nlp比赛中trick的使用

[TOC]

nlp分类比赛trick总结

数据清洗+合理的模型架构+优秀的基模才是关键

模型选择

中文:roberta,ernie-3.0-xbase-zh,nezha

英文:deberta

利用llm做数据增强

利用大模型生成更多数据,本质上是利用大模型的先验知识。

冻结参数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
from transformers import BertForSequenceClassification

# 加载预训练的bert模型,这里我们使用'bert-base-uncased'作为例子
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# 先将所有参数设置为不可训练
for param in model.parameters():
param.requires_grad = False

# 然后将最后两层的参数设置为可训练
for param in model.bert.encoder.layer[-2:].parameters():
param.requires_grad = True

# 现在,只有最后两层和分类头的参数可以进行训练

分层学习率

分组分层

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
def get_bert_layerwise_lr_groups(bert_model, learning_rate=1e-5, layer_decay=0.9):

n_layers = len(bert_model.encoder.layer) + 1 # + 1 (embedding)
embedding_decayed_lr = learning_rate * (layer_decay ** (n_layers+1))
grouped_parameters = [{"params": bert_model.embeddings.parameters(), 'lr': embedding_decayed_lr}]
for depth in range(1, n_layers):
decayed_lr = learning_rate * (layer_decay ** (n_layers + 1 - depth))
grouped_parameters.append(
{"params": bert_model.encoder.layer[depth-1].parameters(), 'lr': decayed_lr}
)

return grouped_parameters

model = AutoModel.from_pretrained("roberta-base")
lr_groups = get_bert_layerwise_lr_groups(model, learning_rate=1e-5)
optimizer = torch.optim.AdamW(
lr_groups, lr=1e-5, weight_decay=0
)
#替换方案,冻结参数再解冻
from transformers import BertModel
# 加载预训练的bert模型
model = BertModel.from_pretrained('bert-base-uncased')
# 获取BERT模型的总层数
total_layers = len(model.encoder.layer) # BERT-base模型有12层
# 冻结前一半的层
for i in range(total_layers // 2):
for param in model.encoder.layer[i].parameters():
param.requires_grad = False

mlm领域内训练

对于bert类模型来说,在测试领域的文本进行mlm训练能一定程度上提高性能。

使用github脚本

执行下面的脚本命令(命令行参数参见trainer参数)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
git clone https://github.com/huggingface/transformers.git
git clone https://github.com/huggingface/evaluate.git
pip install -q ./transformers
pip install datasets==2.14.6
pip install accuracy evaluate -q
python /root/transformers/examples/pytorch/language-modeling/run_mlm.py \
--model_name_or_path /gemini/data-1 \
--num_train_epochs 8 \
--train_file /root/train.txt \
--validation_file /root/test.txt \
--per_device_train_batch_size 32 \
--do_train \
--do_eval \
--output_dir /root/save \
--line_by_line \
--eval_steps 500 \
--save_steps 500 \
--logging_steps 50 \
--evaluation_strategy steps \
--load_best_model_at_end \
--overwrite_output_dir \
--max_seq_length 266 \
--save_total_limit 1

对抗训练

fgm等

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
class FGM():
def __init__(self, model):
self.model = model
self.backup = {}

def attack(self, epsilon=1., emb_name='emb.'):
# emb_name这个参数要换成你模型中embedding的参数名
for name, param in self.model.named_parameters():
if param.requires_grad and emb_name in name:
self.backup[name] = param.data.clone()
norm = torch.norm(param.grad)
if norm != 0 and not torch.isnan(norm):
r_at = epsilon * param.grad / norm
param.data.add_(r_at)

def restore(self, emb_name='emb.'):
# emb_name这个参数要换成你模型中embedding的参数名
for name, param in self.model.named_parameters():
if param.requires_grad and emb_name in name:
assert name in self.backup
param.data = self.backup[name]
self.backup = {}
# 初始化
fgm = FGM(model)
for batch_input, batch_label in data:
# 正常训练
loss = model(batch_input, batch_label)
loss.backward() # 反向传播,得到正常的grad
# 对抗训练
fgm.attack() # 在embedding上添加对抗扰动
loss_adv = model(batch_input, batch_label)
loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度
fgm.restore() # 恢复embedding参数
# 梯度下降,更新参数
optimizer.step()
model.zero_grad()

R-drop

代码实现如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
model = TaskModel()

def compute_kl_loss(self, p, q, pad_mask=None):

p_loss = F.kl_div(F.log_softmax(p, dim=-1), F.softmax(q, dim=-1), reduction='none')
q_loss = F.kl_div(F.log_softmax(q, dim=-1), F.softmax(p, dim=-1), reduction='none')

# pad_mask is for seq-level tasks
if pad_mask is not None:
p_loss.masked_fill_(pad_mask, 0.)
q_loss.masked_fill_(pad_mask, 0.)

# You can choose whether to use function "sum" and "mean" depending on your task
p_loss = p_loss.sum()
q_loss = q_loss.sum()

loss = (p_loss + q_loss) / 2
return loss

# keep dropout and forward twice
logits = model(x)

logits2 = model(x)

# cross entropy loss for classifier
ce_loss = 0.5 * (cross_entropy_loss(logits, label) + cross_entropy_loss(logits2, label))

kl_loss = compute_kl_loss(logits, logits2)

# carefully choose hyper-parameters
loss = ce_loss + α * kl_loss

关闭dropout

1
2
3
self.config.hidden_dropout_prob = 0.
self.config.attention_probs_dropout_prob = 0.
#分类头中也可能有

模型融合

模型融合的原则:good and diversity.

伪标签

使用预测值作为训练集重新训练

EMA

指数移动平均,等到快收敛时开启,将模型进行加权平均更有利于泛化性

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
作者:Lukan
链接:https://www.zhihu.com/question/265357659/answer/3048520994
来源:知乎
著作权归作者所有。商业转载请联系作者获得授权,非商业转载请注明出处。

class EMA():
def __init__(self, model, decay):
self.model = model
self.decay = decay
self.shadow = {}
self.backup = {}

def register(self):
for name, param in self.model.named_parameters():
if param.requires_grad:
self.shadow[name] = param.data.clone()

def update(self):
for name, param in self.model.named_parameters():
if param.requires_grad:
assert name in self.shadow
new_average = (1.0 - self.decay) * param.data + self.decay * self.shadow[name]
self.shadow[name] = new_average.clone()

def apply_shadow(self):
for name, param in self.model.named_parameters():
if param.requires_grad:
assert name in self.shadow
self.backup[name] = param.data
param.data = self.shadow[name]

def restore(self):
for name, param in self.model.named_parameters():
if param.requires_grad:
assert name in self.backup
param.data = self.backup[name]
self.backup = {}

# 初始化
ema = EMA(model, 0.999)
ema.register()

# 训练过程中,更新完参数后,同步update shadow weights
def train():
optimizer.step()
ema.update()

# eval前,apply shadow weights;eval之后,恢复原来模型的参数
def evaluate():
ema.apply_shadow()
# evaluate
ema.restore()

Pooling方式

对于bert模型最后一层可以进行meanpooling和maxpooling,下面是通过attention门进行自控的代码。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
class AttentionPooling(nn.Module):
def __init__(self, in_dim):
super().__init__()
self.attention = nn.Sequential(
nn.Linear(in_dim, in_dim),
nn.LayerNorm(in_dim),
nn.GELU(),
nn.Linear(in_dim, 1),
)
def forward(self, last_hidden_state, attention_mask):
w = self.attention(last_hidden_state).float()
w[attention_mask==0]=float('-inf')
w = torch.softmax(w,1)
attention_embeddings = torch.sum(w * last_hidden_state, dim=1)
return attention_embeddings