小广告尺寸_丹阳疫情最新消息今天新增_深圳网络推广系统_百度搜索排行榜风云榜

1 加载预训练模型对应的分词器

2 加载数据集

3 数据预处理

4 构建数据加载器DataLoader

5 定义下游任务模型

6 测试代码

7 训练代码

#做（中文与英文的）分类任务，Bert模型比较合适，用cls向下游任务传输数据，做分类任务
#Bert模型要求一般传一个句子对（两句话）

1 加载预训练模型对应的分词器

from transformers import AutoTokenizer#use_fast=True 表示使用RUST语言写的分词器，速度比python写的快
tokenizer = AutoTokenizer.from_pretrained('../data/model/distilbert-base-uncased/', use_fast=True)tokenizer

DistilBertTokenizerFast(name_or_path='../data/model/distilbert-base-uncased/', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False),

#编码试算
tokenizer.batch_encode_plus(['hello, everyone, today is a good day', 'how are you, fine thank you, and you?'])
#编码返回的是'input_ids' 和 'attention_mask'

{'input_ids': [[101, 7592, 1010, 3071, 1010, 2651, 2003, 1037, 2204, 2154, 102], [101, 2129, 2024, 2017, 1010, 2986, 4067, 2017, 1010, 1998, 2017, 1029, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

2 加载数据集

from datasets import load_datasetdataset = load_dataset('../data/datasets/cola/', trust_remote_code=True)
dataset

DatasetDict({train: Dataset({features: ['text', 'label'],num_rows: 8551})test: Dataset({features: ['text', 'label'],num_rows: 527})
})

dataset['train'][0]

{'text': "Our friends won't buy this analysis, let alone the next one we propose.",'label': 1}

3 数据预处理

def f(examples, tokenizer):"""只对传输数据集的句子文本'text'进行编码分词"""return tokenizer.batch_encode_plus(examples['text'], truncation=True)dataset = dataset.map(f,batched=True,batch_size=1000,  #一批有1000个数据#num_proc=1 更快 ,   数据量不多的时候， 创建进程也是需要时间开销num_proc=1,  #8个进程同时处理，cpu是8核remove_columns=['text'],   #原数据集中的['text']不要了，转化成['input_ids']fn_kwargs={'tokenizer': tokenizer})

print(dataset['train'][0])

{'label': 1, 'input_ids': [101, 2256, 2814, 2180, 1005, 1056, 4965, 2023, 4106, 1010, 2292, 2894, 1996, 2279, 2028, 2057, 16599, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

4 构建数据加载器DataLoader

#一批数据传输时，每句话的长度必须相同， 否则无法参与矩阵运算
import torch
#DataCollatorWithPadding 读取数据时，自动补全padding，使句子长度相同
from transformers.data.data_collator import DataCollatorWithPaddingloader = torch.utils.data.DataLoader(dataset=dataset['train'],batch_size=8,#实例化一个匿名的collate_fn ,使数据一批批传输，并自动补全padding，使句子长度相同collate_fn=DataCollatorWithPadding(tokenizer),  shuffle=True,drop_last=True)for data in loader:  break  #for循环赋值， 不输出
#data包含'input_ids'和 'attention_mask' 两部分data

{'input_ids': tensor([[  101,  2043,  3021,  5610,  2015,  1010,  2035,  1996,  2062,  2515,6294,  5223,  2032,  1012,   102,     0,     0,     0,     0,     0,0],[  101,  2057,  4687,  2008,  3021,  2187,  1012,   102,     0,     0,0,     0,     0,     0,     0,     0,     0,     0,     0,     0,0],[  101,  2008,  2008,  2005,  5106, 15721,  2000,  5466,  1037,  4906,2052, 28679,  1996,  4932,  2001,  5793,  2003,  2025,  2995,  1012,102],[  101,  1996,  2214,  3899,  2351,  2035,  1996,  2126,  1012,   102,0,     0,     0,     0,     0,     0,     0,     0,     0,     0,0],[  101,  1045,  2215,  2009,  2000,  3961,  1037,  3595,  2008,  3021,2187,  1012,   102,     0,     0,     0,     0,     0,     0,     0,0],[  101,  2027,  2700,  2032,  2637,  1005,  1055, 17089,  2343,  1012,102,     0,     0,     0,     0,     0,     0,     0,     0,     0,0],[  101,  1996,  2795,  2003,  2936,  2084,  1996,  2341,  2003,  2898,1012,   102,     0,     0,     0,     0,     0,     0,     0,     0,0],[  101,  6294,  9619,  2098,  2000,  3046,  2000,  4025,  2000,  2031,2042,  4782,  1012,   102,     0,     0,     0,     0,     0,     0,0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([1, 0, 1, 0, 1, 1, 1, 1])}

len(loader)

1068

5 定义下游任务模型

from transformers import AutoModelForSequenceClassification, DistilBertModel

#查看模型参数与层结构
model_pretrained_parameters = AutoModelForSequenceClassification.from_pretrained('../data/model/distilbert-base-uncased/', num_labels=2) 
model_pretrained_parameters

DistilBertForSequenceClassification((distilbert): DistilBertModel((embeddings): Embeddings((word_embeddings): Embedding(30522, 768, padding_idx=0)(position_embeddings): Embedding(512, 768)(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)(dropout): Dropout(p=0.1, inplace=False))(transformer): Transformer((layer): ModuleList((0-5): 6 x TransformerBlock((attention): MultiHeadSelfAttention((dropout): Dropout(p=0.1, inplace=False)(q_lin): Linear(in_features=768, out_features=768, bias=True)(k_lin): Linear(in_features=768, out_features=768, bias=True)(v_lin): Linear(in_features=768, out_features=768, bias=True)(out_lin): Linear(in_features=768, out_features=768, bias=True))(sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)(ffn): FFN((dropout): Dropout(p=0.1, inplace=False)(lin1): Linear(in_features=768, out_features=3072, bias=True)(lin2): Linear(in_features=3072, out_features=768, bias=True)(activation): GELUActivation())(output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)))))(pre_classifier): Linear(in_features=768, out_features=768, bias=True)(classifier): Linear(in_features=768, out_features=2, bias=True)(dropout): Dropout(p=0.2, inplace=False)
)

class Model(torch.nn.Module):def __init__(self):super().__init__()  #继承父类的方法self.model_pretrained = DistilBertModel.from_pretrained('../data/model/distilbert-base-uncased/')#全连接层#Bert模型输出的数据的最后一维度是768，这里输入的第0维度也要是768self.fc = torch.nn.Sequential(torch.nn.Linear(768, 768),torch.nn.ReLU(),torch.nn.Dropout(p=0.2),torch.nn.Linear(768, 2))  #二分类问题，情感分析（积极1/消极0）#加载预训练参数的模型model_pretrained_parameters = AutoModelForSequenceClassification.from_pretrained('../data/model/distilbert-base-uncased/',num_labels=2) #labels的类别数量#让全连接层加载预训练的参数self.fc[0].load_state_dict(model_pretrained_parameters.pre_classifier.state_dict())self.fc[3].load_state_dict(model_pretrained_parameters.classifier.state_dict())#损失函数self.criterion = torch.nn.CrossEntropyLoss()def forward(self, input_ids, attention_mask, labels=None):#将输入数据传入预训练模型，得到一个输出结果#logits是三维的logits = self.model_pretrained(input_ids=input_ids, attention_mask=attention_mask)# ：使logits变成二维数据logits = logits.last_hidden_state[:, 0]  #0就是cls的输出结果，因为cls的位置是固定的（每句话的第一个单词就是），其他位置具有不确定性能拿到数据#将logits传入输出层logits = self.fc(logits)#计算损失loss = None  #先将loss设为空if labels is not None: #若传入了labels数据，不为空了#计算损失loss = self.criterion(logits, labels)return {'loss': loss, 'logits': logits}model = Model()
#查看模型参数量
print(sum(i.numel() for i in model.parameters()))

66955010

#试跑一下下游任务模型
#向模型中传入参数
out = model(**data)   #out是一个字典，包含输出的loss和logits
print(out['loss'], out['logits'], out['logits'].shape)
#out['logits'].shape=torch.Size([8, 2]), 8是一批有8个数据， 2是两个类别的概率（哪个值更大，就归哪个类别）

tensor(0.6448, grad_fn=<NllLossBackward0>) tensor([[-0.0228,  0.0688],[-0.1635, -0.0205],[-0.1123,  0.0630],[-0.0492,  0.0820],[-0.1185,  0.1382],[-0.1488,  0.1725],[-0.0806,  0.0836],[-0.0384,  0.0721]], grad_fn=<AddmmBackward0>) torch.Size([8, 2])

#查看测试数据集的labels是否正常有效（没有-1）
dataset['test'][0]

{'label': 1,'input_ids': [101,1996,11279,8469,1996,9478,3154,1997,1996,5749,1012,102],'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

6 测试代码

def test(model):model.eval()  #测试预测时，调到评估模式#构建数据加载器loader_test = torch.utils.data.DataLoader(dataset=dataset['test'],batch_size=16,  #测试预测是在cpu上进行的，batch_size的值可以大一些，为16#DataCollatorWithPadding(tokenizer)实例化collate_fn，不然会报错collate_fn=DataCollatorWithPadding(tokenizer), #成批输送数据时，自动补全pad，使句子长度一致shuffle=True,drop_last=True)outs = []  #存放计算的最大类别概率labels = []  #存放真实值for i, data in enumerate(loader_test):#进行下游任务模型计算预测时，不进行求导梯度下降with torch.no_grad():#out是一个字典，包含loss和logits，out = model(**data)#out['logits']是一个二维数组，shape=（batch_szie, 类别数量）outs.append(out['logits'].argmax(dim=1))labels.append(data['labels'])if i % 10 ==0:  #每隔10次print(i)if i == 50:break  #到50，停止#将outs和labels分别拼接起来outs = torch.cat(outs)labels = torch.cat(labels)#计算准确度accuracy = (outs == labels).sum().item() / len(labels)print('accuracy:', accuracy)

test(model)

0
10
20
30
accuracy: 0.693359375

7 训练代码

from transformers import AdamW   #AdamW梯度下降的优化算法
from transformers.optimization import get_scheduler  #学习率的衰减计算#设置设备、
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

#训练代码
def train():#模型训练时，需要梯度下降、学习更新模型参数，以达到最好的预测效果#定义优化器optimizer = AdamW(model.parameters(),betas=(0.9, 0.999), eps=1e-8, lr=2e-5)  #betas/eps/lr都是默认值#学习率衰减计划scheduler = get_scheduler(name='linear',num_warmup_steps=0,  #无预热缓冲区，从一开始就衰减num_training_steps=len(loader),optimizer=optimizer)#将模型发送到设备上model.to(device)model.train()  #模型训练模式for i,data in enumerate(loader):#接收需要输入的数据input_ids, attention_mask, labels = data['input_ids'], data['attention_mask'], data['labels']#将数据传到设备上input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)#将这些数据传到设备上的模型，获取输出值out（一个字典，包含loss和logits（类别概率））out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)#从out中获取lossloss = out['loss']  #字典key索引#用损失函数进行反向传播loss.backward()#为了梯度下降的稳定性，使用梯度裁剪torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  #公式中的c=1.0#梯度更新optimizer.step()scheduler.step()  #学习率衰减计划更新#梯度清零optimizer.zero_grad()model.zero_grad()if i% 50 == 0:lr = optimizer.state_dict()['param_groups'][0]['lr']#计算预测类别概率的最大值out = out['logits'].argmax(dim=1)#计算准确率accuracy = (labels==out).sum().item() / 8 #batch_size=8print(i, loss.item(), lr, accuracy)print()

train()

0 0.6603636145591736 1.9981273408239703e-05 0.7550 0.6770923733711243 1.9044943820224723e-05 0.625100 0.5856966972351074 1.810861423220974e-05 0.75150 0.5937663316726685 1.7172284644194758e-05 0.75200 0.5329931974411011 1.6235955056179777e-05 0.75250 0.47660014033317566 1.5299625468164797e-05 0.875300 0.22391566634178162 1.4363295880149814e-05 0.875350 0.2534029185771942 1.3426966292134834e-05 1.0400 0.5150715112686157 1.2490636704119851e-05 0.75450 0.5376325845718384 1.155430711610487e-05 0.75500 0.48840606212615967 1.0617977528089888e-05 0.875550 0.40059715509414673 9.681647940074908e-06 0.875600 0.679754376411438 8.745318352059925e-06 0.75650 0.21557165682315826 7.808988764044945e-06 0.875700 0.6123908758163452 6.872659176029963e-06 0.75750 0.4683417081832886 5.936329588014982e-06 0.75800 0.38990333676338196 5e-06 0.875850 0.43256130814552307 4.063670411985019e-06 0.75900 0.32022809982299805 3.1273408239700374e-06 0.875950 0.9173805713653564 2.1910112359550564e-06 0.6251000 0.42855364084243774 1.2546816479400751e-06 0.8751050 0.4637509882450104 3.183520599250937e-07 0.75

#训练完模型，再次测试
test(model.to('cpu'))  #因为测试的数据都在cpu上，需要把在gpu上训练的模型发到cpu上

0
10
20
30
accuracy: 0.779296875

小广告尺寸_丹阳疫情最新消息今天新增_深圳网络推广系统_百度搜索排行榜风云榜

1 加载预训练模型对应的分词器

2 加载数据集

3 数据预处理

4 构建数据加载器DataLoader

5 定义下游任务模型

6 测试代码

7 训练代码

最新新闻

热搜词