# 激活 BERT 词元分析嚣# 模型自己去huggingface下载,并设置将大写字母转换为小写。
tokenizer = BertTokenizer.from_pretrained(r'model/bert-base-chinese',do_lower_case=True)
tokenized_texts =[tokenizer.encode(sent, add_special_tokens=True)for sent in sentences]print("Tokenize the first sentence:")print(tokenized_texts[0])
# 现在加载预训练 BERT 模型,定义好并行处理,并将模型发送到设备上。
model = BertForSequenceClassification.from_pretrained('model/bert-base-chinese',num_labels=2)
model = nn.DataParallel(model)
model.to(device)
# 接下来分析优化器的主要参数
param_optimizer =list(model.named_parameters())
param_optimizer
no_decay =['bias','LayerNorm.weight']
optimizer_grouped_parameters =[{'params':[p for n, p in param_optimizer ifnotany(nd in n for nd in no_decay)],'weight_decay':0.1},{'params':[p for n, p in param_optimizer ifany(nd in n for nd in no_decay)],'weight_decay':0.0}]# 现在参数已经准备好并进行了清理。接下来我们设置训练循环的超参数。
#@title The Hyperparameters for the Training Loop # optimizer = BertAdam(optimizer_grouped_parameters,# lr=2e-5,# warmup=.1)# Number of training epochs (authors recommend between 2 and 4)
epochs =4optimizer = AdamW(optimizer_grouped_parameters,lr =2e-5,# args.learning_rate - default is 5e-5, our notebook had 2e-5eps =1e-8# args.adam_epsilon - default is 1e-8.)# Total number of training steps is number of batches * number of epochs.# `train_dataloader` contains batched data so `len(train_dataloader)` gives # us the number of batches.
total_steps =len(train_dataloader)* epochs# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps =0,# Default value in run_glue.pynum_training_steps = total_steps)
#Creating the Accuracy Measurement Function# Function to calculate the accuracy of our predictions vs labelsdefflat_accuracy(preds, labels):pred_flat = np.argmax(preds, axis=1).flatten()labels_flat = labels.flatten()return np.sum(pred_flat == labels_flat)/len(labels_flat)
#@title The Training Loop
t =[]# Store our loss and accuracy for plotting
train_loss_set =[]# trange is a tqdm wrapper around the normal python rangefor _ in trange(epochs, desc="Epoch"):# Training# Set our model to training mode (as opposed to evaluation mode)model.train()# Tracking variablestr_loss =0nb_tr_examples, nb_tr_steps =0,0# Train the data for one epochfor step, batch inenumerate(train_dataloader):# Add batch to GPUbatch =tuple(t.to(device)for t in batch)# Unpack the inputs from our dataloaderb_input_ids, b_input_mask, b_labels = batch# Clear out the gradients (by default they accumulate)optimizer.zero_grad()# Forward passoutputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)loss = outputs['loss']train_loss_set.append(loss.item())# Backward passloss.backward()# Update parameters and take a step using the computed gradientoptimizer.step()# Update the learning rate.scheduler.step()# Update tracking variablestr_loss += loss.item()nb_tr_examples += b_input_ids.size(0)nb_tr_steps +=1print("Train loss: {}".format(tr_loss/nb_tr_steps))# Validation# Put model in evaluation mode to evaluate loss on the validation setmodel.eval()# Tracking variables eval_loss, eval_accuracy =0,0nb_eval_steps, nb_eval_examples =0,0# Evaluate data for one epochfor batch in validation_dataloader:# Add batch to GPUbatch =tuple(t.to(device)for t in batch)# Unpack the inputs from our dataloaderb_input_ids, b_input_mask, b_labels = batch# Telling the model not to compute or store gradients, saving memory and speeding up validationwith torch.no_grad():# Forward pass, calculate logit predictionslogits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)# Move logits and labels to CPUlogits = logits['logits'].detach().cpu().numpy()label_ids = b_labels.to('cpu').numpy()tmp_eval_accuracy = flat_accuracy(logits, label_ids)eval_accuracy += tmp_eval_accuracynb_eval_steps +=1print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
#@title Training Evaluation
plt.figure(figsize=(15,8))
plt.title("Training loss")
plt.xlabel("Batch")
plt.ylabel("Loss")
plt.plot(train_loss_set)
plt.show()
#@title Predicting and Evaluating Using the Holdout Dataset
df = pd.read_csv("out_of_domain_dev.tsv", delimiter='\t', header=None, names=['sentence_source','label','label_notes','sentence'])# Create sentence and label lists
sentences = df.sentence.values# We need to add special tokens at the beginning and end of each sentence for BERT to work properly
sentences =["[CLS] "+ sentence +" [SEP]"for sentence in sentences]
labels = df.label.valuestokenized_texts =[tokenizer.tokenize(sent)for sent in sentences]MAX_LEN =128# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids =[tokenizer.convert_tokens_to_ids(x)for x in tokenized_texts]# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")# Create attention masks
attention_masks =[]# Create a mask of 1s for each token followed by 0s for paddingfor seq in input_ids:seq_mask =[float(i>0)for i in seq]attention_masks.append(seq_mask) prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
prediction_labels = torch.tensor(labels)batch_size =32 prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)
# Prediction on test set# Put model in evaluation mode
model.eval()# Tracking variables
predictions , true_labels =[],[]# Predict for batch in prediction_dataloader:# Add batch to GPUbatch =tuple(t.to(device)for t in batch)# Unpack the inputs from our dataloaderb_input_ids, b_input_mask, b_labels = batch# Telling the model not to compute or store gradients, saving memory and speeding up predictionwith torch.no_grad():# Forward pass, calculate logit predictionslogits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)# Move logits and labels to CPUlogits = logits['logits'].detach().cpu().numpy()label_ids = b_labels.to('cpu').numpy()# Store predictions and true labelspredictions.append(logits)true_labels.append(label_ids)
#@title Evaluating Using Matthew's Correlation Coefficient# Import and evaluate each test batch using Matthew's correlation coefficientfrom sklearn.metrics import matthews_corrcoef
matthews_set =[]for i inrange(len(true_labels)):matthews = matthews_corrcoef(true_labels[i],np.argmax(predictions[i], axis=1).flatten())matthews_set.append(matthews)
#@title Matthew's Evaluation on the Whole Dataset# Flatten the predictions and true values for aggregate Matthew's evaluation on the whole dataset
flat_predictions =[item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
flat_true_labels =[item for sublist in true_labels for item in sublist]
matthews_corrcoef(flat_true_labels, flat_predictions)