广州百度搜索排名优化_建站多少钱一个_上海app开发公司_广告最多的网站

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertConfig
from transformers import AdamW, BertForSequenceClassification, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
# python == 3.8.6
# torch == 1.10.0+cu102
# transformers == 4.36.2

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
!nvidia-smi

在这里插入图片描述

# 数据集下载
#  https://nyu-mll.github.io/CoLA/
# 然后加载数据集
df = pd.read_csv(r'dataset/cola_public/raw/in_domain_train.tsv',encoding='utf-8',delimiter='\t',header=None,names=['sentence_source','label','label_notes','sentence'])
df.shape
#（8551，4）

# 随机展示10个样本
# 句子来源、标注（0：不可接受，1：可接受）、作者的标注、要分类的句子
df.sample(10)

# 创建句子、标注列表、添加[CLS]和[SEP]词元
sentences = df['sentence'].values
sentences = ["[CLS]" + sentence + "[SEP]" for sentence in sentences]
labels = df['label'].values
sentences

# 激活 BERT 词元分析嚣
# 模型自己去huggingface下载，并设置将大写字母转换为小写。
tokenizer = BertTokenizer.from_pretrained(r'model/bert-base-chinese',do_lower_case=True)
tokenized_texts =  [tokenizer.encode(sent, add_special_tokens=True) for sent in sentences]
print("Tokenize the first sentence:")
print(tokenized_texts[0])

MAX_LEN = 128
input_ids = torch.tensor([tokens + [tokenizer.pad_token_id] * (MAX_LEN - len(tokens)) if len(tokens) < MAX_LEN else tokens[:MAX_LEN]for tokens in tokenized_texts
])
# 检查填充后的输入形状
print("填充后的输入形状：", input_ids.shape)
print(input_ids)

# 接下来将防止模型对填充词元进行注意力计算。
attention_masks = []
for seq in input_ids:seq_mask = [float(i>0) for i in seq]attention_masks.append(seq_mask)

# 划分数据集
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, test_size=0.1, random_state=2018)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids, test_size=0.1, random_state=2018)

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

# 选择批量大小并创建迭代器
batch_size = 32
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

# BERT模型配置
from transformers import BertModel, BertConfigconfiguration = BertConfig()
model = BertModel(configuration)
configuration = model.config
print(configuration) #输出将展示类似于以下内容的 Hugging Face 主要参数(因为库会经常更新，所以参数有可能会有所不同:
# 下面将讲解这些主要参数:
# attention probs_dropout_prob:对注意力概率应用的 dropout 率，这里设置为0.1。
# hidden_act;编码器中的非线性激活函数，这里使用 gelu。gelu是高斯误差线性单位(Gaussian Eror Linear Units)激活函数的简称，它对输入按幅度加权，使其成为非线性。
# hidden_dropout_prob:应用于全连接层的 dropout 概率。嵌入、编码器和汇聚器层中都有全连接。输出不总是对序列内容的良好反映。汇聚隐藏状态的序列可改善输出序列。这里设置为0.1。
# hidden_size:编码器层的维度，也是汇聚层的维度，这里设置为768。
# initializer_range:初始化权重矩阵时的标准偏差值，这里设置为0.02。
# intermediate_size:编码器前馈层的维度，这里设置为3072。
# layer_norm_eps:是层规范化层的 epsilon 值，这里设置为le-12。
# max_position_embeddings:模型使用的最大长度，这里设置为512。
# model_type:模型的名称，这里设置为 bert。
# numattention_heads:注意力头数，这里设置为12。
# num_hidden_layers:层数，这里设置为12。
# pad_tokenid:使用0作为填充词元的HD,以避免对填充词元进行训练。
# type_vocab_size:token_type_ids的大小用于标识序列。例如，“the dog[SEP] The cat.[SEP]”可用词元 ID [0,0,0,1,1,1]表示。
# vocab_size:模型用于表示 input_ids 的不同词元数量。换句话说，这是模型可以识别和处理的不同词元或单词的总数。在训练过程中，模型会根据给定的词表将文本输入转换为对应的词元序列，其中包含的词元数量是vocab_size。通过使用这个词表，模型能够理解和表示更广泛的语言特征。这里设置为 30522。
# 讲解完这些参数后，接下来将加载预训练模型。

# 现在加载预训练 BERT 模型，定义好并行处理，并将模型发送到设备上。
model = BertForSequenceClassification.from_pretrained('model/bert-base-chinese',num_labels=2)
model = nn.DataParallel(model)
model.to(device)

# 接下来分析优化器的主要参数
param_optimizer = list(model.named_parameters())
param_optimizer
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [{'params':[p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.1},{'params':[p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
# 现在参数已经准备好并进行了清理。接下来我们设置训练循环的超参数。

#@title The Hyperparameters for the Training Loop 
# optimizer = BertAdam(optimizer_grouped_parameters,
#                      lr=2e-5,
#                      warmup=.1)# Number of training epochs (authors recommend between 2 and 4)
epochs = 4optimizer = AdamW(optimizer_grouped_parameters,lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5eps = 1e-8 # args.adam_epsilon  - default is 1e-8.)
# Total number of training steps is number of batches * number of epochs.
# `train_dataloader` contains batched data so `len(train_dataloader)` gives 
# us the number of batches.
total_steps = len(train_dataloader) * epochs# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, # Default value in run_glue.pynum_training_steps = total_steps)

#Creating the Accuracy Measurement Function
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):pred_flat = np.argmax(preds, axis=1).flatten()labels_flat = labels.flatten()return np.sum(pred_flat == labels_flat) / len(labels_flat)

#@title The Training Loop
t = [] # Store our loss and accuracy for plotting
train_loss_set = []# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):# Training# Set our model to training mode (as opposed to evaluation mode)model.train()# Tracking variablestr_loss = 0nb_tr_examples, nb_tr_steps = 0, 0# Train the data for one epochfor step, batch in enumerate(train_dataloader):# Add batch to GPUbatch = tuple(t.to(device) for t in batch)# Unpack the inputs from our dataloaderb_input_ids, b_input_mask, b_labels = batch# Clear out the gradients (by default they accumulate)optimizer.zero_grad()# Forward passoutputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)loss = outputs['loss']train_loss_set.append(loss.item())    # Backward passloss.backward()# Update parameters and take a step using the computed gradientoptimizer.step()# Update the learning rate.scheduler.step()# Update tracking variablestr_loss += loss.item()nb_tr_examples += b_input_ids.size(0)nb_tr_steps += 1print("Train loss: {}".format(tr_loss/nb_tr_steps))# Validation# Put model in evaluation mode to evaluate loss on the validation setmodel.eval()# Tracking variables eval_loss, eval_accuracy = 0, 0nb_eval_steps, nb_eval_examples = 0, 0# Evaluate data for one epochfor batch in validation_dataloader:# Add batch to GPUbatch = tuple(t.to(device) for t in batch)# Unpack the inputs from our dataloaderb_input_ids, b_input_mask, b_labels = batch# Telling the model not to compute or store gradients, saving memory and speeding up validationwith torch.no_grad():# Forward pass, calculate logit predictionslogits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)# Move logits and labels to CPUlogits = logits['logits'].detach().cpu().numpy()label_ids = b_labels.to('cpu').numpy()tmp_eval_accuracy = flat_accuracy(logits, label_ids)eval_accuracy += tmp_eval_accuracynb_eval_steps += 1print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

#@title Training Evaluation
plt.figure(figsize=(15,8))
plt.title("Training loss")
plt.xlabel("Batch")
plt.ylabel("Loss")
plt.plot(train_loss_set)
plt.show()

在这里插入图片描述

#@title Predicting and Evaluating Using the Holdout Dataset 
df = pd.read_csv("out_of_domain_dev.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])# Create sentence and label lists
sentences = df.sentence.values# We need to add special tokens at the beginning and end of each sentence for BERT to work properly
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]
labels = df.label.valuestokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]MAX_LEN = 128# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
# Create attention masks
attention_masks = []# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:seq_mask = [float(i>0) for i in seq]attention_masks.append(seq_mask) prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
prediction_labels = torch.tensor(labels)batch_size = 32  prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

# Prediction on test set# Put model in evaluation mode
model.eval()# Tracking variables 
predictions , true_labels = [], []# Predict 
for batch in prediction_dataloader:# Add batch to GPUbatch = tuple(t.to(device) for t in batch)# Unpack the inputs from our dataloaderb_input_ids, b_input_mask, b_labels = batch# Telling the model not to compute or store gradients, saving memory and speeding up predictionwith torch.no_grad():# Forward pass, calculate logit predictionslogits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)# Move logits and labels to CPUlogits = logits['logits'].detach().cpu().numpy()label_ids = b_labels.to('cpu').numpy()# Store predictions and true labelspredictions.append(logits)true_labels.append(label_ids)

#@title Evaluating Using Matthew's Correlation Coefficient
# Import and evaluate each test batch using Matthew's correlation coefficient
from sklearn.metrics import matthews_corrcoef
matthews_set = []for i in range(len(true_labels)):matthews = matthews_corrcoef(true_labels[i],np.argmax(predictions[i], axis=1).flatten())matthews_set.append(matthews)

#@title Matthew's Evaluation on the Whole Dataset
# Flatten the predictions and true values for aggregate Matthew's evaluation on the whole dataset
flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
flat_true_labels = [item for sublist in true_labels for item in sublist]
matthews_corrcoef(flat_true_labels, flat_predictions)

广州百度搜索排名优化_建站多少钱一个_上海app开发公司_广告最多的网站

最新新闻

热搜词