Sentiment Analysis with EDA 📊🧠结尾彩蛋
第一步:导入必要的库
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter
import re
import nltk
from nltk import ngrams
import spacy
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import tensorflow
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
import warnings
warnings.filterwarnings("ignore")
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
plt.style.use('fivethirtyeight')
color = ['#E36149', '#49AF72']
color_palette = ["#FF6F61", "#6B5B95"]
第二步:加载数据
df = pd.read_csv('/kaggle/input/vs-sentiment-analysis/Beginner_Reviews_dataset.csv')
pd.set_option('display.max_colwidth', None)
df.head()
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Data columns (total 3 columns):# Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 1000 non-null int64 1 sentence 1000 non-null object2 label 1000 non-null int64 dtypes: int64(2), object(1) memory usage: 23.6+ KB
df.isna().sum()
Unnamed: 0 0 sentence 0 label 0 dtype: int64
第三步:数据清洗
df['label'] = df['label'].astype('category')
df.drop(columns = 'Unnamed: 0', inplace = True)
df['sentence'].head(15)
0 Wow... Loved this place. 1 Crust is not good. 2 Not tasty and the texture was just nasty. 3 Stopped by during the late May bank holiday off Rick Steve recommendation and loved it. 4 The selection on the menu was great and so were the prices. 5 Now I am getting angry and I want my damn pho. 6 Honeslty it didn't taste THAT fresh.) 7 The potatoes were like rubber and you could tell they had been made up ahead of time being kept under a warmer. 8 The fries were great too. 9 A great touch. 10 Service was very prompt. 11 Would not go back. 12 The cashier had no care what so ever on what I had to say it still ended up being wayyy overpriced. 13 I tried the Cape Cod ravoli, chicken,with cranberry...mmmm! 14 I was disgusted because I was pretty sure that was human hair. Name: sentence, dtype: object
Wow... Loved this place. --->>> wow loved this place
re.sub(r'[^A-Za-z0-9\s]+', '', 'Wow... Loved this place.')
'Wow Loved this place'
def remove_special_characters(text):return re.sub(r'[^A-Za-z0-9\s]+', '', text)
def lower_case(text):return text.lower()
df['sentence'] = df['sentence'].apply(remove_special_characters)
df['sentence'] = df['sentence'].apply(lower_case)
df['sentence'].head()
0 wow loved this place 1 crust is not good 2 not tasty and the texture was just nasty 3 stopped by during the late may bank holiday off rick steve recommendation and loved it 4 the selection on the menu was great and so were the prices Name: sentence, dtype: object
df['sentence'].sample(20)
852 so flavorful and has just the perfect amount of heat 882 we definately enjoyed ourselves 946 it was a bit too sweet not really spicy enough and lacked flavor 534 terrible management 220 the shower area is outside so you can only rinse not take a full shower unless you dont mind being nude for everyone to see 894 if you stay in vegas you must get breakfast here at least once 351 he was terrible 786 we will not be coming back 244 like the other reviewer said you couldnt pay me to eat at this place again 885 i had about two bites and refused to eat anymore 561 after two i felt disgusting 729 as for the service i thought it was good 494 their daily specials are always a hit with my group 727 a fly was in my apple juice a fly 675 a fantastic neighborhood gem 845 this place deserves no stars 531 this place is two thumbs upway up 874 weird vibe from owners 73 it took over 30 min to get their milkshake which was nothing more than chocolate milk 734 they also now serve indian naan bread with hummus and some spicy pine nut sauce that was out of this world Name: sentence, dtype: object
sentence_with_stopwords = 'i personally love the hummus pita baklava falafels and baba ganoush its amazing what they do with eggplant'
clean_words = ''
for word in sentence_with_stopwords.split():if word not in stopwords.words('english'):clean_words += ' ' + wordprint(clean_words)
def remove_stopwords(text):stop_words = set(stopwords.words('english'))words = text.split()filtered_words = [word for word in words if word.lower() not in stop_words]return ' '.join(filtered_words)
df['sentence'].head(10)
0 wow loved this place 1 crust is not good 2 not tasty and the texture was just nasty 3 stopped by during the late may bank holiday off rick steve recommendation and loved it 4 the selection on the menu was great and so were the prices 5 now i am getting angry and i want my damn pho 6 honeslty it didnt taste that fresh 7 the potatoes were like rubber and you could tell they had been made up ahead of time being kept under a warmer 8 the fries were great too 9 a great touch Name: sentence, dtype: object
def lemmatize_sentence(text):doc = nlp(text)lemmatized_sentence = ' '.join([token.lemma_ for token in doc])return lemmatized_sentence
df['sentence'] = df['sentence'].apply(lemmatize_sentence)
df['sentence'].head(10)
0 wow love this place 1 crust be not good 2 not tasty and the texture be just nasty 3 stop by during the late may bank holiday off rick steve recommendation and love it 4 the selection on the menu be great and so be the price 5 now I be get angry and I want my damn pho 6 honeslty it do not taste that fresh 7 the potato be like rubber and you could tell they have be make up ahead of time be keep under a warmer 8 the fry be great too 9 a great touch Name: sentence, dtype: object
df['sentence'] = df['sentence'].apply(remove_stopwords)
df.head()
第四步:数据分析与探索
background_color = '#f0f0f0'
plt.figure(figsize=(10, 5), facecolor=background_color)
sns.set_palette("pastel")
p = sns.countplot(x="label", data=df, edgecolor='black', linewidth=2, width=0.7, palette = color_palette, hue="label")for container in p.containers:p.bar_label(container, label_type="center", color="black", fontsize=17, weight='bold', padding=6, bbox={"boxstyle": "round", "pad": 0.3, "facecolor": "white", "edgecolor": "black", "linewidth": 1.5, "alpha": 0.9})plt.title("Label Distribution", fontweight='bold', fontsize=20, color='#333333')
plt.ylabel("Count", fontsize=15, color='#333333')
plt.xlabel("Labels", fontsize=15, color='#333333')p.set_facecolor(background_color)
p.tick_params(colors='#333333', labelsize=12)plt.show()
all_text = ' '.join(df['sentence'])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text)plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud for All Sentences')
plt.axis('off')
plt.show()
第五步:数据标签关键字
positive_text = ' '.join(df[df['label'] == 1]['sentence'])
wordcloud_positive = WordCloud(width=800, height=400, background_color='white').generate(positive_text)negative_text = ' '.join(df[df['label'] == 0]['sentence'])
wordcloud_negative = WordCloud(width=800, height=400, background_color='black').generate(negative_text)plt.figure(figsize=(20, 15))plt.subplot(1, 2, 1)
plt.imshow(wordcloud_positive, interpolation='bilinear')
plt.title('Word Cloud for Positive Reviews', fontsize=20, fontweight = 'bold')
plt.axis('off')plt.subplot(1, 2, 2)
plt.imshow(wordcloud_negative, interpolation='bilinear')
plt.title('Word Cloud for Negative Reviews', fontsize=20, fontweight = 'bold')
plt.axis('off')plt.show()
# 从数据框中提取评论句子
reviews = df['sentence']# 将所有评论句子合并成一个长字符串
positive_text = ' '.join(reviews)# 将长字符串分割成单词列表
positive_words = positive_text.split()# 计算每个单词的出现频率
word_freq = Counter(positive_words)# 获取出现频率最高的20个单词及其频率
most_common_words = word_freq.most_common(20)# 将结果转换为数据框,以便于绘图
common_words_df = pd.DataFrame(most_common_words, columns=['word', 'count'])# 设置图形的大小
plt.figure(figsize=(14, 7))# 使用seaborn绘制条形图,x轴为单词频率,y轴为单词,数据来自common_words_df
p = sns.barplot(x='count', y='word', data=common_words_df, palette='viridis')# 为每个条形图添加标签,显示频率值
for container in p.containers:p.bar_label(container, label_type='center', color='black', fontsize=12, weight='bold', padding=3, fmt='%d')# 设置图表标题及其字体属性
plt.title('Top 20 Most Frequent Words in Reviews', fontsize=16, fontweight='bold')# 设置x轴标签及其字体属性
plt.xlabel('Frequency', fontsize=14)# 设置y轴标签及其字体属性
plt.ylabel('Words', fontsize=14)# 显示图表
plt.show()
# 从数据框中提取标记为1的正面评论句子
positive_reviews = df[df['label'] == 1]['sentence']# 将所有正面评论句子合并成一个长字符串
positive_text = ' '.join(positive_reviews)# 将长字符串分割成单词列表
positive_words = positive_text.split()# 生成二元词组列表,每个二元词组由两个相邻的单词组成
positive_bigrams = list(ngrams(positive_words, 2))# 计算每个二元词组的出现频率
bigram_freq = Counter(positive_bigrams)# 获取出现频率最高的20个二元词组及其频率
most_common_bigrams = bigram_freq.most_common(20)# 将结果转换为数据框,以便于绘图
common_bigrams_df = pd.DataFrame(most_common_bigrams, columns=['bigram', 'count'])# 将二元词组元组转换为空格分隔的字符串
common_bigrams_df['bigram'] = common_bigrams_df['bigram'].apply(lambda x: ' '.join(x))# 设置图形的大小
plt.figure(figsize=(14, 7))# 使用seaborn绘制条形图,x轴为二元词组频率,y轴为二元词组,数据来自common_bigrams_df
p = sns.barplot(x='count', y='bigram', data=common_bigrams_df, palette='viridis')# 为每个条形图添加标签,显示频率值
for container in p.containers:p.bar_label(container, label_type='center', color='black', fontsize=12, weight='bold', padding=3, fmt='%d')# 设置图表标题及其字体属性
plt.title('Top 20 Most Frequent Bigrams in Positive Reviews', fontsize=16, fontweight='bold')# 设置x轴标签及其字体属性
plt.xlabel('Frequency', fontsize=14)# 设置y轴标签及其字体属性
plt.ylabel('Bigrams', fontsize=14)# 显示图表
plt.show()
代码说明:
-
数据提取和处理:
- 从数据框中筛选出正面评论(
label
为 1 的评论)。 - 将这些正面评论合并成一个长字符串,并分割成单词列表。
- 使用
ngrams
函数生成所有的二元词组。
- 从数据框中筛选出正面评论(
-
频率计算:
- 使用
Counter
计算每个二元词组的出现频率。 - 找出出现频率最高的 20 个二元词组。
- 使用
-
数据准备和可视化:
- 将频率结果转换为数据框格式。
- 将二元词组格式化为字符串形式(例如,从元组
('word1', 'word2')
转换为'word1 word2'
)。 - 使用 Seaborn 绘制条形图,显示最常见的 20 个二元词组及其频率。
- 设置图表标题和轴标签,并显示图表。
# 从数据框中提取标记为负面的评论句子
positive_reviews = df[df['label'] == 0]['sentence']# 将所有负面评论句子合并成一个长字符串
positive_text = ' '.join(positive_reviews)# 将长字符串分割成单词列表
positive_words = positive_text.split()# 生成二元词组列表,每个二元词组由两个相邻的单词组成
positive_bigrams = list(ngrams(positive_words, 2))# 计算每个二元词组的出现频率
bigram_freq = Counter(positive_bigrams)# 获取出现频率最高的20个二元词组及其频率
most_common_bigrams = bigram_freq.most_common(20)# 将结果转换为数据框,以便于绘图
common_bigrams_df = pd.DataFrame(most_common_bigrams, columns=['bigram', 'count'])# 将二元词组元组转换为空格分隔的字符串
common_bigrams_df['bigram'] = common_bigrams_df['bigram'].apply(lambda x: ' '.join(x))# 设置图形的大小
plt.figure(figsize=(14, 7))# 使用seaborn绘制条形图,x轴为二元词组频率,y轴为二元词组,数据来自common_bigrams_df
p = sns.barplot(x='count', y='bigram', data=common_bigrams_df, palette='viridis')# 为每个条形图添加标签,显示频率值
for container in p.containers:p.bar_label(container, label_type='center', color='black', fontsize=12, weight='bold', padding=3, fmt='%d')# 设置图表标题及其字体属性
plt.title('Top 20 Most Frequent Bigrams in Negative Reviews', fontsize=16, fontweight='bold')# 设置x轴标签及其字体属性
plt.xlabel('Frequency', fontsize=14)# 设置y轴标签及其字体属性
plt.ylabel('Bigrams', fontsize=14)# 显示图表
plt.show()
第六步:模型构建、编译与训练
model = Sequential()model.add(Embedding(input_dim=len(word_index) + 1, output_dim=12))
model.add(SimpleRNN(22, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid', kernel_regularizer=l2(0.1)))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
early_stopping = EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True)
history = model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=20, callbacks=[early_stopping], batch_size = 26 )
Epoch 1/20 31/31 ━━━━━━━━━━━━━━━━━━━━ 3s 17ms/step - accuracy: 0.4791 - loss: 0.8647 - val_accuracy: 0.5482 - val_loss: 0.8319 Epoch 2/20 31/31 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - accuracy: 0.5884 - loss: 0.8136 - val_accuracy: 0.5787 - val_loss: 0.7969 Epoch 3/20 31/31 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.7518 - loss: 0.7022 - val_accuracy: 0.5533 - val_loss: 0.8105 Epoch 4/20 31/31 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - accuracy: 0.8447 - loss: 0.5550 - val_accuracy: 0.5787 - val_loss: 0.8409 Epoch 5/20 31/31 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - accuracy: 0.9206 - loss: 0.4198 - val_accuracy: 0.5381 - val_loss: 0.9473 Epoch 6/20 31/31 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - accuracy: 0.9367 - loss: 0.3320 - val_accuracy: 0.5685 - val_loss: 0.9977 Epoch 7/20 31/31 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - accuracy: 0.9692 - loss: 0.2848 - val_accuracy: 0.5939 - val_loss: 1.0118 Epoch 8/20 31/31 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - accuracy: 0.9816 - loss: 0.2479 - val_accuracy: 0.5787 - val_loss: 1.0377 Epoch 9/20 31/31 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - accuracy: 0.9826 - loss: 0.2215 - val_accuracy: 0.5736 - val_loss: 1.0454 Epoch 10/20 31/31 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - accuracy: 0.9843 - loss: 0.2049 - val_accuracy: 0.5838 - val_loss: 1.0615 Epoch 11/20 31/31 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - accuracy: 0.9887 - loss: 0.1872 - val_accuracy: 0.5888 - val_loss: 1.0682 Epoch 12/20 31/31 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - accuracy: 0.9845 - loss: 0.1887 - val_accuracy: 0.5838 - val_loss: 1.0929
第七步:模型可视化、损失率和准确性检测
fig, axes = plt.subplots(1,2 , figsize =(14, 5))sns.lineplot(ax = axes[0], data = history.history['loss'], label = 'training loss' )
sns.lineplot(ax= axes[0], data = history.history['val_loss'], label = 'testing loss')
axes[0].set_title('Loss')
axes[0].set_xlabel('Epochs')
axes[0].set_ylabel('Loss')
axes[0].legend()sns.lineplot(ax = axes[1], data = history.history['accuracy'], label = 'training accuracy')
sns.lineplot(ax = axes[1], data = history.history['val_accuracy'], label = 'testing accuracy')
axes[1].set_title('Accuracy')
axes[1].set_xlabel('Epochs')
axes[1].set_ylabel('Accuracy')
axes[1].legend()plt.tight_layout()
plt.show()
第八步:结果预测
def preprocess_input(sentence, tokenizer, maxlen=20):remove_spec_char = remove_special_characters(sentence)print(remove_spec_char)remove_stop_wrod = remove_stopwords(remove_spec_char)print(remove_stop_wrod)lema_sentence = lemmatize_sentence(remove_stop_wrod)print(lema_sentence)tokenize_sequence = tokenizer.texts_to_sequences([lema_sentence])padded_sequence = pad_sequences(tokenize_sequence, maxlen=maxlen, padding='post', truncating='post')return padded_sequence
sentence = 'The food was amazing and the service was excellent'
padded_input = preprocess_input(sentence, tokenizer, maxlen=20)
prediction = model.predict(padded_input)
print(f"Raw prediction value: {prediction[0][0]}")# Determine if it's positive or negative
threshold = 0.5
if prediction[0][0] > threshold:print("Prediction: Positive")
else:print("Prediction: Negative")
The food was amazing and the service was excellent food amazing service excellent food amazing service excellent 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 162ms/step Raw prediction value: 0.9252229928970337 Prediction: Positive
第九步:人脸情绪识别可视化系统
详情请阅读往期文章:https://blog.csdn.net/weixin_42380711/article/details/133078875
交流与联系:
微信 | |
![]() | ![]() |