#Cp交叉验证,选择最优的k值进行判别分析from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifierX = heart.iloc[:,0:5]
y = heart.loc[:,'y']
k_range =range(1,31)
k_scores =[]for k in k_range:knn = KNeighborsClassifier(n_neighbors=k)scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')k_scores.append(scores.mean())plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')#选择最优的k值
k = k_scores.index(max(k_scores))+1print('Optimal k: %d'% k)#绘制最优k值在图中的位置
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')
plt.scatter(k,max(k_scores), color='red')#显示最优k直在图中等于多少
plt.text(k,max(k_scores),'(%d, %.2f)'%(k,max(k_scores)), ha='center', va='bottom')
plt.show()
Optimal k: 22
KNN分类器
#使用最优k值建立KNN进行分类from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)# Initialize and fit the KNN classifier
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)# Predict and print accuracy
y_pred = knn.predict(X_test)print('Accuracy: %.2f'% accuracy_score(y_test, y_pred))#绘制决策区域from matplotlib.colors import ListedColormap
import numpy as np
from sklearn.decomposition import PCAdefplot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):# Reduce dimensionality to 2D using PCApca = PCA(n_components=2)X_pca = pca.fit_transform(X)# setup marker generator and color mapmarkers =('s','x','o','^','v')colors =('red','blue','lightgreen','gray','cyan')cmap = ListedColormap(colors[:len(np.unique(y))])# plot the decision surfacex1_min, x1_max = X_pca[:,0].min()-1, X_pca[:,0].max()+1x2_min, x2_max = X_pca[:,1].min()-1, X_pca[:,1].max()+1xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),np.arange(x2_min, x2_max, resolution))Z = classifier.predict(pca.inverse_transform(np.array([xx1.ravel(), xx2.ravel()]).T))Z = Z.reshape(xx1.shape)plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)plt.xlim(xx1.min(), xx1.max())plt.ylim(xx2.min(), xx2.max())for idx, cl inenumerate(np.unique(y)):plt.scatter(x=X_pca[y == cl,0], y=X_pca[y == cl,1],alpha=0.8, c=[cmap(idx)],marker=markers[idx], label=cl)# highlight test samplesif test_idx:X_test, y_test = X_pca[test_idx,:2], y[test_idx]plt.scatter(X_test[:,0], X_test[:,1],alpha=1.0, linewidth=1, marker='o',s=55, label='test set')# Plot decision regions using PCA-transformed features
X_combined = np.vstack((X_train, X_test))
y_combined = np.hstack((y_train, y_test))
plot_decision_regions(X=X_combined, y=y_combined, classifier=knn, test_idx=range(len(y_train),len(y_train)+len(y_test)))
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(loc='upper left')
plt.show()
Accuracy: 0.69
朴素贝叶斯分类器
#朴素贝叶斯分类器import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from matplotlib.colors import ListedColormap# Load the dataset
heart = pd.read_csv(r"heart.csv", sep=',')# Select features and target
X = heart.iloc[:,0:5]
y = heart.loc[:,'y']# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)# Initialize and fit the Gaussian Naive Bayes classifier
gnb = GaussianNB()
gnb.fit(X_train, y_train)# Predict and print accuracy
y_pred = gnb.predict(X_test)print('Accuracy: %.2f'% accuracy_score(y_test, y_pred))# Define the function to plot decision regionsimport matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.decomposition import PCAdefplot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):# Reduce dimensionality to 2D using PCApca = PCA(n_components=2)X_pca = pca.fit_transform(X)# setup marker generator and color mapmarkers =('s','x','o','^','v')colors =('red','blue','lightgreen','gray','cyan')cmap = ListedColormap(colors[:len(np.unique(y))])# plot the decision surfacex1_min, x1_max = X_pca[:,0].min()-1, X_pca[:,0].max()+1x2_min, x2_max = X_pca[:,1].min()-1, X_pca[:,1].max()+1xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),np.arange(x2_min, x2_max, resolution))Z = classifier.predict(pca.inverse_transform(np.array([xx1.ravel(), xx2.ravel()]).T))Z = Z.reshape(xx1.shape)plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)plt.xlim(xx1.min(), xx1.max())plt.ylim(xx2.min(), xx2.max())for idx, cl inenumerate(np.unique(y)):plt.scatter(x=X_pca[y == cl,0], y=X_pca[y == cl,1],alpha=0.8, c=[cmap(idx)],marker=markers[idx], label=cl)# # highlight test samples# if test_idx:# X_test, y_test = X_pca[test_idx, :2], y[test_idx]# plt.scatter(X_test[:, 0], X_test[:, 1],# alpha=1.0, linewidth=1, marker='o',# s=55, label='test set')# Plot decision regions using PCA-transformed features
X_combined = np.vstack((X_train, X_test))
y_combined = np.hstack((y_train, y_test))
plot_decision_regions(X=X_combined, y=y_combined, classifier=gnb, test_idx=range(len(y_train),len(y_train)+len(y_test)))
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(loc='upper left')
plt.show()
Accuracy: 0.70
SVM分类器
#使用SVM进行分类import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_scorefrom sklearn.svm import SVC# Load the dataset
heart = pd.read_csv(r"heart.csv", sep=',')# Select features and target
X = heart.iloc[:,0:5]
y = heart.loc[:,'y']# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)# Initialize and fit the SVM classifier
svm = SVC(kernel='linear', C=1.0, random_state=0)
svm.fit(X_train, y_train)# Predict and print accuracy
y_pred = svm.predict(X_test)print('Accuracy: %.2f'% accuracy_score(y_test, y_pred))
Accuracy: 0.66
# Plot decision regions using PCA-transformed features
X_combined = np.vstack((X_train, X_test))
y_combined = np.hstack((y_train, y_test))
plot_decision_regions(X=X_combined, y=y_combined, classifier=svm, test_idx=range(len(y_train),len(y_train)+len(y_test)))
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(loc='upper left')
plt.show()
随机森林分类
# Import necessary librariesimport matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
import pydotplus
from IPython.display import Image# Load the dataset
heart = pd.read_csv(r"heart.csv", sep=',')# Select features and target
X = heart.iloc[:,0:5]
y = heart.loc[:,'y']# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)# Initialize and fit the Decision Tree classifier
tree = DecisionTreeClassifier(max_depth=3, random_state=0)
tree.fit(X_train, y_train)# Predict and print accuracy
y_pred = tree.predict(X_test)print('Accuracy: %.2f'% accuracy_score(y_test, y_pred))# Export the decision tree to a file
export_graphviz(tree, out_file='tree.dot', feature_names=X.columns)# Convert the dot file to a png
graph = pydotplus.graph_from_dot_file('tree.dot')
Image(graph.create_png())# Plot decision regions using PCA-transformed features
X_combined = np.vstack((X_train, X_test))
y_combined = np.hstack((y_train, y_test))
plot_decision_regions(X=X_combined, y=y_combined, classifier=tree, test_idx=range(len(y_train),len(y_train)+len(y_test)))
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(loc='upper left')
plt.show()