Doctor told | you have | diabetes | Freq. Percent Cum. ------------+----------------------------------- 0 | 5,531 87.49 87.49 1 | 791 12.51 100.00 ------------+----------------------------------- Total | 6,322 100.00
接下来,我们对原始数据进行绘制,我们试图探查不同特征是否可以区分患者是否有糖尿病,由此,我们调用 python 的 matplotlib 模块进行绘图。在 stata 中调用 python 可以参见stata 的代码文档,安装好之后,在命令窗口内输入 python 即可调用 python,以 end 作为 python 代码的结束。我们使用 python 来进行绘图,我们将糖化血红蛋白作为 y 轴、年龄作为 x 轴,蓝色点为未患有糖尿病的样本点,红色点为患有糖尿病的样本点。
python: # Import the necessary packages import
pandas as pd import matplotlib.pyplot as plt import matplotlib.colors as mcolors
# Read the Stata dataset into Python data = pd.read_stata('diabetes.dta', convert_categoricals=False, preserve_dtypes=True, convert_missing=False)
# Define the feature matrix (independent variables) # and the target variable (dependent variable) X = data[['age','HbA1c']] y = data['diabetes']
# Plot the raw data plt.scatter(X['age'], X['HbA1c'], c=y, cmap = mcolors.ListedColormap(["navy", "darkred"])) plt.xlabel('Age (years)') plt.ylabel('HbA1c') plt.xticks((12,20,30,40,50,60,70,80)) plt.yticks((4,6,8,10,12,14,16)) plt.title('Diabetes status by Age and HbA1c') plt.show() # Save the graph plt.savefig("scatterplot.png") end
我们使用“k-折交叉验证”的技术,把训练组划分为 k 个子组,在 k-1 个子组上训练 SVM 模型,在第 k 个子组上测试模式,我们重复 k 次使每个子组都作为测试组,然后,我们将计算的结果计算平均值,选择拟合度度最好的模型参数作为真正拟合的参数。调用 python 的 sklearn 模块代码如下:
python: from sklearn import svm from sklearn.model_selection import cross_val_score from sklearn.model_selection import GridSearchCV
# Do a grid search for the parameters "degree" and "C" using 10-fold # cross-validation model = svm.SVC(kernel='poly') parameters = {'degree':[1,2,3], 'C':[1,2,3]} poly_svc = GridSearchCV(model, parameters, cv=10, scoring='accuracy').fit(X_train, y_train)
# Display the parameters that yield the best-fitting model poly_svc.fit(X_train,y_train) print(poly_svc.best_params_) end
# Fit the SVM model using the parameters selected from the grid search poly_svc = svm.SVC(kernel='poly', degree=3, C=3).fit(X_train, y_train) scores = cross_val_score(poly_svc, X_test, y_test, cv=10, scoring='accuracy') print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
keep diabetes age HbA1c drop if missing(diabetes,age,HbA1c) save diabetes,replace
erase age.dta erase glucose.dta
/////数据描述性统计 list in1/5 tabulate diabetes
/////使用python数据绘图 python ## import necessary packages import
pandas as pd import matplotlib.pyplot as plt import matplotlib.colors as mcolors
## read the stata dataset into python data = pd.read_stata('diabetes.dta',convert_categoricals=False,preserve_dtypes=True,convert_missing=False)
## define the feature matrix(independent variables) ## and the target variable(dependent variable) X = data[['age','HbA1c']] Y = data['diabetes']
## plot the raw data plt.scatter(X['age'],X['HbA1c'],c=Y,cmap=mcolors.ListedColormap(["navy","darkred"])) plt.xlabel('Age(years)') plt.ylabel('HbA1c') plt.xticks((12,20,30,40,50,60,70,80)) plt.yticks((4,6,8,10,12,14,16)) plt.title('Diabets status by Age and HbA1c') plt.show()
## Save the graph plt.savefig("scatterplot.png") end
///////split the data into training and testing datasets python from sklearn.model_selection import train_test_split X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.4,random_state=0) end
//////using cross-validation to choose parameters for SVM model python from sklearn import svm from sklearn.model_selection import cross_val_score from sklearn.model_selection import GridSearchCV
## do a grid search for the parameters "degree" and "C" using 10-fold cross-validation model = svm.SVC(kernel='poly') parameters = {'degree':[1,2,3],'C':[1,2,3]} poly_svc = GridSearchCV(model,parameters,cv=10,scoring='accuracy').fit(X_train,Y_train)
## display the parameters that yield the best-fitting model poly_svc.fit(X_train,Y_train) print(poly_svc.best_params_) end
//////test the model on the testing datasets python poly_svc = svm.SVC(kernel='poly',degree=3,C=3).fit(X_train,Y_train) scores = cross_val_score(poly_svc,X_test,Y_test,cv=10,scoring='accuracy') print("Accuracy: %0.2f(+/- %0.2f)" %(scores.mean(),scores.std()*2)) end
/////plot the results of the SVM model //////使用numpy的meshgrid来画二维图 python import numpy as np ## create a mesh on which to plot the results of the SVM model h=0.1 x_min,x_max=X['age'].min()-1,X['age'].max()+1 y_min,y_max=X['HbA1c'].min()-1,X['HbA1c'].max()+1 xx,yy=np.meshgrid(np.arange(x_min,x_max,h),np.arange(y_min,y_max,h))
## plot the predicted decision boundary Z=poly_svc.predict(np.c_[xx.ravel(),yy.ravel()]) Z=Z.reshape(xx.shape) plt.contourf(xx,yy,Z,cmap=mcolors.ListedColormap(["dodgerblue","red"]),alpha=0.8) plt.show() plt.savefig("boundaryplot.png") end
/////scatterplot python ## plot the raw data on the predicted decision boundary plt.scatter(X['age'],X['HbA1c'],c=Y,cmap=mcolors.ListedColormap(["navy","darkred"])) plt.xlabel('Age(years)') plt.ylabel('HbA1c') plt.xlim(xx.min(),xx.max()) plt.ylim(yy.min(),yy.max()) plt.xticks((12,20,30,40,50,60,70,80)) plt.yticks((4,6,7,10,12,14,16)) plt.title('Diabets status by Age and HbA1c') # Save the graph plt.show() plt.savefig("coutourplot.png") end