本文共 1964 字,大约阅读时间需要 6 分钟。
here we do evaluation only. We can then build the model separately
import pandas as pdfrom sklearn import treefrom sklearn.model_selection import cross_validateimport sklearn.metrics as mtdf = pd.read_csv("diabetes_data_upload.csv")X = pd.get_dummies(df.drop(columns="class"))y = df["class"]scorer = { 'f1': mt.make_scorer(mt.f1_score, pos_label="Positive"), #Define different scoring metric to be used #Defint “positive” label for F-measure 'accuracy': 'accuracy' }dtc = tree.DecisionTreeClassifier()scores = cross_validate(dtc, X, y, cv=10, scoring=scorer) #Cross-validate with a decision tree classifierprint(scores)
parameter tuning + cross validation
import numpy as npimport pandas as pdimport sklearn.tree as treefrom sklearn.model_selection import GridSearchCVfrom sklearn.model_selection import KFoldfrom sklearn.metrics import confusion_matrixdf = pd.read_csv("diabetes_data_upload.csv")X = pd.get_dummies(df.drop(columns="class"))y = df["class"]parameters = {'min_impurity_decrease': [0.05*i for i in range(3)], 'criterion': ["gini", "entropy"]}dtc = tree.DecisionTreeClassifier() #Setup 10-fold CVkf = KFold(n_splits=10, shuffle=True)matrix = np.matrix('0 0; 0 0') #Generating confusion matrixfor train_index, test_index in kf.split(X): #Iterating through each fold of CV X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] gs_dtc = GridSearchCV(dtc, parameters, scoring="accuracy", cv=10) gs_dtc.fit(X_train, y_train) best_index = gs_dtc.cv_results_['rank_test_score'].argmin() best_param = gs_dtc.cv_results_['params'][best_index] #Perform GridSearchCV using training data and get the best parameter dtc_cv = tree.DecisionTreeClassifier(**best_param) dtc_cv.fit(X_train, y_train) y_pred = dtc_cv.predict(X_test) matrix = np.add(matrix, confusion_matrix(y_test, y_pred)) #Build model and evaluate with testing dataprint(matrix)
转载地址:http://qaygf.baihongyu.com/