园州网站建设,兑换网站建设,河南做网站,网站留言板的作用本文的目的是演示如何构建一个相对完整的机器学习工作流
1.首先对工程进行基本的参数配置
# 进行建模基本配置
SCORE_EVA roc_auc
random_state_clf 1
n_jobs 4
cv_split StratifiedKFold(n_splits5, shuffleTrue, random_state1)
cv_split2 StratifiedKFold(n_splits5, … 本文的目的是演示如何构建一个相对完整的机器学习工作流
1.首先对工程进行基本的参数配置
# 进行建模基本配置
SCORE_EVA roc_auc
random_state_clf 1
n_jobs 4
cv_split StratifiedKFold(n_splits5, shuffleTrue, random_state1)
cv_split2 StratifiedKFold(n_splits5, shuffleTrue, random_state2)
X, y data_of_features, label
2.基于各ML模型默认参数进行建模评估了解模型针对当前任务的基本建模能力筛选出有前途的模型进行超参数调优
#Machine Learning Algorithm (MLA) Selection and Initialization
MLA [#Ensemble Methodsensemble.AdaBoostClassifier(random_staterandom_state_clf),ensemble.BaggingClassifier(random_staterandom_state_clf),ensemble.ExtraTreesClassifier(random_staterandom_state_clf),ensemble.GradientBoostingClassifier(random_staterandom_state_clf),ensemble.RandomForestClassifier(random_staterandom_state_clf),#Gaussian Processesgaussian_process.GaussianProcessClassifier(random_staterandom_state_clf),#GLMlinear_model.LogisticRegressionCV(random_staterandom_state_clf),linear_model.PassiveAggressiveClassifier(random_staterandom_state_clf),linear_model.RidgeClassifierCV(random_staterandom_state_clf),linear_model.SGDClassifier(random_staterandom_state_clf),linear_model.Perceptron(random_staterandom_state_clf),#Navies Bayesnaive_bayes.BernoulliNB(random_staterandom_state_clf),naive_bayes.GaussianNB(random_staterandom_state_clf),#Nearest Neighborneighbors.KNeighborsClassifier(random_staterandom_state_clf),#SVMsvm.SVC(probabilityTrue,random_staterandom_state_clf),svm.NuSVC(probabilityTrue,random_staterandom_state_clf),svm.LinearSVC(random_staterandom_state_clf),#Trees tree.DecisionTreeClassifier(random_staterandom_state_clf),tree.ExtraTreeClassifier(random_staterandom_state_clf),#Discriminant Analysisdiscriminant_analysis.LinearDiscriminantAnalysis(random_staterandom_state_clf),discriminant_analysis.QuadraticDiscriminantAnalysis(random_staterandom_state_clf),XGBClassifier(random_staterandom_state_clf) ]MLA_columns [MLA Name, MLA Parameters,MLA Train Matric Mean, MLA Test Matric Mean, MLA Test Matric 3*STD ,MLA Time]
MLA_compare pd.DataFrame(columns MLA_columns)
row_index 0
for alg in MLA:#set name and parametersMLA_name alg.__class__.__name__MLA_compare.loc[row_index, MLA Name] MLA_nameMLA_compare.loc[row_index, MLA Parameters] str(alg.get_params())#score model with cross validation: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html#sklearn.model_selection.cross_validatecv_results model_selection.cross_validate(alg, X, y, cv cv_split,return_train_scoreTrue,scoring SCORE_EVA)MLA_compare.loc[row_index, MLA Time] cv_results[fit_time].mean()MLA_compare.loc[row_index, MLA Train Matric Mean] cv_results[train_score].mean()MLA_compare.loc[row_index, MLA Test Matric Mean] cv_results[test_score].mean() #if this is a non-bias random sample, then /-3 standard deviations (std) from the mean, should statistically capture 99.7% of the subsetsMLA_compare.loc[row_index, MLA Test Matric 3*STD] cv_results[test_score].std()*3 #lets know the worst that can happen!row_index1MLA_compare MLA_compare.sort_values(byMLA Test Matric Mean, ascendingFalse) # 降序排列
MLA_compare
3.挑选出表现较好的模型结合交叉验证和递归特征消除技术同时进行超参数调优和特征选择
#函数可以选择使用REF或者SelectKBest方法结合贝叶斯或网格进行交叉验证寻优
def SearchCV_Feature_and_Parameter(X,y,clf_model,param_grid,cv_split,SCORE_EVAroc_auc,Search_method Bayes,feature_method ref, bayes_n_iter10,verbose 0,n_jobs1):if feature_method ref:pipe Pipeline( [(scaler, StandardScaler()),(feature_selector, RFE(estimatorclf_model)),(model, clf_model)])else:pipe Pipeline( [(scaler, StandardScaler()),(feature_selector, SelectKBest(f_classif)), (model, clf_model)])if Search_method grid:grid_search GridSearchCV(pipe, param_gridparam_grid, cvcv_split, verboseverbose,scoring SCORE_EVA,n_jobsn_jobs)else:grid_search BayesSearchCV(pipe, search_spacesparam_grid, verboseverbose, scoringSCORE_EVA, cvcv_split, n_iterbayes_n_iter,n_jobsn_jobs)grid_search.fit(X, y)return grid_search
#函数通过多次多折交叉验证的形式找到最佳特征集和超参数在多次的交叉验证结果中以出现最多的特征和参数作为最终的优选结果。这里如果用贝叶斯方法可能每次交叉验证的超参数结果都是唯一的导致所有
#结果出现次数都是1def mutil_times_SearchCV_Feature_and_Parameter(X,y,clf_model,param_grid,cv_outter10,cv_inner5,SCORE_EVAroc_auc,Search_method Bayes,feature_method ref,bayes_n_iter2,verbose0,n_jobs1):start_time timeit.default_timer()inner_cv StratifiedKFold(n_splitscv_inner, shuffleTrue, random_state1)if cv_outter1:grid_search_result SearchCV_Feature_and_Parameter(X,y,clf_model,param_grid,inner_cv,SCORE_EVA,Search_method,feature_method,bayes_n_iter,verbose,n_jobs)end_time timeit.default_timer()print(f函数运行时间为 {(end_time - start_time)/60} 分)print(Best score found: , grid_search_result.best_score_)print(Best parameters found: , grid_search_result.best_params_)print(Selected features:, np.array(features43)[grid_search_result.best_estimator_.named_steps[feature_selector].support_])return grid_search_resultelse: outer_cv StratifiedKFold(n_splitscv_split, shuffleTrue, random_state0)roc []best_params_history [] selected_features_history [] # 执行超参数优化for train_idx, test_idx in outer_cv.split(X, y):X_train, X_test X.iloc[train_idx], X.iloc[test_idx]y_train, y_test y.iloc[train_idx], y.iloc[test_idx]search SearchCV_Feature_and_Parameter(X_train,y_train,clf_model,param_grid,inner_cv,SCORE_EVA,Search_method,feature_method,bayes_n_iter,verbose,n_jobs)best_params_history.append(search.best_params_)best_model search.best_estimator_selected_features best_model.named_steps[feature_selector].get_support()selected_features_history.append(selected_features)y_pred best_model.predict(X_test)y_pred_proba best_model.predict_proba(X_test)[:,1]roc.append(roc_auc_score(y_test, y_pred_proba))best_params_history_df pd.DataFrame([dict(ordered_dict) for ordered_dict in best_params_history])best_params_history_df[SCORE_EVA] rocprint(f{cv_split}次{cv_split}折交叉验证平均ROC: {np.mean(roc):.4f} std: {np.std(roc):.4f} ,{[round(meta,3) for meta in roc]})#for i, selected_features in enumerate(selected_features_history, start1):# print(f第{i}次交叉验证所选择的特征 {np.array(features)[selected_features]})param_names best_params_history[0].keys()overall_best_params {}for param_name in param_names:value_counts Counter([params[param_name] for params in best_params_history])most_common_value value_counts.most_common(1)[0][0]overall_best_params[param_name] most_common_valueprint(整体最佳超参数: , overall_best_params)# 多模型集成计算整体最佳作为最终超参数total_features X.shape[1]feature_selection_counts np.zeros(total_features)# 统计每个特征被选中的次数for selected_features in selected_features_history:feature_selection_counts selected_features.astype(int)# 设置阈值以确定整体最佳特征集threshold len(selected_features_history) // 2overall_best_features feature_selection_counts thresholdprint(整体最佳特征集: , np.array(features)[overall_best_features])end_time timeit.default_timer()print(f函数运行时间为 {(end_time - start_time)/60} 分)return best_params_history_df,selected_features_history,roc
def model_evaluate(X,y,model,n_times,test_size0.3):scores []# 进行多次随机数据划分for i in range(n_times):# 划分训练集和测试集X_train, X_test, y_train, y_test train_test_split(X, y, test_sizetest_size,random_statei)# 训练并评估模型每次调用fit都会重新初始化模型权重model.fit(X_train, y_train)y_pred model.predict_proba(X_test)[:,1]score roc_auc_score(y_test, y_pred)scores.append(score)return scores
# 通过建立ML工作流对针对较优模型进行RFE特征选择和超参数调优作为本次建模的核心模型
grid_n_estimator Integer(1, 300)
grid_ratio Real(0.01, 1.0, log-uniform)
grid_learn Real(0.01, 1.0, log-uniform)
grid_max_depth Integer(1, 15)
grid_min_samples [5, 10, .03, .05, .10]
grid_criterion [gini, entropy]
grid_bool [True, False]
grid_seed [0]# 定义超参数搜索空间
param_grid {#feature_selector__k: Integer(5, 15),feature_selector__n_features_to_select: Integer(5, 15),model__learning_rate: Real(0.01, 1.0, log-uniform),model__max_depth: Integer(1, 50),model__n_estimators: Integer(50, 200),model__random_state: grid_seed
}clf_model XGBClassifier(scale_pos_weight2,objectivebinary:logistic,seed0)
grid_search_result mutil_times_SearchCV_Feature_and_Parameter(X,y,clf_model,param_grid,cv_outter1,cv_inner5,SCORE_EVAroc_auc,Search_method Bayes,feature_method ref,bayes_n_iter10,verbose0,n_jobsn_jobs)
# 得到optimal特征子集和超参数后通过多次数据划分评估模型整体和泛化性能其中泛化性能以std结果体现
X_best X[np.array(features43)[grid_search_result.best_estimator_.named_steps[feature_selector].support_]]
X_best StandardScaler().fit_transform(X_best)
clf_model.set_params(**{k.replace(model__, ): v for k, v in grid_search_result.best_params_.items() if k.startswith(model__)})
scores model_evaluate(X_best,y,clf_model_EVA,n_times100,test_size0.3)
mean_score round(np.mean(scores),3)
std_score round(np.std(scores),3)
print(最佳模型,mean_score,std_score)
4.完成特征选择后基于优选特征子集结合贝叶斯交叉验证对备选模型进行超参数调优
#基于optimal特征子集进行多模型集成
#why choose one model, when you can pick them all with voting classifier
#http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html
#removed models w/o attribute predict_proba required for vote classifier and models with a 1.0 correlation to another model
vote_est [#Ensemble Methods: http://scikit-learn.org/stable/modules/ensemble.html(ada, ensemble.AdaBoostClassifier()),(rfc, ensemble.RandomForestClassifier()),(gbc, ensemble.GradientBoostingClassifier()),(xgb, XGBClassifier())#(bc, ensemble.BaggingClassifier()),#(etc,ensemble.ExtraTreesClassifier()),#(gbc, ensemble.GradientBoostingClassifier()),#(rfc, ensemble.RandomForestClassifier()),#Gaussian Processes: http://scikit-learn.org/stable/modules/gaussian_process.html#gaussian-process-classification-gpc#(gpc, gaussian_process.GaussianProcessClassifier()),#GLM: http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression#(lr, linear_model.LogisticRegressionCV()),#Navies Bayes: http://scikit-learn.org/stable/modules/naive_bayes.html#(bnb, naive_bayes.BernoulliNB()),#(gnb, naive_bayes.GaussianNB()),#Nearest Neighbor: http://scikit-learn.org/stable/modules/neighbors.html#(knn, neighbors.KNeighborsClassifier()),#SVM: http://scikit-learn.org/stable/modules/svm.html# (svc, svm.SVC(probabilityTrue)),#xgboost: http://xgboost.readthedocs.io/en/latest/model.html#(xgb, XGBClassifier())]#WARNING: Running is very computational intensive and time expensive.
#Code is written for experimental/developmental purposes and not production ready!
#Hyperparameter Tune with GridSearchCV: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
grid_param [[{#AdaBoostClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.htmln_estimators: grid_n_estimator, #default50learning_rate: grid_learn, #default1#algorithm: [SAMME, SAMME.R], #default’SAMME.Rrandom_state: grid_seed}],[{#RandomForestClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifiern_estimators: grid_n_estimator, #default10criterion: grid_criterion, #default”gini”max_depth: grid_max_depth, #defaultNoneoob_score: [True], #defaultFalse -- 12/31/17 set to reduce runtime -- The best parameter for RandomForestClassifier is {criterion: entropy, max_depth: 6, n_estimators: 100, oob_score: True, random_state: 0} with a runtime of 146.35 seconds.random_state: grid_seed}],[{#GradientBoostingClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier#loss: [deviance, exponential], #default’deviance’learning_rate: [.05], #default0.1 -- 12/31/17 set to reduce runtime -- The best parameter for GradientBoostingClassifier is {learning_rate: 0.05, max_depth: 2, n_estimators: 300, random_state: 0} with a runtime of 264.45 seconds.n_estimators: [300], #default100 -- 12/31/17 set to reduce runtime -- The best parameter for GradientBoostingClassifier is {learning_rate: 0.05, max_depth: 2, n_estimators: 300, random_state: 0} with a runtime of 264.45 seconds.#criterion: [friedman_mse, mse, mae], #default”friedman_mse”max_depth: grid_max_depth, #default3 random_state: grid_seed}],[{#XGBClassifier - http://xgboost.readthedocs.io/en/latest/parameter.htmllearning_rate: grid_learn, #default: .3max_depth: [1,2,4,6,8,10], #default 2n_estimators: grid_n_estimator, seed: grid_seed }] ,[{#ExtraTreesClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html#sklearn.ensemble.ExtraTreesClassifiern_estimators: grid_n_estimator, #default10criterion: grid_criterion, #default”gini”max_depth: grid_max_depth, #defaultNonerandom_state: grid_seed}],[{#BaggingClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html#sklearn.ensemble.BaggingClassifiern_estimators: grid_n_estimator, #default10max_samples: grid_ratio, #default1.0random_state: grid_seed}],[{ #GaussianProcessClassifiermax_iter_predict: grid_n_estimator, #default: 100random_state: grid_seed}],[{#LogisticRegressionCV - http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html#sklearn.linear_model.LogisticRegressionCVfit_intercept: grid_bool, #default: True#penalty: [l1,l2],solver: [newton-cg, lbfgs, liblinear, sag, saga], #default: lbfgsrandom_state: grid_seed}],[{#BernoulliNB - http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html#sklearn.naive_bayes.BernoulliNBalpha: grid_ratio, #default: 1.0}],#GaussianNB - [{}],[{#KNeighborsClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifiern_neighbors: [1,2,3,4,5,6,7], #default: 5weights: [uniform, distance], #default ‘uniform’algorithm: [auto, ball_tree, kd_tree, brute]}],[{#SVC - http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC#http://blog.hackerearth.com/simple-tutorial-svm-parameter-tuning-python-r#kernel: [linear, poly, rbf, sigmoid],C: [1,2,3,4,5], #default1.0gamma: grid_ratio, #edfault: autodecision_function_shape: [ovo, ovr], #default:ovrprobability: [True],random_state: grid_seed}],]start_total time.perf_counter() #https://docs.python.org/3/library/time.html#time.perf_counter
for clf, param in zip (vote_est, grid_param): #https://docs.python.org/3/library/functions.html#zip#print(clf[0],clf[1]) #vote_est is a list of tuples, index 0 is the name and index 1 is the algorithm#print(param)start time.perf_counter() #best_search model_selection.GridSearchCV(estimator clf[1], param_grid param, cv cv_split, scoring SCORE_EVA)best_search BayesSearchCV(clf[1], search_spacesparam, scoringSCORE_EVA, cvcv_split, n_iter50,n_jobs16)best_search.fit(X_best, y)run time.perf_counter() - startbest_param best_search.best_params_clf[1].set_params(**best_param) #对模型进行多次评估以分析泛化能力scores model_evaluate(X_best, y,clf[1],10,test_size0.3)print(The best parameter for {} is {} with a runtime of {:.2f} seconds, scoreing is {:.3f}, std: {:.3f}.format(clf[1].__class__.__name__, best_param, run,np.mean(scores),np.std(scores)))run_total time.perf_counter() - start_total
print(Total optimization time was {:.2f} minutes..format(run_total/60))
print(-*10)
5.完成各模型超参数调优后进行多模型集成包括ensemble 或 stacking
#通过投票法 进行多模型集成
#Soft Vote or weighted probabilities w/Tuned Hyperparameters
vote ensemble.VotingClassifier(estimators vote_est , voting soft) #voting hard
vote_cv model_selection.cross_validate(vote, X_best, y, cv cv_split,scoring SCORE_EVA,return_train_scoreTrue,n_jobs16)
print(Soft Voting Training w/bin score mean: {:.2f}. format(vote_cv[train_score].mean()*100))
print(Soft Voting Test w/bin score mean: {:.2f}. format(vote_cv[test_score].mean()*100))
print(Soft Voting Test w/bin score 3*std: /- {:.2f}. format(vote_cv[test_score].std()*100*3))
print(-*10)# stacking 法
meta_learner LogisticRegression()
stacking_model StackingClassifier(estimatorsvote_est, final_estimatormeta_learner)
stacking_cv model_selection.cross_validate(stacking_model, X_best, y, cv cv_split,scoring SCORE_EVA,return_train_scoreTrue,n_jobs16)
print(Stacking Training w/bin score mean: {:.2f}. format(stacking_cv[train_score].mean()*100))
print(Stacking Test w/bin score mean: {:.2f}. format(stacking_cv[test_score].mean()*100))
print(Stacking Test w/bin score 3*std: /- {:.2f}. format(stacking_cv[test_score].std()*100*3))
print(-*10)
参考 A Data Science Framework: To Achieve 99% Accuracy | Kaggle