pa66用途障车做网站,福田专门做网站推广公司,长春汽开区建设局网站,多少钱算敲诈勒索RStudio是一个开源的集成开发环境#xff08;IDE#xff09;#xff0c;专门用于R编程语言的开发和数据分析。R语言是一种流行的统计计算和数据可视化语言#xff0c;广泛用于数据科学、统计学和机器学习领域。
RStudio提供了许多功能强大的工具#xff0c;包括代码编辑器… RStudio是一个开源的集成开发环境IDE专门用于R编程语言的开发和数据分析。R语言是一种流行的统计计算和数据可视化语言广泛用于数据科学、统计学和机器学习领域。
RStudio提供了许多功能强大的工具包括代码编辑器、调试器、数据可视化工具和集成的帮助文档。对于机器学习推广RStudio可以帮助您进行数据预处理、模型训练、评估和部署。
RStudio中有许多用于机器学习的包和库如caret、tidymodels、tensorflow等这些包提供了丰富的机器学习算法和工具帮助您构建和部署机器学习模型。
library(rlang)
library(gbm)
library(xgboost)
library(vip)
library(pdp)
library(ggplot2)
library(tidyverse)
library(recipes)
rm(listls())s-Sys.time()#数据# prereqs-data
# ames data
#install.packages(AmesHousing)
library(AmesHousing)
ames - AmesHousing::make_ames()# split data
set.seed(123)
split - rsample::initial_split(ames, strata Sale_Price)
ames_train - rsample::training(split)
ames_test - rsample::testing(split)## GBM## * 非常受欢迎
## * 非常准确
## * 和随机森林类似但是大不相同## 第一个GBM模型# shrinkage: learning rate0.1学习率默认# basic-gbm
tick - proc.time()
set.seed(123)
ames_gbm - gbm(formula Sale_Price ~ .,data ames_train,distribution gaussian, # or bernoulli, multinomial, etc. n.trees 5000, shrinkage 0.1, interaction.depth 1, n.minobsinnode 10, cv.folds 5
)
tock - proc.time() - tick;tock# find index for n trees with minimum CV error
min_MSE - which.min(ames_gbm$cv.error)# get MSE and compute RMSE
sqrt(ames_gbm$cv.error[min_MSE])# 检查是否过拟合gbm.perf(ames_gbm, method cv) # or OOB## 调参## * 刚开始learning rate学习率默认0.1 或者0.05-0.2
## * 确保number of trees不要过拟合
## * 调整learning rate学习率
## * 调整树参数
## * 可能降低learning rate, 增加number of trees## 固定树参数 ## 树高3## 默认min obs## 设定learning rate 0.01## 增加CV更利于error估计## 1分钟## 用尽了所有的树# tune1
set.seed(123)
tick - proc.time()
ames_gbm1 - gbm(formula Sale_Price ~ .,data ames_train,distribution gaussian, # or bernoulli, multinomial, etc. #n.trees 5000, #shrinkage 0.01, #interaction.depth 3, #n.minobsinnode 10, #cv.folds 10 #
)
tock - proc.time() - tick;tock# find index for n trees with minimum CV error
min_MSE - which.min(ames_gbm1$cv.error)# get MSE and compute RMSE
sqrt(ames_gbm1$cv.error[min_MSE])gbm.perf(ames_gbm1, method cv)## 提高number of trees## 尝试0.001## 不准确## 4分钟# tune2
set.seed(123)
tick - proc.time()
ames_gbm1 - gbm(formula Sale_Price ~ .,data ames_train,distribution gaussian, # or bernoulli, multinomial, etc. #n.trees 20000, #shrinkage 0.001, #interaction.depth 3, #n.minobsinnode 10, #cv.folds 10 #
)
tock - proc.time() - tick;tock# find index for n trees with minimum CV error
min_MSE - which.min(ames_gbm1$cv.error)# get MSE and compute RMSE
sqrt(ames_gbm1$cv.error[min_MSE])gbm.perf(ames_gbm1, method cv)# 调整树参数 30分钟# assess 3 values for tree depth# assess 3 values for min obs# tune12
# evalFALSE或者cache TRUE
# search grid
hyper_grid - expand.grid(n.trees 6000,shrinkage .01,interaction.depth c(3, 5, 7), #n.minobsinnode c(5, 10, 15) #
)model_fit - function(n.trees, shrinkage, interaction.depth, n.minobsinnode) {set.seed(123)m - gbm(formula Sale_Price ~ .,data ames_train,distribution gaussian,n.trees n.trees,shrinkage shrinkage, #interaction.depth interaction.depth, #n.minobsinnode n.minobsinnode,cv.folds 10)# compute RMSEsqrt(min(m$cv.error))
}tick - proc.time()
hyper_grid$rmse - purrr::pmap_dbl(hyper_grid,~ model_fit(n.trees ..1,shrinkage ..2,interaction.depth ..3,n.minobsinnode ..4)
)
tock - proc.time() - tick;tockarrange(hyper_grid, rmse)
## n.trees shrinkage interaction.depth n.minobsinnode rmse
## 1 6000 0.01 7 5 20683.65
## 2 6000 0.01 5 5 20746.36
## 3 6000 0.01 7 10 21114.62
## 4 6000 0.01 3 5 21126.33
## 5 6000 0.01 5 10 21228.68
## 6 6000 0.01 7 15 21246.73
## 7 6000 0.01 5 15 21272.91
## 8 6000 0.01 3 10 21767.08
## 9 6000 0.01 3 15 21862.35# 加入随机性# * 0.5-0.8
# * 可能需要Zoom in
# * 10分钟# stochastic
bag_frac - c(.5, .65, .8) #tick - proc.time()
for(i in bag_frac) {set.seed(123)m - gbm(formula Sale_Price ~ .,data ames_train,distribution gaussian,n.trees 6000, shrinkage 0.01, interaction.depth 7, n.minobsinnode 5,bag.fraction i, #cv.folds 10 )# compute RMSEprint(sqrt(min(m$cv.error)))
}tock - proc.time() - tick;tock## [1] 20683.65
## [1] 20398.93
## [1] 20287.76## XGBoost## XGBoost 需要one hot## caret可以自动one hot## * Other 低频## * Encode 有序变量提升速度和表现# xgb-feature-prep
library(rlang)
xgb_prep - recipe(Sale_Price ~ ., data ames_train) %%step_other(all_nominal(), threshold .005) %%step_integer(all_nominal()) %%prep(training ames_train, retain TRUE) %%juice()X - as.matrix(xgb_prep[setdiff(names(xgb_prep), Sale_Price)])
Y - xgb_prep$Sale_Price# 第一个XGBoost模型# 大概几秒钟# Early stopping rounds: CV RMSE 50棵树不进步# https://xgboost.readthedocs.io/en/latest/parameter.html# 默认eta0.3# basic-xgboost
tick - proc.time()
set.seed(123)
ames_xgb - xgb.cv(data X,label Y,nrounds 5000,objective reg:squarederror,early_stopping_rounds 50, nfold 10,verbose 0,
)
tock - proc.time() - tick;tockames_xgb$evaluation_log %% tail()
## iter train_rmse_mean train_rmse_std test_rmse_mean test_rmse_std
## 1: 195 600.2057 55.89026 24382.85 1725.531
## 2: 196 591.9463 55.26891 24381.77 1725.488
## 3: 197 582.9737 53.65047 24382.20 1724.203
## 4: 198 572.7220 52.89840 24382.79 1723.236
## 5: 199 561.9686 49.76037 24382.41 1724.374
## 6: 200 554.3745 48.24520 24382.83 1723.857# xgboost-eta
set.seed(123)
ames_xgb - xgb.cv(data X,label Y,nrounds 6000,objective reg:squarederror,early_stopping_rounds 50, nfold 10,verbose 0,params list(eta .05) #
) ames_xgb$evaluation_log %% tail()
## iter train_rmse_mean train_rmse_std test_rmse_mean test_rmse_std
## 1: 1023 1034.059 51.96559 21879.08 1248.379
## 2: 1024 1032.371 52.17706 21879.89 1248.430
## 3: 1025 1030.903 52.01260 21879.53 1248.890
## 4: 1026 1029.302 51.47370 21879.29 1249.095
## 5: 1027 1027.053 50.80923 21879.43 1248.593
## 6: 1028 1025.107 50.38078 21879.46 1249.280## - eta .3(default): 24,382 w/200 trees ( 1 min)
## - eta .1: 22,333 w/398 trees ( 1 min)
## - eta .05: 21,877 w/978 trees (1.5 min)
## - eta .01: 22,094 w/2843 trees (4 min)
## - one-hot encoded 30 min# * Learning rate0.05
# * 6000棵
# * Early stopping
# * 调整
# - tree depth
# - Child weight# 30分钟# xgboost-tree-specific
# grid
hyper_grid - expand.grid(eta .05,max_depth c(1, 3, 5, 7, 9), #min_child_weight c(1, 3, 5, 7, 9), #rmse 0 # a place to dump results
)# grid search
tick - proc.time()
for(i in seq_len(nrow(hyper_grid))) {set.seed(123)m - xgb.cv(data X,label Y,nrounds 6000,objective reg:squarederror,early_stopping_rounds 50, nfold 10,verbose 0,params list( #eta hyper_grid$eta[i], #max_depth hyper_grid$max_depth[i], #min_child_weight hyper_grid$min_child_weight[i] #) #)hyper_grid$rmse[i] - min(m$evaluation_log$test_rmse_mean)
}
tock - proc.time() - tick;tockarrange(hyper_grid, rmse)
## eta max_depth min_child_weight rmse
## 1 0.05 3 3 20989.27
## 2 0.05 3 1 21062.92
## 3 0.05 3 5 21453.00
## 4 0.05 5 1 21685.04
## 5 0.05 5 3 21748.12
## 6 0.05 3 7 22058.70
## 7 0.05 7 1 22110.09
## 8 0.05 3 9 22181.64
## 9 0.05 5 5 22185.64
## 10 0.05 7 3 22468.97
## 11 0.05 9 1 22632.80
## 12 0.05 9 3 22664.75
## 13 0.05 7 5 22721.41
## 14 0.05 5 7 22801.02
## 15 0.05 7 7 22951.21
## 16 0.05 5 9 22970.42
## 17 0.05 9 5 22987.65
## 18 0.05 9 7 23212.10
## 19 0.05 1 1 23252.66
## 20 0.05 7 9 23452.08
## 21 0.05 9 9 23714.29
## 22 0.05 1 3 24147.17
## 23 0.05 1 5 25089.55
## 24 0.05 1 7 26534.20
## 25 0.05 1 9 26784.77# * Learning rate0.05
# * 6000棵
# * Early stopping
# * tree depth3
# * Child weight3
# * 调整随机性
# - subsampling rows for each tree
# - subsampling columns for each tree# 12分钟# xgb-stochastic-grid
# grid
hyper_grid - expand.grid(eta .05,max_depth 3, min_child_weight 3,subsample c(.5, .65, .8, 1), #colsample_bytree c(.5, .65, .8, 1), #rmse 0 # a place to dump results
)# grid search
tick - proc.time()
for(i in seq_len(nrow(hyper_grid))) {set.seed(123)m - xgb.cv(data X,label Y,nrounds 6000,objective reg:squarederror,early_stopping_rounds 50, nfold 10,verbose 0,params list( #eta hyper_grid$eta[i],max_depth hyper_grid$max_depth[i],min_child_weight hyper_grid$min_child_weight[i],subsample hyper_grid$subsample[i], #colsample_bytree hyper_grid$colsample_bytree[i] #) #)hyper_grid$rmse[i] - min(m$evaluation_log$test_rmse_mean)
}
tock - proc.time() - tick;tockarrange(hyper_grid, rmse)
## eta max_depth min_child_weight subsample colsample_bytree rmse
## 1 0.05 3 3 0.80 1.00 20732.22
## 2 0.05 3 3 0.50 1.00 20752.65
## 3 0.05 3 3 1.00 1.00 20989.27
## 4 0.05 3 3 0.65 1.00 21018.13
## 5 0.05 3 3 1.00 0.80 21425.00
## 6 0.05 3 3 0.80 0.80 21444.68
## 7 0.05 3 3 1.00 0.65 21597.04
## 8 0.05 3 3 0.65 0.80 21890.11
## 9 0.05 3 3 0.80 0.65 21907.55
## 10 0.05 3 3 0.65 0.65 22047.80
## 11 0.05 3 3 0.50 0.80 22211.92
## 12 0.05 3 3 0.50 0.65 22460.97
## 13 0.05 3 3 0.80 0.50 26071.94
## 14 0.05 3 3 1.00 0.50 26555.12
## 15 0.05 3 3 0.65 0.50 26577.24
## 16 0.05 3 3 0.50 0.50 26952.51# xgb-regularize
hyper_grid - expand.grid(eta .05,max_depth 3, min_child_weight 3,subsample .8, colsample_bytree 1,#gamma c(1, 100, 1000, 10000),#lambda c(1e-2, 0.1, 1, 100, 1000, 10000),alpha c(1e-2, 0.1, 1, 100, 1000, 10000), #rmse 0 # a place to dump results
)# grid search
tick - proc.time()
for(i in seq_len(nrow(hyper_grid))) {set.seed(123)m - xgb.cv(data X,label Y,nrounds 6000,objective reg:squarederror,early_stopping_rounds 50, nfold 10,verbose 0,params list( eta hyper_grid$eta[i], max_depth hyper_grid$max_depth[i],min_child_weight hyper_grid$min_child_weight[i],subsample hyper_grid$subsample[i], #colsample_bytree hyper_grid$colsample_bytree[i],#gamma hyper_grid$gamma[i], #lambda hyper_grid$lambda[i]#, alpha hyper_grid$alpha[i] #) )hyper_grid$rmse[i] - min(m$evaluation_log$test_rmse_mean)
}
tock - proc.time() - tick;tockarrange(hyper_grid, rmse)
## eta max_depth min_child_weight subsample colsample_bytree alpha rmse
## 1 0.05 3 3 0.8 1 1e02 20581.31
## 2 0.05 3 3 0.8 1 1e03 20605.30
## 2 0.05 3 3 0.8 1 1e04 20615.19
## 3 0.05 3 3 0.8 1 1e-01 20732.23
## 4 0.05 3 3 0.8 1 1e-02 20732.23
## 5 0.05 3 3 0.8 1 1e00 20732.23# final-xgb-model
# parameter list
params - list(eta .05,max_depth 3, min_child_weight 3,subsample .8, colsample_bytree 1,alpha 100
)# final cv fit
set.seed(123)
tick - proc.time()
final_cv - xgb.cv(data X,label Y,nrounds 6000,objective reg:squarederror,early_stopping_rounds 50, nfold 10,verbose 0,params params
)
tock - proc.time() - tick;tockfinal_cv$evaluation_log %%ggplot(aes(xiter,ytest_rmse_mean)) geom_line()# train final model
ames_final_xgb - xgboost(data X,label Y,nrounds final_cv$best_iteration, objective reg:squarederror,params params, verbose 0
)# xgb-vip
vip::vip(ames_final_xgb, num_features 25)# xgb-pdp
ames_final_xgb %%pdp::partial(pred.var Gr_Liv_Area, n.trees ames_final_xgb$niter, grid.resolution 50, train X) %%autoplot(rug TRUE, train X)# top-bottom-vip
ames_vi - vi(ames_final_xgb, feature_names colnames(X))
feats - c(head(ames_vi, n 4)$Variable, tail(ames_vi, n 4)$Variable)
pds - lapply(feats, FUN function(x) {pd - cbind(x, pdp::partial(ames_final_xgb, pred.var x, train X))names(pd) - c(xvar, xval, yhat)pd
})
pds - do.call(rbind, pds)
pds$xvar - factor(pds$xvar,levelsfeats)
ggplot(pds, aes(x xval, y yhat)) geom_line(size 1.5) geom_hline(yintercept mean(ames$Sale_Price), linetype 2, col red2) facet_wrap( ~ xvar, scales free_x, nrow 2) labs(x , y Partial dependence) theme_light()## test data测试集上面预测# recipe
test - recipe(Sale_Price ~ ., data ames_train) %%step_other(all_nominal(), threshold .005) %%step_integer(all_nominal()) %%prep(training ames_train) %%bake(new_data ames_test)# obtain X for test data
xgtest - xgb.DMatrix(as.matrix(test %% select(-Sale_Price)))# make prediction and calculate error
test - test %%mutate(prediction predict(ames_final_xgb, xgtest)) %%mutate(error (prediction-Sale_Price)^2)# plot
test %% ggplot(aes(Sale_Price,prediction)) geom_point()# test error
test %%summarize(mean(error)) %%sqrt()save.image(6_boosting_CCB_cohort2.RData)#??????1
e1-Sys.time()
print(e1-s) 总的来说RStudio是一个功能强大的工具可用于机器学习推广帮助用户在数据分析和模型开发过程中更高效地工作。