当前位置：首页 > news >正文

沈阳网站订制廉洁文化建设网站

news 2025/11/23 13:15:35

沈阳网站订制,廉洁文化建设网站,哪家公司做网站不错,广州高端优秀网站改版设计公司本章节的目的是【明确目标用户群】 #xff0c;以更好的服务现有用户。【知识点】 1.作图显示中文plt.rcParams[font.sans-serif] [SimHei] # 步骤一#xff08;替换sans-serif字体#xff09; plt.rcParams[axes.unicode_minus] False # 步骤二#xff08;解决坐标轴… 本章节的目的是【明确目标用户群】以更好的服务现有用户。【知识点】 1.作图显示中文plt.rcParams[font.sans-serif] [SimHei] # 步骤一替换sans-serif字体 plt.rcParams[axes.unicode_minus] False # 步骤二解决坐标轴负数的负号显示问题 2.数据库操作 sqlalchemy 引擎engine create_engine(mysqlpymysql://root:123456localhost:3306/datascience) 3.批量读取文件 os.wolk()、os.path.join()用法for root, dirs, files in os.walk(path): for file in files:rfile os.path.join(root,file)if rfile.split(.)[-1] tsv:rdf pd.read_csv(rfile, sep\t)df df.append(rdf) 4.groupby()以及agg() 的联合使用应对不同列使用不同的函数按月统计affc {payment:sum, log_date:count} dfm df.groupby([log_month, user_id]).agg(affc).reset_index() 修改列明renam {log_date:access_days} dfm.rename(columnsrenam, inplaceTrue) 5.KMeans 聚类的使用单列的聚类需要将单列应用 reshape-1,1格式化为1列from sklearn.cluster import KMeans a47 action[A47].reshape(-1, 1) kms KMeans(n_clusters3).fit(a47) 聚类的标签 labels_ 属性cluster kms.labels_ 将标签添加至源数据中运用groupby查看分组情况action[cluster] cluster action.groupby([cluster])[user_id].count() 可视化分组snsdf action[[user_id,A47,cluster]].sort_values(byA47,ascendingFalse) plt.figure(figsize(8,5)) snsdf1 snsdf.reset_index() snsdf1[snsdf1[cluster]2][A47].plot(colorr,label2:重度用户) snsdf1[snsdf1[cluster]1][A47].plot(colorg,label1:中度用户) snsdf1[snsdf1[cluster]0][A47].plot(colorb,label0:轻度用户) plt.legend() plt.xlabel(用户分布) plt.ylabel(排行榜得分) 6.主成分分析数据预处理提取要进行主成分分析的列paction acc.iloc[:,3:(len(acc.columns)-1)]删掉0值较多的列cc paction[paction0].count(axis0)/len(paction) cc.plot() dd cc[cc.9] #删掉该列中90%以上都是0值的列 paction paction[dd.index] paction.head()删掉相关性较强的列 # 数据概览 corp paction.corr() sns.heatmap(corp) mask np.array(corp) mask[np.tril_indices_from(mask)] False # 画下三角heatmap的方法 sns.heatmap(corp,maskmask)# 通过下三角矩阵的方式删掉相关性较强的数据列 coll corp.columns corp pd.DataFrame(np.tril(corp, -1)) # 应用 np.tril(m, -1) 函数获取下三角上三角数据全部置为0 corp.columns coll pac2 paction.loc[:,(corp.abs().8).all()] # 任何一个数都小于 0.8 的数据 all() 函数 pac2.head()进行主成分分析 from sklearn.decomposition import PCA pca PCA() pca.fit(pac2)redio pca.explained_variance_ratio_ # pca.explained_variance_ratio_ 是PCA降维后的矩阵课解释性比率 print(redio) print(pca.singular_values_) # singular_values_ 是奇异值矩阵主成分的课解释性曲线 recu redio.cumsum() # 应用 cumsum() 函数进行逐数据累加 plt.plot(recu)获取降维后的数据以进行下一步 pca.set_params(n_components10) # 设置维度为 10 pac3 pd.DataFrame(pca.fit_transform(pac2)) # 使用fit_transform()函数训练并获得降维后的数据 pac3.head()继续应用 KMENAS 进行聚类得到所有用户的分类然后再平均每个分类的每个行为的所有用户的值继续应用相关性删除相关性强的列获得最后主要观察指标对主要观察指标进行雷达图展示 # 首先对数据进行标准化处理 from sklearn.preprocessing import scale ccccc pd.DataFrame(scale(cccc)) ccccc.columns cccc.columns# 画图 plt.figure(figsize(8,8)) N ccccc.shape[1] # 极坐标的分割分数 angles np.linspace(0, 2*np.pi, N, endpointFalse) # 设置雷达图的角度用于平分切开一个圆面 angles np.concatenate((angles,[angles[0]])) # 使雷达图一圈封闭起来 for i in range(len(ccccc)):values ccccc.loc[i,:] # 构造数据values np.concatenate((values,[values[0]])) # 为了使雷达图一圈封闭起来plt.polar(angles, values, o-, linewidth2) # 绘制 plt.legend(ccccc.index, loclower right) plt.thetagrids(angles * 180/np.pi, labelslist(ccccc.columns)) # 添加极坐标的标签 plt.title(重要指标雷达图呈现) 一、库导入以及matplotlib显示中文 import pandas as pd import numpy as np import pymysql from sqlalchemy import create_engine import matplotlib.pyplot as plt import seaborn as sns import missingno as msno import osplt.rcParams[font.sans-serif] [SimHei] # 步骤一替换sans-serif字体 plt.rcParams[axes.unicode_minus] False # 步骤二解决坐标轴负数的负号显示问题 %matplotlib inline 数据库引擎 engine create_engine(mysqlpymysql://root:123456localhost:3306/datascience) 二、批量读取文件 def read_files(path):df pd.DataFrame()for root, dirs, files in os.walk(path):for file in files:rfile os.path.join(root,file)if rfile.split(.)[-1] tsv:rdf pd.read_csv(rfile, sep\t)df df.append(rdf)return df action_path data/sample-data/section8/daily/action/ dau_path data/sample-data/section8/daily/dau/ dpu_path data/sample-data/section8/daily/dpu/action read_files(action_path) dau read_files(dau_path) dpu read_files(dpu_path) 查看数据完整性以及头部信息 print(action.isnull().sum().sum()) print(action.shape) # print(action.info()) action.head() 0 (2653, 57) log_dateapp_nameuser_idA1A2A3A4A5A6A7...A45A46A47A48A49A50A51A52A53A5402013-10-31game-016541330000000...003802565500000.04612013-10-31game-014255300000101233...19201805433473622400.07122013-10-31game-017095960000000...004162481700000.0232013-10-31game-015250470200900...2222352006412210000.010942013-10-31game-017969080000000...29293882544410000.0645 rows × 57 columns print(dau.isnull().sum().sum()) print(dau.shape) print(dau.info()) dau.head() 0 (509754, 3) class pandas.core.frame.DataFrame Int64Index: 509754 entries, 0 to 2410 Data columns (total 3 columns): log_date 509754 non-null object app_name 509754 non-null object user_id 509754 non-null int64 dtypes: int64(1), object(2) memory usage: 15.6 MB None log_dateapp_nameuser_id02013-05-01game-0160880112013-05-01game-0171245322013-05-01game-0177685332013-05-01game-0182348642013-05-01game-01113600 print(dpu.isnull().sum().sum()) print(dpu.shape) print(dpu.info()) dpu.head() 0 (3532, 4) class pandas.core.frame.DataFrame Int64Index: 3532 entries, 0 to 7 Data columns (total 4 columns): log_date 3532 non-null object app_name 3532 non-null object user_id 3532 non-null int64 payment 3532 non-null int64 dtypes: int64(2), object(2) memory usage: 138.0 KB None log_dateapp_nameuser_idpayment02013-05-01game-0180400557112013-05-01game-017935378122013-05-01game-013177178132013-05-01game-013177178142013-05-01game-01426525324 # 写入数据库# action.to_sql(s8_action, engine, indexFalse) # dau.to_sql(s8_dau, engine, indexFalse) # dpu.to_sql(s8_dpu, engine, indexFalse) 三、数据预处理 1.合并 DAU DPU df pd.merge(dau, dpu[[log_date,user_id,payment]], howleft, on[user_id,log_date]) df.head() log_dateapp_nameuser_idpayment02013-05-01game-01608801NaN12013-05-01game-01712453NaN22013-05-01game-01776853NaN32013-05-01game-01823486NaN42013-05-01game-01113600NaN # 将无消费记录的消费额设为 0 print(df.payment.isnull().sum()) df[payment].fillna(0, inplaceTrue) print(df.payment.isnull().sum()) 507151 0 # 添加消费额标志位 df[is_pay] df[payment].apply( lambda x: 1 if x0 else 0 ) df.head() log_dateapp_nameuser_idpaymentis_pay02013-05-01game-016088010.0012013-05-01game-017124530.0022013-05-01game-017768530.0032013-05-01game-018234860.0042013-05-01game-011136000.00 2.按月统计 # 增加月份列 df[log_month] df[log_date].apply(lambda x: x[0:7]) df.head() log_dateapp_nameuser_idpaymentis_paylog_month02013-05-01game-016088010.002013-0512013-05-01game-017124530.002013-0522013-05-01game-017768530.002013-0532013-05-01game-018234860.002013-0542013-05-01game-011136000.002013-05 巧妙运用 groupby 以及 agg 函数统计出用户按月份的消费情况和登陆次数 # 按月统计 affc {payment:sum, log_date:count} dfm df.groupby([log_month, user_id]).agg(affc).reset_index() # 修改列明 renam {log_date:access_days} dfm.rename(columnsrenam, inplaceTrue) dfm.head() log_monthuser_idpaymentaccess_days02013-05650.0112013-051150.0122013-051940.0132013-054260.0442013-055390.01 4.使用 Kmeans 进行分类得到排名靠前的用户即重度用户/中度用户/轻度用户 A47 列即是排行榜得分从分布图上看出大部分用户得分很低符合幂律曲线 # action[A47].hist(bins50, figsize(6,4)) matplotlib.axes._subplots.AxesSubplot at 0x1c21d894240 sns.distplot(action[A47],bins50,kdeTrue) matplotlib.axes._subplots.AxesSubplot at 0x1c21af07a58 对 A47 列进行聚类分为3类 from sklearn.cluster import KMeansa47 action[A47].reshape(-1, 1)kms KMeans(n_clusters3).fit(a47) D:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) insteadThis is separate from the ipykernel package so we can avoid doing imports until cluster kms.labels_ kms.cluster_centers_ array([[ 9359.84787792],[ 69386.11297071],[185857.17948718]]) action[cluster] cluster action.head() log_dateapp_nameuser_idA1A2A3A4A5A6A7...A46A47A48A49A50A51A52A53A54cluster02013-10-31game-016541330000000...03802565500000.046012013-10-31game-014255300000101233...201805433473622400.071222013-10-31game-017095960000000...04162481700000.02032013-10-31game-015250470200900...22352006412210000.0109042013-10-31game-017969080000000...293882544410000.06405 rows × 58 columns action.groupby([cluster])[user_id].count() cluster 0 2096 1 479 2 78 Name: user_id, dtype: int64 图上显示通过聚类分解后用户分为3个类 0 表示轻度用户排行榜得分最少 1 表示中度用户排行版得分居中 2 表示重度用户排行版得分较高而且用户数量较少符合实际情况。 snsdf action[[user_id,A47,cluster]].sort_values(byA47,ascendingFalse) snsdf[user] range(len(snsdf)) sns.scatterplot(xuser,yA47,huecluster,datasnsdf, paletterainbow, alpha.2) matplotlib.axes._subplots.AxesSubplot at 0x1c21b9bf898 snsdf action[[user_id,A47,cluster]].sort_values(byA47,ascendingFalse) snsdf[user] range(len(snsdf))plt.figure(figsize(8,5)) snsdf1 snsdf.reset_index() snsdf1[snsdf1[cluster]2][A47].plot(colorr,label2:重度用户) snsdf1[snsdf1[cluster]1][A47].plot(colorg,label1:中度用户) snsdf1[snsdf1[cluster]0][A47].plot(colorb,label0:轻度用户) plt.legend() plt.xlabel(用户分布) plt.ylabel(排行榜得分) Text(0,0.5,排行榜得分) 限定排名靠前的用户即得分较高的重度和中度用户以便接下来进行分析 acc action[action[cluster]1] acc.head() log_dateapp_nameuser_idA1A2A3A4A5A6A7...A46A47A48A49A50A51A52A53A54cluster12013-10-31game-014255300000101233...201805433473622400.071252013-10-31game-017761200000900...381422146843715000.0312272013-10-31game-0127619700007058...15546024226150800.095182013-10-31game-012215720000100...2439891579240000.021192013-10-31game-016924330000600...28507064549168000.015415 rows × 58 columns 5.主成分分析获取关键的参数 paction acc.iloc[:,3:(len(acc.columns)-1)] paction.indexacc.user_id paction.head() A1A2A3A4A5A6A7A8A9A10...A45A46A47A48A49A50A51A52A53A54user_id425530000010123358.25288230...19201805433473622400.07177612000009000.00325195...19381422146843715000.0312276197000070587.25150100...1515546024226150800.09522157200001000.004014...242439891579240000.02169243300006000.0010295...1528507064549168000.01545 rows × 54 columns 1.删掉 0 值比较多的列 cc paction[paction0].count(axis0)/len(paction) print(cc.head()) cc.plot() A1 1.000000 A2 0.926391 A3 1.000000 A4 0.994614 A5 0.055655 dtype: float64matplotlib.axes._subplots.AxesSubplot at 0x1c21bbb1470 # cc[cc.8] dd cc[cc.95] paction paction[dd.index] paction.head() A2A5A6A7A8A9A10A11A12A13...A45A46A47A48A49A50A51A52A53A54user_id425530010123358.2528823019219...19201805433473622400.07177612009000.0032519538819...19381422146843715000.0312276197070587.2515010015311...1515546024226150800.09522157201000.004014003...242439891579240000.02169243306000.0010295002...1528507064549168000.01545 rows × 32 columns 2.删掉相关性较强的列 corp paction.corr() plt.figure(figsize(15,8)) sns.heatmap(corp) matplotlib.axes._subplots.AxesSubplot at 0x1c21bc094a8 画下三角heatmap使用到的函数 mask np.array(corp) mask[np.tril_indices_from(mask)] False fig,ax plt.subplots() fig.set_size_inches(15,8) sns.heatmap(corp,maskmask) matplotlib.axes._subplots.AxesSubplot at 0x1c21bc09400 获取矩阵的下三角如果要获取上三角的话 np.tril(m, 1) coll corp.columns corp pd.DataFrame(np.tril(corp, -1)) corp.columns coll corp.head() A2A5A6A7A8A9A10A11A12A13...A45A46A47A48A49A50A51A52A53A5400.0000000.0000000.0000000.0000000.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.010.0697440.0000000.0000000.0000000.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.020.0761850.1788330.0000000.0000000.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.030.1587350.2193950.3713600.0000000.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.040.1672000.1861240.2420250.8031610.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.05 rows × 32 columns pac2 paction.loc[:,(corp.abs().7).all()] # 任何一个数都小于0.7 的数据 pac2.head() A2A11A12A13A20A23A24A43A44A46A48A49A50A51A53A54user_id425530019219000.5230.9217420347362240.071776120038819000.0200.9025638684371500.0312276197015311000.0100.9200015422615080.0952215720003000.020.857142457924000.0216924330002000.0110.7368428454916800.0154 进行主成分分析 from sklearn.decomposition import PCA pca PCA() pca.fit(pac2) PCA(copyTrue, iterated_powerauto, n_componentsNone, random_stateNone,svd_solverauto, tol0.0, whitenFalse) redio pca.explained_variance_ratio_ print(redio) print(pca.singular_values_) [9.97843804e-01 1.92024564e-03 1.20120771e-04 5.57014208e-052.67905481e-05 1.54533752e-05 9.31262940e-06 4.38846214e-063.02317261e-06 8.36725295e-07 1.31874979e-07 9.78197162e-083.86464536e-08 2.94647596e-08 1.82272465e-08 7.54580333e-09] [3.96183910e04 1.73797668e03 4.34684952e02 2.96004755e022.05284590e02 1.55911168e02 1.21032418e02 8.30848288e016.89599635e01 3.62791414e01 1.44027941e01 1.24044853e017.79687146e00 6.80796010e00 5.35458829e00 3.44523057e00] recu redio.cumsum() print(recu) x np.arange(len(recu)) plt.plot(recu, colorr) [0.9978438 0.99976405 0.99988417 0.99993987 0.99996666 0.999982120.99999143 0.99999582 0.99999884 0.99999968 0.99999981 0.999999910.99999994 0.99999997 0.99999999 1. ][matplotlib.lines.Line2D at 0x1c21dadada0] 得到降维后的数据 pca.set_params(n_components10) pac3 pd.DataFrame(pca.fit_transform(pac2)) pacsse pac3.copy() pac3.head() 012345678902706.266005-100.824346-1.874787-1.57753612.481591-2.3943209.7708787.8075350.021273-2.16959612373.811140147.314930-16.386795-8.42865510.019577-3.0047256.0097710.961469-1.5985312.1446152-1171.733361-5.4930810.7449950.542033-0.785251-5.756412-1.012336-1.7780677.2568840.3432773-2738.903900-50.4684872.3284912.965415-5.79434711.8912892.965366-1.1824130.0656191.2453584-1493.64261858.686385-10.80761211.7779737.6646929.3129684.3764291.994214-1.5680500.426246 6.KMeans 进行聚类 from sklearn.cluster import KMeanskm KMeans(n_clusters5) km.fit(pac3) KMeans(algorithmauto, copy_xTrue, initk-means, max_iter300,n_clusters5, n_init10, n_jobs1, precompute_distancesauto,random_stateNone, tol0.0001, verbose0) clu km.labels_ pac3[clu] clu pac3.head() 0123456789clu02706.266005-100.824346-1.874787-1.57753612.481591-2.3943209.7708787.8075350.021273-2.169596012373.811140147.314930-16.386795-8.42865510.019577-3.0047256.0097710.961469-1.5985312.14461502-1171.733361-5.4930810.7449950.542033-0.785251-5.756412-1.012336-1.7780677.2568840.34327713-2738.903900-50.4684872.3284912.965415-5.79434711.8912892.965366-1.1824130.0656191.24535844-1493.64261858.686385-10.80761211.7779737.6646929.3129684.3764291.994214-1.5680500.4262461 pac3.groupby(clu)[2].count() clu 0 90 1 113 2 122 3 109 4 123 Name: 2, dtype: int64 #### palette 的颜色风格 Accent, Accent_r, Blues, Blues_r, BrBG, BrBG_r, BuGn, BuGn_r, BuPu, BuPu_r, CMRmap, CMRmap_r, Dark2, Dark2_r, GnBu, GnBu_r, Greens, Greens_r, Greys, Greys_r, OrRd, OrRd_r, Oranges, Oranges_r, PRGn, PRGn_r, Paired, Paired_r, Pastel1, Pastel1_r, Pastel2, Pastel2_r, PiYG, PiYG_r, PuBu, PuBuGn, PuBuGn_r, PuBu_r, PuOr, PuOr_r, PuRd, PuRd_r, Purples, Purples_r, RdBu, RdBu_r, RdGy, RdGy_r, RdPu, RdPu_r, RdYlBu, RdYlBu_r, RdYlGn, RdYlGn_r, Reds, Reds_r, Set1, Set1_r, Set2, Set2_r, Set3, Set3_r, Spectral, Spectral_r, Vega10, Vega10_r, Vega20, Vega20_r, Vega20b, Vega20b_r, Vega20c, Vega20c_r, Wistia, Wistia_r, YlGn, YlGnBu, YlGnBu_r, YlGn_r, YlOrBr, YlOrBr_r, YlOrRd, YlOrRd_r, afmhot, afmhot_r, autumn, autumn_r, binary, binary_r, bone, bone_r, brg, brg_r, bwr, bwr_r, cool, cool_r, coolwarm, coolwarm_r, copper, copper_r, cubehelix, cubehelix_r, flag, flag_r, gist_earth, gist_earth_r, gist_gray, gist_gray_r, gist_heat, gist_heat_r, gist_ncar, gist_ncar_r, gist_rainbow, gist_rainbow_r, gist_stern, gist_stern_r, gist_yarg, gist_yarg_r, gnuplot, gnuplot2, gnuplot2_r, gnuplot_r, gray, gray_r, hot, hot_r, hsv, hsv_r, icefire, icefire_r, inferno, inferno_r, jet, jet_r, magma, magma_r, mako, mako_r, nipy_spectral, nipy_spectral_r, ocean, ocean_r, pink, pink_r, plasma, plasma_r, prism, prism_r, rainbow, rainbow_r, rocket, rocket_r, seismic, seismic_r, spectral, spectral_r, spring, spring_r, summer, summer_r, tab10, tab10_r, tab20, tab20_r, tab20b, tab20b_r, tab20c, tab20c_r, terrain, terrain_r, viridis, viridis_r, vlag, vlag_r, winter, winter_r plt.figure(figsize(13,7)) sns.scatterplot(x0, y1, datapac3,styleclu,hueclu, paletteautumn) matplotlib.axes._subplots.AxesSubplot at 0x1c21db35438 将分类后的类别添加至原数据中 pac4 pac2.copy() pac4[cluster] list(pac3.clu) pac4.head() A2A11A12A13A20A23A24A43A44A46A48A49A50A51A53A54clusteruser_id425530019219000.5230.9217420347362240.0710776120038819000.0200.9025638684371500.03120276197015311000.0100.9200015422615080.09512215720003000.020.857142457924000.02146924330002000.0110.7368428454916800.01541 # 计算每个类的平均值 clu5 pac4.groupby(cluster).mean() # 删除相关性较高的列 clu5.drop(columnsA53,inplaceTrue) c5cor clu5.corr() plt.figure(figsize(15,8)) sns.heatmap(c5cor,annotTrue) matplotlib.axes._subplots.AxesSubplot at 0x1c21d92a780 ccrp pd.DataFrame(np.tril(c5cor,-1)) ccrp.columns clu5.columns cccc clu5.loc[:,(ccrp.abs().95).all()] cccc A2A20A23A24A44A46A50A51A54cluster00.0222220.3222220.6555560.1676910.85819327.60000010.6666672.011111166.71111110.0796460.2743360.3628320.0952310.84402720.1592923.0088501.469027102.10619520.0737700.3770490.3360660.0706280.84934324.7377054.2868851.844262121.90983630.0183490.2293580.2844040.0982520.84598124.1192665.2660551.733945146.87156040.2032520.2926830.2439020.0636860.77507618.9837402.1300810.97561084.032520 from sklearn.preprocessing import scaleccccc pd.DataFrame(scale(cccc))ccccc.columns cccc.columns ccccc A2A20A23A24A44A46A50A51A540-0.8555900.4688591.9184001.8620200.7858821.4229701.8677731.1184571.42428210.002962-0.503392-0.094337-0.1049610.315530-0.940402-0.688647-0.381093-0.7466722-0.0848841.582038-0.278379-0.7728260.4920380.513827-0.2619980.656909-0.0812003-0.913505-1.416613-0.633601-0.0229440.3803870.3173940.0648790.3517420.75760241.851016-0.130892-0.912083-0.961289-1.973837-1.313789-0.982007-1.746015-1.354012 plt.figure(figsize(8,8)) # 极坐标的分割分数 N ccccc.shape[1] # 设置雷达图的角度用于平分切开一个圆面 angles np.linspace(0, 2*np.pi, N, endpointFalse) # 使雷达图一圈封闭起来 angles np.concatenate((angles,[angles[0]])) for i in range(len(ccccc)):# 构造数据values ccccc.loc[i,:]# 为了使雷达图一圈封闭起来values np.concatenate((values,[values[0]]))# 绘制plt.polar(angles, values, o-, linewidth2) plt.legend(ccccc.index, loclower right) # 添加极坐标的标签 plt.thetagrids(angles * 180/np.pi, labelslist(ccccc.columns)) plt.title(重要指标雷达图呈现) Text(0.5,1.05,重要指标雷达图呈现) 不进行预处理的降维 dfp acc.iloc[:,3:(len(acc.columns)-1)] dfp.indexacc.user_id dfp.head() A1A2A3A4A5A6A7A8A9A10...A45A46A47A48A49A50A51A52A53A54user_id425530000010123358.25288230...19201805433473622400.07177612000009000.00325195...19381422146843715000.0312276197000070587.25150100...1515546024226150800.09522157200001000.004014...242439891579240000.02169243300006000.0010295...1528507064549168000.01545 rows × 54 columns from sklearn.decomposition import PCApca PCA(whitenFalse) pca.fit(dfp) PCA(copyTrue, iterated_powerauto, n_componentsNone, random_stateNone,svd_solverauto, tol0.0, whitenFalse) retio pca.explained_variance_ratio_ # print(retio) # print(pca.singular_values_) rec retio.cumsum() print(rec) x np.arange(len(rec)) plt.plot(rec, colorr) [0.9996008 0.99995245 0.99997489 0.99999016 0.9999933 0.999995640.99999759 0.99999838 0.99999897 0.9999995 0.99999962 0.999999720.99999979 0.99999986 0.9999999 0.99999993 0.99999996 0.999999970.99999997 0.99999998 0.99999998 0.99999999 0.99999999 0.999999990.99999999 1. 1. 1. 1. 1.1. 1. 1. 1. 1. 1.1. 1. 1. 1. 1. 1.1. 1. 1. 1. 1. 1.1. 1. 1. 1. 1. 1. ][matplotlib.lines.Line2D at 0x1c21f406780] pca.set_params(n_components10) pacsse pd.DataFrame(pca.fit_transform(dfp)) pacsse.head() 0123456789094938.293061-342.891655-161.442878-199.6162101.83069273.107938153.124982124.440657-34.37161246.548951156613.313155-960.580156-38.560364-45.83657113.67016690.767620-145.846645-40.25513410.50820316.2878632-31060.195159388.005529-6.932692-0.948812-5.33272818.23729311.39346714.689011-7.99490932.3985323-45806.2524431579.357883-81.812845-96.488345-18.477649-90.05921731.377291-22.865193-19.72483716.2936404-34963.135693611.858506-18.187490-16.454233-5.597209-9.722257-63.112236-3.9432667.222725-10.889839 手肘法获取最优 K 值 from sklearn.cluster import KMeansdf_features pacsse # 读入数据 # 利用SSE选择k SSE [] # 存放每次结果的误差平方和 for k in range(1,9):estimator KMeans(n_clustersk) # 构造聚类器estimator.fit(df_features)SSE.append(estimator.inertia_) X range(1,9) plt.xlabel(k) plt.ylabel(SSE) plt.plot(X,SSE,o-)[matplotlib.lines.Line2D at 0x1c2211cac50] 显然先标准化数据是不合适的 # 显然先标准化数据是不合适的df_features pd.DataFrame(scale(pacsse)) SSE [] for k in range(1,9):estimator KMeans(n_clustersk) estimator.fit(df_features)SSE.append(estimator.inertia_) X range(1,9) plt.xlabel(k) plt.ylabel(SSE) plt.plot(X,SSE,o-) [matplotlib.lines.Line2D at 0x1c2213bc438] km KMeans(n_clusters4) km.fit(pacsse) clu km.labels_ pacsse[clu] clu pacsse.head() 0123456789clu094938.293061-342.891655-161.442878-199.6162101.83069273.107938153.124982124.440657-34.37161246.5489512156613.313155-960.580156-38.560364-45.83657113.67016690.767620-145.846645-40.25513410.50820316.28786302-31060.195159388.005529-6.932692-0.948812-5.33272818.23729311.39346714.689011-7.99490932.39853213-45806.2524431579.357883-81.812845-96.488345-18.477649-90.05921731.377291-22.865193-19.72483716.29364014-34963.135693611.858506-18.187490-16.454233-5.597209-9.722257-63.112236-3.9432667.222725-10.8898391 pacsse.groupby(clu)[2].count() clu 0 153 1 344 2 54 3 6 Name: 2, dtype: int64 plt.figure(figsize(13,7)) sns.scatterplot(x0, y1, datapacsse,styleclu,hueclu, paletteautumn) matplotlib.axes._subplots.AxesSubplot at 0x1c22118b668 显然不进行预处理的数据聚类是有问题的第一主成分和第二主成分显然是相关的 pac4 pac2.copy() pac4[cluster] list(pacsse.clu) pac4.head()clu5 pac4.groupby(cluster).mean() clu5.drop(columnsA53,inplaceTrue) c5cor clu5.corr() plt.figure(figsize(15,8)) sns.heatmap(c5cor,annotTrue) matplotlib.axes._subplots.AxesSubplot at 0x1c22145a4e0 ccrp pd.DataFrame(np.tril(c5cor,-1)) ccrp.columns clu5.columns cccc clu5.loc[:,(ccrp.abs().95).all()] cccc A12A20A51A54cluster03.3986930.2287581.810458146.28758211.9389530.3168601.433140101.53197724.5925930.4074071.870370169.77777832.1666670.1666671.666667213.833333 from sklearn.preprocessing import scaleccccc pd.DataFrame(scale(cccc))ccccc.columns cccc.columns ccccc A12A20A51A5400.352533-0.5627840.684599-0.2852291-1.0217050.406288-1.555764-1.38855721.4765021.4022491.0403380.2938583-0.807330-1.245753-0.1691731.379928 plt.figure(figsize(8,8)) # 极坐标的分割分数 N ccccc.shape[1] # 设置雷达图的角度用于平分切开一个圆面 angles np.linspace(0, 2*np.pi, N, endpointFalse) # 使雷达图一圈封闭起来 angles np.concatenate((angles,[angles[0]])) for i in range(len(ccccc)):# 构造数据values ccccc.loc[i,:]# 为了使雷达图一圈封闭起来values np.concatenate((values,[values[0]]))# 绘制plt.polar(angles, values, o-, linewidth2) plt.legend(ccccc.index, loclower right) # 添加极坐标的标签 plt.thetagrids(angles * 180/np.pi, labelslist(ccccc.columns)) plt.title(重要指标雷达图呈现) Text(0.5,1.05,重要指标雷达图呈现) 转载于:https://www.cnblogs.com/cvlas/p/9537532.html

查看全文

http://www.pierceye.com/news/489613/