经典网站设计作品,科技公司网站设,skycc营销软件,wordpress标签id在哪里KNN主要包括训练过程和分类过程。在训练过程上#xff0c;需要将训练集存储起来。在分类过程中#xff0c;将测试集和训练集中的每一张图片去比较#xff0c;选取差别最小的那张图片。如果数据集多#xff0c;就把训练集分成两部分#xff0c;一小部分作为验证集(假的测试…KNN主要包括训练过程和分类过程。在训练过程上需要将训练集存储起来。在分类过程中将测试集和训练集中的每一张图片去比较选取差别最小的那张图片。如果数据集多就把训练集分成两部分一小部分作为验证集(假的测试集)剩下的都为训练集(一般来说是70%-90%具体多少取决于需要调整的超参数的多少如果超参数多验证集占比就更大一点)。验证集的好处是用来调节超参数如果数据集不多使用交叉验证的方法来调节参数。但是交叉验证的代价比较高K折交叉验证K越大越好但是代价也更高。决策分类明确K个邻居中所有数据类别的个数将测试数据划分给个数最多的那一类。即由输入实例的 K 个最临近的训练实例中的多数类决定输入实例的类别。常用决策规则多数表决法多数表决法和我们日常生活中的投票表决是一样的少数服从多数是最常用的一种方法。加权表决法有些情况下会使用到加权表决法比如投票的时候裁判投票的权重更大而一般人的权重较小。所以在数据之间有权重的情况下一般采用加权表决法。优点所选择的邻居都是已经正确分类的对象KNN算法本身比较简单分类器不需要使用训练集进行训练训练时间复杂度为0。本算法分类的复杂度与训练集中数据的个数成正比。对于类域的交叉或重叠较多的待分类样本KNN算法比其他方法跟合适。缺点当样本分布不平衡时很难做到正确分类计算量较大因为每次都要计算测试数据到全部数据的距离。python代码实现import numpy as npclass kNearestNeighbor:def init(self):passdef train(self, X, y):self.Xtr Xself.ytr ydef predict(self, X, k1):num_test X.shape[0]Ypred np.zeros(num_test, dtype self.ytr.dtype)for i in range(num_test):distances np.sum(np.abs(self.Xtr - X[i,:]), axis 1)closest_y y_train[np.argsort(distances)[:k]]u, indices np.unique(closest_y, return_inverseTrue)Ypred[i] u[np.argmax(np.bincount(indices))]return Ypred12345678910111213141516171819load_CIFAR_batch()和load_CIFAR10()是用来加载CIFAR-10数据集的import pickledef load_CIFAR_batch(filename):“” load single batch of cifar “”with open(filename, ‘rb’) as f:datadict pickle.load(f, encoding‘latin1’)X datadict[‘data’]Y datadict[‘labels’]X X.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype(“float”)Y np.array(Y)return X, Y12345678910import osdef load_CIFAR10(ROOT):“” load all of cifar “”xs []ys []for b in range(1,6):f os.path.join(ROOT, ‘data_batch_%d’ %(b))X, Y load_CIFAR_batch(f)xs.append(X)ys.append(Y)Xtr np.concatenate(xs) #使变成行向量Ytr np.concatenate(ys)del X,YXte, Yte load_CIFAR_batch(os.path.join(ROOT, ‘test_batch’))return Xtr, Ytr, Xte, Yte123456789101112131415Xtr, Ytr, Xte, Yte load_CIFAR10(‘cifar10’)Xtr_rows Xtr.reshape(Xtr.shape[0], 32 * 32 * 3)Xte_rows Xte.reshape(Xte.shape[0], 32 * 32 * 3)123#由于数据集稍微有点大在电脑上跑的很慢所以取训练集5000个测试集500个num_training 5000num_test 500x_train Xtr_rows[:num_training, :]y_train Ytr[:num_training]x_test Xte_rows[:num_test, :]y_test Yte[:num_test]123456789knn kNearestNeighbor()knn.train(x_train, y_train)y_predict knn.predict(x_test, k7)acc np.mean(y_predict y_test)print(‘accuracy : %f’ %(acc))12345accuracy : 0.3020001#k值取什么最后的效果会更好呢可以使用交叉验证的方法这里使用的是5折交叉验证num_folds 5k_choices [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]x_train_folds np.array_split(x_train, num_folds)y_train_folds np.array_split(y_train, num_folds)k_to_accuracies {}for k_val in k_choices:print(k ’ str(k_val))k_to_accuracies[k_val] []for i in range(num_folds):x_train_cycle np.concatenate([f for j,f in enumerate (x_train_folds) if j!i])y_train_cycle np.concatenate([f for j,f in enumerate (y_train_folds) if j!i])x_val_cycle x_train_folds[i]y_val_cycle y_train_folds[i]knn kNearestNeighbor()knn.train(x_train_cycle, y_train_cycle)y_val_pred knn.predict(x_val_cycle, k_val)num_correct np.sum(y_val_cycle y_val_pred)k_to_accuracies[k_val].append(float(num_correct) / float(len(y_val_cycle)))123456789101112131415161718192021222324k 1k 3k 5k 8k 10k 12k 15k 20k 50k 10012345678910for k in sorted(k_to_accuracies):for accuracy in k_to_accuracies[k]:print(‘k %d, accuracy %f’ % (int(k), accuracy))123k 1, accuracy 0.098000k 1, accuracy 0.148000k 1, accuracy 0.205000k 1, accuracy 0.233000k 1, accuracy 0.308000k 3, accuracy 0.089000k 3, accuracy 0.142000k 3, accuracy 0.215000k 3, accuracy 0.251000k 3, accuracy 0.296000k 5, accuracy 0.096000k 5, accuracy 0.176000k 5, accuracy 0.240000k 5, accuracy 0.284000k 5, accuracy 0.309000k 8, accuracy 0.100000k 8, accuracy 0.175000k 8, accuracy 0.263000k 8, accuracy 0.289000k 8, accuracy 0.310000k 10, accuracy 0.099000k 10, accuracy 0.174000k 10, accuracy 0.264000k 10, accuracy 0.318000k 10, accuracy 0.313000k 12, accuracy 0.100000k 12, accuracy 0.192000k 12, accuracy 0.261000k 12, accuracy 0.316000k 12, accuracy 0.318000k 15, accuracy 0.087000k 15, accuracy 0.197000k 15, accuracy 0.255000k 15, accuracy 0.322000k 15, accuracy 0.321000k 20, accuracy 0.089000k 20, accuracy 0.225000k 20, accuracy 0.270000k 20, accuracy 0.319000k 20, accuracy 0.306000k 50, accuracy 0.079000k 50, accuracy 0.248000k 50, accuracy 0.278000k 50, accuracy 0.287000k 50, accuracy 0.293000k 100, accuracy 0.075000k 100, accuracy 0.246000k 100, accuracy 0.275000k 100, accuracy 0.284000k 100, accuracy 0.2770001234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950可视化交叉验证的结果import matplotlib.pyplot as pltplt.rcParams[‘figure.figsize’] (10.0, 8.0)plt.rcParams[‘image.interpolation’] ‘nearest’plt.rcParams[‘image.cmap’] ‘gray’12345for k in k_choices:accuracies k_to_accuracies[k]plt.scatter([k] * len(accuracies), accuracies)accuracies_mean np.array([np.mean(v) for k,v in sorted(k_to_accuracies.items())])accuracies_std np.array([np.std(v) for k,v in sorted(k_to_accuracies.items())])plt.errorbar(k_choices, accuracies_mean, yerraccuracies_std)plt.title(‘Cross-validation on k’)plt.xlabel(‘k’)plt.ylabel(‘Cross-validation accuracy’)plt.show()1234567891011