diff --git "a/\344\275\234\344\270\2323-\344\275\225\350\231\216\344\274\237" "b/\344\275\234\344\270\2323-\344\275\225\350\231\216\344\274\237" new file mode 100644 index 0000000000000000000000000000000000000000..211b823a4a99784eb48e7dd53ae86ccdf0396131 --- /dev/null +++ "b/\344\275\234\344\270\2323-\344\275\225\350\231\216\344\274\237" @@ -0,0 +1,102 @@ +import pandas as pd +import random +import numpy as np +import matplotlib.pyplot as plt +import math + +path = "dataset_circles.csv" +dataset = pd.read_csv(path) +data_array = np.array(dataset)# 首先将pandas读取的数据转化为array +data_list =data_array.tolist()# 然后转化为list形式 +data_list=np.delete(data_list,2,axis=1) +a=b=n=0 +for i in range(len(data_list)): + a=data_list[i][0]+a + b=data_list[i][1]+b + n=n+1 +avx=a/n +avy=b/n +av=[avx,avy] +def change(X): + n=np.shape(X)[0] + A=np.zeros((n,2)) + for i in range(n): + A[i][0]=math.sqrt(X[i][0]*X[i][0]+X[i][1]*X[i][1]) + A[i][1]=math.atan(X[i][1]/X[i][0]) + return A +def change1(X): + n=np.shape(X)[0] + A=np.zeros((n,2)) + for i in range(n): + A[i][0]=math.sqrt(pow(X[i][0]-avx,2)+pow(X[i][1]-avy,2)) + A[i][1]=math.atan(((X[i][1]-avy)/(X[i][0]-avx))) +def distance(x1, x2): + return abs(x2[0]-x1[0])+abs(x2[1]-x1[1]) +def rand_cluster_cents(X, k): + m=np.shape(X)[0] # 样本数 + dataIndex=list(range(m)) # 生成随机下标列表 + random.shuffle(dataIndex)#打乱列表 + centroidsIndex = dataIndex[:k] + return X[centroidsIndex, :]# 返回随机的聚类中心 +def kmeans(X, k): + m = np.shape(X)[0]# 样本总数 + clusterAssment = np.zeros((m, 2)) # 分配样本到最近的簇:存[簇序号,距离的平方] (m行 x 2 列) + centroids = rand_cluster_cents(X, k)# step1: 通过随机产生的样本点初始化聚类中心 + print('最初的中心=', centroids) + iterN = 0 + while True: + clusterChanged = False + for i in range(m):# step2:分配到最近的聚类中心对应的簇中 + minDist = np.inf# minDist值表示+∞ + minIndex = -1 + for j in range(k):# 计算第i个样本到第j个中心点的距离 + distJI = distance(centroids[j], X[i]) + if distJI < minDist: + minDist = distJI + minIndex = j + if clusterAssment[i, 0] != minIndex:# 样本上次分配结果跟本次不一样,标志位clusterChanged置True + clusterChanged = True + clusterAssment[i, :] = minIndex, minDist ** 2 # 分配样本到最近的簇 + iterN += 1 + sse = sum(clusterAssment[:, 1]) + print('the SSE of %d' % iterN + 'th iteration is %f' % sse) + for cent in range(k): # step3:更新聚类中心 # 样本分配结束后,重新计算聚类中心 + ptsInClust = X[clusterAssment[:, 0] == cent, :] + centroids[cent, :] = np.mean(ptsInClust, axis=0) + if not clusterChanged: # 如果聚类重心没有发生改变,则退出迭代 + break + return centroids, clusterAssment +""" +# 进行k-means聚类 +""" +k = 2 # 定义聚类数 +def datashow(dataSet, k, centroids, clusterAssment): # 二维空间显示聚类结果 + num, dim = np.shape(dataSet) # 样本数num ,维数dim + if dim != 2: + print('sorry,the dimension of your dataset is not 2!') + return 1 + marksamples = ['or', 'ob', 'og', 'ok', '^r', '^b', ' len(marksamples): + print('sorry,your k is too large,please add length of the marksample!') + return 1 + for i in range(num):# 绘所有样本 + markindex = int(clusterAssment[i, 0]) # 矩阵形式转为int值, 簇序号 + plt.plot(dataSet[i, 0], dataSet[i, 1], marksamples[markindex], markersize=6) # 特征维对应坐标轴x,y;样本图形标记及大小 + markcentroids = ['o', '^'] # 绘中心点,聚类中心图形标记 + label = ['0', '1'] + c = ['yellow', 'red'] + for i in range(k): + plt.plot(centroids[i, 0], centroids[i, 1], markcentroids[i], markersize=15, label=label[i], c=c[i]) + plt.legend(loc='upper left') #图例 + plt.xlabel('feature 1') + plt.ylabel('feature 2') + plt.title(' result') # 标题 + plt.show() +C=change(data_list) +D=change(data_list) +mycentroids, clusterAssment = kmeans(C, k) +datashow(C, k, mycentroids, clusterAssment) +mycentroids, clusterAssment = kmeans(data_list, k) +datashow(data_list, k, mycentroids, clusterAssment) +mycentroids, clusterAssment = kmeans(D, k) +datashow(D, k, mycentroids, clusterAssment)