본 포스팅에서는 군집분석 실습 시 사용했던 군집분석 파이썬 코드를 공유드립니다.
import pandas import pylab as pl from sklearn.cluster import KMeans from sklearn.decomposition import PCA #PCA : Principal Component Analysis
In [116]:
variables = pandas.read_csv('/Users/Desktop/파일명.csv', engine='python') Y = variables[['f']] X = variables[['1_m']]
In [117]:
Nc = range(1, 20) kmeans = [KMeans(n_clusters=i) for i in Nc] kmeans score = [kmeans[i].fit(Y).score(Y) for i in range(len(kmeans))] score pl.plot(Nc,score) pl.xlabel('Number of Clusters') pl.ylabel('Score') pl.title('Elbow Curve') pl.show()
![](https://it-bite.com/wp-content/uploads/2023/07/image-15.png)
In [118]:
pca = PCA(n_components=1).fit(Y) pca_d = pca.transform(Y) pca_c = pca.transform(X)
In [133]:
kmeans=KMeans(n_clusters=3) kmeansoutput=kmeans.fit(Y) kmeansoutput pl.figure('3 Cluster K-Means') pl.scatter(pca_c[:, 0], pca_d[:, 0], c=kmeansoutput.labels_) pl.xlabel('1_m') pl.ylabel('f') pl.title('3 Cluster K-Means') pl.show()
![](https://it-bite.com/wp-content/uploads/2023/07/image-16.png)
In [120]:
#K-Mean클러스터링 알고리즘 import pandas as pd import numpy as np from sklearn.cluster import KMeans import matplotlib.pyplot as plt import seaborn as sns %matplotlib inline
In [121]:
#data points생성 df = pd.DataFrame(columns=['x', 'y'])
In [122]:
df.loc[0] = [3,1] df.loc[1] = [4,1] df.loc[2] = [3,2] df.loc[3] = [4,2] df.loc[4] = [10,5] df.loc[5] = [10,6] df.loc[6] = [11,5] df.loc[7] = [11,6] df.loc[8] = [15,1] df.loc[9] = [15,2] df.loc[10] = [16,1] df.loc[11] = [16,2]
In [123]:
df.head(20)
Out[123]:
x | y | |
---|---|---|
0 | 3 | 1 |
1 | 4 | 1 |
2 | 3 | 2 |
3 | 4 | 2 |
4 | 10 | 5 |
5 | 10 | 6 |
6 | 11 | 5 |
7 | 11 | 6 |
8 | 15 | 1 |
9 | 15 | 2 |
10 | 16 | 1 |
11 | 16 | 2 |
In [124]:
#visualize data points on 2D plot # visualize data point sns.lmplot('x', 'y', data=df, fit_reg=False, scatter_kws={"s": 200}) # x-axis, y-axis, data, no line, marker size # title plt.title('kmean plot') # x-axis label plt.xlabel('x') # y-axis label plt.ylabel('y')
Out[124]:
Text(16.3,0.5,'y')
![](https://it-bite.com/wp-content/uploads/2023/07/image-16.png)
In [125]:
#k-mean clustering # convert dataframe to numpy array data_points = df.values
In [126]:
kmeans = KMeans(n_clusters=3).fit(data_points)
In [127]:
# cluster id for each data point kmeans.labels_
Out[127]:
array([1, 1, 1, 1, 2, 2, 2, 2, 0, 0, 0, 0], dtype=int32)
In [128]:
# this is final centroids position kmeans.cluster_centers_
Out[128]:
array([[ 15.5, 1.5], [ 3.5, 1.5], [ 10.5, 5.5]])
In [129]:
df['cluster_id'] = kmeans.labels_
In [130]:
df.head(12)
Out[130]:
x | y | cluster_id | |
---|---|---|---|
0 | 3 | 1 | 1 |
1 | 4 | 1 | 1 |
2 | 3 | 2 | 1 |
3 | 4 | 2 | 1 |
4 | 10 | 5 | 2 |
5 | 10 | 6 | 2 |
6 | 11 | 5 | 2 |
7 | 11 | 6 | 2 |
8 | 15 | 1 | 0 |
9 | 15 | 2 | 0 |
10 | 16 | 1 | 0 |
11 | 16 | 2 | 0 |
In [131]:
sns.lmplot('x', 'y', data=df, fit_reg=False, # x-axis, y-axis, data, no line scatter_kws={"s": 150}, # marker size hue="cluster_id") # color # title plt.title('after kmean clustering')
Out[131]:
Text(0.5,1,'after kmean clustering')
![](https://it-bite.com/wp-content/uploads/2023/07/image-17.png)