군집분석 파이썬 코드 공유

본 포스팅에서는 군집분석 실습 시 사용했던 군집분석 파이썬 코드를 공유드립니다.

import pandas 
import pylab as pl
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
#PCA : Principal Component Analysis

In [116]:

variables = pandas.read_csv('/Users/Desktop/파일명.csv', engine='python')
Y = variables[['f']]
X = variables[['1_m']]

In [117]:

Nc = range(1, 20)

kmeans = [KMeans(n_clusters=i) for i in Nc]

kmeans

score = [kmeans[i].fit(Y).score(Y) for i in range(len(kmeans))]

score

pl.plot(Nc,score)

pl.xlabel('Number of Clusters')

pl.ylabel('Score')

pl.title('Elbow Curve')

pl.show()

In [118]:

pca = PCA(n_components=1).fit(Y)

pca_d = pca.transform(Y)

pca_c = pca.transform(X)

In [133]:

kmeans=KMeans(n_clusters=3)

kmeansoutput=kmeans.fit(Y)

kmeansoutput

pl.figure('3 Cluster K-Means')

pl.scatter(pca_c[:, 0], pca_d[:, 0], c=kmeansoutput.labels_)

pl.xlabel('1_m')

pl.ylabel('f')

pl.title('3 Cluster K-Means')

pl.show()

In [120]:

#K-Mean클러스터링 알고리즘  
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [121]:

#data points생성
df = pd.DataFrame(columns=['x', 'y'])

In [122]:

df.loc[0] = [3,1]
df.loc[1] = [4,1]
df.loc[2] = [3,2]
df.loc[3] = [4,2]
df.loc[4] = [10,5]
df.loc[5] = [10,6]
df.loc[6] = [11,5]
df.loc[7] = [11,6]
df.loc[8] = [15,1]
df.loc[9] = [15,2]
df.loc[10] = [16,1]
df.loc[11] = [16,2]

In [123]:

df.head(20)

Out[123]:

	x	y
0	3	1
1	4	1
2	3	2
3	4	2
4	10	5
5	10	6
6	11	5
7	11	6
8	15	1
9	15	2
10	16	1
11	16	2

In [124]:

#visualize data points on 2D plot
# visualize data point
sns.lmplot('x', 'y', data=df, fit_reg=False, scatter_kws={"s": 200}) 
# x-axis, y-axis, data, no line, marker size

# title
plt.title('kmean plot')

# x-axis label
plt.xlabel('x')

# y-axis label
plt.ylabel('y')

Out[124]:

Text(16.3,0.5,'y')

In [125]:

#k-mean clustering
# convert dataframe to numpy array
data_points = df.values

In [126]:

kmeans = KMeans(n_clusters=3).fit(data_points)

In [127]:

# cluster id for each data point
kmeans.labels_

Out[127]:

array([1, 1, 1, 1, 2, 2, 2, 2, 0, 0, 0, 0], dtype=int32)

In [128]:

# this is final centroids position
kmeans.cluster_centers_

Out[128]:

array([[ 15.5,   1.5],
       [  3.5,   1.5],
       [ 10.5,   5.5]])

In [129]:

df['cluster_id'] = kmeans.labels_

In [130]:

df.head(12)

Out[130]:

	x	y	cluster_id
0	3	1	1
1	4	1	1
2	3	2	1
3	4	2	1
4	10	5	2
5	10	6	2
6	11	5	2
7	11	6	2
8	15	1	0
9	15	2	0
10	16	1	0
11	16	2	0

In [131]:

sns.lmplot('x', 'y', data=df, fit_reg=False,  # x-axis, y-axis, data, no line
           scatter_kws={"s": 150}, # marker size
           hue="cluster_id") # color

# title
plt.title('after kmean clustering')

Out[131]:

Text(0.5,1,'after kmean clustering')