7장.K_means

In [1]:

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:1200px !important; }</style>"))

K-Means를 이용한 붓꽃(Iris) 데이터 셋 Clustering¶

In [8]:

from sklearn.preprocessing import scale
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline

iris = load_iris()
# 보다 편리한 데이터 Handling을 위해 DataFrame으로 변환
irisDF = pd.DataFrame(data=iris.data, columns=['sepal_length','sepal_width','petal_length','petal_width'])
irisDF.head(3)

Out[8]:

	sepal_length	sepal_width	petal_length	petal_width
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2

붓꽃 데이터 세트를 3개 그룹으로 군집화 n_cluster = 3 init = k_means++ (초기 중심 설정 방식) max_iter = 300(최대 반복 횟수)

In [9]:

kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300,random_state=0).fit(irisDF)
#fit() 을 수행해 ifisDF 데이터에 대한 군집화 수행 결과가 kmeans 객체 변수로 반환
# .labels_ 로 속성값 확인

In [10]:

print(kmeans.labels_)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 0 0 0 0 2 0 0 0 0
 0 0 2 2 0 0 0 0 2 0 2 0 2 0 0 2 2 0 0 0 0 0 2 0 0 0 0 2 0 0 0 2 0 0 0 2 0
 0 2]

In [11]:

# 실제 붓꽃 품종 분류 값과 얼마나 차이가 나는지 군집화가 효과적으로 됐는지 확인
# 붓꽃 데이터 셑의 target 값을 'target' 칼럼으로, 앞에서 구한 labels_값을 'cluster'칼럼으로 지정해 추가
# groupby를 통해 실제 분류값인 target과 군집화 분류값인 cluster 레벨로 적용해 target과 cluster 값 개수 를 비교


irisDF['target'] = iris.target
irisDF['cluster']=kmeans.labels_
iris_result = irisDF.groupby(['target','cluster'])['sepal_length'].count()
print(iris_result)

target  cluster
0       1          50
1       0           2
        2          48
2       0          36
        2          14
Name: sepal_length, dtype: int64

In [12]:

# 분류 타깃 0값인 데이터는 1로 그루핑완료. 타깃 1과 2는 분산 되었음.

# 시각화 < 붓꽃 데이터 세트의 속성이 4개이므로 2차원 평면에 맞추기위해 PCA로 2차원 축소

from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca_transformed = pca.fit_transform(iris.data)

irisDF['pca_x'] = pca_transformed[:,0]
irisDF['pca_y'] = pca_transformed[:,1]
irisDF.head(3)

Out[12]:

	sepal_length	sepal_width	petal_length	petal_width	cluster	pca_x	pca_y
0	5.1	3.5	1.4	0.2	1	-2.684126	0.319397
1	4.9	3.0	1.4	0.2	1	-2.714142	-0.177001
2	4.7	3.2	1.3	0.2	1	-2.888991	-0.144949

In [13]:

# pca_x X좌표 값, pca_y Y좌표 값. 각 군집별로 마커 'o','s','^'로 표현


# cluster 값이 0, 1, 2 인 경우마다 별도의 Index로 추출
# < matplotlib의 산점도는 서로 다른 마커를 한 번에 표현 불가하여 마커별로 별도의 산점도 수행
marker0_ind = irisDF[irisDF['cluster']==0].index
marker1_ind = irisDF[irisDF['cluster']==1].index
marker2_ind = irisDF[irisDF['cluster']==2].index

# cluster값 0, 1, 2에 해당하는 Index로 각 cluster 레벨의 pca_x, pca_y 값 추출. o, s, ^ 로 marker 표시
plt.scatter(x=irisDF.loc[marker0_ind,'pca_x'], y=irisDF.loc[marker0_ind,'pca_y'], marker='o') 
plt.scatter(x=irisDF.loc[marker1_ind,'pca_x'], y=irisDF.loc[marker1_ind,'pca_y'], marker='s')
plt.scatter(x=irisDF.loc[marker2_ind,'pca_x'], y=irisDF.loc[marker2_ind,'pca_y'], marker='^')

plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.title('3 Clusters Visualization by 2 PCA Components')
plt.show()

Clustering 알고리즘 테스트를 위한 데이터 생성

In [14]:

import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
%matplotlib inline

X, y = make_blobs(n_samples=200, n_features=2, centers=3, cluster_std=0.8, random_state=0)
print(X.shape, y.shape)
# 총 200개의 레코드와 2개의 피처가 3개의 군집화 기반 분포도를 가진 피처 데이터 세트 X
# 동시에 3개의 군집화 값을 가진 타깃 데이터 세트 y 반환

# y target 값의 분포를 확인
unique, counts = np.unique(y, return_counts=True)
print(unique,counts)

(200, 2) (200,)
[0 1 2] [67 67 66]

In [15]:

import pandas as pd

clusterDF = pd.DataFrame(data=X, columns=['ftr1', 'ftr2'])
clusterDF['target'] = y
clusterDF.head(3)

Out[15]:

	ftr1	ftr2	target
0	-1.692427	3.622025	2
1	0.697940	4.428867	0
2	1.100228	4.606317	0

In [16]:

target_list = np.unique(y)
# 각 target별 scatter plot 의 marker 값들. 
markers=['o', 's', '^', 'P','D','H','x']
# 3개의 cluster 영역으로 구분한 데이터 셋을 생성했으므로 target_list는 [0,1,2]
# target==0, target==1, target==2 로 scatter plot을 marker별로 생성. 
for target in target_list:
    target_cluster = clusterDF[clusterDF['target']==target]
    plt.scatter(x=target_cluster['ftr1'], y=target_cluster['ftr2'], edgecolor='k', marker=markers[target] )
plt.show()

In [17]:

# KMeans 객체를 이용하여 X 데이터를 K-Means 클러스터링 수행 
kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=200, random_state=0)
cluster_labels = kmeans.fit_predict(X)
clusterDF['kmeans_label']  = cluster_labels

#cluster_centers_ 는 개별 클러스터의 중심 위치 좌표 시각화를 위해 추출
centers = kmeans.cluster_centers_
unique_labels = np.unique(cluster_labels)
markers=['o', 's', '^', 'P','D','H','x']

# 군집된 label 유형별로 iteration 하면서 marker 별로 scatter plot 수행. 
for label in unique_labels:
    label_cluster = clusterDF[clusterDF['kmeans_label']==label]
    center_x_y = centers[label]
    plt.scatter(x=label_cluster['ftr1'], y=label_cluster['ftr2'], edgecolor='k', 
                marker=markers[label] )
    
    # 군집별 중심 위치 좌표 시각화 
    plt.scatter(x=center_x_y[0], y=center_x_y[1], s=200, color='white',
                alpha=0.9, edgecolor='k', marker=markers[label])
    plt.scatter(x=center_x_y[0], y=center_x_y[1], s=70, color='k', edgecolor='k', 
                marker='$%d$' % label)

plt.show()

In [18]:

print(clusterDF.groupby('target')['kmeans_label'].value_counts())

target  kmeans_label
0       0               66
        2                1
1       1               67
2       2               65
        1                1
Name: kmeans_label, dtype: int64

In [13]:

저작자표시 비영리 변경금지 (새창열림)

'머신러닝' 카테고리의 다른 글

[파이썬 머신러닝] 7장. 평균 이동 (0)	2021.02.07
[파이썬 머신러닝] 7장. 군집 평가 (0)	2021.02.07
[파이썬 머신러닝] 6장.차원 축소 - SVD, NMF (0)	2021.02.01
[파이썬 머신러닝] 6장.차원 축소 - PCA, LDA (0)	2021.02.01
[파이썬 머신러닝] 5장. 회귀 - 캐글 주택 가격 (1)	2021.01.28

dlsalfkd11 코딩코딩

[파이썬 머신러닝] 7장. K-Means

K-Means를 이용한 붓꽃(Iris) 데이터 셋 Clustering¶

'머신러닝' 카테고리의 다른 글

티스토리툴바

[파이썬 머신러닝] 7장. K-Means

K-Means를 이용한 붓꽃(Iris) 데이터 셋 Clustering¶

'머신러닝' 카테고리의 다른 글

'머신러닝' Related Articles

티스토리툴바