In [1]:

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:1200px !important; }</style>"))

차원 축소¶

피처가 많을 경우 상대적으로 적은 차원에서 학습된 모델보다 예측 신뢰도가 떨어진다. 또한 개별 피처간에 상관관계가 높을 가능성이 크다.

선형모델에서는 입력 변수 간의 상관관계가 높을 경우 이로 인한 다중 공선성 문제로 모델의 예측 성능이 저하 > 차원 축소 필요

피처 선택 : 특정 피처에 종속성이 강한 불필요한 피처는 아예 제거. 데이터의 특직을 잘 나타내는 주요 피처만 선택
피처 추출 : 기존 피처를 저차원의 중요 피처로 압축해서 추출. 기존 피처와 완전 다른 값을 가지게 된다. 피처를 함축적으로 더 잘 설명할 수 있는 또 다른 공간으로 매핑해 추출하는 것. 즉, 기존 피처가 전혀 인지하기 어려웠던 잠재적인 요소를 추출하는 것.

PCA , SVD, NMF > 주로 이미지 데이터 축소, 텍스 데이터 축소(시맨틱 토픽 모델링)

PCA(주성분 분석)¶

여러 변수 간에 존재하는 상관관계를 이용해 이를 대표하는 주성분을 추출해 차원을 축소.

기존 데이터의 정보 유실이 최소화된다 > 가장 높은 분산을 가지는 데이터의 축을 찾아 이 축으로 차원을 축소 > 이것이 PCA의 주성분

입력 데이터의 공분산 행렬이 고유벡터와 고유값으로 분해될 수 있으며, 이렇게 분해된 고유벡터를 이용해 입력 데이터를 선형 변환하는 방식이 PCA

입력 데이터 세트의 공분산 행렬을 생성
공분산 행렬의 고유벡터와 고유값을 계산
고유값이 가장 큰 순으로 K개(PCA변환 차수)만큼 고유벡터를 추출
고유값이 가장 큰 순으로 추출된 고유벡터를 이용해 새롭게 입력 데이터를 변환.

In [1]:

from sklearn.datasets import load_iris
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# 사이킷런 내장 데이터 셋 API 호출
iris = load_iris()

# 넘파이 데이터 셋을 Pandas DataFrame으로 변환
columns = ['sepal_length','sepal_width','petal_length','petal_width']
irisDF = pd.DataFrame(iris.data , columns=columns)
irisDF['target']=iris.target
irisDF.head(3)

Out[1]:

	sepal_length	sepal_width	petal_length	petal_width
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2

In [2]:

#setosa는 세모, versicolor는 네모, virginica는 동그라미로 표현
markers=['^', 's', 'o']

#setosa의 target 값은 0, versicolor는 1, virginica는 2. 각 target 별로 다른 shape으로 scatter plot 
for i, marker in enumerate(markers):
    x_axis_data = irisDF[irisDF['target']==i]['sepal_length']
    y_axis_data = irisDF[irisDF['target']==i]['sepal_width']
    plt.scatter(x_axis_data, y_axis_data, marker=marker,label=iris.target_names[i])

plt.legend()
plt.xlabel('sepal length')
plt.ylabel('sepal width')
plt.show()

In [3]:

# 개별 속성을 함께 스케일링
# PCA는 여러 속성의 값을 연산해야 하므로 속성의 스케일에 영향을 받는다.
# PCA로 압축하기 전에 각 속성값을 동일한 스케일로 변환하는 것이 필요

from sklearn.preprocessing import StandardScaler

iris_scaled = StandardScaler().fit_transform(irisDF.iloc[:, :-1])

In [4]:

# n_components : PCA로 변환할 차원의 수

from sklearn.decomposition import PCA

pca = PCA(n_components=2)

#fit( )과 transform( ) 을 호출하여 PCA 변환 데이터 반환
pca.fit(iris_scaled)
iris_pca = pca.transform(iris_scaled)
print(iris_pca.shape)

(150, 2)

In [5]:

# PCA 변환된 데이터의 컬럼명을 각각 pca_component_1, pca_component_2로 명명
pca_columns=['pca_component_1','pca_component_2']
irisDF_pca = pd.DataFrame(iris_pca,columns=pca_columns)
irisDF_pca['target']=iris.target
irisDF_pca.head(3)

Out[5]:

	pca_component_1	pca_component_2
0	-2.264703	0.480027
1	-2.080961	-0.674134
2	-2.364229	-0.341908

In [6]:

#setosa를 세모, versicolor를 네모, virginica를 동그라미로 표시
markers=['^', 's', 'o']

#pca_component_1 을 x축, pc_component_2를 y축으로 scatter plot 수행. 
for i, marker in enumerate(markers):
    x_axis_data = irisDF_pca[irisDF_pca['target']==i]['pca_component_1']
    y_axis_data = irisDF_pca[irisDF_pca['target']==i]['pca_component_2']
    plt.scatter(x_axis_data, y_axis_data, marker=marker,label=iris.target_names[i])

plt.legend()
plt.xlabel('pca_component_1')
plt.ylabel('pca_component_2')
plt.show()

In [7]:

# PCA 컴포넌트별로 차지하는 변동성 비율 확인

print(pca.explained_variance_ratio_)

[0.72962445 0.22850762]

In [9]:

# 원본 데이터와 PCA데이터로 분류 모델 실행
# 램덤포레스트. 교차 검증

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

rcf = RandomForestClassifier(random_state=156)
scores = cross_val_score(rcf, iris.data, iris.target,scoring='accuracy',cv=3)
print("원본 데이터 교차 검증 개별 정확도 : ", scores)
print("원본 데이터 교차 검증 평균 정확도 : ", np.mean(scores))

원본 데이터 교차 검증 개별 정확도 :  [0.98 0.94 0.96]
원본 데이터 교차 검증 평균 정확도 :  0.96

In [10]:

pca_X = irisDF_pca[['pca_component_1', 'pca_component_2']]
scores_pca = cross_val_score(rcf, pca_X, iris.target, scoring='accuracy', cv=3 )
print("PCA 데이터 교차 검증 개별 정확도 : ", scores_pca)
print("PCA 데이터 교차 검증 평균 정확도 : ", np.mean(scores_pca))

PCA 데이터 교차 검증 개별 정확도 :  [0.88 0.88 0.88]
PCA 데이터 교차 검증 평균 정확도 :  0.88

신용카드 고객데이터 세트(캐글)¶

In [11]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

In [12]:

import pandas as pd

df = pd.read_excel('/content/drive/MyDrive/project/data/credit_card.xls', header=1, sheet_name='Data').iloc[0:,1:]
print(df.shape)
df.head(3)

(30000, 24)

Out[12]:

	LIMIT_BAL	SEX	EDUCATION	MARRIAGE	AGE	PAY_0	PAY_2	PAY_3	PAY_4	PAY_5	PAY_6	BILL_AMT1	BILL_AMT2	BILL_AMT3	BILL_AMT4	BILL_AMT5	BILL_AMT6	PAY_AMT1	PAY_AMT2	PAY_AMT3	PAY_AMT4	PAY_AMT5	PAY_AMT6	default payment next month
0	20000	2	2	1	24	2	2	-1	-1	-2	-2	3913	3102	689	0	0	0	0	689	0	0	0	0	1
1	120000	2	2	2	26	-1	2	0	0	0	2	2682	1725	2682	3272	3455	3261	0	1000	1000	1000	0	2000	1
2	90000	2	2	2	34	0	0	0	0	0	0	29239	14027	13559	14331	14948	15549	1518	1500	1000	1000	1000	5000	0

In [13]:

# default payment next month = target값 > default로 컬럼명 변경 등 컬럼명 수정


df.rename(columns={'PAY_0':'PAY_1','default payment next month':'default'}, inplace=True)
y_target = df['default']
X_features = df.drop('default', axis=1)

In [14]:

X_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 23 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   LIMIT_BAL  30000 non-null  int64
 1   SEX        30000 non-null  int64
 2   EDUCATION  30000 non-null  int64
 3   MARRIAGE   30000 non-null  int64
 4   AGE        30000 non-null  int64
 5   PAY_1      30000 non-null  int64
 6   PAY_2      30000 non-null  int64
 7   PAY_3      30000 non-null  int64
 8   PAY_4      30000 non-null  int64
 9   PAY_5      30000 non-null  int64
 10  PAY_6      30000 non-null  int64
 11  BILL_AMT1  30000 non-null  int64
 12  BILL_AMT2  30000 non-null  int64
 13  BILL_AMT3  30000 non-null  int64
 14  BILL_AMT4  30000 non-null  int64
 15  BILL_AMT5  30000 non-null  int64
 16  BILL_AMT6  30000 non-null  int64
 17  PAY_AMT1   30000 non-null  int64
 18  PAY_AMT2   30000 non-null  int64
 19  PAY_AMT3   30000 non-null  int64
 20  PAY_AMT4   30000 non-null  int64
 21  PAY_AMT5   30000 non-null  int64
 22  PAY_AMT6   30000 non-null  int64
dtypes: int64(23)
memory usage: 5.3 MB

In [15]:

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

corr = X_features.corr()
plt.figure(figsize=(14,14))
sns.heatmap(corr, annot=True, fmt='.1g')

Out[15]:

<matplotlib.axes._subplots.AxesSubplot at 0x7f3646bbd828>

In [16]:

# BILL_AMT1 ~ BILL_AMT6 상관관계가 높다

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

#BILL_AMT1 ~ BILL_AMT6 까지 6개의 속성명 생성
cols_bill = ['BILL_AMT'+str(i) for i in range(1,7)]
print('대상 속성명:',cols_bill)

# 2개의 PCA 속성을 가진 PCA 객체 생성하고, explained_variance_ratio_ 계산 위해 fit( ) 호출
scaler = StandardScaler()
df_cols_scaled = scaler.fit_transform(X_features[cols_bill])
pca = PCA(n_components=2)
pca.fit(df_cols_scaled) 

print('PCA Component별 변동성:', pca.explained_variance_ratio_)

대상 속성명: ['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']
PCA Component별 변동성: [0.90555253 0.0509867 ]

In [18]:

# 6개중에 2개만 축소해도 변동성 90%

In [17]:

# 원본 데이터와 PCA데이터로 분류 모델 실행
# 램덤포레스트. 교차 검증

import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

rcf = RandomForestClassifier(n_estimators=300, random_state=156)
scores = cross_val_score(rcf, X_features, y_target, scoring='accuracy', cv=3 )

print('CV=3 인 경우의 개별 Fold세트별 정확도:',scores)
print('평균 정확도:{0:.4f}'.format(np.mean(scores)))

CV=3 인 경우의 개별 Fold세트별 정확도: [0.8083 0.8196 0.8232]
평균 정확도:0.8170

In [19]:

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# 원본 데이터셋에 먼저 StandardScaler적용
scaler = StandardScaler()
df_scaled = scaler.fit_transform(X_features)

# 6개의 Component를 가진 PCA 변환을 수행하고 cross_val_score( )로 분류 예측 수행. 
pca = PCA(n_components=6)
df_pca = pca.fit_transform(df_scaled)
scores_pca = cross_val_score(rcf, df_pca, y_target, scoring='accuracy', cv=3)

print('CV=3 인 경우의 PCA 변환된 개별 Fold세트별 정확도:',scores_pca)
print('PCA 변환 데이터 셋 평균 정확도:{0:.4f}'.format(np.mean(scores_pca)))

# 23개의 피처중 6개의 컴포넌트로 축소

CV=3 인 경우의 PCA 변환된 개별 Fold세트별 정확도: [0.7922 0.7965 0.8049]
PCA 변환 데이터 셋 평균 정확도:0.7979

LDA(LinearDiscriminantAnalysis)¶

입력 데이터 세트를 저차원 공간에 투영해 차원을 축소하는 기법.

지도학습의 분류에서 사용하기 쉽도록 개별 클래스를 분별할 수 있는 기준을 최대한 유지하면서 차원 축소.

PCA는 입력 데이터의 변동성의 가장 큰 축을 기준 / LDA는 입력 데이터의 결정 값 클래스를 최대한으로 분리할 수 있는 축을 찾는다

즉, 클래스 간 분산은 최대하게 크게 가져가고, 클래스 내부의 분산은 최대한 작게 가져간다.

클래스 내부와 클래스 간 분사 행렬을 구한다. 이 두개의 행렬은 입력데이터의 결정 값 클래스별로 개별 피처의 평균 벡터를 기반으로 구한다.
클래스 내부 분산 행렬과 클래스간 분상행렬을 고유벡터로 분해
고유값이 가장 큰 순으로 K개(LDA변환 차수만큼) 추출
고유값이 가장 큰 순으로 추출된 고유벡터를 이용해 새롭게 입력 데이터 변환

In [20]:

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris

iris = load_iris()
iris_scaled = StandardScaler().fit_transform(iris.data)

In [21]:

"""
pca = PCA(n_components=6)
df_pca = pca.fit_transform(df_scaled)
scores_pca = cross_val_score(rcf, df_pca, y_target, scoring='accuracy', cv=3)

"""

lda = LinearDiscriminantAnalysis(n_components=2)
lda.fit(iris_scaled, iris.target)  #PCA와 다르게 결정값 필요
iris_lda = lda.transform(iris_scaled)
print(iris_lda.shape)

(150, 2)

In [22]:

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

lda_columns=['lda_component_1','lda_component_2']
irisDF_lda = pd.DataFrame(iris_lda,columns=lda_columns)
irisDF_lda['target']=iris.target

#setosa는 세모, versicolor는 네모, virginica는 동그라미로 표현
markers=['^', 's', 'o']

#setosa의 target 값은 0, versicolor는 1, virginica는 2. 각 target 별로 다른 shape으로 scatter plot
for i, marker in enumerate(markers):
    x_axis_data = irisDF_lda[irisDF_lda['target']==i]['lda_component_1']
    y_axis_data = irisDF_lda[irisDF_lda['target']==i]['lda_component_2']

    plt.scatter(x_axis_data, y_axis_data, marker=marker,label=iris.target_names[i])

plt.legend(loc='upper right')
plt.xlabel('lda_component_1')
plt.ylabel('lda_component_2')
plt.show()

In [ ]:

저작자표시 비영리 변경금지 (새창열림)

'머신러닝' 카테고리의 다른 글

[파이썬 머신러닝] 7장. K-Means (0)	2021.02.07
[파이썬 머신러닝] 6장.차원 축소 - SVD, NMF (0)	2021.02.01
[파이썬 머신러닝] 5장. 회귀 - 캐글 주택 가격 (1)	2021.01.28
[파이썬 머신러닝] 5장. 회귀 - 자전거 대여 수요 예측 (0)	2021.01.28
[파이썬 머신러닝] 5장. 로지스틱 회귀, 회귀 트리 (0)	2021.01.27

dlsalfkd11 코딩코딩

[파이썬 머신러닝] 6장.차원 축소 - PCA, LDA

차원 축소¶

PCA(주성분 분석)¶

신용카드 고객데이터 세트(캐글)¶

LDA(LinearDiscriminantAnalysis)¶

'머신러닝' 카테고리의 다른 글

티스토리툴바

[파이썬 머신러닝] 6장.차원 축소 - PCA, LDA

차원 축소¶

PCA(주성분 분석)¶

신용카드 고객데이터 세트(캐글)¶

LDA(LinearDiscriminantAnalysis)¶

'머신러닝' 카테고리의 다른 글

'머신러닝' Related Articles

티스토리툴바