In [1]:

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:1200px !important; }</style>"))

In [1]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

titanic_df = pd.read_csv('./data/titanic/train.csv')
titanic_df.head(3)

Out[1]:

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S

In [2]:

print('\n 학습 데이터 정보')
print(titanic_df.info())

 학습 데이터 정보
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None

In [3]:

# 결측치 처리
# Age = 평균값 / Cabin,Embarked =N
titanic_df['Age']=titanic_df['Age'].fillna(titanic_df['Age'].mean())
titanic_df['Cabin']=titanic_df['Cabin'].fillna('N')
titanic_df['Embarked']=titanic_df['Embarked'].fillna('N')

titanic_df.isnull().sum()

Out[3]:

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [4]:

# object(문자열) 객체 중 'Sex, Cabin, Embarked'피처들의 값 분류 확인
print('Sex 값 분포 : \n', titanic_df['Sex'].value_counts())
print('\n Cabin 값 분포 : \n', titanic_df['Cabin'].value_counts())
print('\n Embarked 값 분포 : \n', titanic_df['Embarked'].value_counts())

Sex 값 분포 : 
 male      577
female    314
Name: Sex, dtype: int64

 Cabin 값 분포 : 
 N              687
C23 C25 C27      4
G6               4
B96 B98          4
C22 C26          3
              ... 
A16              1
E34              1
C90              1
E58              1
B94              1
Name: Cabin, Length: 148, dtype: int64

 Embarked 값 분포 : 
 S    644
C    168
Q     77
N      2
Name: Embarked, dtype: int64

In [5]:

# Cabin 에 'N'(null) 값이 너무 많음 
# Cabin 에 C23 C25 C27 = 4 _ B96 B98 = 4 처럼 분류가 제대로 안되어있음-> 앞 문자가 선실 등급을 의미하므로 등급이 중요하다 판단> 앞 문자만 추출
titanic_df['Cabin'] =titanic_df['Cabin'].str[0]
print(titanic_df.head(5))

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500     N        S  
1      0          PC 17599  71.2833     C        C  
2      0  STON/O2. 3101282   7.9250     N        S  
3      0            113803  53.1000     C        S  
4      0            373450   8.0500     N        S

In [6]:

# 데이터 탐색
# 1. 어떤유형 승객이 생존 확률이 높았는지_성별 나이 선실등급 등
# 성별에 따른 생존자
titanic_df.groupby(['Sex','Survived'])['Survived'].count()

Out[6]:

Sex     Survived
female  0            81
        1           233
male    0           468
        1           109
Name: Survived, dtype: int64

In [7]:

# 시각화
sns.barplot(x='Sex',y='Survived', data=titanic_df)

Out[7]:

<AxesSubplot:xlabel='Sex', ylabel='Survived'>

In [8]:

# 부(객실 등급별)에 따른 성별 생존자(hue='Sex')
sns.barplot(x='Pclass', y='Survived', hue='Sex', data=titanic_df)

Out[8]:

<AxesSubplot:xlabel='Pclass', ylabel='Survived'>

In [9]:

# 나이에 따른 생존
# 나이는 범위별
# 입력값(나이)에 따라 구분 값을 반환하는 함수 설정 
def get_category(age):
    cat=''
    if age <= -1 : cat = 'Unknown'
    elif age <=5 : cat = 'Baby'
    elif age <=12 : cat = 'Child'
    elif age <=18 : cat = 'Teenager'
    elif age <=25 : cat = 'Student'
    elif age <=35 : cat = 'Young Adult'
    elif age <=60 : cat = 'Adult'
    else : cat = 'Ederly'
    
    return cat

# 막대그래프 크기 설정
plt.figure(figsize=(10,6))

# X 축 값을 순차적으로 표시
group_names = ['Unknown','Baby','Child','Teenager','Student','Young Adult','Adult','Ederly']

# lambda 식 생성 : get_category 반환값으로 지정
# get_category() 는 입력값으로 'Age'칼럼 값을 받아서 해당하는 cat 반환
titanic_df['Age_cat']=titanic_df['Age'].apply(lambda x : get_category(x))

# 시각화
sns.barplot(x='Age_cat', y='Survived', hue='Sex', data=titanic_df, order=group_names)

Out[9]:

<AxesSubplot:xlabel='Age_cat', ylabel='Survived'>

In [ ]:

# 위 시각화를 통해 Sex Age Pclass 가 생존에 영향을 준다는 것을 확인

In [11]:

# 문자열(object)컬럼('Cabin','Sex','Embarked') 피처를 숫자형으로 변환- 레이블인코딩
# 컬럼이 여러개이므로 함수를 생성
from sklearn import preprocessing

def encode_features(data_df):
    features = ['Cabin','Sex','Embarked']
    for feature in features:
        le = preprocessing.LabelEncoder() # 선언
        le = le.fit(data_df[feature]) # 피처에 대한 데이터프레임으로 인코딩생성
        data_df[feature] = le.transform(data_df[feature]) # 피처에 인코딩한것으로 변환
        
    return data_df

titanic_df = encode_features(titanic_df)
titanic_df.head(5)

Out[11]:

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked	Age_cat
0	1	0	3	Braund, Mr. Owen Harris	1	22.0	1	A/5 21171	7.2500	7	3	Student
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	0	38.0	1	PC 17599	71.2833	2	0	Adult
2	3	1	3	Heikkinen, Miss. Laina	0	26.0	0	STON/O2. 3101282	7.9250	7	3	Young Adult
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	0	35.0	1	113803	53.1000	2	3	Young Adult
4	5	0	3	Allen, Mr. William Henry	1	35.0	0	373450	8.0500	7	3	Young Adult

In [12]:

# 위의 과정을 함수 하나로 묶기 > 데이터 전처리 과정에 대한 함수 생성

# 결측치 처리 함수
def fillna(df):    
    df['Age']=df['Age'].fillna(df['Age'].mean())
    df['Cabin']=df['Cabin'].fillna('N')
    df['Embarked']=df['Embarked'].fillna('N')
    df['Fare']=df['Fare'].fillna(0)
    
    return df

# 알고리즘에 불필요한 컬럼 제거 함수
def drop_features(df):
    df.drop(['PassengerId','Name','Ticket'], axis=1, inplace=True)
    
    return df

# 레이블 인코딩
def format_features(df):
    df['Cabin'] =df['Cabin'].str[0]
    features = ['Cabin','Sex','Embarked']
    for feature in features:
        le = LabelEncoder() # 선언
        le = le.fit(df[feature]) # 피처에 대한 데이터프레임으로 인코딩생성
        df[feature] = le.transform(df[feature]) # 피처에 인코딩한것으로 변환
        
    return df

# 위 함수들을 통합
def transform_features(df):
    df = fillna(df)
    df = drop_features(df)
    df = format_features(df)
    
    return df

In [13]:

###### 다시 원본 데이터 가공하여 새로운 피처 데이터 세트 만들기
titanic_df = pd.read_csv('./data/titanic/train.csv')

y_titanic_df = titanic_df['Survived']
X_titanic_df = titanic_df.drop('Survived', axis=1)

X_titanic_df = transform_features(X_titanic_df)

In [14]:

# 가공한 학습테이터 셋 분리
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_titanic_df, y_titanic_df, test_size=0.2, random_state=11 )

In [15]:

# 머신러닝 알고리즘 : 의사결정트리, 랜덤포레스트, 로지스틱회귀 사용
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [16]:

# 분류 모델 클래스 생성
dt_clf = DecisionTreeClassifier(random_state=11)
rf_clf = RandomForestClassifier(random_state=11)
lr_clf = LogisticRegression()

# 결정트리 학습/예측 평가
dt_clf.fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)
print('의사결정트리 정확도: {0:.4f}'.format(accuracy_score(y_test, dt_pred)))

# 랜덤포레스트 학습/예측 평가
rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test)
print('랜덤포레스트 정확도: {0:.4f}'.format(accuracy_score(y_test, rf_pred)))

# 로지스틱회귀 학습/예측 평가
lr_clf.fit(X_train, y_train)
lr_pred = lr_clf.predict(X_test)
print('로지스틱회귀 정확도: {0:.4f}'.format(accuracy_score(y_test, lr_pred)))

의사결정트리 정확도: 0.7877
랜덤포레스트 정확도: 0.8547
로지스틱회귀 정확도: 0.8492

C:\Users\LG\anaconda3\envs\pjt\lib\site-packages\sklearn\linear_model\_logistic.py:764: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)

In [25]:

# 교차 검증 후 모델 평가
# 교차 검증 : KFold(), cross_val_score(), GridSearchCV()
import warnings

warnings.filterwarnings(action='ignore') 

In [26]:

# 교차 검증 : KFold()
from sklearn.model_selection import KFold


def exec_kfold(clf, folds=5):
    # 폴드 셋은 5개인 kfold 객체 생성
    kfold = KFold(n_splits=folds)
    # 폴드 수 만큼 예측된 결과를 저장하기 위한 리스트 객체 생성
    scores = []
    
    # KFold 교차 검증
    for iter_count, (train_index, test_index) in enumerate(kfold.split(X_titanic_df)):
        
        # X_titanic_df 데이터에서 교차 검증별 학습과 검증 데이터를 가리키는 index 생성
        X_train, X_test = X_titanic_df.values[train_index], X_titanic_df.values[test_index]
        y_train, y_test = y_titanic_df.values[train_index], y_titanic_df.values[test_index]
        
        # 분류 모델 학습, 예측, 정확도 계산
        clf.fit(X_train, y_train)
        
        predictions = clf.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        
        # 점수들 socre 리스트에 넣기
        scores.append(accuracy)
        
        print('교차 검증 {0} 정확도: {1:.4f}'.format(iter_count, accuracy))
        
    # 정확도 점수들 평균
    mean_score = np.mean(scores)
    print('평균 정확도: {0:.4f}'.format(mean_score))
        
# 교차검증 함수 호출
print('의사결정트리')
print(exec_kfold(dt_clf,folds=5))
print('랜덤포레스트')
print(exec_kfold(rf_clf,folds=5))
print('로지스틱회귀')
print(exec_kfold(lr_clf,folds=5))

의사결정트리
교차 검증 0 정확도: 0.7542
교차 검증 1 정확도: 0.7809
교차 검증 2 정확도: 0.7865
교차 검증 3 정확도: 0.7697
교차 검증 4 정확도: 0.8202
평균 정확도: 0.7823
None
랜덤포레스트
교차 검증 0 정확도: 0.7933
교차 검증 1 정확도: 0.8090
교차 검증 2 정확도: 0.8371
교차 검증 3 정확도: 0.7753
교차 검증 4 정확도: 0.8596
평균 정확도: 0.8148
None
로지스틱회귀
교차 검증 0 정확도: 0.8045
교차 검증 1 정확도: 0.7809
교차 검증 2 정확도: 0.7753
교차 검증 3 정확도: 0.7472
교차 검증 4 정확도: 0.8146
평균 정확도: 0.7845
None

In [27]:

# 교차 검증 : cross_val_score() > StratifiedKFold 를 이용
from sklearn.model_selection import cross_val_score

def cross_score(clf, Cv=5):
    scores = cross_val_score(clf, X_titanic_df, y_titanic_df, cv=Cv)
    for iter_count, accuracy in enumerate(scores):
        print('교차 검증 {0} 정확도: {1:.4f}'.format(iter_count, accuracy))
        
    print('평균 정확도: {0:.4f}'.format(np.mean(scores)))
    
    
# 교차검증 함수 호출
print('의사결정트리')
print(cross_score(dt_clf,Cv=5))
print('랜덤포레스트')
print(cross_score(rf_clf,Cv=5))
print('로지스틱회귀')
print(cross_score(lr_clf,Cv=5))
    

의사결정트리
교차 검증 0 정확도: 0.7430
교차 검증 1 정확도: 0.7753
교차 검증 2 정확도: 0.7921
교차 검증 3 정확도: 0.7865
교차 검증 4 정확도: 0.8427
평균 정확도: 0.7879
None
랜덤포레스트
교차 검증 0 정확도: 0.7933
교차 검증 1 정확도: 0.7978
교차 검증 2 정확도: 0.8483
교차 검증 3 정확도: 0.7640
교차 검증 4 정확도: 0.8652
평균 정확도: 0.8137
None
로지스틱회귀
교차 검증 0 정확도: 0.7989
교차 검증 1 정확도: 0.7697
교차 검증 2 정확도: 0.7809
교차 검증 3 정확도: 0.7753
교차 검증 4 정확도: 0.7978
평균 정확도: 0.7845
None

In [30]:

# 교차 검증 : GridSearchCV() 
# 참고 : https://skasha.tistory.com/82   &   https://m.blog.naver.com/PostView.nhn?blogId=gustn3964&logNo=221431933811&proxyReferer=https:%2F%2Fwww.google.com%2F 

from sklearn.model_selection import GridSearchCV

####################### 의사결정트리  참고 : https://injo.tistory.com/30
dt_parameters = {'max_depth':[2,3,5,10]
              ,'min_samples_split':[2,3,5]
              ,'min_samples_leaf':[1,5,8]}

grid_dt_clf = GridSearchCV(dt_clf, param_grid=dt_parameters, scoring='accuracy', cv=5)
grid_dt_clf.fit(X_train, y_train)

print('의사결정트리 하이퍼 파라미터 : ', grid_dt_clf.best_params_)
print('의사결정트리 최고 예측 정확도 : {0:.4f}'.format(grid_dt_clf.best_score_))
best_dt_clf = grid_dt_clf.best_estimator_

#최적으로 학습된 estimator 로 예측
dt_predictions = best_dt_clf.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_predictions)

print('의사결정트리 테스트 최종 정확도 : {0:.4f}'.format(dt_accuracy))

####################### 랜덤포레스트  참고 : https://dsbook.tistory.com/135
rf_parameters = { 'n_estimators' : [10, 100],
           'max_depth' : [6, 8, 10, 12],
           'min_samples_leaf' : [8, 12, 18],
           'min_samples_split' : [8, 16, 20]
            }

# RandomForestClassifier 객체 생성 후 GridSearchCV 수행
grid_rf_clf = GridSearchCV(rf_clf, param_grid = rf_parameters, cv = 5, n_jobs = -1)
grid_rf_clf.fit(X_train, y_train)

print('랜덤포레스트 하이퍼 파라미터: ', grid_rf_clf.best_params_)
print('랜덤포레스트 최고 예측 정확도: {0:.4f}'.format(grid_rf_clf.best_score_))
best_rf_clf = grid_rf_clf.best_estimator_

#최적으로 학습된 estimator 로 예측
rf_predictions = best_rf_clf.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)

print('랜덤포레스트 테스트 최종 정확도 : {0:.4f}'.format(rf_accuracy))

####################### 로지스틱회귀  참고 :https://ariz1623.tistory.com/213
lr_parameters ={'penalty':['l2', 'l1'],
        'C':[0.01, 0.1, 1, 1, 5, 10]}

grid_lr_clf = GridSearchCV(lr_clf, param_grid=lr_parameters, scoring='accuracy', cv=5 )
grid_lr_clf.fit(X_train, y_train)
print('로지스틱회귀 하이퍼 파라미터:', grid_lr_clf.best_params_)
print('로지스틱회구 최고 예측 정확도:{0:.4f}'.format(grid_lr_clf.best_score_))
best_lr_clf = grid_lr_clf.best_estimator_

#최적으로 학습된 estimator 로 예측
lr_predictions = best_lr_clf.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_predictions)

print('로지스틱회귀 테스트 최종 정확도 : {0:.4f}'.format(lr_accuracy))

의사결정트리 하이퍼 파라미터 :  {'max_depth': 3, 'min_samples_leaf': 5, 'min_samples_split': 2}
의사결정트리 최고 예측 정확도 : 0.7992
의사결정트리 테스트 최종 정확도 : 0.8715
랜덤포레스트 하이퍼 파라미터:  {'max_depth': 6, 'min_samples_leaf': 8, 'min_samples_split': 8, 'n_estimators': 100}
랜덤포레스트 최고 예측 정확도: 0.8104
랜덤포레스트 테스트 최종 정확도 : 0.8659
로지스틱회귀 하이퍼 파라미터: {'C': 5, 'penalty': 'l2'}
로지스틱회구 최고 예측 정확도:0.7837
로지스틱회귀 테스트 최종 정확도 : 0.8492

In [ ]:

저작자표시 비영리 변경금지

'머신러닝' 카테고리의 다른 글

[파이썬 머신러닝] 4장. 신용카드 사기 검출 (0)	2021.01.19
[파이썬 머신러닝] 4장. 산탄데르 고객 예측 (0)	2021.01.19
[파이썬 머신러닝] 4장. 결정트리 (0)	2021.01.14
[파이썬 머신러닝] 3장. 당뇨병 예측 (0)	2021.01.14
[파이썬 머신러닝] 2장. 사이킷런 (0)	2021.01.14

dlsalfkd11 코딩코딩

[파이썬 머신러닝] 2장. 타이타닉 생존자 예측

'머신러닝' 카테고리의 다른 글

티스토리툴바

[파이썬 머신러닝] 2장. 타이타닉 생존자 예측

'머신러닝' 카테고리의 다른 글

'머신러닝' Related Articles

티스토리툴바