5_9_회귀_자전거_대여_수요_예측

In [1]:

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:1200px !important; }</style>"))

In [1]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

In [2]:

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

bike_df = pd.read_csv('/content/drive/MyDrive/project/data/bike_train.csv')
print(bike_df.shape)
bike_df.head(3)

(10886, 12)

Out[2]:

	datetime	season	weather	temp	atemp	humidity	casual	registered	count
0	2011-01-01 00:00:00	1	1	9.84	14.395	81	3	13	16
1	2011-01-01 01:00:00	1	1	9.02	13.635	80	8	32	40
2	2011-01-01 02:00:00	1	1	9.02	13.635	80	5	27	32

In [3]:

bike_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    10886 non-null  object 
 1   season      10886 non-null  int64  
 2   holiday     10886 non-null  int64  
 3   workingday  10886 non-null  int64  
 4   weather     10886 non-null  int64  
 5   temp        10886 non-null  float64
 6   atemp       10886 non-null  float64
 7   humidity    10886 non-null  int64  
 8   windspeed   10886 non-null  float64
 9   casual      10886 non-null  int64  
 10  registered  10886 non-null  int64  
 11  count       10886 non-null  int64  
dtypes: float64(3), int64(8), object(1)
memory usage: 1020.7+ KB

In [ ]:

# 결정값은 count '대여횟수'

데이터 클랜징 및 가공

In [4]:

# 문자열을 datetime 타입으로 변경. 
bike_df['datetime'] = bike_df.datetime.apply(pd.to_datetime)

# datetime 타입에서 년, 월, 일, 시간 추출
bike_df['year'] = bike_df.datetime.apply(lambda x : x.year)
bike_df['month'] = bike_df.datetime.apply(lambda x : x.month)
bike_df['day'] = bike_df.datetime.apply(lambda x : x.day)
bike_df['hour'] = bike_df.datetime.apply(lambda x: x.hour)
bike_df.head(3)

Out[4]:

	datetime	season	weather	temp	atemp	humidity	casual	registered	count	year	month	day	hour
0	2011-01-01 00:00:00	1	1	9.84	14.395	81	3	13	16	2011	1	1	0
1	2011-01-01 01:00:00	1	1	9.02	13.635	80	8	32	40	2011	1	1	1
2	2011-01-01 02:00:00	1	1	9.02	13.635	80	5	27	32	2011	1	1	2

In [5]:

bike_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   datetime    10886 non-null  datetime64[ns]
 1   season      10886 non-null  int64         
 2   holiday     10886 non-null  int64         
 3   workingday  10886 non-null  int64         
 4   weather     10886 non-null  int64         
 5   temp        10886 non-null  float64       
 6   atemp       10886 non-null  float64       
 7   humidity    10886 non-null  int64         
 8   windspeed   10886 non-null  float64       
 9   casual      10886 non-null  int64         
 10  registered  10886 non-null  int64         
 11  count       10886 non-null  int64         
 12  year        10886 non-null  int64         
 13  month       10886 non-null  int64         
 14  day         10886 non-null  int64         
 15  hour        10886 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(12)
memory usage: 1.3 MB

In [7]:

#    datetime    10886 non-null  datetime64[ns] : object에서 datetime으로 바뀐것 확인
# 12~15 년, 월, 일, 시간 컬럼 생긴것 확인

In [8]:

# datetime 과 casual+registred=count 이므로 삭제
drop_columns = ['datetime','casual','registered']
bike_df.drop(drop_columns, axis=1,inplace=True)

로그 변환, 피처 인코딩, 모델 학습/예측/평가

In [ ]:

# 캐글에서 요구한 성능 평가 방법=RMSLE(오류 값의 로그에 대한 RMSE)
#> 사이킷런에서는 제공하지 않기 때문에 성능 평가 함수를 직접 만들어야 한다. 

In [9]:

from sklearn.metrics import mean_squared_error, mean_absolute_error

# log 값 변환 시 NaN등의 이슈로 log() 가 아닌 log1p() 를 이용하여 RMSLE 계산
def rmsle(y, pred):
    log_y = np.log1p(y)  #log1p로 변환한 값은 나중에 넘파이의 expm1()로 복원 가능
    log_pred = np.log1p(pred)
    squared_error = (log_y - log_pred) ** 2
    rmsle = np.sqrt(np.mean(squared_error))
    return rmsle

# 사이킷런의 mean_square_error() 를 이용하여 RMSE 계산
def rmse(y,pred):
    return np.sqrt(mean_squared_error(y,pred))

# MSE, RMSE, RMSLE 를 모두 계산 
def evaluate_regr(y,pred):
    rmsle_val = rmsle(y,pred)
    rmse_val = rmse(y,pred)
    # MSE 는 scikit learn의 mean_absolute_error() 로 계산
    mse_val = mean_absolute_error(y,pred)
    print('RMSLE: {0:.3f}, RMSE: {1:.3F}, MSE: {2:.3F}'.format(rmsle_val, rmse_val, mse_val))

회귀 모델 적용하기 전에 댕터 세트에 대해서 먼저 처리해야 할 사항

결과값이 정규 분포로 되어 있는지 확인
카테코리형 회귀 모델의 경우 원핫인코딩으로 피처를 인코딩

In [10]:

# 먼저 회귀 예측
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.linear_model import LinearRegression , Ridge , Lasso

y_target = bike_df['count']
X_features = bike_df.drop(['count'],axis=1,inplace=False)

X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.3, random_state=0)

lr_reg = LinearRegression()
lr_reg.fit(X_train, y_train)
pred = lr_reg.predict(X_test)

evaluate_regr(y_test ,pred)

RMSLE: 1.165, RMSE: 140.900, MSE: 105.924

In [11]:

# 실제 값과 오류 값이 얼마나 차이 나는지 DF으로 확인


def get_top_error_data(y_test, pred, n_tops = 5):
    # DataFrame에 컬럼들로 실제 대여횟수(count)와 예측 값을 서로 비교 할 수 있도록 생성. 
    result_df = pd.DataFrame(y_test.values, columns=['real_count'])
    result_df['predicted_count']= np.round(pred)
    result_df['diff'] = np.abs(result_df['real_count'] - result_df['predicted_count'])
    # 예측값과 실제값이 가장 큰 데이터 순으로 출력. 
    print(result_df.sort_values('diff', ascending=False)[:n_tops])
    
get_top_error_data(y_test,pred,n_tops=5)

      real_count  predicted_count   diff
1618         890            322.0  568.0
3151         798            241.0  557.0
966          884            327.0  557.0
412          745            194.0  551.0
2817         856            310.0  546.0

In [12]:

# 예측 오류가 크다
# Target 값의 분포가 왜곡된 형태를 이루고 있는지 확인 > 정규분포가 좋음

y_target.hist()

Out[12]:

<matplotlib.axes._subplots.AxesSubplot at 0x7f76561cb8d0>

In [13]:

# log1p로 변환

y_log_transform = np.log1p(y_target)
y_log_transform.hist()

Out[13]:

<matplotlib.axes._subplots.AxesSubplot at 0x7f7656105198>

In [14]:

# 다시 학습
# 타겟 컬럼인 count 값을 log1p 로 Log 변환
y_target_log = np.log1p(y_target)

# 로그 변환된 y_target_log를 반영하여 학습/테스트 데이터 셋 분할
X_train, X_test, y_train, y_test = train_test_split(X_features, y_target_log, test_size=0.3, random_state=0)
lr_reg = LinearRegression()
lr_reg.fit(X_train, y_train)
pred = lr_reg.predict(X_test)

# 테스트 데이터 셋의 Target 값은 Log 변환되었으므로 다시 expm1를 이용하여 원래 scale로 변환
y_test_exp = np.expm1(y_test)

# 예측 값 역시 Log 변환된 타겟 기반으로 학습되어 예측되었으므로 다시 exmpl으로 scale변환
pred_exp = np.expm1(pred)

evaluate_regr(y_test_exp ,pred_exp)

RMSLE: 1.017, RMSE: 162.594, MSE: 109.286

In [15]:

coef = pd.Series(lr_reg.coef_, index=X_features.columns)
coef_sort = coef.sort_values(ascending=False)
sns.barplot(x=coef_sort.values, y=coef_sort.index)

Out[15]:

<matplotlib.axes._subplots.AxesSubplot at 0x7f7655bf7518>

year의 회귀 계수가 큰 이유는?

year 피처는 연도를 뜻하므로 카테고리형 피처지만, 숫자형(2011,2012) 값으로 되어 있다.

사이킷런은 카테고리만을 위한 데이터 다입이 없으며, 모두 숫자로 변환해야 한다. 이처럼 숫자형 카테고리 값을 선형 회귀에 사용할 경우 회귀 계수를 연산할 때 이 숫자형 값에 크게 영향을 받는 경우가 발생할 수 있다.

따라서 선형 회귀에서는 이러한 피처 인코딩에 원핫 인코딩을 적용해 변환한다.

In [16]:

# 'year','month','hour','season','weather' feature들을 One Hot Encoding
X_features_ohe = pd.get_dummies(X_features, columns=['year','month','hour', 'holiday',
                                              'workingday','season','weather'])

In [17]:

# 원-핫 인코딩이 적용된 feature 데이터 세트 기반으로 학습/예측 데이터 분할. 
X_train, X_test, y_train, y_test = train_test_split(X_features_ohe, y_target_log,
                                                    test_size=0.3, random_state=0)

# 모델과 학습/테스트 데이터 셋을 입력하면 성능 평가 수치를 반환
def get_model_predict(model, X_train, X_test, y_train, y_test, is_expm1=False):
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    if is_expm1 :
        y_test = np.expm1(y_test)
        pred = np.expm1(pred)
    print('###',model.__class__.__name__,'###')
    evaluate_regr(y_test, pred)
# end of function get_model_predict    

# model 별로 평가 수행
lr_reg = LinearRegression()
ridge_reg = Ridge(alpha=10)
lasso_reg = Lasso(alpha=0.01)

for model in [lr_reg, ridge_reg, lasso_reg]:
    get_model_predict(model,X_train, X_test, y_train, y_test,is_expm1=True)

### LinearRegression ###
RMSLE: 0.589, RMSE: 97.483, MSE: 63.106
### Ridge ###
RMSLE: 0.589, RMSE: 98.407, MSE: 63.648
### Lasso ###
RMSLE: 0.634, RMSE: 113.031, MSE: 72.658

In [18]:

# 회귀 계수가 높은 피처를 다시 시각화하여 확인
coef = pd.Series(lr_reg.coef_ , index=X_features_ohe.columns)
coef_sort = coef.sort_values(ascending=False)[:20]
sns.barplot(x=coef_sort.values , y=coef_sort.index)

Out[18]:

<matplotlib.axes._subplots.AxesSubplot at 0x7f7655bfae80>

회귀 트리를 이용한 예측

In [19]:

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor


# 랜덤 포레스트, GBM, XGBoost, LightGBM model 별로 평가 수행
rf_reg = RandomForestRegressor(n_estimators=500)
gbm_reg = GradientBoostingRegressor(n_estimators=500)
xgb_reg = XGBRegressor(n_estimators=500)
lgbm_reg = LGBMRegressor(n_estimators=500)

for model in [rf_reg, gbm_reg, xgb_reg, lgbm_reg]:
    # XGBoost의 경우 DF가 입력될 때 버전에 따라 오류 발생 가능. ndarray로 변환
    get_model_predict(model,X_train, X_test, y_train, y_test,is_expm1=True)

### RandomForestRegressor ###
RMSLE: 0.354, RMSE: 50.786, MSE: 31.423
### GradientBoostingRegressor ###
RMSLE: 0.341, RMSE: 55.851, MSE: 34.375
[13:05:39] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
### XGBRegressor ###
RMSLE: 0.346, RMSE: 56.474, MSE: 34.917
### LGBMRegressor ###
RMSLE: 0.316, RMSE: 46.473, MSE: 28.777

In [ ]:

저작자표시 비영리 변경금지 (새창열림)

'머신러닝' 카테고리의 다른 글

[파이썬 머신러닝] 6장.차원 축소 - PCA, LDA (0)	2021.02.01
[파이썬 머신러닝] 5장. 회귀 - 캐글 주택 가격 (1)	2021.01.28
[파이썬 머신러닝] 5장. 로지스틱 회귀, 회귀 트리 (0)	2021.01.27
[파이썬 머신러닝] 5장. 규제 선형 모델 - 릿지, 라쏘, 엘라스틱넷 (0)	2021.01.27
[파이썬 머신러닝] 5장. 다항회귀 (0)	2021.01.25

dlsalfkd11 코딩코딩

[파이썬 머신러닝] 5장. 회귀 - 자전거 대여 수요 예측

'머신러닝' 카테고리의 다른 글

티스토리툴바

[파이썬 머신러닝] 5장. 회귀 - 자전거 대여 수요 예측

'머신러닝' 카테고리의 다른 글

'머신러닝' Related Articles

티스토리툴바