In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:1200px !important; }</style>"))
In [1]:
import numpy as np
import pandas as pd
In [2]:
titanic_df = pd.read_csv('./data/titanic/train.csv')
titanic_df.head(3)
Out[2]:
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
In [3]:
print('titanic 변수 type : ', type(titanic_df))
titanic 변수 type : <class 'pandas.core.frame.DataFrame'>
In [4]:
print('titanic DataFrame 크기 : ', titanic_df.shape)
titanic DataFrame 크기 : (891, 12)
In [5]:
titanic_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PassengerId 891 non-null int64 1 Survived 891 non-null int64 2 Pclass 891 non-null int64 3 Name 891 non-null object 4 Sex 891 non-null object 5 Age 714 non-null float64 6 SibSp 891 non-null int64 7 Parch 891 non-null int64 8 Ticket 891 non-null object 9 Fare 891 non-null float64 10 Cabin 204 non-null object 11 Embarked 889 non-null object dtypes: float64(2), int64(5), object(5) memory usage: 83.7+ KB
In [6]:
titanic_df.describe()
Out[6]:
PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | |
---|---|---|---|---|---|---|---|
count | 891.000000 | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
mean | 446.000000 | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
std | 257.353842 | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
min | 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
25% | 223.500000 | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
50% | 446.000000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
75% | 668.500000 | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
max | 891.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
In [7]:
value_counts = titanic_df['Pclass'].value_counts()
print(value_counts)
3 491 1 216 2 184 Name: Pclass, dtype: int64
In [8]:
titanic_pclass = titanic_df['Pclass']
print(type(titanic_pclass))
<class 'pandas.core.series.Series'>
In [9]:
titanic_pclass.head(5)
Out[9]:
0 3 1 1 2 3 3 1 4 3 Name: Pclass, dtype: int64
In [10]:
print(value_counts)
print(type(value_counts))
3 491 1 216 2 184 Name: Pclass, dtype: int64 <class 'pandas.core.series.Series'>
In [11]:
import numpy as np
In [12]:
col1_name = ['컬럼1']
list1 = [1,2,3]
array1 = np.array(list1)
print('array1 shape : ', array1.shape)
array1 shape : (3,)
In [13]:
# 리스트를 이용해 DataFrame 형성
df_list1 = pd.DataFrame(list1, columns=col1_name)
print('1차원 리스트로 만든 DataFrame : \n', df_list1)
1차원 리스트로 만든 DataFrame : 컬럼1 0 1 1 2 2 3
In [14]:
#넘파이 ndarray 를 이용해 DataFrame 형성
df_array1 = pd.DataFrame(array1, columns=col1_name)
print('1차원 ndarray로 만든 DataFrame : \n', df_array1)
1차원 ndarray로 만든 DataFrame : 컬럼1 0 1 1 2 2 3
In [15]:
# 3개이 컬럼명이 필요
col2_name = ['컬럼1','컬럼2','컬럼3']
# 2*3 행렬의 리스트와 ndarray 생성한 뒤 이를 DataFrame 으로 변환
list2 = [[1,3,5],[12,14,16]]
array2 = np.array(list2)
print('array2 : \n',array2)
print('array2 shape : \n',array2.shape)
array2 : [[ 1 3 5] [12 14 16]] array2 shape : (2, 3)
In [16]:
df_list2 = pd.DataFrame(list2, columns=col2_name)
print('2차원 list 로 만든 DataFrame : \n', df_list2)
2차원 list 로 만든 DataFrame : 컬럼1 컬럼2 컬럼3 0 1 3 5 1 12 14 16
In [17]:
df_array2 = pd.DataFrame(array2, columns=col2_name)
print('2차원 ndarray로 만든 DataFrame : \n', df_array2)
2차원 ndarray로 만든 DataFrame : 컬럼1 컬럼2 컬럼3 0 1 3 5 1 12 14 16
In [18]:
# 딕셔너리를 DataFrame으로
# key 는 문자열 칼럼명으로 매핑, value 는 리스트(ndarray) 형 칼럼 데이터로 매핑
dict = {'컬럼1':[1, 14],'컬럼2':[2, 15],'컬럼3':[3,16]}
df_dict1 = pd.DataFrame(dict)
print('딕셔너리로 만든 DataFrame : \n', df_dict1)
딕셔너리로 만든 DataFrame : 컬럼1 컬럼2 컬럼3 0 1 2 3 1 14 15 16
In [19]:
# DataFrame 의 컬럼값 을 ndarray로 변환
array3 = df_dict1.values
print('df_dict1.values 타입 : ', type(array3)
, 'df_dict1.values 의 shape : ', array3.shape)
print(array3)
df_dict1.values 타입 : <class 'numpy.ndarray'> df_dict1.values 의 shape : (2, 3) [[ 1 2 3] [14 15 16]]
In [20]:
# DataFrame 의 컬럼값 을 list로 변환
list3 = df_dict1.values.tolist()
print('df_dict1.values.tolist() 의 타입: ', type(list3) )
# DataFrame 의 컬럼값 을 딕셔너리로 변환 후 list 형으로 반환
dict3 = df_dict1.to_dict('list')
print('\n df_dict1.to_dict() 의 타입' , type(dict3))
print(dict3)
df_dict1.values.tolist() 의 타입: <class 'list'> df_dict1.to_dict() 의 타입 <class 'dict'> {'컬럼1': [1, 14], '컬럼2': [2, 15], '컬럼3': [3, 16]}
In [21]:
# 컬럼 만들기
titanic_df['Age_0']=0
titanic_df.head(3)
Out[21]:
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Age_0 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 0 |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 0 |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 0 |
In [22]:
titanic_df['Age_by_10'] = titanic_df['Age']*10
titanic_df['Family_No'] = titanic_df['SibSp']+titanic_df['Parch']+1
titanic_df.head(4)
Out[22]:
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Age_0 | Age_by_10 | Family_No | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 0 | 220.0 | 2 |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 0 | 380.0 | 2 |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 0 | 260.0 | 1 |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 0 | 350.0 | 2 |
In [23]:
titanic_df['Age_by_10']=titanic_df['Age_by_10']+100
titanic_df.head(4)
Out[23]:
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Age_0 | Age_by_10 | Family_No | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 0 | 320.0 | 2 |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 0 | 480.0 | 2 |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 0 | 360.0 | 1 |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 0 | 450.0 | 2 |
In [24]:
titanic_drop_df = titanic_df.drop('Age_0', axis=1)
titanic_drop_df.head(3)
Out[24]:
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Age_by_10 | Family_No | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 320.0 | 2 |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 480.0 | 2 |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 360.0 | 1 |
In [25]:
titanic_df.head(3)
Out[25]:
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Age_0 | Age_by_10 | Family_No | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 0 | 320.0 | 2 |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 0 | 480.0 | 2 |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 0 | 360.0 | 1 |
In [26]:
# drop_result = titanic_df.drop(['Age_0','Age_by_10','Family_No'], axis=1, inplace=True)
titanic_df.head(3)
Out[26]:
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Age_0 | Age_by_10 | Family_No | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 0 | 320.0 | 2 |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 0 | 480.0 | 2 |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 0 | 360.0 | 1 |
In [27]:
titanic_df_reset = titanic_df.reset_index(inplace=False)
titanic_df_reset.head(3)
Out[27]:
index | PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Age_0 | Age_by_10 | Family_No | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 0 | 320.0 | 2 |
1 | 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 0 | 480.0 | 2 |
2 | 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 0 | 360.0 | 1 |
In [ ]:
## df.iloc[숫자,숫자]
## df.loc[명칭(숫자),명칭(숫자)]
In [28]:
## 불린인덱싱
titanic_df[
(titanic_df['Age']>60) & (titanic_df['Pclass']==1)&(titanic_df['Sex']=='female')
]
Out[28]:
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Age_0 | Age_by_10 | Family_No | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
275 | 276 | 1 | 1 | Andrews, Miss. Kornelia Theodosia | female | 63.0 | 1 | 0 | 13502 | 77.9583 | D7 | S | 0 | 730.0 | 2 |
829 | 830 | 1 | 1 | Stone, Mrs. George Nelson (Martha Evelyn) | female | 62.0 | 0 | 0 | 113572 | 80.0000 | B28 | NaN | 0 | 720.0 | 1 |
In [29]:
cond1 = titanic_df['Age']>60
cond2 = titanic_df['Pclass']==1
cond3 = titanic_df['Sex']=='female'
titanic_df[cond1&cond2&cond3]
Out[29]:
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Age_0 | Age_by_10 | Family_No | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
275 | 276 | 1 | 1 | Andrews, Miss. Kornelia Theodosia | female | 63.0 | 1 | 0 | 13502 | 77.9583 | D7 | S | 0 | 730.0 | 2 |
829 | 830 | 1 | 1 | Stone, Mrs. George Nelson (Martha Evelyn) | female | 62.0 | 0 | 0 | 113572 | 80.0000 | B28 | NaN | 0 | 720.0 | 1 |
In [35]:
titanic_groupby = titanic_df.groupby('Pclass')
titanic_groupby
Out[35]:
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001F35919AB08>
In [36]:
## 딕셔너리 형태로 '컬럼':'agg함수' 생성 후 groupby
agg_form={'Age':'max','Fare':'mean'}
titanic_df.groupby('Pclass').agg(agg_form)
Out[36]:
Age | Fare | |
---|---|---|
Pclass | ||
1 | 80.0 | 84.154687 |
2 | 70.0 | 20.662183 |
3 | 74.0 | 13.675550 |
In [ ]:
In [30]:
titanic_groupby = titanic_df.groupby('Pclass').count()
titanic_groupby
Out[30]:
PassengerId | Survived | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Age_0 | Age_by_10 | Family_No | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Pclass | ||||||||||||||
1 | 216 | 216 | 216 | 216 | 186 | 216 | 216 | 216 | 216 | 176 | 214 | 216 | 186 | 216 |
2 | 184 | 184 | 184 | 184 | 173 | 184 | 184 | 184 | 184 | 16 | 184 | 184 | 173 | 184 |
3 | 491 | 491 | 491 | 491 | 355 | 491 | 491 | 491 | 491 | 12 | 491 | 491 | 355 | 491 |
In [31]:
titanic_groupby = titanic_df.groupby('Pclass')[['PassengerId','Survived']].count()
In [32]:
titanic_groupby
Out[32]:
PassengerId | Survived | |
---|---|---|
Pclass | ||
1 | 216 | 216 |
2 | 184 | 184 |
3 | 491 | 491 |
In [34]:
titanic_groupby = titanic_df.groupby('Pclass')[['PassengerId']].count()
titanic_groupby
Out[34]:
PassengerId | |
---|---|
Pclass | |
1 | 216 |
2 | 184 |
3 | 491 |
In [37]:
## 결손값 처리
# .isna().sum() > 결손 데이터 찾기
# titanic_df['컬럼명'] = titanic_df['컬럼명'].fillna(대체값)
# titanic_df['컬럼명'].fillna(대체값, inplace=True)
titanic_df['Age'] = titanic_df['Age'].fillna(titanic_df['Age'].mean())
titanic_df['Embarked'] = titanic_df['Embarked'].fillna('S')
titanic_df.isna().sum()
Out[37]:
PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 Age 0 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 687 Embarked 0 Age_0 0 Age_by_10 177 Family_No 0 dtype: int64
In [39]:
titanic_df['Name_len'] = titanic_df['Name'].apply(lambda x : len(x))
titanic_df[['Name','Name_len']].head(3)
Out[39]:
Name | Name_len | |
---|---|---|
0 | Braund, Mr. Owen Harris | 23 |
1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | 51 |
2 | Heikkinen, Miss. Laina | 22 |
In [41]:
titanic_df['Child_Adult'] = titanic_df['Age'].apply(lambda x : 'Child' if x<=15 else 'Adult')
titanic_df[['Name','Age','Child_Adult']].head(6)
Out[41]:
Name | Age | Child_Adult | |
---|---|---|---|
0 | Braund, Mr. Owen Harris | 22.000000 | Adult |
1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | 38.000000 | Adult |
2 | Heikkinen, Miss. Laina | 26.000000 | Adult |
3 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | 35.000000 | Adult |
4 | Allen, Mr. William Henry | 35.000000 | Adult |
5 | Moran, Mr. James | 29.699118 | Adult |
'코드잇' 카테고리의 다른 글
[코드잇] 대학교 강의실 배정하기1 (0) | 2021.02.03 |
---|---|
[코드잇] 대학교 수강신청 준비하기 (0) | 2021.02.03 |
[코드잇 & 머신러닝] 넘파이 (0) | 2021.02.02 |
[코드잇] 숫자 야구 (0) | 2021.02.01 |
[코드잇] 로또 시뮬레이션 (0) | 2021.02.01 |