In [1]:

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:1200px !important; }</style>"))

In [1]:

import numpy as np
import pandas as pd

In [2]:

titanic_df = pd.read_csv('./data/titanic/train.csv')
titanic_df.head(3)

Out[2]:

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S

In [3]:

print('titanic 변수 type : ', type(titanic_df))

titanic 변수 type :  <class 'pandas.core.frame.DataFrame'>

In [4]:

print('titanic DataFrame 크기 : ', titanic_df.shape)

titanic DataFrame 크기 :  (891, 12)

In [5]:

titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB

In [6]:

titanic_df.describe()

Out[6]:

	PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare
count	891.000000	891.000000	891.000000	714.000000	891.000000	891.000000	891.000000
mean	446.000000	0.383838	2.308642	29.699118	0.523008	0.381594	32.204208
std	257.353842	0.486592	0.836071	14.526497	1.102743	0.806057	49.693429
min	1.000000	0.000000	1.000000	0.420000	0.000000	0.000000	0.000000
25%	223.500000	0.000000	2.000000	20.125000	0.000000	0.000000	7.910400
50%	446.000000	0.000000	3.000000	28.000000	0.000000	0.000000	14.454200
75%	668.500000	1.000000	3.000000	38.000000	1.000000	0.000000	31.000000
max	891.000000	1.000000	3.000000	80.000000	8.000000	6.000000	512.329200

In [7]:

value_counts = titanic_df['Pclass'].value_counts()
print(value_counts)

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [8]:

titanic_pclass = titanic_df['Pclass']
print(type(titanic_pclass))

<class 'pandas.core.series.Series'>

In [9]:

titanic_pclass.head(5)

Out[9]:

0    3
1    1
2    3
3    1
4    3
Name: Pclass, dtype: int64

In [10]:

print(value_counts)
print(type(value_counts))

3    491
1    216
2    184
Name: Pclass, dtype: int64
<class 'pandas.core.series.Series'>

In [11]:

import numpy as np

In [12]:

col1_name = ['컬럼1']
list1 = [1,2,3]
array1 = np.array(list1)
print('array1 shape : ', array1.shape)

array1 shape :  (3,)

In [13]:

# 리스트를 이용해 DataFrame 형성
df_list1 = pd.DataFrame(list1, columns=col1_name)
print('1차원 리스트로 만든 DataFrame : \n', df_list1)

1차원 리스트로 만든 DataFrame : 
    컬럼1
0    1
1    2
2    3

In [14]:

#넘파이 ndarray 를 이용해 DataFrame 형성
df_array1 = pd.DataFrame(array1, columns=col1_name)
print('1차원 ndarray로 만든 DataFrame : \n', df_array1)

1차원 ndarray로 만든 DataFrame : 
    컬럼1
0    1
1    2
2    3

In [15]:

# 3개이 컬럼명이 필요
col2_name = ['컬럼1','컬럼2','컬럼3']

# 2*3 행렬의 리스트와 ndarray 생성한 뒤 이를 DataFrame 으로 변환
list2 = [[1,3,5],[12,14,16]]
array2 = np.array(list2)
print('array2 : \n',array2)
print('array2 shape : \n',array2.shape)

array2 : 
 [[ 1  3  5]
 [12 14 16]]
array2 shape : 
 (2, 3)

In [16]:

df_list2 = pd.DataFrame(list2, columns=col2_name)
print('2차원 list 로 만든 DataFrame : \n', df_list2)

2차원 list 로 만든 DataFrame : 
    컬럼1  컬럼2  컬럼3
0    1    3    5
1   12   14   16

In [17]:

df_array2 = pd.DataFrame(array2, columns=col2_name)
print('2차원 ndarray로 만든 DataFrame : \n', df_array2)

2차원 ndarray로 만든 DataFrame : 
    컬럼1  컬럼2  컬럼3
0    1    3    5
1   12   14   16

In [18]:

# 딕셔너리를 DataFrame으로
# key 는 문자열 칼럼명으로 매핑, value 는 리스트(ndarray) 형 칼럼 데이터로 매핑
dict = {'컬럼1':[1, 14],'컬럼2':[2, 15],'컬럼3':[3,16]}
df_dict1 = pd.DataFrame(dict)
print('딕셔너리로 만든 DataFrame : \n', df_dict1)

딕셔너리로 만든 DataFrame : 
    컬럼1  컬럼2  컬럼3
0    1    2    3
1   14   15   16

In [19]:

# DataFrame 의 컬럼값 을 ndarray로 변환
array3 = df_dict1.values
print('df_dict1.values 타입 : ', type(array3)
     , 'df_dict1.values 의 shape : ', array3.shape)
print(array3)

df_dict1.values 타입 :  <class 'numpy.ndarray'> df_dict1.values 의 shape :  (2, 3)
[[ 1  2  3]
 [14 15 16]]

In [20]:

# DataFrame 의 컬럼값 을 list로 변환
list3 = df_dict1.values.tolist()
print('df_dict1.values.tolist() 의 타입: ', type(list3) )

# DataFrame 의 컬럼값 을 딕셔너리로 변환 후 list 형으로 반환
dict3 = df_dict1.to_dict('list')
print('\n df_dict1.to_dict() 의 타입' , type(dict3))
print(dict3)

df_dict1.values.tolist() 의 타입:  <class 'list'>

 df_dict1.to_dict() 의 타입 <class 'dict'>
{'컬럼1': [1, 14], '컬럼2': [2, 15], '컬럼3': [3, 16]}

In [21]:

# 컬럼 만들기
titanic_df['Age_0']=0
titanic_df.head(3)

Out[21]:

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S

In [22]:

titanic_df['Age_by_10'] = titanic_df['Age']*10
titanic_df['Family_No'] = titanic_df['SibSp']+titanic_df['Parch']+1
titanic_df.head(4)

Out[22]:

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked	Age_by_10	Family_No
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S	220.0	2
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C	380.0	2
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S	260.0	1
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S	350.0	2

In [23]:

titanic_df['Age_by_10']=titanic_df['Age_by_10']+100
titanic_df.head(4)

Out[23]:

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked	Age_by_10	Family_No
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S	320.0	2
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C	480.0	2
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S	360.0	1
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S	450.0	2

In [24]:

titanic_drop_df = titanic_df.drop('Age_0', axis=1)
titanic_drop_df.head(3)

Out[24]:

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked	Age_by_10	Family_No
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S	320.0	2
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C	480.0	2
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S	360.0	1

In [25]:

titanic_df.head(3)

Out[25]:

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked	Age_by_10	Family_No
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S	320.0	2
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C	480.0	2
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S	360.0	1

In [26]:

# drop_result = titanic_df.drop(['Age_0','Age_by_10','Family_No'], axis=1, inplace=True)
titanic_df.head(3)

Out[26]:

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked	Age_by_10	Family_No
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S	320.0	2
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C	480.0	2
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S	360.0	1

In [27]:

titanic_df_reset = titanic_df.reset_index(inplace=False)
titanic_df_reset.head(3)

Out[27]:

	index	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked	Age_by_10	Family_No
0	0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S	320.0	2
1	1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C	480.0	2
2	2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S	360.0	1

In [ ]:

## df.iloc[숫자,숫자]
## df.loc[명칭(숫자),명칭(숫자)]

In [28]:

## 불린인덱싱

titanic_df[
    (titanic_df['Age']>60) & (titanic_df['Pclass']==1)&(titanic_df['Sex']=='female')
]

Out[28]:

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked	Age_0	Age_by_10	Family_No
275	276	1	1	Andrews, Miss. Kornelia Theodosia	female	63.0	1	0	13502	77.9583	D7	S	0	730.0	2
829	830	1	1	Stone, Mrs. George Nelson (Martha Evelyn)	female	62.0	0	0	113572	80.0000	B28	NaN	0	720.0	1

In [29]:

cond1 = titanic_df['Age']>60
cond2 = titanic_df['Pclass']==1
cond3 = titanic_df['Sex']=='female'

titanic_df[cond1&cond2&cond3]

Out[29]:

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked	Age_0	Age_by_10	Family_No
275	276	1	1	Andrews, Miss. Kornelia Theodosia	female	63.0	1	0	13502	77.9583	D7	S	0	730.0	2
829	830	1	1	Stone, Mrs. George Nelson (Martha Evelyn)	female	62.0	0	0	113572	80.0000	B28	NaN	0	720.0	1

In [35]:

titanic_groupby = titanic_df.groupby('Pclass')
titanic_groupby

Out[35]:

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001F35919AB08>

In [36]:

## 딕셔너리 형태로 '컬럼':'agg함수' 생성 후 groupby

agg_form={'Age':'max','Fare':'mean'}
titanic_df.groupby('Pclass').agg(agg_form)

Out[36]:

	Age	Fare
Pclass
1	80.0	84.154687
2	70.0	20.662183
3	74.0	13.675550

In [ ]:

In [30]:

titanic_groupby = titanic_df.groupby('Pclass').count()
titanic_groupby

Out[30]:

	PassengerId	Survived	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked	Age_0	Age_by_10	Family_No
Pclass
1	216	216	216	216	186	216	216	216	216	176	214	216	186	216
2	184	184	184	184	173	184	184	184	184	16	184	184	173	184
3	491	491	491	491	355	491	491	491	491	12	491	491	355	491

In [31]:

titanic_groupby = titanic_df.groupby('Pclass')[['PassengerId','Survived']].count()

In [32]:

titanic_groupby

Out[32]:

	PassengerId	Survived
Pclass
1	216	216
2	184	184
3	491	491

In [34]:

titanic_groupby = titanic_df.groupby('Pclass')[['PassengerId']].count()
titanic_groupby

Out[34]:

	PassengerId
Pclass
1	216
2	184
3	491

In [37]:

## 결손값 처리 
# .isna().sum() > 결손 데이터 찾기
# titanic_df['컬럼명'] = titanic_df['컬럼명'].fillna(대체값)
# titanic_df['컬럼명'].fillna(대체값, inplace=True) 


titanic_df['Age'] = titanic_df['Age'].fillna(titanic_df['Age'].mean())
titanic_df['Embarked'] = titanic_df['Embarked'].fillna('S')

titanic_df.isna().sum()

Out[37]:

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
Age_0            0
Age_by_10      177
Family_No        0
dtype: int64

In [39]:

titanic_df['Name_len'] = titanic_df['Name'].apply(lambda x : len(x))
titanic_df[['Name','Name_len']].head(3)

Out[39]:

	Name	Name_len
0	Braund, Mr. Owen Harris	23
1	Cumings, Mrs. John Bradley (Florence Briggs Th...	51
2	Heikkinen, Miss. Laina	22

In [41]:

titanic_df['Child_Adult'] = titanic_df['Age'].apply(lambda x : 'Child' if x<=15 else 'Adult')
titanic_df[['Name','Age','Child_Adult']].head(6)

Out[41]:

	Name	Age	Child_Adult
0	Braund, Mr. Owen Harris	22.000000	Adult
1	Cumings, Mrs. John Bradley (Florence Briggs Th...	38.000000	Adult
2	Heikkinen, Miss. Laina	26.000000	Adult
3	Futrelle, Mrs. Jacques Heath (Lily May Peel)	35.000000	Adult
4	Allen, Mr. William Henry	35.000000	Adult
5	Moran, Mr. James	29.699118	Adult

[코드잇] 대학교 강의실 배정하기1 (0)	2021.02.03
[코드잇] 대학교 수강신청 준비하기 (0)	2021.02.03
[코드잇 & 머신러닝] 넘파이 (0)	2021.02.02
[코드잇] 숫자 야구 (0)	2021.02.01
[코드잇] 로또 시뮬레이션 (0)	2021.02.01

dlsalfkd11 코딩코딩

[코드잇 & 머신러닝] 판다스

'코드잇' 카테고리의 다른 글

티스토리툴바

[코드잇 & 머신러닝] 판다스

'코드잇' 카테고리의 다른 글

'코드잇' Related Articles

티스토리툴바