import pandas as pd
import seaborn as sns
df = sns.load_dataset('titanic')
print(df.shape)
df.head(2)
(891, 15)
survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | NaN | Southampton | no | False |
1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | C | Cherbourg | yes | False |
df.groupby(['class', 'survived']).size()
class survived First 0 80 1 136 Second 0 97 1 87 Third 0 372 1 119 dtype: int64
unstack()
까지 해서 survived
의 그룹들을 column으로 표현¶unstack()
을 호출하면 row의 멀티인덱스 중 마지막 인덱스를 column으로 펼쳐 줌df.groupby(['class', 'survived']).size().unstack()
survived | 0 | 1 |
---|---|---|
class | ||
First | 80 | 136 |
Second | 97 | 87 |
Third | 372 | 119 |
df_grouped = df.groupby(['class', 'survived']).size().unstack().reset_index()
df_grouped
survived | class | 0 | 1 |
---|---|---|---|
0 | First | 80 | 136 |
1 | Second | 97 | 87 |
2 | Third | 372 | 119 |
# 컴럼명 확인 : 0, 1은 문자열 아니고 그냥 숫자
df_grouped.columns
Index(['class', 0, 1], dtype='object', name='survived')
df_grouped['survived_ratio'] = df_grouped[1] / (df_grouped[0] + df_grouped[1])
df_grouped
survived | class | 0 | 1 | survived_ratio |
---|---|---|---|---|
0 | First | 80 | 136 | 0.629630 |
1 | Second | 97 | 87 | 0.472826 |
2 | Third | 372 | 119 | 0.242363 |
# 필요한 경우 소팅
df_grouped.sort_values(by='survived_ratio', ascending=True)
survived | class | 0 | 1 | survived_ratio |
---|---|---|---|---|
2 | Third | 372 | 119 | 0.242363 |
1 | Second | 97 | 87 | 0.472826 |
0 | First | 80 | 136 | 0.629630 |