import pandas as pd
import seaborn as sns
df = sns.load_dataset('titanic')
print(df.shape)
df.head(2)
(891, 15)
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | NaN | Southampton | no | False |
| 1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | C | Cherbourg | yes | False |
df.groupby(['class', 'survived']).size()
class survived
First 0 80
1 136
Second 0 97
1 87
Third 0 372
1 119
dtype: int64
unstack() 까지 해서 survived의 그룹들을 column으로 표현¶unstack()을 호출하면 row의 멀티인덱스 중 마지막 인덱스를 column으로 펼쳐 줌df.groupby(['class', 'survived']).size().unstack()
| survived | 0 | 1 |
|---|---|---|
| class | ||
| First | 80 | 136 |
| Second | 97 | 87 |
| Third | 372 | 119 |
df_grouped = df.groupby(['class', 'survived']).size().unstack().reset_index()
df_grouped
| survived | class | 0 | 1 |
|---|---|---|---|
| 0 | First | 80 | 136 |
| 1 | Second | 97 | 87 |
| 2 | Third | 372 | 119 |
# 컴럼명 확인 : 0, 1은 문자열 아니고 그냥 숫자
df_grouped.columns
Index(['class', 0, 1], dtype='object', name='survived')
df_grouped['survived_ratio'] = df_grouped[1] / (df_grouped[0] + df_grouped[1])
df_grouped
| survived | class | 0 | 1 | survived_ratio |
|---|---|---|---|---|
| 0 | First | 80 | 136 | 0.629630 |
| 1 | Second | 97 | 87 | 0.472826 |
| 2 | Third | 372 | 119 | 0.242363 |
# 필요한 경우 소팅
df_grouped.sort_values(by='survived_ratio', ascending=True)
| survived | class | 0 | 1 | survived_ratio |
|---|---|---|---|---|
| 2 | Third | 372 | 119 | 0.242363 |
| 1 | Second | 97 | 87 | 0.472826 |
| 0 | First | 80 | 136 | 0.629630 |