In [1]:
import pandas as pd
import seaborn as sns

titanic = sns.load_dataset('titanic')
print(titanic.shape)
titanic.head(2)
(891, 15)
Out[1]:
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
0 0 3 male 22.0 1 0 7.2500 S Third man True NaN Southampton no False
1 1 1 female 38.0 1 0 71.2833 C First woman False C Cherbourg yes False
In [2]:
df = titanic.loc[:, ['age', 'sex', 'class', 'fare', 'survived']]
print(df.shape)
df.head()
(891, 5)
Out[2]:
age sex class fare survived
0 22.0 male Third 7.2500 0
1 38.0 female First 71.2833 1
2 26.0 female Third 7.9250 1
3 35.0 female First 53.1000 1
4 35.0 male Third 8.0500 0
In [3]:
# class 열의 값들 확인
df['class'].value_counts()
Out[3]:
Third     491
First     216
Second    184
Name: class, dtype: int64
In [4]:
# class 열의 값 기준으로 그룹화
grouped = df.groupby(['class'])
grouped
Out[4]:
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000241F1D1AD08>
In [5]:
# 그룹별 row 수 확인
grouped.size()
Out[5]:
class
First     216
Second    184
Third     491
dtype: int64
In [6]:
# 그룹별 key 와 DataFrame 확인
for key, group in grouped:
    print(f'key: {key}')
    print(f'shape: {group.shape}')
    display(group.head(2))
key: First
shape: (216, 5)
age sex class fare survived
1 38.0 female First 71.2833 1
3 35.0 female First 53.1000 1
key: Second
shape: (184, 5)
age sex class fare survived
9 14.0 female Second 30.0708 1
15 55.0 female Second 16.0000 1
key: Third
shape: (491, 5)
age sex class fare survived
0 22.0 male Third 7.250 0
2 26.0 female Third 7.925 1
In [7]:
# 그룹별로 각 열별 평균(mean) 구하기
grouped.mean()
Out[7]:
age fare survived
class
First 38.233441 84.154687 0.629630
Second 29.877630 20.662183 0.472826
Third 25.140620 13.675550 0.242363
In [8]:
# 그외 적용 가능한 판다스 기본 함수
# - mean(), max(), min(), sum(), count(), size()
# - var(), std(), describe(), info(), first(), last() 등
In [9]:
# 특정 그룹 확인
grouped.get_group('Third').head(2)
Out[9]:
age sex class fare survived
0 22.0 male Third 7.250 0
2 26.0 female Third 7.925 1