DataFrame의 각 열이 어떤 데이터들을 가지고 있는지 간편하게 살펴보자.
- 아래의
inspect_columns()
함수로 각 열의 데이터를 확인할 수 있음
def inspect_columns(df):
df_describe = df.describe()
df_result = pd.DataFrame(
data={
'dtype': df.dtypes,
'unique': (df.nunique() == len(df)) + 0, # 0: False, 1: True
'nunique': df.nunique().map('{:,d}'.format),
'null_count': df.isna().sum().map('{:,d}'.format),
'null_pct': ((df.isnull().sum() / len(df)) * 100).map('{:,.2f}'.format),
'|': '▮',
'min': df_describe.loc['min'].map('{:,.2f}'.format),
'mean': df_describe.loc['mean'].map('{:,.2f}'.format),
'median': df_describe.loc['50%'].map('{:,.2f}'.format),
'max': df_describe.loc['max'].map('{:,.2f}'.format),
'std': df_describe.loc['std'].map('{:,.2f}'.format),
'||': '▮',
'1st_row': df.iloc[0],
'random_row': df.iloc[np.random.randint(low=0, high=len(df))],
'last_row': df.iloc[-1],
},
index=df.columns,
)
return df_result
import numpy as np
import pandas as pd
import seaborn as sns
def inspect_columns(df):
df_describe = df.describe()
df_result = pd.DataFrame(
data={
'dtype': df.dtypes,
'unique': (df.nunique() == len(df)) + 0, # 0: False, 1: True
'nunique': df.nunique().map('{:,d}'.format),
'null_count': df.isna().sum().map('{:,d}'.format),
'null_pct': ((df.isnull().sum() / len(df)) * 100).map('{:,.2f}'.format),
'|': '▮',
'min': df_describe.loc['min'].map('{:,.2f}'.format),
'mean': df_describe.loc['mean'].map('{:,.2f}'.format),
'median': df_describe.loc['50%'].map('{:,.2f}'.format),
'max': df_describe.loc['max'].map('{:,.2f}'.format),
'std': df_describe.loc['std'].map('{:,.2f}'.format),
'||': '▮',
'1st_row': df.iloc[0],
'random_row': df.iloc[np.random.randint(low=0, high=len(df))],
'last_row': df.iloc[-1],
},
index=df.columns,
)
return df_result
df_titanic = sns.load_dataset('titanic')
print(df_titanic.shape)
df_titanic.head()
(891, 15)
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone |
---|
0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | NaN | Southampton | no | False |
---|
1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | C | Cherbourg | yes | False |
---|
2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | NaN | Southampton | yes | True |
---|
3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S | First | woman | False | C | Southampton | yes | False |
---|
4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | NaN | Southampton | no | True |
---|
inspect_columns(df_titanic)
| dtype | unique | nunique | null_count | null_pct | | | min | mean | median | max | std | || | 1st_row | random_row | last_row |
---|
survived | int64 | 0 | 2 | 0 | 0.00 | ▮ | 0.00 | 0.38 | 0.00 | 1.00 | 0.49 | ▮ | 0 | 0 | 0 |
---|
pclass | int64 | 0 | 3 | 0 | 0.00 | ▮ | 1.00 | 2.31 | 3.00 | 3.00 | 0.84 | ▮ | 3 | 3 | 3 |
---|
sex | object | 0 | 2 | 0 | 0.00 | ▮ | NaN | NaN | NaN | NaN | NaN | ▮ | male | male | male |
---|
age | float64 | 0 | 88 | 177 | 19.87 | ▮ | 0.42 | 29.70 | 28.00 | 80.00 | 14.53 | ▮ | 22.0 | 27.0 | 32.0 |
---|
sibsp | int64 | 0 | 7 | 0 | 0.00 | ▮ | 0.00 | 0.52 | 0.00 | 8.00 | 1.10 | ▮ | 1 | 0 | 0 |
---|
parch | int64 | 0 | 7 | 0 | 0.00 | ▮ | 0.00 | 0.38 | 0.00 | 6.00 | 0.81 | ▮ | 0 | 0 | 0 |
---|
fare | float64 | 0 | 248 | 0 | 0.00 | ▮ | 0.00 | 32.20 | 14.45 | 512.33 | 49.69 | ▮ | 7.25 | 7.8958 | 7.75 |
---|
embarked | object | 0 | 3 | 2 | 0.22 | ▮ | NaN | NaN | NaN | NaN | NaN | ▮ | S | S | Q |
---|
class | category | 0 | 3 | 0 | 0.00 | ▮ | NaN | NaN | NaN | NaN | NaN | ▮ | Third | Third | Third |
---|
who | object | 0 | 3 | 0 | 0.00 | ▮ | NaN | NaN | NaN | NaN | NaN | ▮ | man | man | man |
---|
adult_male | bool | 0 | 2 | 0 | 0.00 | ▮ | NaN | NaN | NaN | NaN | NaN | ▮ | True | True | True |
---|
deck | category | 0 | 7 | 688 | 77.22 | ▮ | NaN | NaN | NaN | NaN | NaN | ▮ | NaN | NaN | NaN |
---|
embark_town | object | 0 | 3 | 2 | 0.22 | ▮ | NaN | NaN | NaN | NaN | NaN | ▮ | Southampton | Southampton | Queenstown |
---|
alive | object | 0 | 2 | 0 | 0.00 | ▮ | NaN | NaN | NaN | NaN | NaN | ▮ | no | no | no |
---|
alone | bool | 0 | 2 | 0 | 0.00 | ▮ | NaN | NaN | NaN | NaN | NaN | ▮ | False | True | True |
---|
df_iris = sns.load_dataset('iris')
print(df_iris.shape)
df_iris.head()
(150, 5)
| sepal_length | sepal_width | petal_length | petal_width | species |
---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
---|
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
---|
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
---|
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
---|
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
---|
| dtype | unique | nunique | null_count | null_pct | | | min | mean | median | max | std | || | 1st_row | random_row | last_row |
---|
sepal_length | float64 | 0 | 35 | 0 | 0.00 | ▮ | 4.30 | 5.84 | 5.80 | 7.90 | 0.83 | ▮ | 5.1 | 6.8 | 5.9 |
---|
sepal_width | float64 | 0 | 23 | 0 | 0.00 | ▮ | 2.00 | 3.06 | 3.00 | 4.40 | 0.44 | ▮ | 3.5 | 3.0 | 3.0 |
---|
petal_length | float64 | 0 | 43 | 0 | 0.00 | ▮ | 1.00 | 3.76 | 4.35 | 6.90 | 1.77 | ▮ | 1.4 | 5.5 | 5.1 |
---|
petal_width | float64 | 0 | 22 | 0 | 0.00 | ▮ | 0.10 | 1.20 | 1.30 | 2.50 | 0.76 | ▮ | 0.2 | 2.1 | 1.8 |
---|
species | object | 0 | 3 | 0 | 0.00 | ▮ | NaN | NaN | NaN | NaN | NaN | ▮ | setosa | virginica | virginica |
---|
df_diamonds = sns.load_dataset('diamonds')
print(df_diamonds.shape)
df_diamonds.head()
(53940, 10)
| carat | cut | color | clarity | depth | table | price | x | y | z |
---|
0 | 0.23 | Ideal | E | SI2 | 61.5 | 55.0 | 326 | 3.95 | 3.98 | 2.43 |
---|
1 | 0.21 | Premium | E | SI1 | 59.8 | 61.0 | 326 | 3.89 | 3.84 | 2.31 |
---|
2 | 0.23 | Good | E | VS1 | 56.9 | 65.0 | 327 | 4.05 | 4.07 | 2.31 |
---|
3 | 0.29 | Premium | I | VS2 | 62.4 | 58.0 | 334 | 4.20 | 4.23 | 2.63 |
---|
4 | 0.31 | Good | J | SI2 | 63.3 | 58.0 | 335 | 4.34 | 4.35 | 2.75 |
---|
inspect_columns(df_diamonds)
| dtype | unique | nunique | null_count | null_pct | | | min | mean | median | max | std | || | 1st_row | random_row | last_row |
---|
carat | float64 | 0 | 273 | 0 | 0.00 | ▮ | 0.20 | 0.80 | 0.70 | 5.01 | 0.47 | ▮ | 0.23 | 0.71 | 0.75 |
---|
cut | category | 0 | 5 | 0 | 0.00 | ▮ | NaN | NaN | NaN | NaN | NaN | ▮ | Ideal | Premium | Ideal |
---|
color | category | 0 | 7 | 0 | 0.00 | ▮ | NaN | NaN | NaN | NaN | NaN | ▮ | E | I | D |
---|
clarity | category | 0 | 8 | 0 | 0.00 | ▮ | NaN | NaN | NaN | NaN | NaN | ▮ | SI2 | VS2 | SI2 |
---|
depth | float64 | 0 | 184 | 0 | 0.00 | ▮ | 43.00 | 61.75 | 61.80 | 79.00 | 1.43 | ▮ | 61.5 | 59.3 | 62.2 |
---|
table | float64 | 0 | 127 | 0 | 0.00 | ▮ | 43.00 | 57.46 | 57.00 | 95.00 | 2.23 | ▮ | 55.0 | 59.0 | 55.0 |
---|
price | int64 | 0 | 11,602 | 0 | 0.00 | ▮ | 326.00 | 3,932.80 | 2,401.00 | 18,823.00 | 3,989.44 | ▮ | 326 | 2300 | 2757 |
---|
x | float64 | 0 | 554 | 0 | 0.00 | ▮ | 0.00 | 5.73 | 5.70 | 10.74 | 1.12 | ▮ | 3.95 | 5.89 | 5.83 |
---|
y | float64 | 0 | 552 | 0 | 0.00 | ▮ | 0.00 | 5.73 | 5.71 | 58.90 | 1.14 | ▮ | 3.98 | 5.81 | 5.87 |
---|
z | float64 | 0 | 375 | 0 | 0.00 | ▮ | 0.00 | 3.54 | 3.53 | 31.80 | 0.71 | ▮ | 2.43 | 3.47 | 3.64 |
---|