Pandas - DataFrame의 각 columns 분석을 위한 유틸리티 함수

DataFrame의 각 열이 어떤 데이터들을 가지고 있는지 간편하게 살펴보자.

/images/logo/pandas.svg
pandas 로고
  • 아래의 inspect_columns() 함수로 각 열의 데이터를 확인할 수 있음
def inspect_columns(df):
    df_describe = df.describe()
    df_result = pd.DataFrame(
        data={
            'dtype': df.dtypes,
            'unique': (df.nunique() == len(df)) + 0,  # 0: False, 1: True
            'nunique': df.nunique().map('{:,d}'.format),
            'null_count': df.isna().sum().map('{:,d}'.format),
            'null_pct': ((df.isnull().sum() / len(df)) * 100).map('{:,.2f}'.format),
            '|': '▮',
            'min': df_describe.loc['min'].map('{:,.2f}'.format),
            'mean': df_describe.loc['mean'].map('{:,.2f}'.format),
            'median': df_describe.loc['50%'].map('{:,.2f}'.format),
            'max': df_describe.loc['max'].map('{:,.2f}'.format),
            'std': df_describe.loc['std'].map('{:,.2f}'.format),
            '||': '▮',
            '1st_row': df.iloc[0],
            'random_row': df.iloc[np.random.randint(low=0, high=len(df))],
            'last_row': df.iloc[-1],
        },
        index=df.columns,
    )

    return df_result


import numpy as np
import pandas as pd
import seaborn as sns

def inspect_columns(df):
    df_describe = df.describe()
    df_result = pd.DataFrame(
        data={
            'dtype': df.dtypes,
            'unique': (df.nunique() == len(df)) + 0,  # 0: False, 1: True
            'nunique': df.nunique().map('{:,d}'.format),
            'null_count': df.isna().sum().map('{:,d}'.format),
            'null_pct': ((df.isnull().sum() / len(df)) * 100).map('{:,.2f}'.format),
            '|': '▮',
            'min': df_describe.loc['min'].map('{:,.2f}'.format),
            'mean': df_describe.loc['mean'].map('{:,.2f}'.format),
            'median': df_describe.loc['50%'].map('{:,.2f}'.format),
            'max': df_describe.loc['max'].map('{:,.2f}'.format),
            'std': df_describe.loc['std'].map('{:,.2f}'.format),
            '||': '▮',
            '1st_row': df.iloc[0],
            'random_row': df.iloc[np.random.randint(low=0, high=len(df))],
            'last_row': df.iloc[-1],
        },
        index=df.columns,
    )

    return df_result

df_titanic = sns.load_dataset('titanic')
print(df_titanic.shape)
df_titanic.head()
(891, 15)

survivedpclasssexagesibspparchfareembarkedclasswhoadult_maledeckembark_townalivealone
003male22.0107.2500SThirdmanTrueNaNSouthamptonnoFalse
111female38.01071.2833CFirstwomanFalseCCherbourgyesFalse
213female26.0007.9250SThirdwomanFalseNaNSouthamptonyesTrue
311female35.01053.1000SFirstwomanFalseCSouthamptonyesFalse
403male35.0008.0500SThirdmanTrueNaNSouthamptonnoTrue


inspect_columns(df_titanic)

dtypeuniquenuniquenull_countnull_pct|minmeanmedianmaxstd||1st_rowrandom_rowlast_row
survivedint640200.000.000.380.001.000.49000
pclassint640300.001.002.313.003.000.84333
sexobject0200.00NaNNaNNaNNaNNaNmalemalemale
agefloat6408817719.870.4229.7028.0080.0014.5322.027.032.0
sibspint640700.000.000.520.008.001.10100
parchint640700.000.000.380.006.000.81000
farefloat64024800.000.0032.2014.45512.3349.697.257.89587.75
embarkedobject0320.22NaNNaNNaNNaNNaNSSQ
classcategory0300.00NaNNaNNaNNaNNaNThirdThirdThird
whoobject0300.00NaNNaNNaNNaNNaNmanmanman
adult_malebool0200.00NaNNaNNaNNaNNaNTrueTrueTrue
deckcategory0768877.22NaNNaNNaNNaNNaNNaNNaNNaN
embark_townobject0320.22NaNNaNNaNNaNNaNSouthamptonSouthamptonQueenstown
aliveobject0200.00NaNNaNNaNNaNNaNnonono
alonebool0200.00NaNNaNNaNNaNNaNFalseTrueTrue

df_iris = sns.load_dataset('iris')
print(df_iris.shape)
df_iris.head()
(150, 5)

sepal_lengthsepal_widthpetal_lengthpetal_widthspecies
05.13.51.40.2setosa
14.93.01.40.2setosa
24.73.21.30.2setosa
34.63.11.50.2setosa
45.03.61.40.2setosa


inspect_columns(df_iris)

dtypeuniquenuniquenull_countnull_pct|minmeanmedianmaxstd||1st_rowrandom_rowlast_row
sepal_lengthfloat6403500.004.305.845.807.900.835.16.85.9
sepal_widthfloat6402300.002.003.063.004.400.443.53.03.0
petal_lengthfloat6404300.001.003.764.356.901.771.45.55.1
petal_widthfloat6402200.000.101.201.302.500.760.22.11.8
speciesobject0300.00NaNNaNNaNNaNNaNsetosavirginicavirginica

df_diamonds = sns.load_dataset('diamonds')
print(df_diamonds.shape)
df_diamonds.head()
(53940, 10)

caratcutcolorclaritydepthtablepricexyz
00.23IdealESI261.555.03263.953.982.43
10.21PremiumESI159.861.03263.893.842.31
20.23GoodEVS156.965.03274.054.072.31
30.29PremiumIVS262.458.03344.204.232.63
40.31GoodJSI263.358.03354.344.352.75


inspect_columns(df_diamonds)

dtypeuniquenuniquenull_countnull_pct|minmeanmedianmaxstd||1st_rowrandom_rowlast_row
caratfloat64027300.000.200.800.705.010.470.230.710.75
cutcategory0500.00NaNNaNNaNNaNNaNIdealPremiumIdeal
colorcategory0700.00NaNNaNNaNNaNNaNEID
claritycategory0800.00NaNNaNNaNNaNNaNSI2VS2SI2
depthfloat64018400.0043.0061.7561.8079.001.4361.559.362.2
tablefloat64012700.0043.0057.4657.0095.002.2355.059.055.0
priceint64011,60200.00326.003,932.802,401.0018,823.003,989.4432623002757
xfloat64055400.000.005.735.7010.741.123.955.895.83
yfloat64055200.000.005.735.7158.901.143.985.815.87
zfloat64037500.000.003.543.5331.800.712.433.473.64




Related Content