import pandas as pd
import seaborn as sns
df = sns.load_dataset('iris')
print(df.shape)
df.head(2)
(150, 5)
sepal_length | sepal_width | petal_length | petal_width | species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
# train/test set 분리
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1], df[['species']], train_size=0.8, random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
(120, 4) (120, 1) (30, 4) (30, 1)
# 각 열별로 mean, std 확인
X_train.agg(['mean', 'std']).round(2)
sepal_length | sepal_width | petal_length | petal_width | |
---|---|---|---|---|
mean | 5.81 | 3.06 | 3.73 | 1.18 |
std | 0.82 | 0.45 | 1.75 | 0.75 |
fit_transform()
¶from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
arr_scaled = scaler.fit_transform(X_train) # X_train 기준으로 정규화
arr_scaled[:3]
array([[-1.47393679, 1.20365799, -1.56253475, -1.31260282], [-0.13307079, 2.99237573, -1.27600637, -1.04563275], [ 1.08589829, 0.08570939, 0.38585821, 0.28921757]])
# 다루기 편하게 DataFrame으로
X_train_scaled = pd.DataFrame(arr_scaled, columns=X_train.columns)
X_train_scaled.head(3)
sepal_length | sepal_width | petal_length | petal_width | |
---|---|---|---|---|
0 | -1.473937 | 1.203658 | -1.562535 | -1.312603 |
1 | -0.133071 | 2.992376 | -1.276006 | -1.045633 |
2 | 1.085898 | 0.085709 | 0.385858 | 0.289218 |
# 각 열별로 mean, std 확인
X_train_scaled.agg(['mean', 'std']).round(2)
sepal_length | sepal_width | petal_length | petal_width | |
---|---|---|---|---|
mean | 0.0 | 0.0 | 0.0 | -0.0 |
std | 1.0 | 1.0 | 1.0 | 1.0 |
# scaler 객체 확인
print('mean_', scaler.mean_.round(2), 'Out[3] 에서의 mean과 같음')
print('scale_', scaler.scale_.round(2), 'Out[3] 에서의 std와 같음') # std
print('var_', scaler.var_.round(2))
print('feature_names_in_', scaler.feature_names_in_)
print('n_samples_seen_', scaler.n_samples_seen_)
print('n_features_in_', scaler.n_features_in_)
mean_ [5.81 3.06 3.73 1.18] Out[3] 에서의 mean과 같음 scale_ [0.82 0.45 1.75 0.75] Out[3] 에서의 std와 같음 var_ [0.67 0.2 3.05 0.56] feature_names_in_ ['sepal_length' 'sepal_width' 'petal_length' 'petal_width'] n_samples_seen_ 120 n_features_in_ 4
X_test.head(3)
sepal_length | sepal_width | petal_length | petal_width | |
---|---|---|---|---|
73 | 6.1 | 2.8 | 4.7 | 1.2 |
18 | 5.7 | 3.8 | 1.7 | 0.3 |
118 | 7.7 | 2.6 | 6.9 | 2.3 |
arr_scaled = scaler.transform(X_test)
arr_scaled[:3] # nd_array
array([[ 0.35451684, -0.58505976, 0.55777524, 0.02224751], [-0.13307079, 1.65083742, -1.16139502, -1.17911778], [ 2.30486738, -1.0322392 , 1.8185001 , 1.49058286]])
# 다루기 편하게 DataFrame으로
X_test_scaled = pd.DataFrame(arr_scaled, columns=X_test.columns)
X_test_scaled.head(2).round(2)
sepal_length | sepal_width | petal_length | petal_width | |
---|---|---|---|---|
0 | 0.35 | -0.59 | 0.56 | 0.02 |
1 | -0.13 | 1.65 | -1.16 | -1.18 |
sample = X_test_scaled.iloc[[0], :] # 첫 행으로 테스트
sample
sepal_length | sepal_width | petal_length | petal_width | |
---|---|---|---|---|
0 | 0.354517 | -0.58506 | 0.557775 | 0.022248 |
scaler.inverse_transform(sample) # scale하기 전인 Out [8] 의 첫 행과 같음
array([[6.1, 2.8, 4.7, 1.2]])