rangedata-7cofy5fmoNM. deal with range data, one hot encoding
# @
# Data can be devided into 'numerical data', 'text data', 'range data'
# Machine learning and deep learning algorithm only can understance 'numerical data'
# So, we should convert data into data which computer understands
# We convert 'range data' into one hot encoding
# @
# Fruits
# apple
# pear
# persimmon
# aplle
# Fruits fruit-apple fruit-pear fruit-persimmon
# apple 1 0 0
# pear 0 1 0
# persimmon 0 0 1
# aplle 1 0 0
# @
# Download test data and train data: https://www.kaggle.com/c/titanic/data
import pandas as pd
import numpy as np
print(pd.__version__)
# < 0.21.0
print(np.__version__)
# < 1.14.0
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
print(train.shape)
# < (891, 12)
print(test.shape)
# < (418, 11)
print(train.dtypes)
# < PassengerId int64
# < Survived int64
# < Pclass int64
# < Name object
# < Sex object
# < Age float64
# < SibSp int64
# < Parch int64
# < Ticket object
# < Fare float64
# < Cabin object
# < Embarked object
# < dtype: object
print(test.dtypes)
# < PassengerId int64
# < Pclass int64
# < Name object
# < Sex object
# < Age float64
# < SibSp int64
# < Parch int64
# < Ticket object
# < Fare float64
# < Cabin object
# < Embarked object
# < dtype: object
print(train.columns)
# < Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
# < 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
# < dtype='object')
# @
# Numerical data
train.describe()
# < PassengerId Survived Pclass Age SibSp Parch Fare
# < count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
# < mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
# < std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
# < min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
# < 25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
# < 50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
# < 75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
# < max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200
# We extract object type data
# We find categorical data from them and then we encode them
# We can apply one hot encoding and TF, TF-IDF
# We should consider which encoding is more appropriate
obj_df = train.select_dtypes(include=['object']).copy()
obj_df.head()
# < Name Sex Ticket Cabin Embarked
# < 0 Braund, Mr. Owen Harris male A/5 21171 NaN S
# < 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female PC 17599 C85 C
# < 2 Heikkinen, Miss. Laina female STON/O2. 3101282 NaN S
# < 3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 113803 C123 S
# < 4 Allen, Mr. William Henry male 373450 NaN S
# We make data show if it has missed data by using isnull()
# We have most missed data from Cabin
obj_df[obj_df.isnull().any(axis=1)].head(5)
# < Name Sex Ticket Cabin Embarked
# < 0 Braund, Mr. Owen Harris male A/5 21171 NaN S
# < 2 Heikkinen, Miss. Laina female STON/O2. 3101282 NaN S
# < 4 Allen, Mr. William Henry male 373450 NaN S
# < 5 Moran, Mr. James male 330877 NaN Q
# < 7 Palsson, Master. Gosta Leonard male 349909 NaN S
# We want to check if data can be used as categorical data
# It turns out it's not good for categorical data because of many missed data
obj_df["Cabin"].value_counts().head(5)
# < G6 4
# < C23 C25 C27 4
# < B96 B98 4
# < F2 3
# < F33 3
# < Name: Cabin, dtype: int64
# @
# We copy data to compare before and after preprocessing
train_c_df = train.copy()
test_c_df = train.copy()
train['Sex'].value_counts()
# < male 577
# < female 314
# < Name: Sex, dtype: int64
# We encode sex data by using pandas
train.loc[train["Sex"] == "male", "Sex"] = 0
train.loc[train["Sex"] == "female", "Sex"] = 1
test.loc[test["Sex"] == "male", "Sex"] = 0
test.loc[test["Sex"] == "female", "Sex"] = 1
# train['Sex'] = train['Sex'].apply(lambda s: 1 if s == 'female' else 0)
# test['Sex'] = test['Sex'].apply(lambda s: 1 if s == 'female' else 0)
train.head()
# < PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
# < 0 1 0 3 Braund, Mr. Owen Harris 0 22.0 1 0 A/5 21171 7.2500 NaN S
# < 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 1 38.0 1 0 PC 17599 71.2833 C85 C
# < 2 3 1 3 Heikkinen, Miss. Laina 1 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
# < 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 1 35.0 1 0 113803 53.1000 C123 S
# < 4 5 0 3 Allen, Mr. William Henry 0 35.0 0 0 373450 8.0500 NaN S
# @
# We can use LabelEncoder of scikit-learn to process data into one hot encoding
# We encode categorical data
from sklearn.preprocessing import LabelEncoder
# We encode sex data into 0 and 1
def gender_to_int(data):
le = LabelEncoder()
le.fit(["male","female"])
data["Sex"] = le.transform(data["Sex"])
return data
# train data
train_c_df = gender_to_int(train_c_df)
# test data
test_c_df = gender_to_int(test_c_df)
train_c_df.head()
# < PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
# < 0 1 0 3 Braund, Mr. Owen Harris 1 22.0 1 0 A/5 21171 7.2500 NaN S
# < 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 0 38.0 1 0 PC 17599 71.2833 C85 C
# < 2 3 1 3 Heikkinen, Miss. Laina 0 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
# < 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 0 35.0 1 0 113803 53.1000 C123 S
# < 4 5 0 3 Allen, Mr. William Henry 1 35.0 0 0 373450 8.0500 NaN S
# @
train['Embarked'].value_counts()
# < S 644
# < C 168
# < Q 77
# < Name: Embarked, dtype: int64
train_c_df["Embarked_C"] = train_c_df["Embarked"] == "C"
train_c_df["Embarked_S"] = train_c_df["Embarked"] == "S"
train_c_df["Embarked_Q"] = train_c_df["Embarked"] == "Q"
print(train.shape)
# < (891, 12)
print(train_c_df.shape)
# < (891, 15)
train_c_df[["Embarked", "Embarked_C", "Embarked_S", "Embarked_Q"]].head(10)
# < Embarked Embarked_C Embarked_S Embarked_Q
# < 0 S False True False
# < 1 C True False False
# < 2 S False True False
# < 3 S False True False
# < 4 S False True False
# < 5 Q False False True
# < 6 S False True False
# < 7 S False True False
# < 8 S False True False
# < 9 C True False False
# @
# We can use get_dummies of pandas to perform one hot encoding
# to machine understand categorical data
def dummy_data(data, columns):
for column in columns:
data = pd.concat([data, pd.get_dummies(data[column], prefix = column)], axis=1)
data = data.drop(column, axis=1)
return data
dummy_columns = ["Sex", "Pclass", "Embarked"]
train_dummy = dummy_data(train, dummy_columns)
test_dummy = dummy_data(test, dummy_columns)
print('원핫인코딩 전 shape')
print(train.shape)
print(test.shape)
# 원핫인코딩 전 shape
# (891, 12)
# (418, 11)
print('get_dummies로 원핫인코딩 후 shape')
print(train_dummy.shape)
print(test_dummy.shape)
# get_dummies로 원핫인코딩 후 shape
# (891, 17)
# (418, 16)
train_dummy.head()
# PassengerId Survived Name Age SibSp Parch Ticket Fare Cabin Sex_0 Sex_1 Pclass_1 Pclass_2 Pclass_3 Embarked_C Embarked_Q Embarked_S
# 0 1 0 Braund, Mr. Owen Harris 22.0 1 0 A/5 21171 7.2500 NaN 1 0 0 0 1 0 0 1
# 1 2 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 38.0 1 0 PC 17599 71.2833 C85 0 1 1 0 0 1 0 0
# 2 3 1 Heikkinen, Miss. Laina 26.0 0 0 STON/O2. 3101282 7.9250 NaN 0 1 0 0 1 0 0 1
# 3 4 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 35.0 1 0 113803 53.1000 C123 0 1 1 0 0 0 0 1
# 4 5 0 Allen, Mr. William Henry 35.0 0 0 373450 8.0500 NaN 1 0 0 0 1 0 0 1
# When you use encoded data, you'd better drop unused column when creating feature
def drop_not_concerned(data, columns):
return data.drop(columns, axis=1)
not_concerned_columns = ["PassengerId", "Name", "Ticket", "Cabin"]
X_train = drop_not_concerned(train_dummy, not_concerned_columns)
X_train = X_train.drop('Survived', axis=1)
X_test = drop_not_concerned(test_dummy, not_concerned_columns)
X_train.head()
# Age SibSp Parch Fare Sex_0 Sex_1 Pclass_1 Pclass_2 Pclass_3 Embarked_C Embarked_Q Embarked_S
# 0 22.0 1 0 7.2500 1 0 0 0 1 0 0 1
# 1 38.0 1 0 71.2833 0 1 1 0 0 1 0 0
# 2 26.0 0 0 7.9250 0 1 0 0 1 0 0 1
# 3 35.0 1 0 53.1000 0 1 1 0 0 0 0 1
# 4 35.0 0 0 8.0500 1 0 0 0 1 0 0 1