rangedata-7cofy5fmoNM. deal with range data, one hot encoding
# @
# Data can be devided into 'numerical data', 'text data', 'range data'
# Machine learning and deep learning algorithm only can understance 'numerical data'
# So, we should convert data into data which computer understands
# We convert 'range data' into one hot encoding
# @
# Fruits
# apple
# pear
# persimmon
# aplle
# Fruits          fruit-apple     fruit-pear     fruit-persimmon
# apple           1               0               0
# pear            0               1               0
# persimmon       0               0               1
# aplle           1               0               0
# @
# Download test data and train data: https://www.kaggle.com/c/titanic/data
import pandas as pd
import numpy as np
print(pd.__version__)
# < 0.21.0
print(np.__version__)
# < 1.14.0
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
print(train.shape)
# < (891, 12)
print(test.shape)
# < (418, 11)
print(train.dtypes)
# < PassengerId      int64
# < Survived         int64
# < Pclass           int64
# < Name            object
# < Sex             object
# < Age            float64
# < SibSp            int64
# < Parch            int64
# < Ticket          object
# < Fare           float64
# < Cabin           object
# < Embarked        object
# < dtype: object
print(test.dtypes)
# < PassengerId      int64
# < Pclass           int64
# < Name            object
# < Sex             object
# < Age            float64
# < SibSp            int64
# < Parch            int64
# < Ticket          object
# < Fare           float64
# < Cabin           object
# < Embarked        object
# < dtype: object
print(train.columns)
# < Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
# <        'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
# <       dtype='object')
# @
# Numerical data
train.describe()
# <                 PassengerId	Survived	Pclass	    Age     	SibSp      	Parch	    Fare
# < count	        891.000000	891.000000	891.000000	714.000000	891.000000	891.000000	891.000000
# < mean	        446.000000	0.383838	2.308642	29.699118	0.523008	0.381594	32.204208
# < std         	257.353842	0.486592	0.836071	14.526497	1.102743	0.806057	49.693429
# < min         	1.000000	0.000000	1.000000	0.420000	0.000000	0.000000	0.000000
# < 25%         	223.500000	0.000000	2.000000	20.125000	0.000000	0.000000	7.910400
# < 50%         	446.000000	0.000000	3.000000	28.000000	0.000000	0.000000	14.454200
# < 75%         	668.500000	1.000000	3.000000	38.000000	1.000000	0.000000	31.000000
# < max         	891.000000	1.000000	3.000000	80.000000	8.000000	6.000000	512.329200
# We extract object type data
# We find categorical data from them and then we encode them
# We can apply one hot encoding and TF, TF-IDF
# We should consider which encoding is more appropriate
obj_df = train.select_dtypes(include=['object']).copy()
obj_df.head()
# <     Name	                Sex	    Ticket	Cabin	Embarked
# < 0	Braund, Mr. Owen Harris	male	A/5     21171	NaN	S
# < 1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	PC 17599	C85	C
# < 2	Heikkinen, Miss. Laina	female	STON/O2. 3101282	NaN	S
# < 3	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	113803	C123	S
# < 4	Allen, Mr. William Henry	male	373450	NaN	S
# We make data show if it has missed data by using isnull()
# We have most missed data from Cabin
obj_df[obj_df.isnull().any(axis=1)].head(5)
# <     Name	                Sex	Ticket	Cabin	Embarked
# < 0	Braund, Mr. Owen Harris	male	A/5 21171	NaN	S
# < 2	Heikkinen, Miss. Laina	female	STON/O2. 3101282	NaN	S
# < 4	Allen, Mr. William Henry	male	373450	NaN	S
# < 5	Moran, Mr. James	male	330877	NaN	Q
# < 7	Palsson, Master. Gosta Leonard	male	349909	NaN	S
# We want to check if data can be used as categorical data
# It turns out it's not good for categorical data because of many missed data
obj_df["Cabin"].value_counts().head(5)
# < G6             4
# < C23 C25 C27    4
# < B96 B98        4
# < F2             3
# < F33            3
# < Name: Cabin, dtype: int64
# @
# We copy data to compare before and after preprocessing
train_c_df = train.copy()
test_c_df = train.copy()
train['Sex'].value_counts()
# < male      577
# < female    314
# < Name: Sex, dtype: int64
# We encode sex data by using pandas
train.loc[train["Sex"] == "male", "Sex"] = 0
train.loc[train["Sex"] == "female", "Sex"] = 1
test.loc[test["Sex"] == "male", "Sex"] = 0
test.loc[test["Sex"] == "female", "Sex"] = 1
# train['Sex'] = train['Sex'].apply(lambda s: 1 if s == 'female' else 0)
# test['Sex'] = test['Sex'].apply(lambda s: 1 if s == 'female' else 0)
train.head()
# <     PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
# < 0	1	0	3	Braund, Mr. Owen Harris	0	22.0	1	0	A/5 21171	7.2500	NaN	S
# < 1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	1	38.0	1	0	PC 17599	71.2833	C85	C
# < 2	3	1	3	Heikkinen, Miss. Laina	1	26.0	0	0	STON/O2. 3101282	7.9250	NaN	S
# < 3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	1	35.0	1	0	113803	53.1000	C123	S
# < 4	5	0	3	Allen, Mr. William Henry	0	35.0	0	0	373450	8.0500	NaN	S
# @
# We can use LabelEncoder of scikit-learn to process data into one hot encoding
# We encode categorical data
from sklearn.preprocessing import LabelEncoder
# We encode sex data into 0 and 1
def gender_to_int(data):
    le = LabelEncoder()
    le.fit(["male","female"])
    data["Sex"] = le.transform(data["Sex"]) 
    return data
# train data
train_c_df = gender_to_int(train_c_df)
# test data
test_c_df = gender_to_int(test_c_df)
train_c_df.head()
# <     PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
# < 0	1	0	3	Braund, Mr. Owen Harris	1	22.0	1	0	A/5 21171	7.2500	NaN	S
# < 1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	0	38.0	1	0	PC 17599	71.2833	C85	C
# < 2	3	1	3	Heikkinen, Miss. Laina	0	26.0	0	0	STON/O2. 3101282	7.9250	NaN	S
# < 3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	0	35.0	1	0	113803	53.1000	C123	S
# < 4	5	0	3	Allen, Mr. William Henry	1	35.0	0	0	373450	8.0500	NaN	S
# @
train['Embarked'].value_counts()
# < S    644
# < C    168
# < Q     77
# < Name: Embarked, dtype: int64
train_c_df["Embarked_C"] = train_c_df["Embarked"] == "C"
train_c_df["Embarked_S"] = train_c_df["Embarked"] == "S"
train_c_df["Embarked_Q"] = train_c_df["Embarked"] == "Q"
print(train.shape)
# < (891, 12)
print(train_c_df.shape)
# < (891, 15)
train_c_df[["Embarked", "Embarked_C", "Embarked_S", "Embarked_Q"]].head(10)
# <     Embarked	Embarked_C	Embarked_S	Embarked_Q
# < 0	S	False	True	False
# < 1	C	True	False	False
# < 2	S	False	True	False
# < 3	S	False	True	False
# < 4	S	False	True	False
# < 5	Q	False	False	True
# < 6	S	False	True	False
# < 7	S	False	True	False
# < 8	S	False	True	False
# < 9	C	True	False	False
# @
# We can use get_dummies of pandas to perform one hot encoding
# to machine understand categorical data
def dummy_data(data, columns):
    for column in columns:
        data = pd.concat([data, pd.get_dummies(data[column], prefix = column)], axis=1)
        data = data.drop(column, axis=1)
    return data
dummy_columns = ["Sex", "Pclass", "Embarked"]
train_dummy = dummy_data(train, dummy_columns)
test_dummy = dummy_data(test, dummy_columns)
print('원핫인코딩 전 shape')
print(train.shape)
print(test.shape)
# 원핫인코딩 전 shape
# (891, 12)
# (418, 11)
print('get_dummies로 원핫인코딩 후 shape')
print(train_dummy.shape)
print(test_dummy.shape)
# get_dummies로 원핫인코딩 후 shape
# (891, 17)
# (418, 16)
train_dummy.head()
# PassengerId	Survived	Name	Age	SibSp	Parch	Ticket	Fare	Cabin	Sex_0	Sex_1	Pclass_1	Pclass_2	Pclass_3	Embarked_C	Embarked_Q	Embarked_S
# 0	1	0	Braund, Mr. Owen Harris	22.0	1	0	A/5 21171	7.2500	NaN	1	0	0	0	1	0	0	1
# 1	2	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	38.0	1	0	PC 17599	71.2833	C85	0	1	1	0	0	1	0	0
# 2	3	1	Heikkinen, Miss. Laina	26.0	0	0	STON/O2. 3101282	7.9250	NaN	0	1	0	0	1	0	0	1
# 3	4	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	35.0	1	0	113803	53.1000	C123	0	1	1	0	0	0	0	1
# 4	5	0	Allen, Mr. William Henry	35.0	0	0	373450	8.0500	NaN	1	0	0	0	1	0	0	1
# When you use encoded data, you'd better drop unused column when creating feature
def drop_not_concerned(data, columns):
    return data.drop(columns, axis=1)
not_concerned_columns = ["PassengerId", "Name", "Ticket", "Cabin"]
X_train = drop_not_concerned(train_dummy, not_concerned_columns)
X_train = X_train.drop('Survived', axis=1)
X_test = drop_not_concerned(test_dummy, not_concerned_columns)
X_train.head()
#   Age	SibSp	Parch	Fare	Sex_0	Sex_1	Pclass_1	Pclass_2	Pclass_3	Embarked_C	Embarked_Q	Embarked_S
# 0	22.0	1	0	7.2500	1	0	0	0	1	0	0	1
# 1	38.0	1	0	71.2833	0	1	1	0	0	1	0	0
# 2	26.0	0	0	7.9250	0	1	0	0	1	0	0	1
# 3	35.0	1	0	53.1000	0	1	1	0	0	0	0	1
# 4	35.0	0	0	8.0500	1	0	0	0	1	0	0	1