rangedata-7cofy5fmoNM. deal with range data, one hot encoding # @ # Data can be devided into 'numerical data', 'text data', 'range data' # Machine learning and deep learning algorithm only can understance 'numerical data' # So, we should convert data into data which computer understands # We convert 'range data' into one hot encoding # @ # Fruits # apple # pear # persimmon # aplle # Fruits fruit-apple fruit-pear fruit-persimmon # apple 1 0 0 # pear 0 1 0 # persimmon 0 0 1 # aplle 1 0 0 # @ # Download test data and train data: https://www.kaggle.com/c/titanic/data import pandas as pd import numpy as np print(pd.__version__) # < 0.21.0 print(np.__version__) # < 1.14.0 train = pd.read_csv('data/train.csv') test = pd.read_csv('data/test.csv') print(train.shape) # < (891, 12) print(test.shape) # < (418, 11) print(train.dtypes) # < PassengerId int64 # < Survived int64 # < Pclass int64 # < Name object # < Sex object # < Age float64 # < SibSp int64 # < Parch int64 # < Ticket object # < Fare float64 # < Cabin object # < Embarked object # < dtype: object print(test.dtypes) # < PassengerId int64 # < Pclass int64 # < Name object # < Sex object # < Age float64 # < SibSp int64 # < Parch int64 # < Ticket object # < Fare float64 # < Cabin object # < Embarked object # < dtype: object print(train.columns) # < Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', # < 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], # < dtype='object') # @ # Numerical data train.describe() # < PassengerId Survived Pclass Age SibSp Parch Fare # < count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000 # < mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208 # < std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429 # < min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000 # < 25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400 # < 50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200 # < 75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000 # < max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200 # We extract object type data # We find categorical data from them and then we encode them # We can apply one hot encoding and TF, TF-IDF # We should consider which encoding is more appropriate obj_df = train.select_dtypes(include=['object']).copy() obj_df.head() # < Name Sex Ticket Cabin Embarked # < 0 Braund, Mr. Owen Harris male A/5 21171 NaN S # < 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female PC 17599 C85 C # < 2 Heikkinen, Miss. Laina female STON/O2. 3101282 NaN S # < 3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 113803 C123 S # < 4 Allen, Mr. William Henry male 373450 NaN S # We make data show if it has missed data by using isnull() # We have most missed data from Cabin obj_df[obj_df.isnull().any(axis=1)].head(5) # < Name Sex Ticket Cabin Embarked # < 0 Braund, Mr. Owen Harris male A/5 21171 NaN S # < 2 Heikkinen, Miss. Laina female STON/O2. 3101282 NaN S # < 4 Allen, Mr. William Henry male 373450 NaN S # < 5 Moran, Mr. James male 330877 NaN Q # < 7 Palsson, Master. Gosta Leonard male 349909 NaN S # We want to check if data can be used as categorical data # It turns out it's not good for categorical data because of many missed data obj_df["Cabin"].value_counts().head(5) # < G6 4 # < C23 C25 C27 4 # < B96 B98 4 # < F2 3 # < F33 3 # < Name: Cabin, dtype: int64 # @ # We copy data to compare before and after preprocessing train_c_df = train.copy() test_c_df = train.copy() train['Sex'].value_counts() # < male 577 # < female 314 # < Name: Sex, dtype: int64 # We encode sex data by using pandas train.loc[train["Sex"] == "male", "Sex"] = 0 train.loc[train["Sex"] == "female", "Sex"] = 1 test.loc[test["Sex"] == "male", "Sex"] = 0 test.loc[test["Sex"] == "female", "Sex"] = 1 # train['Sex'] = train['Sex'].apply(lambda s: 1 if s == 'female' else 0) # test['Sex'] = test['Sex'].apply(lambda s: 1 if s == 'female' else 0) train.head() # < PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked # < 0 1 0 3 Braund, Mr. Owen Harris 0 22.0 1 0 A/5 21171 7.2500 NaN S # < 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 1 38.0 1 0 PC 17599 71.2833 C85 C # < 2 3 1 3 Heikkinen, Miss. Laina 1 26.0 0 0 STON/O2. 3101282 7.9250 NaN S # < 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 1 35.0 1 0 113803 53.1000 C123 S # < 4 5 0 3 Allen, Mr. William Henry 0 35.0 0 0 373450 8.0500 NaN S # @ # We can use LabelEncoder of scikit-learn to process data into one hot encoding # We encode categorical data from sklearn.preprocessing import LabelEncoder # We encode sex data into 0 and 1 def gender_to_int(data): le = LabelEncoder() le.fit(["male","female"]) data["Sex"] = le.transform(data["Sex"]) return data # train data train_c_df = gender_to_int(train_c_df) # test data test_c_df = gender_to_int(test_c_df) train_c_df.head() # < PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked # < 0 1 0 3 Braund, Mr. Owen Harris 1 22.0 1 0 A/5 21171 7.2500 NaN S # < 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 0 38.0 1 0 PC 17599 71.2833 C85 C # < 2 3 1 3 Heikkinen, Miss. Laina 0 26.0 0 0 STON/O2. 3101282 7.9250 NaN S # < 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 0 35.0 1 0 113803 53.1000 C123 S # < 4 5 0 3 Allen, Mr. William Henry 1 35.0 0 0 373450 8.0500 NaN S # @ train['Embarked'].value_counts() # < S 644 # < C 168 # < Q 77 # < Name: Embarked, dtype: int64 train_c_df["Embarked_C"] = train_c_df["Embarked"] == "C" train_c_df["Embarked_S"] = train_c_df["Embarked"] == "S" train_c_df["Embarked_Q"] = train_c_df["Embarked"] == "Q" print(train.shape) # < (891, 12) print(train_c_df.shape) # < (891, 15) train_c_df[["Embarked", "Embarked_C", "Embarked_S", "Embarked_Q"]].head(10) # < Embarked Embarked_C Embarked_S Embarked_Q # < 0 S False True False # < 1 C True False False # < 2 S False True False # < 3 S False True False # < 4 S False True False # < 5 Q False False True # < 6 S False True False # < 7 S False True False # < 8 S False True False # < 9 C True False False # @ # We can use get_dummies of pandas to perform one hot encoding # to machine understand categorical data def dummy_data(data, columns): for column in columns: data = pd.concat([data, pd.get_dummies(data[column], prefix = column)], axis=1) data = data.drop(column, axis=1) return data dummy_columns = ["Sex", "Pclass", "Embarked"] train_dummy = dummy_data(train, dummy_columns) test_dummy = dummy_data(test, dummy_columns) print('원핫인코딩 전 shape') print(train.shape) print(test.shape) # 원핫인코딩 전 shape # (891, 12) # (418, 11) print('get_dummies로 원핫인코딩 후 shape') print(train_dummy.shape) print(test_dummy.shape) # get_dummies로 원핫인코딩 후 shape # (891, 17) # (418, 16) train_dummy.head() # PassengerId Survived Name Age SibSp Parch Ticket Fare Cabin Sex_0 Sex_1 Pclass_1 Pclass_2 Pclass_3 Embarked_C Embarked_Q Embarked_S # 0 1 0 Braund, Mr. Owen Harris 22.0 1 0 A/5 21171 7.2500 NaN 1 0 0 0 1 0 0 1 # 1 2 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 38.0 1 0 PC 17599 71.2833 C85 0 1 1 0 0 1 0 0 # 2 3 1 Heikkinen, Miss. Laina 26.0 0 0 STON/O2. 3101282 7.9250 NaN 0 1 0 0 1 0 0 1 # 3 4 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 35.0 1 0 113803 53.1000 C123 0 1 1 0 0 0 0 1 # 4 5 0 Allen, Mr. William Henry 35.0 0 0 373450 8.0500 NaN 1 0 0 0 1 0 0 1 # When you use encoded data, you'd better drop unused column when creating feature def drop_not_concerned(data, columns): return data.drop(columns, axis=1) not_concerned_columns = ["PassengerId", "Name", "Ticket", "Cabin"] X_train = drop_not_concerned(train_dummy, not_concerned_columns) X_train = X_train.drop('Survived', axis=1) X_test = drop_not_concerned(test_dummy, not_concerned_columns) X_train.head() # Age SibSp Parch Fare Sex_0 Sex_1 Pclass_1 Pclass_2 Pclass_3 Embarked_C Embarked_Q Embarked_S # 0 22.0 1 0 7.2500 1 0 0 0 1 0 0 1 # 1 38.0 1 0 71.2833 0 1 1 0 0 1 0 0 # 2 26.0 0 0 7.9250 0 1 0 0 1 0 0 1 # 3 35.0 1 0 53.1000 0 1 1 0 0 0 0 1 # 4 35.0 0 0 8.0500 1 0 0 0 1 0 0 1