022. Dealing with mushroom data, converting character data into number, one-hot-encoding
022. deal with mushroom data,convert character data into number,one-hot-encoding # @ import pandas as pd from sklearn.ensemble import RandomForestClassifier from sklearn import metrics from sklearn.model_selection import train_test_split # You load mushroon.csv file mr=pd.read_csv("D://chromedown//mushroom.csv",header=None) # You create data holders as list label=[] data=[] attr_list=[] # I create data source for row_index,row in mr.iterrows() # I create list data from first column of each row # by appending them into "label list" label.append(row.ix[0]) # I create placeholder for row_data which is excluded from first column row_data=[] # I create data source for v in row.ix[1:] # I convert each v into number and append it into "row_data" list # ord('A') returns ascii code 65 # ord('X') returns ascii code 90 # ord('a') returns ascii code 97 # ord('z') returns ascii code 122 row_data.append(ord(v)) # I append each row_data into "data" list data.append(row_data) # I display data from 0 to 5 for test print(label[0:5]) # < ['p','e','e','p','e'] print("---") print(data[0:5]) # < Numbers are converted into ascii code number from characters # < [ # < [2,3,4,5...], # < [2,3,4,5...], # < [2,3,4,5...], # < [2,3,4,5...], # < [2,3,4,5...] # < ] # I create train data and test data from entire data set data_train,data_test,label_train,label_test=train_test_split(data,label) # For learning clf=RandomForestClassifier() clf.fit(data_train,label_train) # For predicting predict=clf.predict(data_test) # For evaluating ac_score=metrics.accuracy_score(label_test,predict) cl_report=metrics.classification_report(label_test,predict) print("accuracy:",ac_score) print("report:\n",cl_report) # @ # There is precaution when you convert data into number # You should consider if data(feature) is fitted to classification # which has nothing to do with high and low of feature or continuity # Example of data which is fitted to "classification" is color # Feature of color has nothing to do with high and low to each other # For example, we can assign number to each color like # red=1, blue=2, green=3, white=4 # However, each number has nothing to do with other number # It doesn't make sense that if we multiply 2 by 3, it should be blue*green # So, in this case, we'd better use one hot encoding as a list like the following # red=[1,0,0,0] # blue=[0,1,0,0] # green=[0,0,1,0] # white=[0,0,0,1] # @ # I will convert mushroom data into one hot encoding label=[] data=[] attr_list=[] for row_index,row in mr.iterrows() label.append(row.ix[0]) exdata=[] # I bring row from 1 column to last column for col,v in enumerate(row.ix[1:]): # If row_index means header if row_index == 0: # I create attr dictionary attr={"dic":{},"cnt":0} # And then I append attr dictionary into attr_list attr_list.append(attr) # If row_index doesn't mean header else: attr=attr_list[col] # I express properties of mushroon as feature symbols in "list d" # Finally, it will seem like the following # feature d will become 1,0,0,0,0,0,0,0,0,0,0,0 # feautre x will become 0,0,0,0,0,0,0,0,0,0,0,0 # So, entire feature of entire mushroom datawill become # [ # [0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0], # [0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0], # ... # ] d=[0,0,0,0,0,0,0,0,0,0,0,0] if v in attr["dic"] idx=attr["dic"][v] else: idx=attr["cnt"] attr["dic"][v]=idx attr["cnt"] += 1 d[idx]=1 exdata += d data.append(exdata) # @ # Let's talk about cross validation # Suppose we have 3 sets of data # In this case, we let model train with part1 of data # and we let model evaluate performance with other part2 of data # But someone can claim that # "you only can get good result with specific test data # so I can't trust your evaluation" # To resolve this issue, we can use "cross validation" methodology # So, we validate our model "k" times # 3 sets of data => 0, 1, 2 # 3 cross validation # For learning fit(0,1) # For predicting predict(2) # accuracy : 0.85 # For learning fit(0,2) # For predicting predict(1) # accuracy : 0.80 # For learning fit(1,2) # For predicting predict(0) # accuracy : 0.75 # mean of accuracy : 0.80