015. iris data with scikit-learn # @ # iris.csv data # github.com/pandas-dev/pandas/blob/master/pandas/tests/data/iris.csv # It's 150 data # click "raw" to download as csv file # @ # We need way to convert label data represented in characters # Let's just try to convert each one into 0, 1, 2 by using "replace" in text editor # @ # We will use pandas to deal with iris.csv file import pandas as pd from sklearn import svm, metrics from sklearn.model_selection import train_test_split # I load iris.csv data file csv = pd.read_csv("D://chromdown//iris.csv") # I extract all data which I need based on column name data = csv[["SepalLength","SepalWidth","PetalLength","PetalWidth"]] label = csv["Name"] print(data) print(label) # We've extracted all data which we need # And now, all what we need to do is using learning methods # Classifier clf = svm.SVC() # For learning clf.fit(data, label) # For predicting results = clf.predict([[5.1, 3.0, 1.3, 0.2]]) print(results) # output : 0 (which means it's Iris-setosa) # @ # application example: # 1. User inputs data, and this application predicts kind of flower # 1. User takes photo of flower # And application measures its length of parts # And application inputs measured data into application # application predicts the kind of that flower # @ # But we don't have precise way to evaluate this expecting model # We can't evaluate our expecting model by unknown data # So, methodology how to evaluate accuracy of expecting model is to use part of train data # We devide train data into 3 parts, train data/validation data/test data # @ # I will create train data and test data from sklearn.model_selection import train_test_split # train_test_split(entiredata, label) returns tuple(multiple data) train_data, test_data, train_label, test_label = train_test_split(data, label) clf = svm.SVC() # For learning clf.fit(train_data, train_label) # For predicting results = clf.predict(test_data) score = metrics.accuracy_score(results, test_label) print("accuracy : ", score)