015. iris data with scikit-learn
# @
# iris.csv data
# github.com/pandas-dev/pandas/blob/master/pandas/tests/data/iris.csv
# It's 150 data
# click "raw" to download as csv file
# @
# We need way to convert label data represented in characters
# Let's just try to convert each one into 0, 1, 2 by using "replace" in text editor
# @
# We will use pandas to deal with iris.csv file
import pandas as pd
from sklearn import svm, metrics
from sklearn.model_selection import train_test_split
# I load iris.csv data file
csv = pd.read_csv("D://chromdown//iris.csv")
# I extract all data which I need based on column name
data = csv[["SepalLength","SepalWidth","PetalLength","PetalWidth"]]
label = csv["Name"]
print(data)
print(label)
# We've extracted all data which we need
# And now, all what we need to do is using learning methods
# Classifier
clf = svm.SVC()
# For learning
clf.fit(data, label)
# For predicting
results = clf.predict([[5.1, 3.0, 1.3, 0.2]])
print(results)
# output : 0 (which means it's Iris-setosa)
# @
# application example:
# 1. User inputs data, and this application predicts kind of flower
# 1. User takes photo of flower
# And application measures its length of parts
# And application inputs measured data into application
# application predicts the kind of that flower
# @
# But we don't have precise way to evaluate this expecting model
# We can't evaluate our expecting model by unknown data
# So, methodology how to evaluate accuracy of expecting model is to use part of train data
# We devide train data into 3 parts, train data/validation data/test data
# @
# I will create train data and test data
from sklearn.model_selection import train_test_split
# train_test_split(entiredata, label) returns tuple(multiple data)
train_data, test_data, train_label, test_label = train_test_split(data, label)
clf = svm.SVC()
# For learning
clf.fit(train_data, train_label)
# For predicting
results = clf.predict(test_data)
score = metrics.accuracy_score(results, test_label)
print("accuracy : ", score)