016. arguments of fit()
# @
# We will talk about how and what arguments we should input into clf.fit()
# @
# We will try mnist
# We will convert numbers written as image into csv file
# And we will input this csv file data into fit() to let expecting model to train
# @
import urllib.request as req
import gzip, os, os.path
savepath = "D://chromedown//mnist"
baseurl = "http://yann.lecun.com/exdb/mnist"
files =
[
"train-images-idx3-ubyte.gz",
"train-labels-idx1-ubyte.gz",
"t10k-images-idx3-ubyte.gz",
"t10k-labels-idx1-ubyte.gz"
]
# Now, we're going to download data
# First, I need to check if the folder exists or not
# If it doesn't exist, I make folder
if not os.path.exists(savepath): os.mkdir(savepath)
for f in files:
url = baseurl + "/" f
loc = savepath + "/" f
print("download : ", url)
# I need to check if the folder exists or not
# If it doesn't exist, I make folder
if not os.path.exists(loc):
req.urlretrieve(url, loc)
# I need to unzip GZip file
for f in files:
gz_file = savepath + "/" + f
raw_file = savepath + "/" + f.replace(".gz", "")
print("gzip : ", f)
with gzip.open(gz_file, "rb") as fp:
body = fp.read()
with open(raw_file, "wb") as w:
w.write(body)
print("ok")
# @
# Run this python file
# You can see the mnist folder created
# You need to explore that folder
# @
I need to convert mnist to csv
import struct
def to_csv(name, maxdata):
# I open label file and image file
lbl_f = open("D://chromedown//mnist//"+name+"-labels-idx1-ubyte", "rb")
img_f = open("D://chromedown//mnist//"+name+"-images-idx3-ubyte", "rb")
csv_f = open("D://chromedown//mnist//"+name+".csv", "w", encoding="utf-8")
# I read header information
mag, lbl_count = struct.unpack(">II", lbl_f.read(8))
mag, img_count = struct.unpack(">II", img_f.read(8))
rows, cols = struct.unpack(">II", img_f.read(8))
pixels = rows * cols
# I read image data and save them as csv file
res = []
for idx in range(lbl_count):
if idx > maxdata: break
label = struct.unpack("B", lbl_f.read(1))[0]
bdata = img_f.read(pixels)
sdata = list(map(lambda n: str(n), bdata))
csv_f.write(str(label)+",")
csv_f.write(",".join(sdata)+"\r\n")
# I need to check if they are saved well by testing converted csv file
if idx < 10:
s = "P2 28 28 255\n"
s += " ".join(sdata)
iname = "./mnist/{0}-{1}-{2}.pgm".format(name,idx,label)
with open(iname, "w", encoding="utf-8") as f:
f.write(s)
csv_f.close()
lbl_f.close()
img_f.close()
# I display output result
to_csv("train", 1000)
to_csv("t10k", 500)
# run this python file
# The created files are (train csv + train0-5.pgm + ...) and (t10k csv + t10k0-5.pgm + ...)
# @
# Open the train csv
# The data in the first column represents label
# The data in the columns excluded the first column, represents image of number represented by characters together
# One image of number is representd 28 columns of number
# There are 28 images of number
# So, we have 28*28=784 number data
# @
# we will input one data
# I copy one entire row excluded first data from right representing label
input = "0 0 0 1 0 0...0"
# I want to split string by white space
input = input.split(" ")
# I use range(len(input)) (in this case, it's 0 to 28) as data source for interation
for i in range(len(input))
# input[0] will take 3 columns and append white space at the end of number
# For example, if input[0] is 0, it will be 0vvv (v is white space)
# For example, if input[44] is 212, it will be 212v (v is white space)
print("{:3}".format(input[i]), end=" ")
# if the remainder of dividing i by 28 is 0 (which means if i is 28 so at the end of one row), you make new line to make 28 by 28 matrix
if i % 28 == 0:
print()
# @
# Now, what we need to do is using learning methods
# by inputing vectorized image data and its label
f = svm.SVC()
# For learning
f.fit
(
[
[0 0 0 0 0 0...],
[0 0 0 0 3 0...],
[0 2 0 0 0 0...],
],
[
4,
5,
6
]
)
# @
# We want to use pandas to load csv file
from sklearn import model_selection, svm, metrics
import pandas
# These files have no header so we specify none
# Train dataset
train_csv = pandas.read_csv("D://chromedown//train.csv", header=None)
# Test dataset
tk_csv = pandas.read_csv("D://chromedown//t10k.csv", header=None)
# I will extract each data
# iloc()
# 1 argument : range of entire row you want to extract
# 2 argument : range of rows you want to extract
# in this example, we should exclude label row
# We need to convert number from 0 to 255 into number from 0 to 1
# We will define test(l) to convert number ranged from 0 to 1
# This method is passed map()
def test(l):
output = []
for i in l:
output.append(float(i)/256)
return output
# Select entire row, select 1: column
train_csv_data = list(map(test, train_csv.iloc[:, 1:].values))
tk_csv_data = list(map(test, tk_csv.iloc[:, 1:].values))
train_csv_label = train_csv[0].values
tk_csv_data = tk_csv[0].values
clf = svm.SVC()
# For learning
clf.fit(train_csv_data, train_csv_label)
# For predicting
predict = clf.predict(tk_csv_data)
score = metrics.accuracy_score(tk_csv_label, predict)
pritn("accuracy : ", score)