018. nlp 1
# @
from sklearn import svm, metrics
import glob, os.path, re, json
# I bring all files in specified directory
# I can specify extension name like .txt
files = glob.glob("d://chromedown//train//*.txt")
train_data = []
train_label = []
for file_name in files:
# I display only file names excluded full path of file
# like en-1.txt
basename = os.path.basename(file_name)
# First, I split string by "-"
# And I select first element from them
lang = basename.split("-")[0]
# I open file stream
file = open(file_name, "r", encoding="utf-8")
# I load contents of file
text = file.read()
# I convert all texts into lowercase
text = text.lower()
# I close file stream
# If you use "with open()",
# you don't need to use close()
file.close()
# I will find frequency of showing of each alphabet character
# ord("a") -> 97
# ord("가") -> 44032
code_a = ord("a")
code_z = ord("z")
count = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
# This makes same result with above code
# count = [0 for n in range(0,26)]
for character in text:
code_current = ord(character)
if code_a <= code_current <= code_z:
# "a"97 - "a"97 = 0
# "b"98 - "a"97 = 1
count[code_current-code_a]+=1
# I normalize numbers to have number from 0 to 1
total = sum(count)
count = map(lambda n: n/total,count)
train_label.append(lang)
train_data.append(count)
# For learning
clf = svm.SVC()
clf.fit(train_data,train_label)
# For predicting
predict = clf.predict(test_data)
# For evaluating
score = metrics.accuracy_score(test_label,predict)
print("Accuracy: ",score)
# For good visual report
report = metrics.classification_report(test_label, predict)
print("---report---")
print(report)