018. nlp 1 # @ from sklearn import svm, metrics import glob, os.path, re, json # I bring all files in specified directory # I can specify extension name like .txt files = glob.glob("d://chromedown//train//*.txt") train_data = [] train_label = [] for file_name in files: # I display only file names excluded full path of file # like en-1.txt basename = os.path.basename(file_name) # First, I split string by "-" # And I select first element from them lang = basename.split("-")[0] # I open file stream file = open(file_name, "r", encoding="utf-8") # I load contents of file text = file.read() # I convert all texts into lowercase text = text.lower() # I close file stream # If you use "with open()", # you don't need to use close() file.close() # I will find frequency of showing of each alphabet character # ord("a") -> 97 # ord("가") -> 44032 code_a = ord("a") code_z = ord("z") count = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] # This makes same result with above code # count = [0 for n in range(0,26)] for character in text: code_current = ord(character) if code_a <= code_current <= code_z: # "a"97 - "a"97 = 0 # "b"98 - "a"97 = 1 count[code_current-code_a]+=1 # I normalize numbers to have number from 0 to 1 total = sum(count) count = map(lambda n: n/total,count) train_label.append(lang) train_data.append(count) # For learning clf = svm.SVC() clf.fit(train_data,train_label) # For predicting predict = clf.predict(test_data) # For evaluating score = metrics.accuracy_score(test_label,predict) print("Accuracy: ",score) # For good visual report report = metrics.classification_report(test_label, predict) print("---report---") print(report)