019. nlp 2
# @
from sklearn import svm, metrics
import glob, os.path, re, json
import matplotlib.pyplot as plt
import pandas as pd
files=glob.glob("d://chromedown//train//*.txt")
train_data=[]
train_label=[]
for file_name in files:
basename=os.path.basename(file_name)
lang=basename.split("-")[0]
file=open(file_name, "r",encoding="utf-8")
text=file.read()
text=text.lower()
file.close()
code_a=ord("a")
code_z=ord("z")
count = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
for character in text:
code_current = ord(character)
if code_a <= code_current <= code_z:
count[code_current-code_a]+=1
total=sum(count)
count=map(lambda n: n/total,count)
train_label.append(lang)
train_data.append(count)
# I create dictionary to prepare drawing graph
graph_dict = {}
for i in range(0,len(train_label))
label=train_label[i]
data=train_data[i]
# If there is no key named label in graph_dict,
# I input data into label key
if not (label in graph_dict):
graph_dict[label] = data
# I create list containing from a to z
# [['a', 'b',..., 'z']]
asclist=[[chr(n) for n in range(97,97+26)]]
print(asclist)
# 1 argument : graph_dict for first graph
# 2 argument : asclist on "index" key
df=pd.DataFrame(graph_dict,index=asclist)
# I actually draw graph by using above dataframe which I created
# I use style as ggplot
plt.style.use('ggplot')
# I use bar graph
df.plot(kind="bar",subplots=True,ylim=(0,0.15))
# I save graph as png file
plt.savefig("lang-plot.png")