019. nlp 2 # @ from sklearn import svm, metrics import glob, os.path, re, json import matplotlib.pyplot as plt import pandas as pd files=glob.glob("d://chromedown//train//*.txt") train_data=[] train_label=[] for file_name in files: basename=os.path.basename(file_name) lang=basename.split("-")[0] file=open(file_name, "r",encoding="utf-8") text=file.read() text=text.lower() file.close() code_a=ord("a") code_z=ord("z") count = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] for character in text: code_current = ord(character) if code_a <= code_current <= code_z: count[code_current-code_a]+=1 total=sum(count) count=map(lambda n: n/total,count) train_label.append(lang) train_data.append(count) # I create dictionary to prepare drawing graph graph_dict = {} for i in range(0,len(train_label)) label=train_label[i] data=train_data[i] # If there is no key named label in graph_dict, # I input data into label key if not (label in graph_dict): graph_dict[label] = data # I create list containing from a to z # [['a', 'b',..., 'z']] asclist=[[chr(n) for n in range(97,97+26)]] print(asclist) # 1 argument : graph_dict for first graph # 2 argument : asclist on "index" key df=pd.DataFrame(graph_dict,index=asclist) # I actually draw graph by using above dataframe which I created # I use style as ggplot plt.style.use('ggplot') # I use bar graph df.plot(kind="bar",subplots=True,ylim=(0,0.15)) # I save graph as png file plt.savefig("lang-plot.png")