My first HTML document

026. analyze frequency of shown words in text # @ import codecs from bs4 import BeautifulSoup from konlpy.tag import Twitter # I open file stream file=codecs.open("2BEXXX01.txt","r",encoding=utf-16) # I analyze above file with html.parser soup=BeautifulSoup(file,"html.parser") # I extract "text tag, child body tag" body=soup.select_one("text > body") # I extract internal text text=body.getText() print(text) # Now, I will analyze frequency of all shown words twitter=Twitter() # I create word_dic to store frequency numbers word_dic={} # I split extracted text by \r and \n lines=text.split("\r\n") # I analyze "one line by one line" by iteration for line in lines: malist=twitter.pos(line) # # I want to check how it's going on # print(malist) # # But there will be tons of lines # so I will use break not to iterate entire lines # break # # And we can see well analyzed data in tuple for taeso, pumsa in malist: # If pumsa is Noun in one data of malist if pumsa == "Noun": # And if taeso doesn't exist in word_dic if not (taeso in word_dic) # I assign 0 into value of taeso key in word_dic word_dic[taeso]=0 # And if taeso already exists in word_dic # I increase by 1 for value of taeso key in word_dic word_dic[taeso] += 1 # I display frequency number of entire word print(word_dic) # < {'천년':1, '영화':1, ...} # I display frequency number in order of highest frequency from first to 50th for word,count in keys[:50] print("{0}({1}) ".format(word,count),end="") print() # < 것(319) 그(199) 치수(192) ... # @ # You can use this technique to classify genre of book based on frequency of shown words in text