026. analyze frequency of shown words in text
# @
import codecs
from bs4 import BeautifulSoup
from konlpy.tag import Twitter
# I open file stream
file=codecs.open("2BEXXX01.txt","r",encoding=utf-16)
# I analyze above file with html.parser
soup=BeautifulSoup(file,"html.parser")
# I extract "text tag, child body tag"
body=soup.select_one("text > body")
# I extract internal text
text=body.getText()
print(text)
# Now, I will analyze frequency of all shown words
twitter=Twitter()
# I create word_dic to store frequency numbers
word_dic={}
# I split extracted text by \r and \n
lines=text.split("\r\n")
# I analyze "one line by one line" by iteration
for line in lines:
malist=twitter.pos(line)
# # I want to check how it's going on
# print(malist)
# # But there will be tons of lines
# so I will use break not to iterate entire lines
# break
# # And we can see well analyzed data in tuple
for taeso, pumsa in malist:
# If pumsa is Noun in one data of malist
if pumsa == "Noun":
# And if taeso doesn't exist in word_dic
if not (taeso in word_dic)
# I assign 0 into value of taeso key in word_dic
word_dic[taeso]=0
# And if taeso already exists in word_dic
# I increase by 1 for value of taeso key in word_dic
word_dic[taeso] += 1
# I display frequency number of entire word
print(word_dic)
# < {'천년':1, '영화':1, ...}
# I display frequency number in order of highest frequency from first to 50th
for word,count in keys[:50]
print("{0}({1}) ".format(word,count),end="")
print()
# < 것(319) 그(199) 치수(192) ...
# @
# You can use this technique to classify genre of book based on frequency of shown words in text