030. create sentence with markov chain
# @
# If I apply markov chain when creating sentence,
# I can get odd sentence like "Apple eats human"
# So, we should know what words are probabilistically followed by other word
# around those words with markov chain model
# And this is called LSTM
# @
# First, let's just use markov chain to create sentence
import codecs
from bs4 import BeautifulSoup
from konlpy.tag import Twitter
import urllib.request
import os, re, json, random
# I create dictionary by using markov model
# words(개,도,닷새,가,되면,주인,을,안다,.) is morphologically analyzed text
def make_dic(words):
# I create list, tmp = ["@"]
tmp = ["@"]
# I create dictionary
dic = {}
# Let's process following words
# 개|도|닷새|가|되면|주인|을|안다|.
# 기르던|개|에게|다리|가|물렸다|.
tmp=["@","개","도"]
# By set_word3(),
# w1="@"
# w2="개"
# w3="도"
# And then, we will make,
dic =
{
"@":
{
"개":
{
"도":1
}
},
"개":
{
"도":
{
"닷새":1
}
},
"도":
{
"닷새":
{
"가":1
}
},
"닷새":
{
"가":
{
"되면":1
}
},
"가":
{
"되면":
{
"주인":1
}
},
"되면":
{
"주인":
{
"을":1
}
},
"주인":
{
"을":
{
"안다":1
}
},
"을":
{
"안다":
{
".":1
}
},
}
tmp=["@"]
# @ means we will newly create sentence
# @ means most front word in sentence
for word in words:
tmp.append(word)
# If length of tmp is less than 3,
# we go back to if condition
if len(tmp)<3:
continue
if len(tmp)>3:
tmp=tmp[1:]
set_word3(dic,tmp)
if word==".":
tmp=["@"]
continue
return dic
# I add text data(words) in dic
def set_word3(dic,s3):
w1, w2, w3 = s3
if not w1 in dic:
dic[w1] = {}
if not w2 in dic[w1]:
dic[w1][w2] = {}
if not w3 in dic[w1][w2]:
dic[w1][w2][w3] = 0
dic[w1][w2][w3] += 1
# This creates sentence
def make_sentence(dic):
ret = []
if not "@" in dic:
return "no dic"
top = dic["@"]
w1 = word_choice(top)
w2 = word_choice(top[w1])
ret.append(w1)
ret.append(w2)
while True:
w3 = word_choice(dic[w1][w2])
ret.append(w3)
if w3 == ".":
break
w1, w2 = w2, w3
ret = "".join(ret)
# This is for white space
params = urllib.parse.urlencode({
"_callback": "",
"q": ret
})
# I use web spelling checker
data=urllib.request.urlopen("https://m.search.naver.com/p/csearch/dcontent/spellchecker.nhn?"+params)
data=data.read().decode("utf-8")[1:-2]
data=json.loads(data)
data=data["message"]["result"]["html"]
data=BeautifulSoup(data,"html.parser").getText()
# 리턴
return data
def word_choice(sel):
keys=sel.keys()
return random.choice(list(keys))
# I will make toji_file as name of toji.txt
toji_file = "toji.txt"
# I will make dict_file as name of markov-toji.json
dict_file = "markov-toji.json"
# If there is no dict_file, I will create a dict_file
# dic = make_dic(words)
# json.dump(dic, open(dict_file,"w", encoding="utf-8"))
# If there is already dict_file, I will load passed dict_file and convert it into json file
# dic = json.load(open(dict_file,"r"))
if not os.path.exists(dict_file):
# I load toji text file(BEXX0003.txt)
fp=codecs.open("BEXX0003.txt","r",encoding="utf-16")
soup=BeautifulSoup(fp,"html.parser")
body=soup.select_one("body > text")
text=body.getText()
# Old version twitter morphological analyzer of KoNLPy consider "…" as nount not punctuation
# So, I use replace to remove it
text=text.replace("…","")
# I process morphological analysis
twitter=Twitter()
malist=twitter.pos(text, norm=True)
words=[]
for word in malist:
# I exclude all puntuations except for period
if not word[1] in ["Punctuation"]:
words.append(word[0])
# I add processed text into words
if word[0]==".":
words.append(word[0])
# I create a dictionary by using make_dic() and process text
dic=make_dic(words)
# I create dict_file with "dic"
json.dump(dic, open(dict_file,"w",encoding="utf-8"))
else:
dic=json.load(open(dict_file,"r"))
# After you got a dict_file, I call make_sentence() to make sentence
for i in range(3):
s = make_sentence(dic)
print(s)
print("---")