My first HTML document

027. word2vec # @ # Word2Vec is library which can make words to be expressed on xy plane as number values # @ # father-man+woman : the word of father - property of man + property of woman # Then,you will get mother # @ # When you use Word2Vec,it requires gensim library # pip3 install gensim # @ from gensim.models import word2vec # I input text data to make it as text data fitted to Word2Vec model data=word2vec.LineSentence("") # I input manipulated text data into Word2Vec() to make Word2Vec model model=word2vec.Word2Vec(data,size=200,window=10,hs=1,min_count=2,sg=1) # I save created word2vec model model.save("filenameyouwant") # @ # I load text file fp=codecs.open("text.txt","r",encoding="utf-16") # I use beautifulsoup to parse html soup=BeautifulSoup(fp,"html.parser") # text tag, descendant, body tag body=soup.select_one("text body") # I extract text from body tag text=body.getText() # @ # And then, I use twitter morphological analyzer # to separate one sentence by one sentence based on \r\n twitter=Twitter() # I split text lines=text.split("\r\n") results=[] for line in lines: r=[] # And then, I process morphological analysis malist=twitter.pos(line,norm=True,stem=True) # Created malist in this way has "word(analyzed morpheme)" and "pumsa(part of speech)" # "word, pumsa" is returned as tuple, # so you can wrap them by parenthesis like this (word,pumsa) for word,pumsa in malist: # "text=body.getText()" has lots of puntuations, # so I want to remove them all # For that, I want to extract word # only if its pumsa is not Josa, Eomi, puntuations # And then, I will put them into list of "r" if not pumsa in ["Josa,Eomi","Punctuation"]: r.append(word) # I want to add processed words into "results" list # I want to insert white space between each contents of "r" list # And in case that something wrong happens, # I apply strip on both ends when I append results.append((" ".join(r)).strip()) # And then, I create final output by joining each contents of "results" list output=(" ".join(results)).strip() # I want to save output as file with open("toji.wakati","w",encoding=utf-8) as fp: fp.write(output) # @ # I input created and processed text file "toji.wakati" into LineSentence() data=word2vec.LineSentence("toji.wakati") model=word2vec.Word2Vec(data,size=200,window=10,hs=1,min_count=2,sg=1) # I save created model with name of "toji.model" model.save("toji.model") # @ # Now, you will have 2 files, toji.wakati and toji.model # @ # I want to use "toji.model" from gensim.models import word2vec model=word2vec.Word2Vec.load("toji.model") # Now, I have loaded Word2Vec model, # and I can do various tasks with it # I can find and see similar meaning words with "땅" model.most_similar(positive=["땅"]) # @ # I can use wikipedia as dictionary # Step # 1. I grab all data from wikipedia # 1. I make each wakati file for each data # 1. I create word2vec model based on each wakati file # 1. I test those models # Wikipedia US 10GB # Wikipedia JN 9GB # Wikipedia KR 2.3GB # @ # I load word2vec model file model=word2vec.Word2Vec.load("wiki2.model") model.most_similar(positive="파이썬") model.most_similar(positive=["파이썬","Python"]) # I output data only in from 0 to 3 model.most_similar(positive=["왕자","여성"],negative=["남성"])[0:3] # @ model["고양이"] # You will see vectorized 고양이 # The reason that the size of list is 100 is # because you input size=100 in "model=word2vec.Word2Vec(data,size=100)" # array([22,22,33,.....])