004-003. neural network basic - word2vec import tensorflow as tf import matplotlib import matplotlib.pyplot as plt import numpy as np # This configuration lets korean language show in matplotlib font_name=matplotlib.font_manager\ .FontProperties(fname="/media/young/5e7be152-8ed5-483d-a8e8-b3fecfa221dc/font/NanumFont/NanumGothic.ttf")\ .get_name() matplotlib.rc('font',family=font_name) # You will vectorize following sentences sentences=["나 고양이 좋다", "나 강아지 좋다", "나 동물 좋다", "강아지 고양이 동물", "여자친구 고양이 강아지 좋다", "고양이 생선 우유 좋다", "강아지 생선 싫다 우유 좋다", "강아지 고양이 눈 좋다", "나 여자친구 좋다", "여자친구 나 싫다", "여자친구 나 영화 책 음악 좋다", "나 게임 만화 애니 좋다", "고양이 강아지 싫다", "강아지 고양이 좋다"] # 문장을 전부 합친 후 공백으로 단어들을 나누고 고유한 단어들로 리스트를 만듭니다. # You merge all elements in "sentences" with delimiter " ", # and then, split merged one word_sequence=" ".join(sentences).split() # < '나 고양이 좋다 나 강아지 좋다 나 동물 좋다 강아지 고양이 동물 여자친구 고양이 강아지 좋다 고양이 생선 우유 좋다 강아지 생선 싫다 우유 좋다 강아지 고양이 눈 좋다 나 여자친구 좋다 여자친구 나 싫다 여자친구 나 영화 책 음악 좋다 나 게임 만화 애니 좋다 고양이 강아지 싫다 강아지 고양이 좋다' # < ['나', # < '고양이', # < '좋다', # < '나', # < '강아지', # < '좋다', # < '나', # < '동물', # < '좋다', # < '강아지', # < '고양이', # < '동물', # < '여자친구', # < '고양이', # < '강아지', # < '좋다', # < '고양이', # < '생선', # < '우유', # < '좋다', # < '강아지', # < '생선', # < '싫다', # < '우유', # < '좋다', # < '강아지', # < '고양이', # < '눈', # < '좋다', # < '나', # < '여자친구', # < '좋다', # < '여자친구', # < '나', # < '싫다', # < '여자친구', # < '나', # < '영화', # < '책', # < '음악', # < '좋다', # < '나', # < '게임', # < '만화', # < '애니', # < '좋다', # < '고양이', # < '강아지', # < '싫다', # < '강아지', # < '고양이', # < '좋다'] word_list=" ".join(sentences).split() word_list=list(set(word_list)) # Since it's easier to analyze text with number than character, # you will create correlation array, # and index array which allows you to reference word in word list, # to use index of character in list # < ['나', # < '고양이', # < '좋다', # < '나', # < '강아지', # < '좋다', # < '나', # < '동물', # < '좋다', # < '강아지', # < '고양이', # < '동물', # < '여자친구', # < '고양이', # < '강아지', # < '좋다', # < '고양이', # < '생선', # < '우유', # < '좋다', # < '강아지', # < '생선', # < '싫다', # < '우유', # < '좋다', # < '강아지', # < '고양이', # < '눈', # < '좋다', # < '나', # < '여자친구', # < '좋다', # < '여자친구', # < '나', # < '싫다', # < '여자친구', # < '나', # < '영화', # < '책', # < '음악', # < '좋다', # < '나', # < '게임', # < '만화', # < '애니', # < '좋다', # < '고양이', # < '강아지', # < '싫다', # < '강아지', # < '고양이', # < '좋다'] word_dict={w: i for i,w in enumerate(word_list)} # < {'강아지': 9, # < '게임': 8, # < '고양이': 2, # < '나': 0, # < '눈': 6, # < '동물': 15, # < '만화': 3, # < '생선': 10, # < '싫다': 13, # < '애니': 4, # < '여자친구': 1, # < '영화': 14, # < '우유': 7, # < '음악': 5, # < '좋다': 11, # < '책': 12} # You will create skip-gram model whose window size is 1 # For example, 나 게임 만화 애니 좋다, # [나,만화] is feature # 게임 is label # -> ([나,만화],게임),([게임,애니],만화),([만화,좋다],애니) # -> (게임,나),(게임,만화),(만화,게임),(만화,애니),(애니,만화),(애니,좋다) skip_grams=[] # len(word_sequence) is 52 for i in range(1,len(word_sequence) - 1): # (context,target):([target index-1,target index+1],target) # After you create skip-gram, you save index of word # word_sequence[1] is '고양이' # target=2 target=word_dict[word_sequence[i]] # word_dict[word_sequence[i-1]]=1 # word_dict[word_sequence[2]]=11 # context=[1,11] context=[word_dict[word_sequence[i-1]],word_dict[word_sequence[i+1]]] for w in context: # skip_grams=[[2,[1,11]], ...] skip_grams.append([target,w]) # This method generates batch data of input and ouput, # by randomly extracting data from skip-gram data def random_batch(data,size): random_inputs=[] random_labels=[] random_index=np.random.choice(range(len(data)),size,replace=False) for i in random_index: # This extracts target random_inputs.append(data[i][0]) # This extracts context random_labels.append([data[i][1]]) return random_inputs,random_labels # @ # You will configure options training_epoch=10000 learning_rate=0.1 batch_size=20 # Word vector is composed of embedding # embedding_size is size of its dimension # You will use 2 dimension embedding for word vector, # to easily express embeddings onto xy plane embedding_size=2 # This is size of sampling which is used in nce_loss(), # when you train word2vec model # This value should be smaller than batch_size num_sampled=15 # voc_size means entire number of word voc_size=len(word_list) # @ # You will build neural network model # This is placeholder for inputs inputs=tf.placeholder(tf.int32,shape=[batch_size]) # When you use tf.nn.nce_loss(), you should configure output label as [batch_size,1] labels=tf.placeholder(tf.int32,shape=[batch_size,1]) # This variable will store embedding vectors which are result of word2vec model # This is composed of voc_size*embedding_size matrix # They're voc_size and embedding_size embeddings=tf.Variable(tf.random_uniform([voc_size,embedding_size],-1.0,1.0)) # embeddings inputs selected_embed # [[1,2,3] -> [2,3] -> [[2,3,4] # [2,3,4] [3,4,5]] # [3,4,5] # [4,5,6]] selected_embed=tf.nn.embedding_lookup(embeddings,inputs) # You define parameters which will be used in nce_loss() nce_weights=tf.Variable(tf.random_uniform([voc_size,embedding_size],-1.0,1.0)) nce_biases=tf.Variable(tf.zeros([voc_size])) # It's complex to directly implement nce_loss() # You just can use tf.nn.nce_loss() provided by tf # You create loss function in nce_loss function loss=tf.reduce_mean( tf.nn.nce_loss(nce_weights,nce_biases,labels,selected_embed,num_sampled,voc_size)) # You create optimizer train_op=tf.train.AdamOptimizer(learning_rate).minimize(loss) # @ # You will train your neural network model # You open session with tf.Session() as sess: # You initialize variables init=tf.global_variables_initializer() sess.run(init) # You let model to train for step in range(1,training_epoch+1): batch_inputs,batch_labels=random_batch(skip_grams,batch_size) # train_op is optimizer # loss is nce_loss function # loss_val is loss value _,loss_val=sess.run([train_op,loss], feed_dict={inputs: batch_inputs, labels: batch_labels}) # You display loss value every 10 step if step % 10 == 0: print("loss at step ",step,": ",loss_val) # < loss at step 10 : 4.640656 # < loss at step 20 : 3.7076352 # < loss at step 30 : 3.4454236 # < loss at step 40 : 3.92434 # < loss at step 50 : 3.4112315 # < loss at step 60 : 3.3955512 # < loss at step 70 : 3.7334003 # < loss at step 80 : 3.1573186 # < loss at step 90 : 3.304488 # < loss at step 100 : 3.2934136 # < loss at step 110 : 3.2376626 # < loss at step 120 : 3.4271011 # < loss at step 130 : 3.2045867 # < loss at step 140 : 3.0137382 # < loss at step 150 : 3.2337089 # < loss at step 160 : 3.1725814 # < loss at step 170 : 3.4625728 # < loss at step 180 : 3.30521 # < loss at step 190 : 3.3242996 # < loss at step 200 : 3.1565833 # < loss at step 210 : 3.363378 # < loss at step 220 : 3.2673995 # < loss at step 230 : 3.2842145 # < loss at step 240 : 3.1075568 # < loss at step 250 : 3.4219444 # < loss at step 260 : 3.293485 # < loss at step 270 : 3.12632 # < loss at step 280 : 3.1017227 # < loss at step 290 : 3.2007794 # < loss at step 300 : 3.1695514 # You save value of embeddings, # to visualize it by matplotlib # You simply can use eval() instead of sess.run in with statement trained_embeddings=embeddings.eval() # @ # You will see word2vec where words are embedded into # Result shows how much specific word is correlated with other words for i,label in enumerate(word_list): x,y=trained_embeddings[i] plt.scatter(x,y) plt.annotate(label,xy=(x,y),xytext=(5,2), textcoords='offset points',ha='right',va='bottom') plt.show() # img 2ea47a3c-a824-4e74-8e93-828715782233