My first HTML document

031. create sentence with lstm and rnn # @ # If you want to analyze snapshot,you can use cnn # If you want to analyze time series data,you can use rnn # @ # LSTM is upgrade version of rnn # @ import codecs from bs4 import BeautifulSoup from keras.models import Sequential from keras.layers import Dense,Activation,Dropout from keras.layers import LSTM from keras.optimizers import RMSprop from keras.utils.data_utils import get_file import numpy as np import random,sys fp=codecs.open("./BEXX0003.txt","r",encoding="utf-16") soup=BeautifulSoup(fp,"html.parser") body=soup.select_one("body") text=body.getText()+" " print('The length of corpous: ',len(text)) # I will convert text into set # and then I will convert set into list chars=sorted(list(set(text))) print('The number of used character:',len(chars)) char_indices=dict((c,i) for i,c in enumerate(chars)) # < {...,'품':1479,'풋':1580,...} indices_char=dict((i,c) for i,c in enumerate(chars)) # < {...,1479:'품',1580:'풋',...} # I cut text by number of maxlen of word(20) # and I register next showing one word # \n\n제1편 어둠의 발소리\n서(서)\n1 => 20 words maxlen=20 # \n\n제1편 어둠의 발소리\n서(서)\n1 -> 3 step further # -> 1편 어둠의 발소리\n서(서)\n1897 step=3 # 제1편 어둠의 발소리 # 서 # 1897년의 한가위 # 까치들이 울타리 안 감나무에 와서 아침 인사를 하기도 전에, sentences=[] # < ['\n\n제1편 어둠의 발소리\n서(서)\n1','1편 어둠의 발소리\n서(서)\n1897','어둠의 발소리\n서(서)\n1897년의 ',.....] next_chars=[] # < ['8','년','한',...] for i in range(0,len(text)-maxlen,step): sentences.append(text[i:i+maxlen]) next_chars.append(text[i+maxlen]) print('Number of sentence to be trained: ',len(sentences)) print('I\'m converting text into int ID vector...') # np.zeros(5) # < array([0,0,0,0,0]) # np.zeros(3,dtype=np.bool) # < array([false,false,false]) # np.zeros((2,2)) # < array # < ([ # < [0,0], # < [0,0] # < ]) X=np.zeros((len(sentences),maxlen,len(chars)),dtype=np.bool) y=np.zeros((len(sentences),len(chars)),dtype=np.bool) for i,sentence in enumerate(sentences): for t,char in enumerate(sentence): X[i,t,char_indices[char]]=1 y[i,char_indices[next_chars[i]]]=1 # I implement lstm model print('Consructing LSTM model...') model=Sequential() model.add(LSTM(128,input_shape=(maxlen,len(chars)))) model.add(Dense(len(chars))) model.add(Activation('softmax')) optimizer=RMSprop(lr=0.01) model.compile(loss='categorical_crossentropy',optimizer=optimizer) # I extract candidated predicted value from array def sample(preds,temperature=1.0): preds=np.asarray(preds).astype('float64') preds=np.log(preds) / temperature exp_preds=np.exp(preds) preds=exp_preds / np.sum(exp_preds) probas=np.random.multinomial(1,preds,1) return np.argmax(probas) for iteration in range(1,60): print() print('-'*50) print('iteration= ',iteration) model.fit(X,y,batch_size=128,nb_epoch=1) # # I choose random starting text start_index=random.randint(0,len(text)-maxlen-1) # I create various sentences for diversity in [0.2,0.5,1.0,1.2]: print() print('---diversity=',d iversity) generated='' sentence=text[start_index: start_index+maxlen] generated += sentence print('--- seed= "'+sentence+'"') sys.stdout.write(generated) # I create text automatically by using seed for i in range(400): x=np.zeros((1,maxlen,len(chars))) for t,char in enumerate(sentence): x[0,t,char_indices[char]]=1. # I predict next showing word preds=model.predict(x,verbose=0)[0] next_index=sample(preds,diversity) next_char=indices_char[next_index] # Displaying generated+=next_char sentence=sentence[1:]+next_char sys.stdout.write(next_char) sys.stdout.flush() print()