031. create sentence with lstm and rnn
# @
# If you want to analyze snapshot,you can use cnn
# If you want to analyze time series data,you can use rnn
# @
# LSTM is upgrade version of rnn
# @
import codecs
from bs4 import BeautifulSoup
from keras.models import Sequential
from keras.layers import Dense,Activation,Dropout
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random,sys
fp=codecs.open("./BEXX0003.txt","r",encoding="utf-16")
soup=BeautifulSoup(fp,"html.parser")
body=soup.select_one("body")
text=body.getText()+" "
print('The length of corpous: ',len(text))
# I will convert text into set
# and then I will convert set into list
chars=sorted(list(set(text)))
print('The number of used character:',len(chars))
char_indices=dict((c,i) for i,c in enumerate(chars))
# < {...,'품':1479,'풋':1580,...}
indices_char=dict((i,c) for i,c in enumerate(chars))
# < {...,1479:'품',1580:'풋',...}
# I cut text by number of maxlen of word(20)
# and I register next showing one word
# \n\n제1편 어둠의 발소리\n서(서)\n1 => 20 words
maxlen=20
# \n\n제1편 어둠의 발소리\n서(서)\n1 -> 3 step further
# -> 1편 어둠의 발소리\n서(서)\n1897
step=3
# 제1편 어둠의 발소리
# 서
# 1897년의 한가위
# 까치들이 울타리 안 감나무에 와서 아침 인사를 하기도 전에,
sentences=[]
# < ['\n\n제1편 어둠의 발소리\n서(서)\n1','1편 어둠의 발소리\n서(서)\n1897','어둠의 발소리\n서(서)\n1897년의 ',.....]
next_chars=[]
# < ['8','년','한',...]
for i in range(0,len(text)-maxlen,step):
sentences.append(text[i:i+maxlen])
next_chars.append(text[i+maxlen])
print('Number of sentence to be trained: ',len(sentences))
print('I\'m converting text into int ID vector...')
# np.zeros(5)
# < array([0,0,0,0,0])
# np.zeros(3,dtype=np.bool)
# < array([false,false,false])
# np.zeros((2,2))
# < array
# < ([
# < [0,0],
# < [0,0]
# < ])
X=np.zeros((len(sentences),maxlen,len(chars)),dtype=np.bool)
y=np.zeros((len(sentences),len(chars)),dtype=np.bool)
for i,sentence in enumerate(sentences):
for t,char in enumerate(sentence):
X[i,t,char_indices[char]]=1
y[i,char_indices[next_chars[i]]]=1
# I implement lstm model
print('Consructing LSTM model...')
model=Sequential()
model.add(LSTM(128,input_shape=(maxlen,len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
optimizer=RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy',optimizer=optimizer)
# I extract candidated predicted value from array
def sample(preds,temperature=1.0):
preds=np.asarray(preds).astype('float64')
preds=np.log(preds) / temperature
exp_preds=np.exp(preds)
preds=exp_preds / np.sum(exp_preds)
probas=np.random.multinomial(1,preds,1)
return np.argmax(probas)
for iteration in range(1,60):
print()
print('-'*50)
print('iteration= ',iteration)
model.fit(X,y,batch_size=128,nb_epoch=1) #
# I choose random starting text
start_index=random.randint(0,len(text)-maxlen-1)
# I create various sentences
for diversity in [0.2,0.5,1.0,1.2]:
print()
print('---diversity=',d iversity)
generated=''
sentence=text[start_index: start_index+maxlen]
generated += sentence
print('--- seed= "'+sentence+'"')
sys.stdout.write(generated)
# I create text automatically by using seed
for i in range(400):
x=np.zeros((1,maxlen,len(chars)))
for t,char in enumerate(sentence):
x[0,t,char_indices[char]]=1.
# I predict next showing word
preds=model.predict(x,verbose=0)[0]
next_index=sample(preds,diversity)
next_char=indices_char[next_index]
# Displaying
generated+=next_char
sentence=sentence[1:]+next_char
sys.stdout.write(next_char)
sys.stdout.flush()
print()