013-006. qna bot built with seq2seq2 with attention
# @
# Attention Mechanism
# The longer sequence of seq2se1 model becomes, seq2seq model can't remember past state
# So, we use separated network to emphasize vector which seq2seq model should pay attention to
# @
# We will apply attention mechanism on seq2seq model to build qna bot
# img e495538b-9668-4912-85e6-93ef35b6eaf8
# @
# We will use
# Python 3.5, Tensorflow 1.1, Konlpy (Mecab),Word2Vec (Gensim), matplotlib (Graph)
@
import matplotlib.pyplot as plt
import re
from collections import Counter
from tqdm import tqdm
import tensorflow as tf
from tensorflow.python.layers.core import Dense
from konlpy.tag import Mecab
# I create mecan instance containing mecab-ko-dic data
mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic')
print(tf.__version__)
# 1.3.0
# @
enc_sentence_length = 10
dec_sentence_length = 10
# I create list data containing q and each corrensponding a for seq2seq
# I use 'operator' to oder by value
train_data = [
['안녕', '만나서 반가워'],
['넌누구니', '나는 AI 봇이란다.'],
['피자 주문 할께', '페파로니 주문해줘'],
['음료는 멀로', '콜라로 해줘']
]
all_input_sentences = []
all_target_sentences = []
for row_data in train_data:
# all_input_sentences = [안녕, 넌누구니, ...]
all_input_sentences.append(row_data[0])
# all_target_sentences = [만나서 반가워, 나는 AI 봇이란다, ...]
all_target_sentences.append(row_data[1])
# This method returns mophological analized sentence as token
def tokenizer(sentence):
tokens = mecab.morphs(sentence)
return tokens
tokenizer('피자 주문 할께')
# < ['피자', '주문', '할께']
# @
# Structure of vector which is created from each character of input sentence
# 1. Generally, if processing unit is smaller,
# you can sustain small dimensinal vector
# which is free from unregistered vocabulary
# 1. But when sentence becomes longer, training becomes difficult,
# so, you should find proper embedding
# 1. This point is different with each Biz Domain, and you should find complexity and expression potential in terms of proper balance
# 1. Below codes, I built onehot by character unit for easy understanding
def build_vocab(sentences, is_target=False, max_vocab_size=None):
word_counter = Counter()
vocab = dict()
reverse_vocab = dict()
for sentence in sentences:
# I tokenize each sentence
tokens = tokenizer(sentence)
# I calculate word count
word_counter.update(tokens)
if max_vocab_size is None:
max_vocab_size = len(word_counter)
if is_target:
vocab['_GO'] = 0
vocab['_PAD'] = 1
vocab_idx = 2
for key, value in word_counter.most_common(max_vocab_size):
vocab[key] = vocab_idx
vocab_idx += 1
else:
vocab['_PAD'] = 0
vocab_idx = 1
for key, value in word_counter.most_common(max_vocab_size):
vocab[key] = vocab_idx
vocab_idx += 1
for key, value in vocab.items():
reverse_vocab[value] = key
return vocab, reverse_vocab, max_vocab_size
# Q
# all_input_sentences = [안녕, 넌누구니, ...]
enc_vocab, enc_reverse_vocab, enc_vocab_size = build_vocab(all_input_sentences)
# A
# all_target_sentences = [만나서 반가워, 나는 AI 봇이란다, ...]
dec_vocab, dec_reverse_vocab, dec_vocab_size = build_vocab(all_target_sentences, is_target=True)
print('input vocabulary size:', enc_vocab_size)
# < input vocabulary size: 11
print('target vocabulary size:', dec_vocab_size)
# < target vocabulary size: 17
# Token to index
def token2idx(word, vocab):
# vocab = dict()
# vocab['_GO'] = 0
return vocab[word]
for token in tokenizer('피자 주문 할께'):
print(token, token2idx(token, enc_vocab))
# < 피자 5
# < 주문 6
# < 할께 7
# Sentence to index
def sent2idx(sent, vocab=enc_vocab, max_sentence_length=enc_sentence_length, is_target=False):
# I tokenize input sentence
tokens = tokenizer(sent)
# I find length of tokens
current_length = len(tokens)
# I find padding length
pad_length = max_sentence_length - current_length
if is_target:
return [0] + [token2idx(token, vocab) for token in tokens] + [1] * pad_length, current_length
else:
return [token2idx(token, vocab) for token in tokens] + [0] * pad_length, current_length
print('피자 주문 할께')
# < 피자 주문 할께
print(sent2idx('피자 주문 할께'))
# < ([5, 6, 7, 0, 0, 0, 0, 0, 0, 0], 3)
print('페파로니 주문해줘')
# < 페파로니 주문해줘
print(sent2idx('페파로니 주문해줘', vocab=dec_vocab, max_sentence_length=dec_sentence_length, is_target=True))
# < ([0, 14, 15, 16, 2, 3, 1, 1, 1, 1, 1], 5)
def idx2token(idx, reverse_vocab):
return reverse_vocab[idx]
def idx2sent(indices, reverse_vocab=dec_reverse_vocab):
return " ".join([idx2token(idx, reverse_vocab) for idx in indices])
class Seq2SeqModel():
# This configure Seq2SeqModel instance
def __init__(self, mode='training'):
self.mode = mode
self.hidden_size = 30
self.enc_emb_size = 30
self.dec_emb_size = 30
self.attn_size = 30
self.cell = tf.contrib.rnn.BasicLSTMCell
self.optimizer = tf.train.AdamOptimizer
self.n_epoch = 101
self.learning_rate = 0.001
self.start_token = 0
self.end_token = 1
self.ckpt_dir = './ckpt_dir/'
def add_decoder(self):
with tf.variable_scope('Decoder'):
with tf.device('/cpu:0'):
self.dec_Wemb = tf.get_variable('embedding',
initializer=tf.random_uniform([dec_vocab_size + 2, self.dec_emb_size]),
dtype=tf.float32)
batch_size = tf.shape(self.enc_inputs)[0]
dec_cell = self.cell(self.hidden_size)
attn_mech = tf.contrib.seq2seq.LuongAttention(
num_units=self.attn_size,
memory=self.enc_outputs,
memory_sequence_length=self.enc_sequence_length,
name='LuongAttention')
dec_cell = tf.contrib.seq2seq.AttentionWrapper(
cell=dec_cell,
attention_mechanism=attn_mech,
attention_layer_size=self.attn_size,
name='Attention_Wrapper')
initial_state = dec_cell.zero_state(dtype=tf.float32, batch_size=batch_size)
output_layer = Dense(dec_vocab_size + 2, name='output_projection')
if self.mode == 'training':
max_dec_len = tf.reduce_max(self.dec_sequence_length + 1, name='max_dec_len')
dec_emb_inputs = tf.nn.embedding_lookup(
self.dec_Wemb, self.dec_inputs, name='emb_inputs')
training_helper = tf.contrib.seq2seq.TrainingHelper(
inputs=dec_emb_inputs,
sequence_length=self.dec_sequence_length + 1,
time_major=False,
name='training_helper')
training_decoder = tf.contrib.seq2seq.BasicDecoder(
cell=dec_cell,
helper=training_helper,
initial_state=initial_state,
output_layer=output_layer)
train_dec_outputs, train_dec_last_state, _ = tf.contrib.seq2seq.dynamic_decode(
training_decoder,
output_time_major=False,
impute_finished=True,
maximum_iterations=max_dec_len)
logits = tf.identity(train_dec_outputs.rnn_output, name='logits')
targets = tf.slice(self.dec_inputs, [0, 0], [-1, max_dec_len], 'targets')
masks = tf.sequence_mask(self.dec_sequence_length + 1, max_dec_len, dtype=tf.float32, name='masks')
self.batch_loss = tf.contrib.seq2seq.sequence_loss(
logits=logits,
targets=targets,
weights=masks,
name='batch_loss')
self.valid_predictions = tf.identity(train_dec_outputs.sample_id, name='valid_preds')
elif self.mode == 'inference':
start_tokens = tf.tile(tf.constant([self.start_token], dtype=tf.int32), [batch_size], name='start_tokens')
inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
embedding=self.dec_Wemb,
start_tokens=start_tokens,
end_token=self.end_token)
inference_decoder = tf.contrib.seq2seq.BasicDecoder(
cell=dec_cell,
helper=inference_helper,
initial_state=initial_state,
output_layer=output_layer)
infer_dec_outputs, infer_dec_last_state, _ = tf.contrib.seq2seq.dynamic_decode(
inference_decoder,
output_time_major=False,
impute_finished=True,
maximum_iterations=dec_sentence_length)
self.predictions = tf.identity(infer_dec_outputs.sample_id, name='predictions')
def save(self, sess, var_list=None, save_path=None):
print('Saving model at {save_path}')
if hasattr(self, 'training_variables'):
var_list = self.training_variables
saver = tf.train.Saver(var_list)
saver.save(sess, save_path, write_meta_graph=False)
def build(self):
self.enc_inputs = tf.placeholder(tf.int32, shape=[None, enc_sentence_length], name='input_sentences')
self.enc_sequence_length = tf.placeholder(tf.int32, shape=[None, ], name='input_sequence_length')
if self.mode == 'training':
self.dec_inputs = tf.placeholder(tf.int32, shape=[None, dec_sentence_length + 1], name='target_sentences')
self.dec_sequence_length = tf.placeholder(tf.int32, shape=[None, ], name='target_sequence_length')
with tf.variable_scope('Encoder'):
with tf.device('/cpu:0'):
self.enc_Wemb = tf.get_variable('embedding',
initializer=tf.random_uniform([enc_vocab_size + 1, self.enc_emb_size]),
dtype=tf.float32)
enc_emb_inputs = tf.nn.embedding_lookup(self.enc_Wemb, self.enc_inputs, name='emb_inputs')
enc_cell = self.cell(self.hidden_size)
self.enc_outputs, self.enc_last_state = tf.nn.dynamic_rnn(
cell=enc_cell,
inputs=enc_emb_inputs,
sequence_length=self.enc_sequence_length,
time_major=False,
dtype=tf.float32)
self.add_decoder()
def train(self, sess, data, save_path=None):
print(data)
self.training_op = self.optimizer(self.learning_rate, name='training_op').minimize(self.batch_loss)
sess.run(tf.global_variables_initializer())
loss_history = []
for epoch in tqdm(range(self.n_epoch)):
all_preds = []
epoch_loss = 0
for row_data in data:
input_batch_tokens = []
target_batch_tokens = []
enc_sentence_lengths = []
dec_sentence_lengths = []
tokens, sent_len = sent2idx(row_data[0])
input_batch_tokens.append(tokens)
enc_sentence_lengths.append(sent_len)
tokens, sent_len = sent2idx(row_data[1],
vocab=dec_vocab,
max_sentence_length=dec_sentence_length,
is_target=True)
target_batch_tokens.append(tokens)
dec_sentence_lengths.append(sent_len)
batch_preds, batch_loss, _ = sess.run(
[self.valid_predictions, self.batch_loss, self.training_op],
feed_dict={
self.enc_inputs: input_batch_tokens,
self.enc_sequence_length: enc_sentence_lengths,
self.dec_inputs: target_batch_tokens,
self.dec_sequence_length: dec_sentence_lengths,
})
epoch_loss += batch_loss
all_preds.append(batch_preds)
loss_history.append(epoch_loss)
if epoch % 100 == 0:
print('Epoch', epoch)
for row_data, batch_preds in zip(data, all_preds):
for input_sent, target_sent, pred in zip(row_data[0], row_data[1], batch_preds):
print('\tInput: {input_sent}')
print('\tPrediction:', idx2sent(pred, reverse_vocab=dec_reverse_vocab))
print('\tTarget:, {target_sent}')
print('\tepoch loss: {epoch_loss:.2f}\n')
if save_path:
self.save(sess, save_path=save_path)
return loss_history
def inference(self, sess, data, load_ckpt):
self.restorer = tf.train.Saver().restore(sess, load_ckpt)
batch_tokens = []
batch_sent_lens = []
tokens, sent_len = sent2idx(data)
batch_tokens.append(tokens)
batch_sent_lens.append(sent_len)
batch_preds = sess.run(
self.predictions,
feed_dict={
self.enc_inputs: batch_tokens,
self.enc_sequence_length: batch_sent_lens,
})
print('Input:', data)
print('Prediction:', idx2sent(batch_preds[0], reverse_vocab=dec_reverse_vocab))
# I display result of training
tf.reset_default_graph()
with tf.Session() as sess:
model = Seq2SeqModel(mode='training')
model.build()
loss_history = model.train(sess, train_data, save_path=model.ckpt_dir)
plt.figure(figsize=(20, 10))
plt.plot(range(model.n_epoch), loss_history, label = 'cost')
plt.show()
# [['안녕', '만나서 반가워'], ['넌누구니', '나는 AI 봇이란다.'], ['피자 주문 할께', '페파로니 주문해줘'], ['음료는 멀로', '콜라로 해줘']]
# 5%|▍ | 5/101 [00:00<00:04, 21.81it/s]
# Epoch 0
# Input: {input_sent}
# Prediction: 줘 란다 란다 는
# Target:, {target_sent}
# Input: {input_sent}
# Prediction: 줘 줘 줘 란다 란다 란다 줘 줘
# Target:, {target_sent}
# Input: {input_sent}
# Prediction: 줘 줘 줘 줘 _GO 란다
# Target:, {target_sent}
# Input: {input_sent}
# Prediction: 봇 _GO _GO _GO _GO
# Target:, {target_sent}
# epoch loss: {epoch_loss:.2f}
# 100%|██████████| 101/101 [00:03<00:00, 33.05it/s]
# Epoch 100
# Input: {input_sent}
# Prediction: _GO 만나 서 반가워
# Target:, {target_sent}
# Input: {input_sent}
# Prediction: _GO 나 는 AI 봇 이 란다 .
# Target:, {target_sent}
# Input: {input_sent}
# Prediction: _GO 페파 로니 주문 해 줘
# Target:, {target_sent}
# Input: {input_sent}
# Prediction: _GO 콜라 로 해 줘
# Target:, {target_sent}
# epoch loss: {epoch_loss:.2f}
# Saving model at {save_path}
# img
# I try prediction
tf.reset_default_graph()
with tf.Session() as sess:
model = Seq2SeqModel(mode='inference')
model.build()
for row_data in train_data:
model.inference(sess, row_data[0], load_ckpt=model.ckpt_dir)
import shutil
shutil.rmtree(model.ckpt_dir)
print("Model Deleted")
# INFO:tensorflow:Restoring parameters from ./ckpt_dir/
# Input: 안녕
# Prediction: _GO 만나 서 반가워 반가워 반가워 반가워 반가워 반가워 반가워
# INFO:tensorflow:Restoring parameters from ./ckpt_dir/
# Input: 넌누구니
# Prediction: _GO 나 는 AI 봇 이 이 란다 . .
# INFO:tensorflow:Restoring parameters from ./ckpt_dir/
# Input: 피자 주문 할께
# Prediction: _GO 페파 로니 주문 해 줘 줘 줘 줘 줘
# INFO:tensorflow:Restoring parameters from ./ckpt_dir/
# Input: 음료는 멀로
# Prediction: _GO 콜라 로 해 해 줘 줘 줘 줘 줘
# Model Deleted