029. classify text mini_size_text_data_file into class by "multi layer perceptron", convert text mini_size_text_data_file into vector
# @
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from sklearn import model_selection, metrics
import json
# Number of words in text is 56681
# You can check it out in word-dic.json file
max_words_number=56681
# There are 6 categories(like genres)
number_of_classes=6
batch_size=64
number_of_epoch=20
# This method creates "multi layer perceptron" model,
# which will be passed into KerasClassifier constructor as argument
def build_model():
Sequential_object=Sequential()
Sequential_object.add(Dense(512,input_shape=(max_words_number,)))
Sequential_object.add(Activation('relu'))
Sequential_object.add(Dropout(0.5))
Sequential_object.add(Dense(number_of_classes))
Sequential_object.add(Activation('softmax'))
Sequential_object.compile(\
loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
return Sequential_object
mini_size_text_data_file=json.load(open("./newstext/data-mini.json"))
# You can load full size data file
# full_size_text_data_file=json.load(open("./newstext/data.json"))
X_text_data=mini_size_text_data_file["X"]
Y_category_data=mini_size_text_data_file["Y"]
# You train model
x_train_data,x_test_data,y_train_data,y_test_data=train_test_split(X,Y)
categorized_y_train_data=np_utils.to_categorical(y_train_data,number_of_classes)
print(len(x_train_data),len(categorized_y_train_data))
# You create classifier
keras_classifier_object=KerasClassifier(
build_fn=build_model,
number_of_epoch=number_of_epoch,
batch_size=batch_size)
# You let created keras_classifier_object to train,
# by passing feature(text) and label(category) of train data
# x_train_data is frequency of word like [2,2,2,1,1,1,1]
# y_train_data is categories(something like genre)
Sequential_object.fit(x_train_data,y_train_data)
# For predicting
prediction_value=Sequential_object.predict(x_test_data)
# For evaluating
accuracy_score=metrics.accuracy_score(y_test_data,prediction_value)
classification_report=metrics.classification_report(y_test_data,prediction_value)
print("accuracy: ",accuracy_score)
print("report: \n",classification_report)
# @
# Let's talk about way,
# how you can convert text data into fixed length vector
# Suppose you have following sentence
# "몇 번을 쓰러지더라도 몇 번을 무너지더라도 다시 일어나라"
# You will process morphological analysis by twitter morphological analyzer
# and you will get following processed output
# 몇|번|을|쓰러지다|몇|번|을|무너지다|다시|일어나다
# To convert text data into vector,
# you need "word_dictionary" and "data"
# If you can't find "몇" in word_dictionary,
# you input "몇" into word_dictionary,
# with unique ID number representing "몇"
# If you can find "몇" in word_dictionary,
# you just pass with doing nothing
# word_dictionary=
# {
# "몇":1,
# "번":2,
# "을":3,
# "쓰러지다":4,
# "무너지다":5,
# "다시":6,
# "일어나다":7
# }
# You can create this word dictionary for entire text,
# resulting lots of elements in vector
# Now, you can create vector of above sentence,
# by counting frequency of appearance in text