029. classify text mini_size_text_data_file into class by "multi layer perceptron", convert text mini_size_text_data_file into vector # @ from keras.models import Sequential from keras.layers import Dense, Dropout, Activation from keras.wrappers.scikit_learn import KerasClassifier from keras.utils import np_utils from sklearn.model_selection import train_test_split from sklearn import model_selection, metrics import json # Number of words in text is 56681 # You can check it out in word-dic.json file max_words_number=56681 # There are 6 categories(like genres) number_of_classes=6 batch_size=64 number_of_epoch=20 # This method creates "multi layer perceptron" model, # which will be passed into KerasClassifier constructor as argument def build_model(): Sequential_object=Sequential() Sequential_object.add(Dense(512,input_shape=(max_words_number,))) Sequential_object.add(Activation('relu')) Sequential_object.add(Dropout(0.5)) Sequential_object.add(Dense(number_of_classes)) Sequential_object.add(Activation('softmax')) Sequential_object.compile(\ loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) return Sequential_object mini_size_text_data_file=json.load(open("./newstext/data-mini.json")) # You can load full size data file # full_size_text_data_file=json.load(open("./newstext/data.json")) X_text_data=mini_size_text_data_file["X"] Y_category_data=mini_size_text_data_file["Y"] # You train model x_train_data,x_test_data,y_train_data,y_test_data=train_test_split(X,Y) categorized_y_train_data=np_utils.to_categorical(y_train_data,number_of_classes) print(len(x_train_data),len(categorized_y_train_data)) # You create classifier keras_classifier_object=KerasClassifier( build_fn=build_model, number_of_epoch=number_of_epoch, batch_size=batch_size) # You let created keras_classifier_object to train, # by passing feature(text) and label(category) of train data # x_train_data is frequency of word like [2,2,2,1,1,1,1] # y_train_data is categories(something like genre) Sequential_object.fit(x_train_data,y_train_data) # For predicting prediction_value=Sequential_object.predict(x_test_data) # For evaluating accuracy_score=metrics.accuracy_score(y_test_data,prediction_value) classification_report=metrics.classification_report(y_test_data,prediction_value) print("accuracy: ",accuracy_score) print("report: \n",classification_report) # @ # Let's talk about way, # how you can convert text data into fixed length vector # Suppose you have following sentence # "몇 번을 쓰러지더라도 몇 번을 무너지더라도 다시 일어나라" # You will process morphological analysis by twitter morphological analyzer # and you will get following processed output # 몇|번|을|쓰러지다|몇|번|을|무너지다|다시|일어나다 # To convert text data into vector, # you need "word_dictionary" and "data" # If you can't find "몇" in word_dictionary, # you input "몇" into word_dictionary, # with unique ID number representing "몇" # If you can find "몇" in word_dictionary, # you just pass with doing nothing # word_dictionary= # { # "몇":1, # "번":2, # "을":3, # "쓰러지다":4, # "무너지다":5, # "다시":6, # "일어나다":7 # } # You can create this word dictionary for entire text, # resulting lots of elements in vector # Now, you can create vector of above sentence, # by counting frequency of appearance in text