BagOfWordMeetsBagsOfPopcorn
# sampleSubmission.csv: This file will be used for submission
# labeledTraindata.tsv: This is labeld train data including sentiment,
# either negative or positive(id-review-sentiment)
# testData.tsv: This is test data without sentiments(only id-review)
# unlabeledTrainData.tsv
import pandas as pd
"""
header=0 indicates first sentence which represents names of column
delimiter=\t means that elements are delimited by tab space
quoting=3 means that double quatation is neglected
# QUOTE_MINIMAL (0),
# QUOTE_ALL (1),
# QUOTE_NONNUMERIC (2),
# QUOTE_NONE (3).
"""
train_data_with_sentimentlabel_dataframe\
=pd.read_csv('D://chromedown//labeledTrainData.tsv',header=0,delimiter='\t',quoting=3)
test_data_dataframe\
=pd.read_csv('D://chromedown//testData.tsv',header=0,delimiter='\t',quoting=3)
train_data_with_sentimentlabel_dataframe.shape
# row, columns(id,sentiment,review) is (25000,3)
print(train_data_with_sentimentlabel_dataframe.tail(3))
# id sentiment review
# 24997 "10905_3" 0 "Guy is a loser. Can't get girls,needs to bui...
# 24998 "10194_3" 0 "This 30 minute documentary Buñuel made in the...
# 24999 "8478_8" 1 "I saw this movie as a child and it broke my h...
test_data_dataframe.shape
# (25000,2)
test_data_dataframe.tail()
# id review
# 24995 "2155_10" "Sony Pictures Classics,I'm looking at you! S...
# 24996 "59_10" "I always felt that Ms. Merkerson had never go...
# 24997 "2531_1" "I was so disappointed in this movie. I am ver...
# 24998 "7772_8" "From the opening sequence,filled with black ...
# 24999 "11465_10" "This is a great horror film for people who do...
train_data_with_sentimentlabel_dataframe.columns.values
# array(['id','sentiment','review'],dtype=object)
print(test_data_dataframe.columns.values)
# array(['id','review'],dtype=object)
print(train_data_with_sentimentlabel_dataframe.info())
#
# RangeIndex: 25000 entries,0 to 24999
# Data columns (total 3 columns):
# id 25000 non-null object
# sentiment 25000 non-null int64
# review 25000 non-null object
# dtypes: int64(1),object(2)
# memory usage: 586.0+ KB
train_data_with_sentimentlabel_dataframe.describe()
# sentiment
# count 25000.00000
# mean 0.50000
# std 0.50001
# min 0.00000
# 25% 0.00000
# 50% 0.50000
# 75% 1.00000
# max 1.00000
train_data_with_sentimentlabel_dataframe['sentiment'].value_counts()
# 1 12500
# 0 12500
# Name: sentiment,dtype: int64
# You will remove html tags in review data
train_data_with_sentimentlabel_dataframe['review'][0][:700]
# '"With all this stuff going down at the moment with MJ i\'ve started listening to his music,watching the odd documentary here and there,watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography,part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.
Visually impressive but of course this is all about Michael Jackson so unless you remotely lik'
from bs4 import BeautifulSoup
# You bring one review in html5 format
review_parsed_to_html5=BeautifulSoup(train_data_with_sentimentlabel_dataframe['review'][0],"html5lib")
# I bring one review which contains html tags
train_data_with_sentimentlabel_dataframe['review'][0][:700]
# "With all this stuff going down at the moment with MJ i've started listening to his music,watching the odd documentary here and there,watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography,part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.
Visually impressive but of course this is all about Michael Jackson so unless you remotely lik
# You can remove
tags by following process
review_parsed_to_html5.get_text()[:700]
# '"With all this stuff going down at the moment with MJ i\'ve started listening to his music,watching the odd documentary here and there,watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography,part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyw'
import re
# You will remove special characters by regular expression
# You will replace characters which is not alphabet ([^a-zA-Z]) with whitespace (' ')
letters_only_removed_special_characters=re.sub('[^a-zA-Z]',' ',review_parsed_to_html5.get_text())
letters_only_removed_special_characters[:700]
# ' With all this stuff going down at the moment with MJ i ve started listening to his music watching the odd documentary here and there watched The Wiz and watched Moonwalker again Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent Moonwalker is part biography part feature film which i remember going to see at the cinema when it was originally released Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyw'
# You will replace characters,
# which are not lower case alphabet with lower case alphabet
letters_containing_only_lowercase_alphabet=letters_only_removed_special_characters.lower()
# You will split all sentence into words,
# which is called tokenization
splited_text_into_words=letters_containing_only_lowercase_alphabet.split()
len(splited_text_into_words)
# 437
print(splited_text_into_words[:10])
# ['with',
# 'all',
# 'this',
# 'stuff',
# 'going',
# 'down',
# 'at',
# 'the',
# 'moment',
# 'with']
import nltk
from nltk.corpus import stopwords
# You will bring 10 stopwords for test observation,
# from nltk stopwords data
stopwords.words('english')[:10]
# ['i','me','my','myself','we','our','ours','ourselves','you','your']
# You will input tokenized text,
# and you will remove stopwords in tokenized text
words_which_is_not_stopword\
=[one_word for one_word in splited_text_into_words\
if not one_word in stopwords.words('english')]
len(words_which_is_not_stopword)
# 219
words_which_is_not_stopword[:10]
# You won't see words like "with", "all"
# ['stuff',
# 'going',
# 'moment',
# 'mj',
# 'started',
# 'listening',
# 'music',
# 'watching',
# 'odd',
# 'documentary']
# This is example of using PorterStemmer
porterstemmer_object=nltk.stem.PorterStemmer()
# You want to stemmize "maximum"
porterstemmer_object.stem('maximum')
# maximum
# Let's see other examples
"Stemmed form of running is: {}".format(porterstemmer_object.stem("running"))
"Stemmed form of runs is: {}".format(porterstemmer_object.stem("runs"))
"Stemmed form of run is: {}".format(porterstemmer_object.stem("run"))
# Stemmed form of running is: run
# Stemmed form of runs is: run
# Stemmed form of run is: run
# This is example of using LancasterStemmer
from nltk.stem.lancaster import LancasterStemmer
lancasterstemmer_object=LancasterStemmer()
lancasterstemmer_object.stem('maximum')
# maxim
print("Stemmed form of running is: {}".format(lancasterstemmer_object.stem("running")))
print("Stemmed form of runs is: {}".format(lancasterstemmer_object.stem("runs")))
print("Stemmed form of run is: {}".format(lancasterstemmer_object.stem("run")))
# Stemmed form of running is: run
# Stemmed form of runs is: run
# Stemmed form of run is: run
# Following words are before processing
words_which_is_not_stopword[:10]
# ['stuff',
# 'going',
# 'moment',
# 'mj',
# 'started',
# 'listening',
# 'music',
# 'watching',
# 'odd',
# 'documentary']
from nltk.stem.snowball import SnowballStemmer
snowballstemmer_object=SnowballStemmer('english')
words_processed_by_snowballstemmer\
=[snowballstemmer_object.stem(one_word) for one_word in words_which_is_not_stopword]
# Let's see words after processing
words_processed_by_snowballstemmer[:10]
# ['stuff',
# 'go',
# 'moment',
# 'mj',
# 'start',
# 'listen',
# 'music',
# 'watch',
# 'odd',
# 'documentari']
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer_object=WordNetLemmatizer()
wordnet_lemmatizer_object.lemmatize('fly')
# fly
wordnet_lemmatizer_object.lemmatize('flies')
# fly
words_processed_by_wordnet_lemmatizer\
=[wordnet_lemmatizer.lemmatize(one_word) for one_word in words_processed_by_snowballstemmer]
words_processed_by_wordnet_lemmatizer[:10]
# ['stuff',
# 'go',
# 'moment',
# 'mj',
# 'start',
# 'listen',
# 'music',
# 'watch',
# 'odd',
# 'documentari']
# You will implement reviewtext_to_words() to process string sequence
def reviewtext_to_words(raw_reviewtext):
# 1. You remove html tags in raw_reviewtext by beautifulsoup
raw_reviewtext_removed_html_tags=BeautifulSoup(raw_reviewtext,'html.parser').get_text()
# 2. You replace character which is not english character with white space
raw_reviewtext_removed_not_alphabet=re.sub('[^a-zA-Z]',' ',raw_reviewtext_removed_html_tags)
# 3. I convert all string to lower case character and I split them by word
raw_reviewtext_splited_text_into_words=raw_reviewtext_removed_not_alphabet.lower().split()
# 4. In python, it's faster to find by set than by list
# So, you will convert stopwords to set type
stopwords_coverted_into_set=set(stopwords.words('english'))
# 5. I remove stopwords
words_which_is_not_stopword_list=\
[one_word for one_word in raw_reviewtext_splited_text_into_words\
if not one_word in stopwords_coverted_into_set]
# 6. You process stemmization
stemmized_words_list=[snowballstemmer_object.stem(one_word) for one_word in words_which_is_not_stopword_list]
# 7. You join all words which are delimited by white space,
# to make one string sentence as return value
return( ' '.join(stemmized_words_list) )
# You use reviewtext_to_words() which is made just above
reviewtext_processed_by_reviewtext_to_words\
=reviewtext_to_words(train_data_with_sentimentlabel_dataframe['review'][0])
reviewtext_processed_by_reviewtext_to_words
# 'stuff go moment mj start listen music watch odd documentari watch wiz watch moonwalk mayb want get certain insight guy thought realli cool eighti mayb make mind whether guilti innoc moonwalk part biographi part featur film rememb go see cinema origin releas subtl messag mj feel toward press also obvious messag drug bad kay visual impress cours michael jackson unless remot like mj anyway go hate find bore may call mj egotist consent make movi mj fan would say made fan true realli nice actual featur film bit final start minut exclud smooth crimin sequenc joe pesci convinc psychopath power drug lord want mj dead bad beyond mj overheard plan nah joe pesci charact rant want peopl know suppli drug etc dunno mayb hate mj music lot cool thing like mj turn car robot whole speed demon sequenc also director must patienc saint came film kiddi bad sequenc usual director hate work one kid let alon whole bunch perform complex danc scene bottom line movi peopl like mj one level anoth think peopl stay away tri give wholesom messag iron mj bestest buddi movi girl michael jackson truli one talent peopl ever grace planet guilti well attent gave subject hmmm well know peopl differ behind close door know fact either extrem nice stupid guy one sickest liar hope latter'
# You will process entire reviewtext of train data
# You bring entire reviewtext from train data
size_of_entire_reviewtext=train_data_with_sentimentlabel_dataframe['review'].size
size_of_entire_reviewtext
# 25000
"""
clean_train_reviews=[]
In kaggle tutorial, range's written by xrange
But, for this case, since you use python3, you use range as following
"""
# for i in range(0,num_reviews):
# clean_train_reviews.append( review_to_words(train['review'][i]))
"""
But,above code doesn't give information how far position the code is executing
So, you fix them to give information current state one time per 5000 unit
"""
# clean_train_reviews=[]
# for i in range(0,num_reviews):
# if (i + 1)%5000 == 0:
# print('Review {} of {} '.format(i+1,num_reviews))
# clean_train_reviews.append(review_to_words(train['review'][i]))
"""
To make the code brevity,I use apply instead of for loop
"""
# %time train['review_clean']=train['review'].apply(review_to_words)
"""
The code becomes simple but takes much of time
"""
# CPU times: user 1min 15s,sys: 2.3 s,total: 1min 18s
# Wall time: 1min 20s
# @
from multiprocessing import Pool
import numpy as np
def _apply_df(args):
df,func,kwargs=args
return df.apply(func,**kwargs)
def apply_by_multiprocessing(dataframe,func,**kwargs):
# You extract "workers" parameter from multiple parameteers
number_of_workers=kwargs.pop('workers')
number_of_pools=Pool(processes=number_of_workers)
# You devide method and dataframe into number of worker
result_d_func_kwargs_tuple=number_of_pools.map(\
_apply_df,[(d,func,kwargs) for d in np.array_split(dataframe,number_of_workers)])
number_of_pools.close()
return pd.concat(list(result_d_func_kwargs_tuple))
def square(x):
return x**x
if __name__ == '__main__':
dataframe=pd.DataFrame({'a':range(10),'b':range(10)})
apply_by_multiprocessing(dataframe,square,axis=1,workers=4)
# You run code by 4 processors
cleaned_entire_reviewtext_of_train_data\
=apply_by_multiprocessing(train_data_with_sentimentlabel_dataframe['review'],reviewtext_to_words,workers=4)
# # CPU times: user 106 ms,sys: 119 ms,total: 226 ms
# # Wall time: 43.1 s
cleaned_entire_reviewtext_of_test_data\
=apply_by_multiprocessing(test_data_dataframe['review'],reviewtext_to_words,workers=4)
# # CPU times: user 116 ms,sys: 139 ms,total: 255 ms
# # Wall time: 51.6 s
# You will use "word cloud" to show visualization based on frequency of word
from wordcloud import WordCloud,STOPWORDS
import matplotlib.pyplot as plt
def displayWordCloud(data_to_be_shown_in_wordcloud=None,backgroundcolor='white',width=800,height=600):
wordcloud_object=WordCloud(\
stopwords=STOPWORDS\
,background_color=backgroundcolor\
,width=width\
,height=height).generate(data_to_be_shown_in_wordcloud)
plt.figure(figsize=(15 ,10))
plt.imshow(wordcloud_object)
plt.axis("off")
plt.show()
# You can draw "wordcloud" with words of train data
displayWordCloud(' '.join(cleaned_entire_reviewtext_of_train_data))
# You can visualizz data by seaborn
import seaborn as sns
# You will create 2 separated graphs
# 1. distribution of number of word per review
# 1. distribution of number of unique word per review without duplication of word
figure_object,subplot_object=plt.subplots(ncols=2)
figure_object.set_size_inches(18,6)
print('Mean value of number of words from each review:'\
,train_data_with_sentimentlabel_dataframe['num_words'].mean())
print('Median value of number of words from each review:'\
,train_data_with_sentimentlabel_dataframe['num_words'].median())
sns.distplot(\
train_data_with_sentimentlabel_dataframe['num_words']\
,bins=100\
,ax=subplot_object[0])
# Vertical dot line representing median value
subplot_object[0].axvline(\
train_data_with_sentimentlabel_dataframe['num_words'].median()\
,linestyle='dashed')
subplot_object[0].set_title(\
'distributioin of the number of word per review')
# Mean value of number of words from each review: 119.52356
# Median value of number of words from each review: 89.0
print('Mean value of number of unique words from each review:'\
,train_data_with_sentimentlabel_dataframe['num_uniq_words'].mean())
print('Median value of number of unique words from each review:'\
,train_data_with_sentimentlabel_dataframe['num_uniq_words'].median())
sns.distplot(\
train_data_with_sentimentlabel_dataframe['num_uniq_words']\
,bins=100\
,color='g'\
,ax=subplot_object[1])
subplot_object[1].axvline(\
train_data_with_sentimentlabel_dataframe['num_uniq_words'].median()\
,linestyle='dashed')
subplot_object[1].set_title('distributioin of the number of unique word per review')
# Mean value of number of unique words from each review: 94.05756
# Median value of number of unique words from each review: 74.0
# Bag-of-words model - Wikipedia
# Let's suppose there are 2 sentences below
# (1) John likes to watch movies. Mary likes movies too.
# (2) John also likes to watch football games.
# You can tokenize above 2 sentences,
# and put tokenized words into bags,
# then it will seem like below
# [
# "John",
# "likes",
# "to",
# "watch",
# "movies",
# "Mary",
# "too",
# "also",
# "football",
# "games"
# ]
# And you can count how many times each token shows up,
# in bag in order in top to bottom
# (1) John:1,likes:2,... => [1,2,1,1,2,1,1,0,0,0]
# (2) [1,1,1,1,0,0,0,1,1,1]
# Above processing is process of converting formation of data,
# to make machine learning algorithm understand data
# You can put tokenized workds into "bag of words",
# in "bigram" way by n-gram methodology,
# and it's shown below
# [
# "John likes",
# "likes to",
# "to watch",
# "watch movies",
# "Mary likes",
# "likes movies",
# "movies too",
# ]
# You can perform above process by using CountVectorizer
# You generate features by CountVectorizer of scikit-learn
# You extract tokens by regular expression
# Since you can convert all alphabet characters into lowercase,
# all words like "good,Good,gOod" will be identical into "good"
# Since this process generates meaningless features,
# you just only use tokens which are shown at least in 2 reviews
# You can define minimal number of review which can define meaningful token
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
# You can manipulate some parameter values, compared to tutorial
# Manipulating some parameter values can cause different score of result
countvectorizer_object=CountVectorizer(\
analyzer='word',
tokenizer=None,
preprocessor=None,
stop_words=None,
# This is minimal number of document which can define meaningful token
min_df=2,
ngram_range=(1,3),
max_features=20000)
# CountVectorizer(analyzer='word',binary=False,decode_error='strict',
# dtype=,encoding='utf-8',input='content',
# lowercase=True,max_df=1.0,max_features=20000,min_df=2,
# ngram_range=(1,3),preprocessor=None,stop_words=None,
# strip_accents=None,token_pattern='(?u)\\b\\w\\w+\\b',
# tokenizer=None,vocabulary=None)
# You can use "pipeline" to enhance processing speed
# Reference:
# https://stackoverflow.com/questions/28160335/plot-a-document-tfidf-2d-graph
pipeline_object=Pipeline(\
[('vect',countvectorizer_object),])
# You will process cleaned_entire_reviewtext_of_train_data,
# by pipeline_object.fit_transform()
cleaned_entire_reviewtext_of_train_data_processed_by_pipeline\
=pipeline_object.fit_transform(cleaned_entire_reviewtext_of_train_data)
# <25000x20000 sparse matrix of type ''
# with 2762268 stored elements in Compressed Sparse Row format>
cleaned_entire_reviewtext_of_train_data_processed_by_pipeline.shape
# (25000,20000)
feature_names_via_countvectorizer=countvectorizer_object.get_feature_names()
len(feature_names_via_countvectorizer)
# 20000
feature_names_via_countvectorizer[:10]
# ['aag',
# 'aaron',
# 'ab',
# 'abandon',
# 'abbey',
# 'abbi',
# 'abbot',
# 'abbott',
# 'abc',
# 'abduct']
# You check out vectorized feature
import numpy as np
sum_of_column_data=np.sum(cleaned_entire_reviewtext_of_train_data_processed_by_pipeline,axis=0)
for tag,count in zip(feature_names_via_countvectorizer,sum_of_column_data):
print(count,tag)
pd.DataFrame(sum_of_column_data,columns=feature_names_via_countvectorizer)
# [[26 48 22 ... 59 40 23]] aag
# aag aaron ab abandon abbey abbi abbot abbott abc abduct ... zucker
# 0 26 48 22 288 24 30 29 30 125 55 ... 23
# 1 rows × 20000 columns
# You bring 10 cleaned_entire_reviewtext_of_train_data_processed_by_pipeline,
# from first location, and show their header
pd.DataFrame(\
cleaned_entire_reviewtext_of_train_data_processed_by_pipeline[:10].toarray()\
,columns=feature_names_via_countvectorizer).head()
# aag aaron ab abandon abbey abbi abbot abbott abc abduct ... zucker
# 0 0 0 0 0 0 0 0 0 0 0 ... 0
# 1 0 0 0 0 0 0 0 0 0 0 ... 0
# 2 0 0 0 0 0 0 0 0 0 0 ... 0
# 3 0 0 0 0 0 0 0 0 0 0 ... 0
# 4 0 0 0 0 0 0 0 0 0 0 ... 0
# 5 rows × 20000 columns
# You use random forest classifier
from sklearn.ensemble import RandomForestClassifier
randomforest_classifier_object=RandomForestClassifier(\
n_estimators=100\
,n_jobs=-1\
,random_state=2018)
randomforest_classifier_object
# You can set up random forest classifier in detail
# RandomForestClassifier(\
# bootstrap=True,class_weight=None,criterion='gini',
# max_depth=None,max_features='auto',max_leaf_nodes=None,
# min_impurity_decrease=0.0,min_impurity_split=None,
# min_samples_leaf=1,min_samples_split=2,
# min_weight_fraction_leaf=0.0,n_estimators=100,n_jobs=-1,
# oob_score=False,random_state=2018,verbose=0,
# warm_start=False)
randomforest_classifier_object\
=forest.fit(\
cleaned_entire_reviewtext_of_train_data_processed_by_pipeline\
,train_data_with_sentimentlabel_dataframe['sentiment'])
# CPU times: user 1min 16s,sys: 324 ms,total: 1min 17s
# Wall time: 20.9 s
# You can use cross validation
from sklearn.model_selection import cross_val_score
# You input data and get result,
# then you will find mean value from result
score_via_cross_validation=np.mean(cross_val_score(\
randomforest_classifier_object\
,cleaned_entire_reviewtext_of_train_data_processed_by_pipeline\
,train_data_with_sentimentlabel_dataframe['sentiment']\
,cv=10\
,scoring='roc_auc'))
# CPU times: user 10min 52s,sys: 3.19 s,total: 10min 55s
# Wall time: 2min 57s
# You can check out first test data which I refined above
cleaned_entire_reviewtext_of_test_data[0]
# 'natur film main theme mortal nostalgia loss innoc perhap surpris rate high older viewer younger one howev craftsmanship complet film anyon enjoy pace steadi constant charact full engag relationship interact natur show need flood tear show emot scream show fear shout show disput violenc show anger natur joyc short stori lend film readi made structur perfect polish diamond small chang huston make inclus poem fit neat truli masterpiec tact subtleti overwhelm beauti'
# You will vectorize test data
cleaned_entire_reviewtext_of_test_data_processed_by_pipeline\
=pipeline_object.transform(cleaned_entire_reviewtext_of_test_data)
cleaned_entire_reviewtext_of_test_data_processed_by_pipeline_array\
=cleaned_entire_reviewtext_of_test_data_processed_by_pipeline.toarray()
# CPU times: user 8.18 s,sys: 46.6 ms,total: 8.23 s
# Wall time: 8.24 s
cleaned_entire_reviewtext_of_test_data_processed_by_pipeline_array
# You can see sparse matrix which is generated from vectorized words
# array([[0,0,0,...,0,0,0],
# [0,0,0,...,0,0,0],
# [0,0,0,...,0,0,0],
# ...,
# [0,0,0,...,0,0,0],
# [0,0,0,...,0,0,0],
# [0,0,0,...,0,0,0]])
# You count number of vectorized word of how many times they show up in reviews
# You bring 5th data
cleaned_entire_reviewtext_of_test_data_processed_by_pipeline_array[5][:100]
# array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
# 0,0,0,0,1,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,
# 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
# 0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
# 0,0,0,0,0,0,0,0,0,0,0,0])
# You can find and see what the word is
# feature_names_via_countvectorizer=countvectorizer_object.get_feature_names()
feature_names_via_countvectorizer[8]\
,feature_names_via_countvectorizer[2558]\
,feature_names_via_countvectorizer[2559]\
,feature_names_via_countvectorizer[2560]
# ('abc','charact person','charact play','charact plot')
# You input test data into random forest,
# and let random forest to predict label value
prediction_via_randomforest_classifier\
=randomforest_classifier_object\
.predict(cleaned_entire_reviewtext_of_test_data_processed_by_pipeline_array)
prediction_via_randomforest_classifier[:10]
# array([1,0,0,1,1,1,0,1,0,0])
# You can store prediction_via_randomforest_classifier into dataframe
prediction_via_randomforest_classifier_dataframe=pd.DataFrame(\
data={'id':test_data_dataframe['id'],'sentiment':prediction_via_randomforest_classifier})
prediction_via_randomforest_classifier_dataframe.head()
# id sentiment
# 0 "12311_10" 1
# 1 "8348_2" 0
# 2 "5828_4" 0
# 3 "7186_2" 1
# 4 "12128_7" 1
prediction_via_randomforest_classifier_dataframe.to_csv(\
'D://chromedown//tutorial_1_BOW_{0:.5f}.csv'.format(score)\
,index=False\
,quoting=3)
number_of_sentiments_from_prediction_via_randomforest_classifier_dataframe\
=prediction_via_randomforest_classifier_dataframe['sentiment'].value_counts()
print(\
number_of_sentiments_from_prediction_via_randomforest_classifier_dataframe[0]\
-number_of_sentiments_from_prediction_via_randomforest_classifier_dataframe[1])
# 108
number_of_sentiments_from_prediction_via_randomforest_classifier_dataframe
# 0 12554
# 1 12446
# Name: sentiment,dtype: int64
# You can draw graphs
figure_object,subplot_object=plt.subplots(ncols=2)
figure_object.set_size_inches(12,5)
sns.countplot(train_data_with_sentimentlabel_dataframe['sentiment'],ax=subplot_object[0])
sns.countplot(prediction_via_randomforest_classifier_dataframe['sentiment'],ax=subplot_object[1])