028. classify text by bayes classifier
# @
from bayes import BayesianFilter
bf=BayesianFilter()
# Text which will be trained
br.fit("파격 세일 - 오늘까지만 30% 할인","광고")
br.fit("쿠폰 선물 & 무료 배송","광고")
br.fit("백화점 세일","광고")
br.fit("봄과 함께 찾아온 따뜻한 신제품 소식","광고")
br.fit("인기 제품 기간 한정 세일","광고")
br.fit("오늘 일정 확인","중요")
br.fit("프로젝트 진행 상황 보고","중요")
br.fit("계약 잘 부탁드립니다","중요")
br.fit("회의 일정이 등록되었습니다","중요")
br.fit("오늘 일정이 없습니다","중요")
# For predicting
pre,scorelist=bf.predict("제고 정리 할인, 무료 배송")
print("결과: ", pre)
# < 결과: 광고
print(scorelist)
# < [('광고', -19.384), ('중요', -20.3948)]
# @
bf=BayesianFilter()
import math, sys
from konlpy.tag import Twitter
# Let's inspect BayesianFilter class
class BayesianFilter:
""" This is BayesianFilter class """
def __init__(self):
# This space is for recording shown words
self.words=set()
# This space is for recording frequency of shown words per category
self.word_dict={}
# This space is for recording frequency of shown category
self.category_dict={}
# This method splits text to be used for morphological analysis
def split(self, text):
results=[]
twitter=Twitter()
malist=twitter.pos(text,norm=True,stem=True)
for word in malist:
if not word[1] in ["Josa","Eomi","Punctuation"]:
results.append(word[0])
return results
# This method calculates how many words in specific category show in word_dict
def inc_word(self, word, category):
# If category doesn't show in word_dict
if not category in self.word_dict:
# This creates dictionary which has key of category
# word_dic={"광고":{}}
self.word_dict[category]={}
# If there is no word in key 광고
if not word in self.word_dict[category]:
# This makes word and initialize its contents as 0
# word_dic={"광고":{"할인":0}}
self.word_dict[category][word]=0
# If there is word in key of 광고,
# increment its contents by 1
# word_dic={"광고":{"할인":1}}
self.word_dict[category][word] += 1
self.words.add(word)
# word=["할인"]
# word_dict=
# {
# "광고":
# {
# "할인":1
# }
# }
# category_dict={}
# This method calculates how many categories exist in category_dict
def inc_category(self,category):
# If there is no category in category_dict
if not category in self.category_dict:
# this creates category and initialize its value to 0
# category_dict={"광고":0}
self.category_dict[category]=0
# If there is category in category_dict
# like category_dict={"광고":1}
# you increment value of category key by 1
self.category_dict[category] += 1
# This is method for learning
def fit(self,text,category):
""" For learning text """
word_list=self.split(text)
for word in word_list:
# increment_word
self.inc_word(word,category)
self.inc_category(category)
# This records score for each list of word
def score(self,words,category):
score=math.log(self.category_prob(category))
for word in words:
score+=math.log(self.word_prob(word,category))
return score
# This method predics
def predict(self,text):
best_category=None
max_score=-sys.maxsize
words=self.split(text)
score_list=[]
for category in self.category_dict.keys():
score=self.score(words,category)
score_list.append((category,score))
if score>max_score:
max_score=score
best_category=category
return best_category,score_list
bf.predict("재고 정리 할인, 무료 배송")
best_category=None
# I input smallest value in system to find max value from smallest value
max_score=-92222222234444
words=self.split(text)
score_list=[]
# This method calculates how many times words appear in specific category
def get_word_count(self,word,category):
if word in self.word_dict[category]:
return self.word_dict[category][word]
else:
return 0
# This method calculates probability of occurring specific category
def category_prob(self,category):
# This sums all values in category_dict
# category_dict={"광고":5, "중요":5}
# sum_categories=10
# category_v=5 if category is 광고
sum_categories=sum(self.category_dict.values())
category_v=self.category_dict[category]
return category_v / sum_categories
word_dic=
{
'광고':
{
'파격':1, '세일':2, '한정':2, ... '기간':1
},
'중요':
{
'오늘':1, '일정':2, '등록':2, ... '회의':1
}
}
# This method calculates probability of occurring specific word in specific category
def word_prob(self,word,category):
# If category is 광고 and word is 세일, this returns 2
# +1 is required to use log function later
# because if we give 0 to log function, something wrong will happen
n=self.get_word_count(word,category)+1
d=sum(self.word_dict[category].values())+len(self.words)
return n/d