https://www.youtube.com/watch?v=VXAUZag1kIU&list=PLaRYNlxIGoESkuvIdRJNLjdUC4lpF3fM7&index=6
================================================================================
import requests,operator,pandas,glob2
from bs4 import BeautifulSoup
from datetime import datetime
import nltk
from konlpy.tag import Twitter
from matplotlib import font_manager,rc
font_name=font_manager.FontProperties(fname="/media/young/5e7be152-8ed5-483d-a8e8-b3fecfa221dc/font/NanumFont/NanumGothic.ttf").get_name()
rc('font',family=font_name)
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline
================================================================================
t=Twitter()
================================================================================
def analyze(content):
nouns=t.nouns(str(content))
================================================================================
# Useless words
trash=["조선","연합뉴스","일보","중앙","기자","뉴스","헤럴드경제"]
for i in trash:
for j in nouns:
if i==j:
nouns.remove(i)
================================================================================
# You convert "nouns" by using "분석"
ko=nltk.Text(nouns,name="분석")
# Find most_common
ranking=ko.vocab().most_common(100)
tmpData=dict(ranking)
# {"국회":7,"자유":6,...}
# Create wordcloud instance with setting it
wordcloud=WordCloud(
font_path="/media/young/5e7be152-8ed5-483d-a8e8-b3fecfa221dc/font/NanumFont/NanumGothic.ttf",
relative_scaling=0.2,background_color="white",)
wordcloud=wordcloud.generate_from_frequencies(tmpData)
================================================================================
plt.figure(figsize=(16,8))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
================================================================================
def crawlingData(date,pageCount):
now=datetime.now()
l=[]
for pagecount in range(1,int(pageCount)):
r=requests.get("http://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=100&date="+str(date)+"&page="+str(pageCount))
c=r.content
soup=BeautifulSoup(c,"html.parser")
all=soup.find_all("li")
================================================================================
for item in all:
for item2 in item.find_all("dl"):
d={}
try:
linkTag=item2.find("dt",{"class":""}).find("a")
d["LinkSrc"]=linkTag["href"]
d["Title"]=linkTag.text.replace("\t","").replace("\n","").replace(",","").replace('"',"").replace("\r","")[1:len(linkTag.text)+1]
except:
d["LinkSrc"]="None"
d["Title"]="None"
try:
contentTag=item2.find("dd")
d["Content"]=\
contentTag.text.replace("\t","").replace("\n","").replace("\r","").replace(",","").replace('"',"").split("…")[0]
d["Company"]=contentTag.find("span",{"class":"writing"}).text
d["Date"]=contentTag.find("span",{"class":"date"}).text
except:
d["Content"]="None"
d["Company"]="None"
d["Date"]="None"
try:
imgTag=item2.find("dt",{"class":"photo"}).find("img")
d["imgSrc"]=imgTag["src"]
except:
d["imgSrc"]="No image"
l.append(d)
df=pandas.DataFrame(l)
df.to_csv('%s-%s-%s-%s-%s-%s.csv'%(now.year,now.month,now.day,now.hour,now.minute,now.second),encoding='utf-8-sig',index=False)
print(df)
print("get datafile and save data successfully")
def loadFile(fileName,analyzeValue):
outputFileName=checkFileName(fileName)
if outputFileName is not -1:
df=pandas.read_csv(outputFileName)
content=df["Content"]
title=df["Title"]
company=df["Company"]
print("csv FIle Load Success")
if analyzeValue==1:
analyze(content)
else:
print("error during loading csv file")
def checkFileName(fileName):
now=datetime.now()
if len(glob2.glob("*.csv"))==0:
print("No file found in this directory")
return -1
else:
if fileName=="all":
result=[]
for i in glob2.glob("*.csv"):
result.append(pandas.read_csv(i))
outputFileName='%s-%s-%s-%s-%s-%s-merging.csv'%(now.year,now.month,now.day,now.hour,now.minute,now.second)
resultDf=pandas.concat(result,ignore_index=True)
resultDf.to_csv(outputFileName,encoding='utf-8-sig')
return outputFileName
else:
return fileName
def mainSetting():
while(1):
kb=input("input exit or crawling or loadAll or load: ")
if kb=="exit":
break
elif kb=="crawling":
date=input("input news date(format:20170101): ")
page=input("input news page(format:4): ")
crawlingData(date,page)
break
elif kb=="loadAll":
loadFile("all")
break
elif kb=="load":
fileName=input("input your csv file name: ")
loadFile(fileName,0)
break
elif kb=="analyze":
fileName=input("input your csv file name: ")
loadFile(fileName,1)
break
else:
print("command is not defined")
break