https://www.youtube.com/watch?v=VXAUZag1kIU&list=PLaRYNlxIGoESkuvIdRJNLjdUC4lpF3fM7&index=6 ================================================================================ import requests,operator,pandas,glob2 from bs4 import BeautifulSoup from datetime import datetime import nltk from konlpy.tag import Twitter from matplotlib import font_manager,rc font_name=font_manager.FontProperties(fname="/media/young/5e7be152-8ed5-483d-a8e8-b3fecfa221dc/font/NanumFont/NanumGothic.ttf").get_name() rc('font',family=font_name) from wordcloud import WordCloud import matplotlib.pyplot as plt %matplotlib inline ================================================================================ t=Twitter() ================================================================================ def analyze(content): nouns=t.nouns(str(content)) ================================================================================ # Useless words trash=["조선","연합뉴스","일보","중앙","기자","뉴스","헤럴드경제"] for i in trash: for j in nouns: if i==j: nouns.remove(i) ================================================================================ # You convert "nouns" by using "분석" ko=nltk.Text(nouns,name="분석") # Find most_common ranking=ko.vocab().most_common(100) tmpData=dict(ranking) # {"국회":7,"자유":6,...} # Create wordcloud instance with setting it wordcloud=WordCloud( font_path="/media/young/5e7be152-8ed5-483d-a8e8-b3fecfa221dc/font/NanumFont/NanumGothic.ttf", relative_scaling=0.2,background_color="white",) wordcloud=wordcloud.generate_from_frequencies(tmpData) ================================================================================ plt.figure(figsize=(16,8)) plt.imshow(wordcloud) plt.axis("off") plt.show() ================================================================================ def crawlingData(date,pageCount): now=datetime.now() l=[] for pagecount in range(1,int(pageCount)): r=requests.get("http://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=100&date="+str(date)+"&page="+str(pageCount)) c=r.content soup=BeautifulSoup(c,"html.parser") all=soup.find_all("li") ================================================================================ for item in all: for item2 in item.find_all("dl"): d={} try: linkTag=item2.find("dt",{"class":""}).find("a") d["LinkSrc"]=linkTag["href"] d["Title"]=linkTag.text.replace("\t","").replace("\n","").replace(",","").replace('"',"").replace("\r","")[1:len(linkTag.text)+1] except: d["LinkSrc"]="None" d["Title"]="None" try: contentTag=item2.find("dd") d["Content"]=\ contentTag.text.replace("\t","").replace("\n","").replace("\r","").replace(",","").replace('"',"").split("…")[0] d["Company"]=contentTag.find("span",{"class":"writing"}).text d["Date"]=contentTag.find("span",{"class":"date"}).text except: d["Content"]="None" d["Company"]="None" d["Date"]="None" try: imgTag=item2.find("dt",{"class":"photo"}).find("img") d["imgSrc"]=imgTag["src"] except: d["imgSrc"]="No image" l.append(d) df=pandas.DataFrame(l) df.to_csv('%s-%s-%s-%s-%s-%s.csv'%(now.year,now.month,now.day,now.hour,now.minute,now.second),encoding='utf-8-sig',index=False) print(df) print("get datafile and save data successfully") def loadFile(fileName,analyzeValue): outputFileName=checkFileName(fileName) if outputFileName is not -1: df=pandas.read_csv(outputFileName) content=df["Content"] title=df["Title"] company=df["Company"] print("csv FIle Load Success") if analyzeValue==1: analyze(content) else: print("error during loading csv file") def checkFileName(fileName): now=datetime.now() if len(glob2.glob("*.csv"))==0: print("No file found in this directory") return -1 else: if fileName=="all": result=[] for i in glob2.glob("*.csv"): result.append(pandas.read_csv(i)) outputFileName='%s-%s-%s-%s-%s-%s-merging.csv'%(now.year,now.month,now.day,now.hour,now.minute,now.second) resultDf=pandas.concat(result,ignore_index=True) resultDf.to_csv(outputFileName,encoding='utf-8-sig') return outputFileName else: return fileName def mainSetting(): while(1): kb=input("input exit or crawling or loadAll or load: ") if kb=="exit": break elif kb=="crawling": date=input("input news date(format:20170101): ") page=input("input news page(format:4): ") crawlingData(date,page) break elif kb=="loadAll": loadFile("all") break elif kb=="load": fileName=input("input your csv file name: ") loadFile(fileName,0) break elif kb=="analyze": fileName=input("input your csv file name: ") loadFile(fileName,1) break else: print("command is not defined") break