https://www.youtube.com/watch?v=VXAUZag1kIU&list=PLaRYNlxIGoESkuvIdRJNLjdUC4lpF3fM7&index=6
================================================================================
import requests,operator,pandas,glob2
from bs4 import BeautifulSoup
from datetime import datetime
import nltk
from konlpy.tag import Twitter
from matplotlib import font_manager,rc
font_name=font_manager.FontProperties(fname="/media/young/5e7be152-8ed5-483d-a8e8-b3fecfa221dc/font/NanumFont/NanumGothic.ttf").get_name()
rc('font',family=font_name)
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline
================================================================================
t=Twitter()
================================================================================
def analyze(content):
  nouns=t.nouns(str(content))
  ================================================================================
  # Useless words
  trash=["조선","연합뉴스","일보","중앙","기자","뉴스","헤럴드경제"]
  for i in trash:
    for j in nouns:
      if i==j:
        nouns.remove(i)
  ================================================================================
  # You convert "nouns" by using "분석"
  ko=nltk.Text(nouns,name="분석")
  # Find most_common
  ranking=ko.vocab().most_common(100)
  tmpData=dict(ranking)
  # {"국회":7,"자유":6,...}
  # Create wordcloud instance with setting it
  wordcloud=WordCloud(
    font_path="/media/young/5e7be152-8ed5-483d-a8e8-b3fecfa221dc/font/NanumFont/NanumGothic.ttf",
    relative_scaling=0.2,background_color="white",)
  
  wordcloud=wordcloud.generate_from_frequencies(tmpData)  
  ================================================================================
  plt.figure(figsize=(16,8))
  plt.imshow(wordcloud)
  plt.axis("off")
  plt.show()
================================================================================
def crawlingData(date,pageCount):
  now=datetime.now()
  l=[]
  for pagecount in range(1,int(pageCount)):
    r=requests.get("http://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=100&date="+str(date)+"&page="+str(pageCount))
    c=r.content
    soup=BeautifulSoup(c,"html.parser")
    all=soup.find_all("li")
    ================================================================================
    for item in all:
      for item2 in item.find_all("dl"):
        d={}
        try:
          linkTag=item2.find("dt",{"class":""}).find("a")
          d["LinkSrc"]=linkTag["href"]
          d["Title"]=linkTag.text.replace("\t","").replace("\n","").replace(",","").replace('"',"").replace("\r","")[1:len(linkTag.text)+1]
        except:    
          d["LinkSrc"]="None"
          d["Title"]="None"
            
        try:
          contentTag=item2.find("dd")
          d["Content"]=\
          contentTag.text.replace("\t","").replace("\n","").replace("\r","").replace(",","").replace('"',"").split("…")[0]
      
          d["Company"]=contentTag.find("span",{"class":"writing"}).text
          d["Date"]=contentTag.find("span",{"class":"date"}).text
 
        except:
          d["Content"]="None"
          d["Company"]="None"
          d["Date"]="None"
            
        try:
          imgTag=item2.find("dt",{"class":"photo"}).find("img")
          d["imgSrc"]=imgTag["src"]
        except:
          d["imgSrc"]="No image"
            
        l.append(d)
            
  df=pandas.DataFrame(l)
  df.to_csv('%s-%s-%s-%s-%s-%s.csv'%(now.year,now.month,now.day,now.hour,now.minute,now.second),encoding='utf-8-sig',index=False)
  print(df)
  print("get datafile and save data successfully")
def loadFile(fileName,analyzeValue):
  outputFileName=checkFileName(fileName)
  if outputFileName is not -1:
    df=pandas.read_csv(outputFileName)
    content=df["Content"]
    title=df["Title"]
    company=df["Company"]
    print("csv FIle Load Success")
    if analyzeValue==1:
      analyze(content)
  else:
    print("error during loading csv file")
        
def checkFileName(fileName):
  now=datetime.now()
  if len(glob2.glob("*.csv"))==0:
    print("No file found in this directory")
    return -1
  else:
    if fileName=="all":
      result=[]
      for i in glob2.glob("*.csv"):
        result.append(pandas.read_csv(i))
      outputFileName='%s-%s-%s-%s-%s-%s-merging.csv'%(now.year,now.month,now.day,now.hour,now.minute,now.second)
      resultDf=pandas.concat(result,ignore_index=True)
      resultDf.to_csv(outputFileName,encoding='utf-8-sig')
      return outputFileName
    else:
      return fileName
         
def mainSetting():
  while(1):
    kb=input("input exit or crawling or loadAll or load: ")
    if kb=="exit":
      break
    elif kb=="crawling":
      date=input("input news date(format:20170101): ")
      page=input("input news page(format:4): ")
      crawlingData(date,page)
      break
    elif kb=="loadAll":
      loadFile("all")
      break
    elif kb=="load":
      fileName=input("input your csv file name: ")
      loadFile(fileName,0)
      break
    elif kb=="analyze":
      fileName=input("input your csv file name: ")
      loadFile(fileName,1)
      break
    else:
      print("command is not defined")
      break