https://www.youtube.com/watch?v=VXAUZag1kIU&list=PLaRYNlxIGoESkuvIdRJNLjdUC4lpF3fM7&index=6
================================================================================
# @
# Install libraries
# pip3 install pandas
# pip3 install glob2
# @
import requests,operator,pandas,glob2
from bs4 import BeautifulSoup
from datetime import datetime
# This method crawls based on "date" and "pageCount"
def crawlingData(date,pageCount):
# You store current time into "now"
now=datetime.now()
================================================================================
# Placeholder
l=[]
# If user inputs 4 as pageCount, total pages will be from 1 to 4
for pagecount in range(1,int(pageCount)):
r=requests.get("http://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=100&date="+str(date)+"&page="+str(pageCount))
c=r.content
soup=BeautifulSoup(c,"html.parser")
================================================================================
# Find all tags of li
all=soup.find_all("li")
for item in all:
# Find all tags of dl
for item2 in item.find_all("dl"):
d={}
try:
# Find one tag of dt, class name is ""
dt_empty_class=item2.find("dt",{"class":""})
# Find one tag of a
linkTag=dt_empty_class.find("a")
================================================================================
# Get value from attribute href
d["LinkSrc"]=linkTag["href"]
================================================================================
# Get text data from linkTag
d["Title"]=linkTag.text.\
replace("\t","").\
replace("\n","").\
replace(",","").\
replace('"',"").\
replace("\r","")[1:len(linkTag.text)+1]
================================================================================
# If there is no data with exception, assign "None" into value
except:
d["LinkSrc"]="None"
d["Title"]="None"
================================================================================
try:
# Find one tag of dd
contentTag=item2.find("dd")
d["Content"]=contentTag.text.\
replace("\t","").\
replace("\n","").\
replace("\r","").\
replace(",","").\
replace('"',"").\
split("…")[0]
================================================================================
d["Company"]=contentTag.find("span",{"class":"writing"}).text
d["Date"]=contentTag.find("span",{"class":"date"}).text
except:
d["Content"]="None"
d["Company"]="None"
d["Date"]="None"
try:
imgTag=item2.find("dt",{"class":"photo"}).find("img")
d["imgSrc"]=imgTag["src"]
except:
d["imgSrc"]="No image"
# Append "d" into "l" per one for loop
l.append(d)
================================================================================
df=pandas.DataFrame(l)
df.to_csv('%s-%s-%s-%s-%s-%s.csv'%(now.year,now.month,now.day,now.hour,now.minute,now.second),
encoding='utf-8-sig',index=False)
print("get datafile and save data successfully")
================================================================================
def loadFile(fileName):
# This method first invokes checkFileName()
# to check if "fileName" file already exists or not
outputFileName=checkFileName(fileName)
# -1 means no file
if outputFileName is not -1:
# Load csv file into dataframe
df=pandas.read_csv(outputFileName)
# Extract Content from dataframe
content=df["Content"]
# Extract Title from dataframe
title=df["Title"]
# Extract Company from dataframe
company=df["Company"]
print(company)
print("csv fIle loaded successfully")
else:
print("error during csv file load")
================================================================================
# 사용자 입력값이 all이면 같은 경로의 모든 csv파일을 하나로 합치고,csv파일을 새로 만듦
# 그리고 만들어진 csv 파일을 리턴
def checkFileName(fileName):
now=datetime.now()
# If there is no file,it returns -1
if len(glob2.glob("*.csv"))==0:
print("No file found in this directory")
return -1
else:
# If fileName which user input is all,
# merge all csv file located in same directory,
# create new csv file containing merge csv file
if fileName=="all":
result=[]
# Bring all csv files
for i in glob2.glob("*.csv"):
# Read contents of csv file,
# and append all contents into "result"
result.append(pandas.read_csv(i))
# After appending all contents into "result",
# designate file name
outputFileName='%s-%s-%s-%s-%s-%s-merging.csv'\
%(now.year,now.month,now.day,now.hour,now.minute,now.second)
# Concatenate all elements into "result"
# Assign concatenated one into "resultDf"
resultDf=pandas.concat(result,ignore_index=True)
# Create csv file
resultDf.to_csv(outputFileName,encoding='utf-8-sig')
# Return name of csv file
return outputFileName
else:
return fileName
================================================================================
def mainSetting():
# Infinite loop until user inputs "exit"
while (1):
# Show input window
kb=input("input exit or crawling or loadAll or load : ")
# If user inputs "exit", while loop is terminated
if kb=="exit":
break
# If user inputs "crawling",
elif kb=="crawling":
# Take "news date" from user
date=input("input news date(format:20170101) : ")
# Take "number of page" from user
page=input("input news page(format:4) : ")
# Invoke crawlingData() with passing "date" and "page",
crawlingData(date,page)
break
================================================================================
# If user inputs "loadAll",
elif kb=="loadAll":
# invokes loadFile() with passing "all"
loadFile("all")
break
================================================================================
# If user inputs "load",
elif kb=="load":
# Takes "csv file name" from user,
fileName=input("input your csv file name: ")
# invoke loadFile() with passing "filename"
loadFile(fileName)
================================================================================
# If user inputs other commands,
else:
print("command is not defined")
break