https://www.youtube.com/watch?v=IkZiDDFfJ88&list=PLaRYNlxIGoESkuvIdRJNLjdUC4lpF3fM7&index=3 ================================================================================ # Data which you want to crawl /home/young/Pictures/2019_05_20_12:42:02.png # All article's top tag: ul class=type06_headline # one article tag for one article: li, dl # photo tag: dt class="photo" # title tag: dt # article contents tag: dd ================================================================================ import requests from bs4 import BeautifulSoup r=requests.get("http://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=100") c=r.content soup=BeautifulSoup(c,"html.parser") ================================================================================ # Find one "ul" tag, class name is type06_headline all=soup.find("ul",{"class":"type06_headline"}) ================================================================================ # Find all li tags, and store them into all2 all2=all.find_all("li") # print(all2) # ["a","b","c",...] ================================================================================ /home/young/Pictures/2019_05_20_12:50:03.png ================================================================================ for item in all2: # Find dt tag, class name is empty to get title title_HTML=item.find("dt",{"class":""}) title=title_HTML.text.replace("\t","").replace("\n","") # Remove front 2 whitespaces modifiedTitle=title[2:len(title)+1] print(modifiedTitle) ================================================================================ /home/young/Pictures/2019_05_20_12:54:59.png # getImageSrc() gets image_address # <dt class="photo"> # <a href="hyper_link"> # <img src="image_address"/> # </a> # </dt> ================================================================================ def getImageSrc(): # Find all tags of dl dl=all.find_all("dl") ================================================================================ for item2 in dl: try: # You find dt tag, class name is photo dt_photo=item2.find("dt",{"class":"photo"}) # And then, on the found result, you find img tag img=dt_photo.find("img") # Get value of src attribute print(img['src']) # There can be articles without images except: print("No image") ================================================================================ # This method finds title and address of article def getLinkAndTitle(): # Find all tags of dl dl=all.find_all("dl") ================================================================================ for item2 in dl: # On dl, find dt with class="" # On result, find a elements # On dl, find one tag of dt, class name is empty dt_tags=item2.find("dt",{"class":""}) link=dt_tags.find("a") # Get value of href attribute print(link['href']) processed_link=link.text.replace("\t","").replace("\n","")[1:len(link.text)+1] print(processed_link) ================================================================================ # This finds contents of article def getContent(): # Find all tags of dl dl=all.find_all("dl") ================================================================================ for item2 in dl: try: # On each dl, find one tag of dd content=item2.find("dd") processed_content=content.text.replace("\t","").replace("\n","").split("...")[0] # Find one tag of span, class name is writing span_writing=processed_content.find("span",{"class":"writing"}) # Get only text data span_writing_txt=span_writing.text print(span_writing_txt) span_date=processed_content.find("span",{"class":"date"}) span_date_txt=span_date.text except: print("No Content")