005. collect currency data and news data by using beautifulsoup @ beauty.py import urllib.request from bs4 import BeautifulSoup import time url = "http://info.finance.naver.com/marketindex" response = urllib.request.urlopen(url) soup = BeautifulSoup(response, "html.parser") # On web page, right click on US currency to inspect element # I will bring following <span class="value"> element # under the <div class="head_info point_up"> # <div class="head_info point_up"> # <span class="value">1,142.70</spna> results = soup.select('span.value') # < <span class="value">1,142.70</spna> for result in results: print(result.sting) # < 1,142.70 # < 332.31 # < 2,182.43 # < ... print("dollor-won currency", results[0].string) print("yien-won currency", results[1].string) print("euro-won currency", results[2].string) # Now, we will collect news data url = "http://news.naver.com/main/main.nhn?mode=LSD&mid=shm&sid1=105" # You collect html page response = urllib.request.urlopen(url) # You parse collected html page soup = BeautifulSoup(response, "html.parser") # Web page can have same class, but id should be unique # I use id selector and tag selector results = soup.select("#section_body a") for result in results: print('title: ', result.attrs['titles']) url_article = result.attrs['href']) response = urllib.request.urlopen(url_article) soup_article = BeautifulSoup(response, "html.parser") content = soup_article.select_one("#articleBodyContents") # print(content.contents) # We process data output = '' for item in content.contents: stripped = str(item).strip() # if stripped is empty character, # I execute "continue" to be out of if statement if stripped == '': continue # if stripped[0] character doesn't contain '<' and '/' if stripped[0] not in ['<', '/']: # I append item into 'output' output += str(item).strip() # element which is located in content.contents is instance of tag class # So, I need to convert it into string type to add each element output += str(item) print(output.replace('본문 내용TV플레이어', '')) time.sleep(1)