007-004. scraping static web page
# @
# We can scrape movie data from rottentomatoes.com
# @
# Go to https://www.rottentomatoes.com/top/bestofrt/?year=2015
# to see 'top 100 popular movie list released in 2015'
# Click one movie and you can see detailed information about that movie
# We will scrap title of movie, rate score, genre, review from each detailed page
# @
# Move to folder where you will proceed task
cd flearning_da
# Create new scrapy project
scrapy startproject rt_crawler
# rt_crawler project is created under folder you're located in
# And you got set of scrapy component
rt_crawler/
scrapy.cfg
rt_crawler/ # Python module for corresponding project
spiders/ # Storing repository for Spider
__init__.py
items.py # Item file for project
pipelines.py # Item pipeline file for project
settings.py # Settings file for project
__init__.py
...
# @
# Let's define Item class
# Open rt_crawler/items.py, and then, add following codes
# RTItem inherits scrapy.Item base class which is provided by scrapy
# We will store scraped data in RTItem object
class RTItem(scrapy.Item):
# Define member variables to store your scraped data
# name = scrapy.Field()
title = scrapy.Field()
score = scrapy.Field()
genres = scrapy.Field()
consensus = scrapy.Field()
# @
# Let's define Spider class
# Create rt_crawler/spiders/rt_spider.py, and then, add following codes
import scrapy
# RTItem is class which we created from above
from rt_crawler.items import RTItem
# I define RTSpider class which inherits scrapy.Spider class
# I will define member variables(name, allowed_domains, start_urls)
class RTSpider(scrapy.Spider):
# This is name of corresponding Spider
# When you run "crawling web" and "scraping web",
# you will use this name for spider
name = "RottenTomatoes"
# This is list of sites which is allowed to crawl by Spider
allowed_domains = ["rottentomatoes.com"]
# This is list of sites which is 'specific starting page' when crawling
# Spider first requests specified site to its server,
# and Spider gets response and stores response into response object
# response object is processed in parse() which we will create
start_urls = [
"https://www.rottentomatoes.com/top/bestofrt/?year=2015"
]
# You specify parts which you want to scrap from crawling starting web page
# which is obtained and stored in response object
def parse(self, response):
# If you want to find hyperlink of first title,
# hover mouse over what you want, then, copy xpath
# Then, try
# response.xpath('//*[@id="top_movies_main"]/div/table/tr/tbody/tr[1]/td[3]/a')
# in scrapy shell
# But you won't see any results, which means 'xpath' and 'chrome developer tool' are not almighty
# In this case, you iteratively remove part from most back part and run command
# response.xpath('//*[@id="top_movies_main"]/div/table/tbody/tr[1]/td[3]/a')
# response.xpath('//*[@id="top_movies_main"]/div/table/tbody/tr[1]/td[3]')
# response.xpath('//*[@id="top_movies_main"]/div/table/tbody/tr[1]')
# response.xpath('//*[@id="top_movies_main"]/div/table/tbody')
# response.xpath('//*[@id="top_movies_main"]/div/table')
# # Now, after removing tbody, you can see Selector
# Then, try this to select tr[1], td[3], a element
# response.xpath('//*[@id="top_movies_main"]/div/table/tr[1]/td[3]/a')
# This is hyperlink element inside of "a" tag
# You need to specify @href to bring value of href attribute
# response.xpath('//*[@id="top_movies_main"]/div/table/tr[1]/td[3]/a/@href')
# Then, bring text
# response.xpath('//*[@id="top_movies_main"]/div/table/tr[1]/td[3]/a/@href')[0].extract()
# < 'm/mad_max_fury_road/'
# But above value is partial url of full url
# To obtain as full url, you can use response.urljoin()
# partialUrl = response.xpath('//*[@id="top_movies_main"]/div/table/tr[1]/td[3]/a/@href')[0].extract()
# response.urljoin(partialUrl)
# < 'https://www.rottentomatoes.com/m/mad_max_fury_road'
# Apply above step by using for loop to bring all urls
# I find all id="top_movies_main elements and then iterate them one by one
for tr in response.xpath('//*[@id="top_movies_main"]/div/table/tr'):
# I find value of href attribute
href = tr.xpath('./td[3]/a/@href')
# I find full url
url = response.urljoin(href[0].extract())
# I request all full urls
# and I specify callback=self.parse_page_contents
# whose method will be definded below
# For example, when I request 'https://www.rottentomatoes.com/m/mad_max_fury_road',
# then, I will get html codes as response,
# then, I will process that html code by callback funtion parse_page_contents()
yield scrapy.Request(url, callback=self.parse_page_contents)
# parse_page_contents() specifies how to scrap data
# on each detailed movie information page's html code
# We can use this method for all detailed movie information page html code,
# because each page actually has same structure
# This method takes response
def parse_page_contents(self, response):
# I create RTItem instance
# class RTItem(scrapy.Item):
# #name = scrapy.Field()
# title = scrapy.Field()
# score = scrapy.Field()
# genres = scrapy.Field()
# consensus = scrapy.Field()
item = RTItem()
# You can find each xpath of title, score, genre, consensus by using chrome developer tool
# And then, you assign each element into RTItem object
item["title"]=response.xpath('//*[@id="movie-title"]/text()')[0].extract().strip()
item["score"]=response.xpath('//*[@id="tomato_meter_link"]/span[2]/span/text()')[0].extract()
item["genres"]=response.xpath('//*[@id="mainColumn"]/section[3]/div/div/div[2]/div[4]//span/text()').extract() # list of genre
consensus_list=response.xpath('//*[@id="all-critics-numbers"]/div/div[2]/p//text()').extract()[2:]
item["consensus"]=' '.join(consensus_list).strip()
# I return item object by yield keyword
yield item
# @
# Move to rt_crawler project folder
# You can run spider named RottenTomatoes
scrapy crawl RottenTomatoes
# You will see running long code
# You will see some dictionaries like
# {'consensus':Spotlight gracefully..., 'genre:['Drama','Mystery'], 'score':96, 'title':'Spotlight'}
# If you want to save crawled and scraped data into csv,
# you can use -o option when you run RottenTomatoes spider
scrapy crawl RottenTomatoes -o rt.csv
# @
# But contents layout is not that good with default csv settings
# To resolve this, you can use ItemPipeline class
# You can define how to process collected data
# and how to save collected data as file by creating ItemPipeline class
# Create rt_crawler/pipelines.py file, and add following codes
import csv
# I define RTPipeline class
class RTPipeline(object):
# I declare constructor
def __init__(self):
# I use csv module inside of constructor
# I define csvwriter to write every row of sentence into rt_movies_new.csv file
self.csvwriter = csv.writer(open("rt_movies_new.csv", "w"))
# We will store data in this order
self.csvwriter.writerow(["title", "score", "genres", "consensus"])
# I define process_item() to specify how to process each collected Item via Spider
def process_item(self, item, spider):
# I declare one list which will be appended by each value of Item member variable
row = []
row.append(item["title"])
row.append(item["score"])
# In case of genre member variable, it is originally list data type
# So, I first concatenate each element of list by "|" to make one string
# And then I append each one string
row.append('|'.join(item["genres"]))
row.append(item["consensus"])
# I pass row which I processed from above
# What I did is writing one row of rt_movies_new.csv file for one detailed movie data
self.csvwriter.writerow(row)
return item
# @
# After defining Item pipeline,
# you should configure defined Item pipeline able to be used
# in Settings when you run crawling
# Open rt_crawler/settings.py file,
# then, find ITEM_PIPELINES, uncomment ITEM_PIPELINES, edit ITEM_PIPELINES
ITEM_PIPELINES = {
# 'rt_crawler.pipelines.SomePipeline': 300,
# RTPipeline is what we created
'rt_crawler.pipelines.RTPipeline': 300,
}
# @
# You run RottenTomatoes spider
scrapy crawl RottenTomatoes
# Collected items are written into csv file
# because we configured by using Item pipeline
# @
# For more detail of using scrapy, go to this site
# http://scrapy.readthedocs.io