My first HTML document

007-004. scraping static web page # @ # We can scrape movie data from rottentomatoes.com # @ # Go to https://www.rottentomatoes.com/top/bestofrt/?year=2015 # to see 'top 100 popular movie list released in 2015' # Click one movie and you can see detailed information about that movie # We will scrap title of movie, rate score, genre, review from each detailed page # @ # Move to folder where you will proceed task cd flearning_da # Create new scrapy project scrapy startproject rt_crawler # rt_crawler project is created under folder you're located in # And you got set of scrapy component rt_crawler/ scrapy.cfg rt_crawler/ # Python module for corresponding project spiders/ # Storing repository for Spider __init__.py items.py # Item file for project pipelines.py # Item pipeline file for project settings.py # Settings file for project __init__.py ... # @ # Let's define Item class # Open rt_crawler/items.py, and then, add following codes # RTItem inherits scrapy.Item base class which is provided by scrapy # We will store scraped data in RTItem object class RTItem(scrapy.Item): # Define member variables to store your scraped data # name = scrapy.Field() title = scrapy.Field() score = scrapy.Field() genres = scrapy.Field() consensus = scrapy.Field() # @ # Let's define Spider class # Create rt_crawler/spiders/rt_spider.py, and then, add following codes import scrapy # RTItem is class which we created from above from rt_crawler.items import RTItem # I define RTSpider class which inherits scrapy.Spider class # I will define member variables(name, allowed_domains, start_urls) class RTSpider(scrapy.Spider): # This is name of corresponding Spider # When you run "crawling web" and "scraping web", # you will use this name for spider name = "RottenTomatoes" # This is list of sites which is allowed to crawl by Spider allowed_domains = ["rottentomatoes.com"] # This is list of sites which is 'specific starting page' when crawling # Spider first requests specified site to its server, # and Spider gets response and stores response into response object # response object is processed in parse() which we will create start_urls = [ "https://www.rottentomatoes.com/top/bestofrt/?year=2015" ] # You specify parts which you want to scrap from crawling starting web page # which is obtained and stored in response object def parse(self, response): # If you want to find hyperlink of first title, # hover mouse over what you want, then, copy xpath # Then, try # response.xpath('//*[@id="top_movies_main"]/div/table/tr/tbody/tr[1]/td[3]/a') # in scrapy shell # But you won't see any results, which means 'xpath' and 'chrome developer tool' are not almighty # In this case, you iteratively remove part from most back part and run command # response.xpath('//*[@id="top_movies_main"]/div/table/tbody/tr[1]/td[3]/a') # response.xpath('//*[@id="top_movies_main"]/div/table/tbody/tr[1]/td[3]') # response.xpath('//*[@id="top_movies_main"]/div/table/tbody/tr[1]') # response.xpath('//*[@id="top_movies_main"]/div/table/tbody') # response.xpath('//*[@id="top_movies_main"]/div/table') # # Now, after removing tbody, you can see Selector # Then, try this to select tr[1], td[3], a element # response.xpath('//*[@id="top_movies_main"]/div/table/tr[1]/td[3]/a') # This is hyperlink element inside of "a" tag # You need to specify @href to bring value of href attribute # response.xpath('//*[@id="top_movies_main"]/div/table/tr[1]/td[3]/a/@href') # Then, bring text # response.xpath('//*[@id="top_movies_main"]/div/table/tr[1]/td[3]/a/@href')[0].extract() # < 'm/mad_max_fury_road/' # But above value is partial url of full url # To obtain as full url, you can use response.urljoin() # partialUrl = response.xpath('//*[@id="top_movies_main"]/div/table/tr[1]/td[3]/a/@href')[0].extract() # response.urljoin(partialUrl) # < 'https://www.rottentomatoes.com/m/mad_max_fury_road' # Apply above step by using for loop to bring all urls # I find all id="top_movies_main elements and then iterate them one by one for tr in response.xpath('//*[@id="top_movies_main"]/div/table/tr'): # I find value of href attribute href = tr.xpath('./td[3]/a/@href') # I find full url url = response.urljoin(href[0].extract()) # I request all full urls # and I specify callback=self.parse_page_contents # whose method will be definded below # For example, when I request 'https://www.rottentomatoes.com/m/mad_max_fury_road', # then, I will get html codes as response, # then, I will process that html code by callback funtion parse_page_contents() yield scrapy.Request(url, callback=self.parse_page_contents) # parse_page_contents() specifies how to scrap data # on each detailed movie information page's html code # We can use this method for all detailed movie information page html code, # because each page actually has same structure # This method takes response def parse_page_contents(self, response): # I create RTItem instance # class RTItem(scrapy.Item): # #name = scrapy.Field() # title = scrapy.Field() # score = scrapy.Field() # genres = scrapy.Field() # consensus = scrapy.Field() item = RTItem() # You can find each xpath of title, score, genre, consensus by using chrome developer tool # And then, you assign each element into RTItem object item["title"]=response.xpath('//*[@id="movie-title"]/text()')[0].extract().strip() item["score"]=response.xpath('//*[@id="tomato_meter_link"]/span[2]/span/text()')[0].extract() item["genres"]=response.xpath('//*[@id="mainColumn"]/section[3]/div/div/div[2]/div[4]//span/text()').extract() # list of genre consensus_list=response.xpath('//*[@id="all-critics-numbers"]/div/div[2]/p//text()').extract()[2:] item["consensus"]=' '.join(consensus_list).strip() # I return item object by yield keyword yield item # @ # Move to rt_crawler project folder # You can run spider named RottenTomatoes scrapy crawl RottenTomatoes # You will see running long code # You will see some dictionaries like # {'consensus':Spotlight gracefully..., 'genre:['Drama','Mystery'], 'score':96, 'title':'Spotlight'} # If you want to save crawled and scraped data into csv, # you can use -o option when you run RottenTomatoes spider scrapy crawl RottenTomatoes -o rt.csv # @ # But contents layout is not that good with default csv settings # To resolve this, you can use ItemPipeline class # You can define how to process collected data # and how to save collected data as file by creating ItemPipeline class # Create rt_crawler/pipelines.py file, and add following codes import csv # I define RTPipeline class class RTPipeline(object): # I declare constructor def __init__(self): # I use csv module inside of constructor # I define csvwriter to write every row of sentence into rt_movies_new.csv file self.csvwriter = csv.writer(open("rt_movies_new.csv", "w")) # We will store data in this order self.csvwriter.writerow(["title", "score", "genres", "consensus"]) # I define process_item() to specify how to process each collected Item via Spider def process_item(self, item, spider): # I declare one list which will be appended by each value of Item member variable row = [] row.append(item["title"]) row.append(item["score"]) # In case of genre member variable, it is originally list data type # So, I first concatenate each element of list by "|" to make one string # And then I append each one string row.append('|'.join(item["genres"])) row.append(item["consensus"]) # I pass row which I processed from above # What I did is writing one row of rt_movies_new.csv file for one detailed movie data self.csvwriter.writerow(row) return item # @ # After defining Item pipeline, # you should configure defined Item pipeline able to be used # in Settings when you run crawling # Open rt_crawler/settings.py file, # then, find ITEM_PIPELINES, uncomment ITEM_PIPELINES, edit ITEM_PIPELINES ITEM_PIPELINES = { # 'rt_crawler.pipelines.SomePipeline': 300, # RTPipeline is what we created 'rt_crawler.pipelines.RTPipeline': 300, } # @ # You run RottenTomatoes spider scrapy crawl RottenTomatoes # Collected items are written into csv file # because we configured by using Item pipeline # @ # For more detail of using scrapy, go to this site # http://scrapy.readthedocs.io