007-005. collecting data in dynamic web page
# @
# We will collect data from premierleague.com
# We can see 'final ranking in 15/16 season'
# https://www.premierleague.com/tables?co=1&se=42&mw=-1&ha=-1
# We will collect data of position, club, played, won, drawn, lost, gf, ga, gd, points, etc
# from 'ranking table' on web page
# @
# Create scrapy project named epl_crawler
# We will create Item class
class EPLItem(scrapy.Item):
# Define member variables for your items
# name = scrapy.Field()
club_name = scrapy.Field()
position = scrapy.Field()
played = scrapy.Field()
won = scrapy.Field()
drawn = scrapy.Field()
lost = scrapy.Field()
gf = scrapy.Field()
ga = scrapy.Field()
gd = scrapy.Field()
points = scrapy.Field()
# @
# You will scrap "visible" web page by using selenium webdriver
# @
# I will define Spider class
# At this time, instead we scrap web page by using response object,
# we will use selenium webdriver to request url,
# and we will get response into selector object
# You should create epl_crawler/spiders/epl_spider.py
import scrapy
from selenium import webdriver
from epl_crawler.items import EPLItem
class EPLSpider(scrapy.Spider):
# This is name of this spider
name = "PremierLeague"
# This is allowed sites list which spider can crawl
allowed_domains = ["premierleague.com"]
# This list of sites represents starting site when you start crawling
start_urls = [
"https://www.premierleague.com/tables?co=1&se=42&mw=-1&ha=-1"
]
# This is overriden constructor
def __init__(self):
# This is constructor method of base class
scrapy.Spider.__init__(self)
# I let chrome webdriver to open browser
# I assign opened browser into self.browser
self.browser = webdriver.Chrome("/Users/kilho/chromedriver")
def parse(self,response):
# I open web page with corresponding url
# by using selenium web browser with chrome webdriver
self.browser.get(response.url)
# I will have 5 seconds to give enough time to load all contents
# between self.browser.get() and self.browser.find_element_by_xpath().get_attribute()
time.sleep(5)
# self.browser pulled web page from above self.browser.get(response.url)
# And I need to bring all html codes by using xpath
# I assign result into html local variable
html = self.browser.find_element_by_xpath('//*').get_attribute('outerHTML')
# I create Selector instance with passing text=html
selector = Selector(text=html)
# I choose elements what I want to extract by using xpath
rows = selector.xpath('//*[@id="mainContent"]/div/div[1]/div[3]/div/div/table/tbody/tr[not(@class="expandable")]')
for row in rows:
# I create EPLItem instance which has member variables
item = EPLItem()
# I will find text of element what I want to extract from each row
item["club_name"] = row.xpath('./td[3]/a/span[2]/text()')[0].extract()
item["position"] = row.xpath('./td[2]/span[1]/text()')[0].extract()
item["played"] = row.xpath('./td[4]/text()')[0].extract()
item["won"] = row.xpath('./td[5]/text()')[0].extract()
item["drawn"] = row.xpath('./td[6]/text()')[0].extract()
item["lost"] = row.xpath('./td[7]/text()')[0].extract()
item["gf"] = row.xpath('./td[8]/text()')[0].extract()
item["ga"] = row.xpath('./td[9]/text()')[0].extract()
item["gd"] = row.xpath('./td[10]/text()')[0].extract()
item["points"] = row.xpath('./td[11]/text()')[0].extract()
# I return item instance
yield item
# @
# Run spider to crawl
scrapy crawl PremierLeague -o pl.csv
# @
# For more detail for crawling and scraping web page
# which requires login process, you can reference following site
# http://selenium-python.readthedocs.io/