My first HTML document

007-005. collecting data in dynamic web page # @ # We will collect data from premierleague.com # We can see 'final ranking in 15/16 season' # https://www.premierleague.com/tables?co=1&se=42&mw=-1&ha=-1 # We will collect data of position, club, played, won, drawn, lost, gf, ga, gd, points, etc # from 'ranking table' on web page # @ # Create scrapy project named epl_crawler # We will create Item class class EPLItem(scrapy.Item): # Define member variables for your items # name = scrapy.Field() club_name = scrapy.Field() position = scrapy.Field() played = scrapy.Field() won = scrapy.Field() drawn = scrapy.Field() lost = scrapy.Field() gf = scrapy.Field() ga = scrapy.Field() gd = scrapy.Field() points = scrapy.Field() # @ # You will scrap "visible" web page by using selenium webdriver # @ # I will define Spider class # At this time, instead we scrap web page by using response object, # we will use selenium webdriver to request url, # and we will get response into selector object # You should create epl_crawler/spiders/epl_spider.py import scrapy from selenium import webdriver from epl_crawler.items import EPLItem class EPLSpider(scrapy.Spider): # This is name of this spider name = "PremierLeague" # This is allowed sites list which spider can crawl allowed_domains = ["premierleague.com"] # This list of sites represents starting site when you start crawling start_urls = [ "https://www.premierleague.com/tables?co=1&se=42&mw=-1&ha=-1" ] # This is overriden constructor def __init__(self): # This is constructor method of base class scrapy.Spider.__init__(self) # I let chrome webdriver to open browser # I assign opened browser into self.browser self.browser = webdriver.Chrome("/Users/kilho/chromedriver") def parse(self,response): # I open web page with corresponding url # by using selenium web browser with chrome webdriver self.browser.get(response.url) # I will have 5 seconds to give enough time to load all contents # between self.browser.get() and self.browser.find_element_by_xpath().get_attribute() time.sleep(5) # self.browser pulled web page from above self.browser.get(response.url) # And I need to bring all html codes by using xpath # I assign result into html local variable html = self.browser.find_element_by_xpath('//*').get_attribute('outerHTML') # I create Selector instance with passing text=html selector = Selector(text=html) # I choose elements what I want to extract by using xpath rows = selector.xpath('//*[@id="mainContent"]/div/div[1]/div[3]/div/div/table/tbody/tr[not(@class="expandable")]') for row in rows: # I create EPLItem instance which has member variables item = EPLItem() # I will find text of element what I want to extract from each row item["club_name"] = row.xpath('./td[3]/a/span[2]/text()')[0].extract() item["position"] = row.xpath('./td[2]/span[1]/text()')[0].extract() item["played"] = row.xpath('./td[4]/text()')[0].extract() item["won"] = row.xpath('./td[5]/text()')[0].extract() item["drawn"] = row.xpath('./td[6]/text()')[0].extract() item["lost"] = row.xpath('./td[7]/text()')[0].extract() item["gf"] = row.xpath('./td[8]/text()')[0].extract() item["ga"] = row.xpath('./td[9]/text()')[0].extract() item["gd"] = row.xpath('./td[10]/text()')[0].extract() item["points"] = row.xpath('./td[11]/text()')[0].extract() # I return item instance yield item # @ # Run spider to crawl scrapy crawl PremierLeague -o pl.csv # @ # For more detail for crawling and scraping web page # which requires login process, you can reference following site # http://selenium-python.readthedocs.io/