How to use scrapy.Request to load an element from another page into an element

Question

How to use scrapy.Request to load an element from another page into an element

I have created a web scraper using Scrapy which is capable of scrubbing items from every ticket from this site , but cannot scrape the ticket price since it is not available on the page. When I try to request the next page to clear the price, I can't and get the error: exceptions.TypeError: "XPathItemLoader" object has no " getitem " attribute". I was only able to clean up any items using object loaders to what I am currently using, and I'm not entirely sure if the correct procedure is to pass cleaned items on another page to the item loader (I've seen one way to do this using type item data, but it is not applicable here) I think I may have had problems fetching items to an object-to-object because I am piping to a database, but I am not exactly sure. If the code I am posting below , can be changed to properly scan to next page, clear the price and add it to the product loader, I think the problem should be resolved. Any help would be appreciated. Thanks!

 class MySpider(CrawlSpider):
    handle_httpstatus_list = [416]
    name = 'comparator'
    allowed_domains = ["www.vividseats.com"]
    start_urls = [vs_url]
    tickets_list_xpath = './/*[@itemtype="http://schema.org/Event"]'
    def parse_price(self, response):
        #First attempt at trying to load price into item loader
        loader.add_xpath('ticketPrice' , '//*[@class="eventTickets lastChild"]/div/div/@data-origin-price')
        print 'ticket price'
    def parse(self, response):
        selector = HtmlXPathSelector(response)
        # iterate over tickets
        for ticket in selector.select(self.tickets_list_xpath):

            loader = XPathItemLoader(ComparatorItem(), selector=ticket)
            # define loader
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()
            # iterate over fields and add xpaths to the loader

            loader.add_xpath('eventName' , './/*[@class="productionsEvent"]/text()')
            loader.add_xpath('eventLocation' , './/*[@class = "productionsVenue"]/span[@itemprop  = "name"]/text()')
            loader.add_xpath('ticketsLink' , './/*/td[3]/a/@href')
            loader.add_xpath('eventDate' , './/*[@class = "productionsDate"]/text()')
            loader.add_xpath('eventCity' , './/*[@class = "productionsVenue"]/span[@itemprop  = "address"]/span[@itemprop  = "addressLocality"]/text()')
            loader.add_xpath('eventState' , './/*[@class = "productionsVenue"]/span[@itemprop  = "address"]/span[@itemprop  = "addressRegion"]/text()')
            loader.add_xpath('eventTime' , './/*[@class = "productionsTime"]/text()')

            ticketsURL = "concerts/" + bandname + "-tickets/" + bandname + "-" + loader["ticketsLink"]
            request = scrapy.Request(ticketsURL , callback = self.parse_price)
            yield loader.load_item()

+3

python html python-2.7 web-scraping scrapy

loremIpsum1771 04 Jul At 12:10 am

source to share

2 answers

I have the exact problem and solved it in another post. I posted my code here: (my original post here )

from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy import Request
import re
from datetime import datetime, timedelta
from CAPjobs.items import CAPjobsItem 
from CAPjobs.items import CAPjobsItemLoader

class CAPjobSpider(Spider):
    name = "naturejob3"
    download_delay = 2
    #allowed_domains = ["nature.com/naturejobs/"]
    start_urls = [
"http://www.nature.com/naturejobs/science/jobs?utf8=%E2%9C%93&q=pathologist&where=&commit=Find+Jobs"]

    def parse_subpage(self, response):
        il = response.meta['il']
        location = response.xpath('//div[@id="extranav"]//ul[@class="job-addresses"]/li/text()').extract()
        il.add_value('loc_pj', location)  
        yield il.load_item()

    def parse(self, response):
        hxs = Selector(response)
        sites = hxs.xpath('//div[@class="job-details"]')    

        for site in sites:

            il = CAPjobsItemLoader(CAPjobsItem(), selector = site) 
            il.add_xpath('title', 'h3/a/text()')
            il.add_xpath('post_date', 'normalize-space(ul/li[@class="when"]/text())')
            il.add_xpath('web_url', 'concat("http://www.nature.com", h3/a/@href)')
            url = il.get_output_value('web_url')
            yield Request(url, meta={'il': il}, callback=self.parse_subpage)

+1

LearnAWK 30 jul. 15 at 5:50 am

source to share

alecxe · Accepted Answer · 2015-07-04T03:15:18+0000

Key points to fix:

to get value from item loader use get_output_value()

, replace:

loader["ticketsLink"]

from:

loader.get_output_value("ticketsLink")

you need to pass loader

inside the meta

request and output / return the loaded element there
when building url to get price use urljoin()

to join relative part with current url

Here's the fixed version:

from urlparse import urljoin
# other imports

class MySpider(CrawlSpider):
    handle_httpstatus_list = [416]
    name = 'comparator'
    allowed_domains = ["www.vividseats.com"]
    start_urls = [vs_url]
    tickets_list_xpath = './/*[@itemtype="http://schema.org/Event"]'
    def parse_price(self, response):
        loader = response.meta['loader']
        loader.add_xpath('ticketPrice' , '//*[@class="eventTickets lastChild"]/div/div/@data-origin-price')
        return loader.load_item()

    def parse(self, response):
        selector = HtmlXPathSelector(response)
        # iterate over tickets
        for ticket in selector.select(self.tickets_list_xpath):

            loader = XPathItemLoader(ComparatorItem(), selector=ticket)
            # define loader
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()
            # iterate over fields and add xpaths to the loader

            loader.add_xpath('eventName' , './/*[@class="productionsEvent"]/text()')
            loader.add_xpath('eventLocation' , './/*[@class = "productionsVenue"]/span[@itemprop  = "name"]/text()')
            loader.add_xpath('ticketsLink' , './/*/td[3]/a/@href')
            loader.add_xpath('eventDate' , './/*[@class = "productionsDate"]/text()')
            loader.add_xpath('eventCity' , './/*[@class = "productionsVenue"]/span[@itemprop  = "address"]/span[@itemprop  = "addressLocality"]/text()')
            loader.add_xpath('eventState' , './/*[@class = "productionsVenue"]/span[@itemprop  = "address"]/span[@itemprop  = "addressRegion"]/text()')
            loader.add_xpath('eventTime' , './/*[@class = "productionsTime"]/text()')

            ticketsURL = "concerts/" + bandname + "-tickets/" + bandname + "-" + loader.get_output_value("ticketsLink")
            ticketsURL = urljoin(response.url, ticketsURL)
            yield scrapy.Request(ticketsURL, meta={'loader': loader}, callback = self.parse_price)

How to use scrapy.Request to load an element from another page into an element

More articles: