How to use scrapy.Request to load an element from another page into an element

I have created a web scraper using Scrapy which is capable of scrubbing items from every ticket from this site , but cannot scrape the ticket price since it is not available on the page. When I try to request the next page to clear the price, I can't and get the error: exceptions.TypeError: "XPathItemLoader" object has no " getitem " attribute". I was only able to clean up any items using object loaders to what I am currently using, and I'm not entirely sure if the correct procedure is to pass cleaned items on another page to the item loader (I've seen one way to do this using type item data, but it is not applicable here) I think I may have had problems fetching items to an object-to-object because I am piping to a database, but I am not exactly sure. If the code I am posting below , can be changed to properly scan to next page, clear the price and add it to the product loader, I think the problem should be resolved. Any help would be appreciated. Thanks!

 class MySpider(CrawlSpider):
    handle_httpstatus_list = [416]
    name = 'comparator'
    allowed_domains = ["www.vividseats.com"]
    start_urls = [vs_url]
    tickets_list_xpath = './/*[@itemtype="http://schema.org/Event"]'
    def parse_price(self, response):
        #First attempt at trying to load price into item loader
        loader.add_xpath('ticketPrice' , '//*[@class="eventTickets lastChild"]/div/div/@data-origin-price')
        print 'ticket price'
    def parse(self, response):
        selector = HtmlXPathSelector(response)
        # iterate over tickets
        for ticket in selector.select(self.tickets_list_xpath):

            loader = XPathItemLoader(ComparatorItem(), selector=ticket)
            # define loader
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()
            # iterate over fields and add xpaths to the loader

            loader.add_xpath('eventName' , './/*[@class="productionsEvent"]/text()')
            loader.add_xpath('eventLocation' , './/*[@class = "productionsVenue"]/span[@itemprop  = "name"]/text()')
            loader.add_xpath('ticketsLink' , './/*/td[3]/a/@href')
            loader.add_xpath('eventDate' , './/*[@class = "productionsDate"]/text()')
            loader.add_xpath('eventCity' , './/*[@class = "productionsVenue"]/span[@itemprop  = "address"]/span[@itemprop  = "addressLocality"]/text()')
            loader.add_xpath('eventState' , './/*[@class = "productionsVenue"]/span[@itemprop  = "address"]/span[@itemprop  = "addressRegion"]/text()')
            loader.add_xpath('eventTime' , './/*[@class = "productionsTime"]/text()')

            ticketsURL = "concerts/" + bandname + "-tickets/" + bandname + "-" + loader["ticketsLink"]
            request = scrapy.Request(ticketsURL , callback = self.parse_price)
            yield loader.load_item()

      

+3


source to share


2 answers


Key points to fix:



Here's the fixed version:

from urlparse import urljoin
# other imports

class MySpider(CrawlSpider):
    handle_httpstatus_list = [416]
    name = 'comparator'
    allowed_domains = ["www.vividseats.com"]
    start_urls = [vs_url]
    tickets_list_xpath = './/*[@itemtype="http://schema.org/Event"]'
    def parse_price(self, response):
        loader = response.meta['loader']
        loader.add_xpath('ticketPrice' , '//*[@class="eventTickets lastChild"]/div/div/@data-origin-price')
        return loader.load_item()

    def parse(self, response):
        selector = HtmlXPathSelector(response)
        # iterate over tickets
        for ticket in selector.select(self.tickets_list_xpath):

            loader = XPathItemLoader(ComparatorItem(), selector=ticket)
            # define loader
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()
            # iterate over fields and add xpaths to the loader

            loader.add_xpath('eventName' , './/*[@class="productionsEvent"]/text()')
            loader.add_xpath('eventLocation' , './/*[@class = "productionsVenue"]/span[@itemprop  = "name"]/text()')
            loader.add_xpath('ticketsLink' , './/*/td[3]/a/@href')
            loader.add_xpath('eventDate' , './/*[@class = "productionsDate"]/text()')
            loader.add_xpath('eventCity' , './/*[@class = "productionsVenue"]/span[@itemprop  = "address"]/span[@itemprop  = "addressLocality"]/text()')
            loader.add_xpath('eventState' , './/*[@class = "productionsVenue"]/span[@itemprop  = "address"]/span[@itemprop  = "addressRegion"]/text()')
            loader.add_xpath('eventTime' , './/*[@class = "productionsTime"]/text()')

            ticketsURL = "concerts/" + bandname + "-tickets/" + bandname + "-" + loader.get_output_value("ticketsLink")
            ticketsURL = urljoin(response.url, ticketsURL)
            yield scrapy.Request(ticketsURL, meta={'loader': loader}, callback = self.parse_price)

      

+5


source


I have the exact problem and solved it in another post. I posted my code here: (my original post here )



from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy import Request
import re
from datetime import datetime, timedelta
from CAPjobs.items import CAPjobsItem 
from CAPjobs.items import CAPjobsItemLoader

class CAPjobSpider(Spider):
    name = "naturejob3"
    download_delay = 2
    #allowed_domains = ["nature.com/naturejobs/"]
    start_urls = [
"http://www.nature.com/naturejobs/science/jobs?utf8=%E2%9C%93&q=pathologist&where=&commit=Find+Jobs"]

    def parse_subpage(self, response):
        il = response.meta['il']
        location = response.xpath('//div[@id="extranav"]//ul[@class="job-addresses"]/li/text()').extract()
        il.add_value('loc_pj', location)  
        yield il.load_item()

    def parse(self, response):
        hxs = Selector(response)
        sites = hxs.xpath('//div[@class="job-details"]')    

        for site in sites:

            il = CAPjobsItemLoader(CAPjobsItem(), selector = site) 
            il.add_xpath('title', 'h3/a/text()')
            il.add_xpath('post_date', 'normalize-space(ul/li[@class="when"]/text())')
            il.add_xpath('web_url', 'concat("http://www.nature.com", h3/a/@href)')
            url = il.get_output_value('web_url')
            yield Request(url, meta={'il': il}, callback=self.parse_subpage)

      

+1


source







All Articles