The correct way to launch multiple scraping spiders
I just tried to run multiple spiders in the same process using new scrapy documentation, but I get: AttributeError: 'CrawlerProcess' object has no 'crawl' attribute
I found this post with the same problem, so I tried using code from 0.24 documentation and got: runspider: error: Unable to load 'price_comparator.py': no ββmodule named testspiders.spiders.followall
For 1.0, I imported:
from scrapy.crawler import CrawlerProcess
and for 0.24 I imported:
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import log
from testspiders.spiders.followall import FollowAllSpider
from scrapy.utils.project import get_project_settings
Based on 0.24 of the doc it seems like the code is passing a single spider across multiple domains, which is not what the doc 1.0 code does, but I ran both anyway. I have code to run both spiders inside the file where my spiders are defined, so this could be a problem. Is there some internal problem with the new version of the code or is there some dependency or code that I am missing in my program? I have a file with the code from both documents below (I did not run both versions at the same time).
Spider class definitions
corresponding code below
import re
import json
import scrapy
from scrapy import Request
from scrapy.contrib.spiders import CrawlSpider , Rule
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader import XPathItemLoader
from scrapy.contrib.loader.processor import Join, MapCompose
from concert_comparator.items import ComparatorItem, ComparatorItem2
from twisted.internet import reactor
from scrapy.crawler import CrawlerProcess
#from scrapy.crawler import Crawler
from scrapy import log
#from testspiders.spiders.followall import FollowAllSpider
from scrapy.utils.project import get_project_settings
from urlparse import urljoin
bandname = raw_input("Enter a bandname \n")
#location = raw_input("Enter a city \n")
vs_url = "http://www.vividseats.com/concerts/" + bandname + "-tickets.html"
sg_url = "http://www.seatgeek.com/" + bandname + "-tickets"
#sh_url = "http://www.stubhub.com/" + bandname + "-tickets/"
#print sh_url
#rules = (Rule(LinkExtractor(allow=("concerts/" + bandname + "-tickets/" + bandname + "-" + item["ticketsLink"]),restrict_xpaths=('.//*/td[3]/a/@href',))callback = "parse_tickets" , follow = True,))
class MySpider(CrawlSpider):
handle_httpstatus_list = [416]
name = 'comparator'
allowed_domains = ["www.vividseats.com"]
start_urls = [vs_url]
tickets_list_xpath = './/*[@itemtype="http://schema.org/Event"]'
def parse_json(self, response):
loader = response.meta['loader']
jsonresponse = json.loads(response.body_as_unicode())
ticket_info = jsonresponse.get('tickets')
price_list = [i.get('p') for i in ticket_info]
ticketPrice = ''.join(price_list[0])
loader.add_value('ticketPrice', ticketPrice)
return loader.load_item()
def parse_price(self, response):
loader = response.meta['loader']
ticketsLink = loader.get_output_value("ticketsLink")
json_id_list= re.findall(r"(\d+)[^-]*$", ticketsLink)
json_id= "".join(json_id_list)
json_url = "http://www.vividseats.com/javascript/tickets.shtml?productionId=" + json_id
yield scrapy.Request(json_url, meta={'loader': loader}, callback = self.parse_json, dont_filter = True)
def parse(self, response):
"""
# """
selector = HtmlXPathSelector(response)
# iterate over tickets
for ticket in selector.select(self.tickets_list_xpath):
loader = XPathItemLoader(ComparatorItem(), selector=ticket)
# define loader
loader.default_input_processor = MapCompose(unicode.strip)
loader.default_output_processor = Join()
# iterate over fields and add xpaths to the loader
loader.add_xpath('eventName' , './/*[@class="productionsEvent"]/text()')
loader.add_xpath('eventLocation' , './/*[@class = "productionsVenue"]/span[@itemprop = "name"]/text()')
loader.add_xpath('ticketsLink' , './/*/a[@class = "btn btn-primary"]/@href')
loader.add_xpath('eventDate' , './/*[@class = "productionsDate"]/text()')
loader.add_xpath('eventCity' , './/*[@class = "productionsVenue"]/span[@itemprop = "address"]/span[@itemprop = "addressLocality"]/text()')
loader.add_xpath('eventState' , './/*[@class = "productionsVenue"]/span[@itemprop = "address"]/span[@itemprop = "addressRegion"]/text()')
loader.add_xpath('eventTime' , './/*[@class = "productionsTime"]/text()')
print "Here is ticket link \n" + loader.get_output_value("ticketsLink")
#sel.xpath("//span[@id='PractitionerDetails1_Label4']/text()").extract()
ticketsURL = "concerts/" + bandname + "-tickets/" + bandname + "-" + loader.get_output_value("ticketsLink")
ticketsURL = urljoin(response.url, ticketsURL)
yield scrapy.Request(ticketsURL, meta={'loader': loader}, callback = self.parse_price, dont_filter = True)
class MySpider2(CrawlSpider):
handle_httpstatus_list = [416]
name = 'comparator2'
allowed_domains = ["www.seatgeek.com/"]
start_urls = [sg_url]
tickets_list_xpath = './/*[@itemtype="http://schema.org/Event"]'
def parse_json2(self, response):
loader = response.meta['loader']
jsonresponse = json.loads(response.body_as_unicode())
listings_info = jsonresponse.get('listings')
price_list = [i.get('pf') for i in ticket_info]
ticketPrice = price_list[0]
loader.add_value('ticketPrice', ticketPrice)
return loader.load_item()
def parse_price2(self, response):
loader = response.meta['loader']
ticketsLink = loader.get_output_value("ticketsLink")
json_id= ticketsLink.split('/')[6]
json_url = "https://seatgeek.com/listings?client_id=MTY2MnwxMzgzMzIwMTU4&id=" + json_id + "&_wt=1&&_=1436364489501"
yield scrapy.Request(json_url, meta={'loader': loader}, callback = self.parse_json, dont_filter = True)
def parse2(self, response):
"""
# """
selector = HtmlXPathSelector(response)
# iterate over tickets
for ticket in selector.select(self.tickets_list_xpath):
loader = XPathItemLoader(ComparatorItem(), selector=ticket)
# define loader
loader.default_input_processor = MapCompose(unicode.strip)
loader.default_output_processor = Join()
# iterate over fields and add xpaths to the loader
loader.add_xpath('eventName' , './/a[@class = "event-listing-title"]/span[@itemprop = "name"]/text()')
loader.add_xpath('eventLocation' , './/a[@class = "event-listing-venue-link"]/span[@itemprop = "name"]/text()')
loader.add_xpath('ticketsLink' , '//a[@class = "event-listing-button"]/@href')
loader.add_xpath('eventDate' , '//div[@class = "event-listing-date"]/text()')
loader.add_xpath('eventCity' , './/span[@itemprop = "addressLocality"]/text()')
loader.add_xpath('eventState' , './/span[@itemprop = "addressRegion"]/text()')
loader.add_xpath('eventCountry' , './/span[@itemprop = "addressCountry"]/text()')
loader.add_xpath('eventTime' , '//div[@class = "event-listing-time"]/text()')
#ticketsURL = "concerts/" + bandname + "-tickets/" + bandname + "-" + loader.get_output_value("ticketsLink")
tickets_url = "www.seatgeek.com/" + loader.get_output_value("ticketsLink")
#ticketsURL = urljoin(response.url, ticketsURL)
yield scrapy.Request(tickets_url, meta={'loader': loader}, callback = self.parse_price2, dont_filter = True)
#0.24 code
# def setup_crawler(domain):
# spider = FollowAllSpider(domain=domain)
# settings = get_project_settings()
# crawler = Crawler(settings)
# crawler.configure()
# crawler.crawl(spider)
# crawler.start()
# for domain in [vs_url, sg_url]:
# setup_crawler(domain)
# log.start()
# reactor.run()
#1.0 code
process = CrawlerProcess(get_project_settings())
process = CrawlerProcess({
'USER_AGENT' : 'Mozilla/4.0 (compartible; MSIE 7.0; Windows NT 5.1)'
})
process.crawl(MySpider)
process.crawl(MySpider2)
process.start()
source to share
No one has answered this question yet
See similar questions:
or similar: