Scrap_redis Stop my spiders after x IDLE times
I have a pool of scrapy_redis that are listening in a redis queue (the number of spiders is not always the same). This queue is being loaded by another script. I would like my spiders to stop after X minutes of inactivity when there is nothing left in the redis queue.
I have set SCHEDULER_IDLE_BEFORE_CLOSE in my settings.py but it doesn't work.
Here is my settings.py :
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER_IDLE_BEFORE_CLOSE = 10
REDIS_HOST = 'localhost'
DOWNLOADER_MIDDLEWARES = {
'serp_crawl.middlewares.RandomUserAgentMiddleware': 200,
'scrapy_crawlera.CrawleraMiddleware': 300
}
CRAWLERA_ENABLED = True
CRAWLERA_USER = ''
CRAWLERA_PASS = ''
#Activate Crawlera User Agent
DEFAULT_REQUEST_HEADERS = {
"X-Crawlera-UA": "pass",
}
UPDATE
Here is my spider code:
from scrapy_redis.spiders import RedisSpider
from elasticsearch import Elasticsearch
from serp_crawl.settings import *
from datetime import datetime
from redis import Redis
import scrapy
import json
class SerpSpider(RedisSpider):
name = "serpcrawler"
redis_key = 'serp_crawler:request'
def __init__(self, redis_host='localhost', redis_port='6379',
elasticsearch_host='localhost', elasticsearch_port='9200',
mysql_host='localhost', dev=False,):
super(SerpSpider, self).__init__()
self.platform = None
self.dev = bool(dev)
self.q = Redis(redis_host, redis_port)
self.es = Elasticsearch([{'host': elasticsearch_host, 'port': elasticsearch_port}])
@classmethod
def from_crawler(self, crawler, *args, **kwargs):
crawler.settings.attributes['REDIS_HOST'].value = kwargs['redis_host']
obj = super(RedisSpider, self).from_crawler(crawler, *args, **kwargs)
obj.setup_redis(crawler)
return obj
def make_requests_from_url(self, url):
data = json.loads(url)
self.logger.info('Got new url to parse: ', data['url'])
self.settings.attributes['DEFAULT_REQUEST_HEADERS'].value.attributes['X-Crawlera-UA'].value = data['platform']
self.platform = data['platform']
return scrapy.Request(url=data['url'], callback=self.parse,
meta={'keyword': data['keyword'],
'id': data['id_keyword'],
'country': data['country'],
'platform': data['platform']}, dont_filter=True)
def parse(self, response):
doc = dict()
try:
doc['content'] = response.body.decode('cp1252')
except:
doc['content'] = response.body
doc['date'] = datetime.now().strftime('%Y-%m-%d')
doc['keyword'] = str(response.meta['keyword'])
doc['type_platform'] = str(response.meta['platform'])
doc['country'] = str(response.meta['country'])
if not self.dev:
id_index = self.es.index(index='serp_html', doc_type='page', body=doc)
self.q.lpush('batching_serp',
{'id_index': str(id_index['_id']),
'type_batching': 'default',
'country': doc['country'],
'type_platform': doc['type_platform'],
'keyword': doc['keyword'],
'id_keyword': int(response.meta['id'])})
self.logger.info('Indexed new page. id_es : [' + str(id_index['_id']) + ']')
Thank you for your help.
+3
source to share
1 answer
scrapy-redis
the docs say:
# Max idle time to prevent the spider from being closed when distributed crawling.
# This only works if queue class is SpiderQueue or SpiderStack,
# and may also block the same time when your spider start at the first time (because the queue is empty).
SCHEDULER_IDLE_BEFORE_CLOSE = 10
So, you need to set one of these settings:
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.FifoQueue'
# or
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.LifoQueue'
0
source to share