Scrap_redis Stop my spiders after x IDLE times

Question

Scrap_redis Stop my spiders after x IDLE times

I have a pool of scrapy_redis that are listening in a redis queue (the number of spiders is not always the same). This queue is being loaded by another script. I would like my spiders to stop after X minutes of inactivity when there is nothing left in the redis queue.

I have set SCHEDULER_IDLE_BEFORE_CLOSE in my settings.py but it doesn't work.

Here is my settings.py :

SCHEDULER = "scrapy_redis.scheduler.Scheduler"
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER_IDLE_BEFORE_CLOSE = 10

REDIS_HOST = 'localhost'

DOWNLOADER_MIDDLEWARES = {
    'serp_crawl.middlewares.RandomUserAgentMiddleware': 200,
    'scrapy_crawlera.CrawleraMiddleware': 300
}
CRAWLERA_ENABLED = True

CRAWLERA_USER = ''
CRAWLERA_PASS = ''
#Activate Crawlera User Agent
DEFAULT_REQUEST_HEADERS = {
    "X-Crawlera-UA": "pass",
    }

UPDATE

Here is my spider code:

from scrapy_redis.spiders import RedisSpider
from elasticsearch import Elasticsearch
from serp_crawl.settings import *
from datetime import datetime
from redis import Redis

import scrapy
import json


class SerpSpider(RedisSpider):
    name = "serpcrawler"
    redis_key = 'serp_crawler:request'

    def __init__(self, redis_host='localhost', redis_port='6379',
                 elasticsearch_host='localhost', elasticsearch_port='9200',
                 mysql_host='localhost', dev=False,):
        super(SerpSpider, self).__init__()

        self.platform = None
        self.dev = bool(dev)
        self.q = Redis(redis_host, redis_port)
        self.es = Elasticsearch([{'host': elasticsearch_host, 'port': elasticsearch_port}])

    @classmethod
    def from_crawler(self, crawler, *args, **kwargs):
        crawler.settings.attributes['REDIS_HOST'].value = kwargs['redis_host']
        obj = super(RedisSpider, self).from_crawler(crawler, *args, **kwargs)
        obj.setup_redis(crawler)
        return obj

    def make_requests_from_url(self, url):
        data = json.loads(url)
        self.logger.info('Got new url to parse: ', data['url'])
        self.settings.attributes['DEFAULT_REQUEST_HEADERS'].value.attributes['X-Crawlera-UA'].value = data['platform']
        self.platform = data['platform']
        return scrapy.Request(url=data['url'], callback=self.parse,
                              meta={'keyword': data['keyword'],
                                    'id': data['id_keyword'],
                                    'country': data['country'],
                                    'platform': data['platform']}, dont_filter=True)

    def parse(self, response):
        doc = dict()
        try:
            doc['content'] = response.body.decode('cp1252')
        except:
            doc['content'] = response.body
        doc['date'] = datetime.now().strftime('%Y-%m-%d')
        doc['keyword'] = str(response.meta['keyword'])
        doc['type_platform'] = str(response.meta['platform'])
        doc['country'] = str(response.meta['country'])
        if not self.dev:
            id_index = self.es.index(index='serp_html', doc_type='page', body=doc)
            self.q.lpush('batching_serp',
                               {'id_index': str(id_index['_id']),
                                'type_batching': 'default',
                                'country': doc['country'],
                                'type_platform': doc['type_platform'],
                                'keyword': doc['keyword'],
                                'id_keyword': int(response.meta['id'])})
            self.logger.info('Indexed new page. id_es : [' + str(id_index['_id']) + ']')

Thank you for your help.

+3

python web-scraping scrapy redis

BLANQUER Adrien Apr 21 17 at 16:19

source to share

1 answer

Granitosaurus · Answer 1 · 2017-04-24T08:06:43+0000

scrapy-redis

the docs say:

# Max idle time to prevent the spider from being closed when distributed crawling.
# This only works if queue class is SpiderQueue or SpiderStack,
# and may also block the same time when your spider start at the first time (because the queue is empty).
SCHEDULER_IDLE_BEFORE_CLOSE = 10

So, you need to set one of these settings:

SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.FifoQueue'
# or
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.LifoQueue'

Scrap_redis Stop my spiders after x IDLE times

More articles: