Memory overflow when running scrapy from script

I wrote a spider in scrapy to crawl hundreds of thousands of pages from some news websites. It works well when I run it from command line tools and memory usage reaches a stable 20% on my 4GB PC. (I used priority on requests to ensure that many requests are not spawned.) But when I start it with a python script, the memory usage keeps growing until my spider eats up all the memory space. This is my startup script:

class CrawlersInitiator(object):

    def __init__(self, spiders, start=datetime.now()-timedelta(minutes=30), end=datetime.now()):
        self.setting = get_project_settings()
        self.crawlers = []
        self.spiders = spiders
        self.start_time = start
        self.end_time = end
        # log file
        self.info_log = None
        log_dir = self.setting.get("LOG_DIR")
        if not os.path.exists(log_dir):
            os.mkdir(log_dir)
        # counter used to stop reactor
        self.stopped_crawler = 0
        self.lock = RLock()

    def __del__(self):
        self.close_log_file()

    def create_log_file(self):
    """ create log file with crawl date in file name
    """
        self.close_log_file()
        dir_path = self.setting.get("LOG_DIR")+"/{0}".format(self.end_time.strftime("%Y-%m"))
        file_suffix = self.end_time.strftime("%Y-%m-%d")
        if not os.path.exists(dir_path):
            os.mkdir(dir_path)
        self.info_log = open("{0}/log-{1}.log".format(dir_path, file_suffix), "a")  # info

    def close_log_file(self):
        if self.info_log and not self.info_log.closed:
            self.info_log.close()
            self.info_log = None

    def get_crawler(self, spider):
        crawler = Crawler(self.setting)
        crawler.signals.connect(self.stop, signal=signals.spider_closed)
        crawler.configure()
        crawler.crawl(spider(start_time=self.start_time, end_time=self.end_time))
        return crawler

    def stop(self):
        """callback to stop reactor
        """
        self.lock.acquire()
        self.stopped_crawler += 1
        if self.stopped_crawler >= len(self.crawlers):
            reactor.stop()
        self.lock.release()

    def run_spiders(self):
        """run spiders
        """
        self.crawlers = []
        self.stopped_crawler = 0
        # get crawlers
        for Spider in self.spiders:
            self.crawlers.append(self.get_crawler(Spider))
        # log
        self.create_log_file()
        ScrapyFileLogObserver(self.info_log, level=log.INFO).start() 
        self.info_log.write("\nCrawlers starting...\n")
        self.info_log.write("Crawl from {0} to {1}".format(str(self.start_time), str(self.end_time)))

        # run
        for crawler in self.crawlers:
            crawler.start()
        reactor.run()
        end = datetime.now()

        # release crawlers
        for crawler in self.crawlers:
            del crawler

        # log
        self.info_log.write("Crawlers finished in {0} !\n".format(str(end-self.end_time)))
        self.close_log_file()


def crawl(spiders, start, end):
    CrawlersInitiator(spiders, start=start, end=end).run_spiders()

SPIDERS = [MySpider1, MySpider2]    

if __name__ == "__main__":
    start_time = datetime.strptime(sys.argv[1], "%Y-%m-%d_%H:%M:%S")
    end_time = datetime.strptime(sys.argv[2], "%Y-%m-%d_%H:%M:%S")          
    crawl(SPIDERS, start_time, end_time)            
    quit()

      

I tried to use cure tracking to find the problem.

When run from command line tools, prefs () appears (only one spider running):

MySpider1                           1   oldest: 942s ago
HtmlResponse                       13   oldest: 52s ago
Request                          6329   oldest: 932s ago
Item                             5915   oldest: 932s ago
Selector                           13   oldest: 52s ago

      

When run from a script, prefs () shows:

Response                           51   oldest: 657s ago 
Request                          6966   oldest: 661s ago 
Item                             5732   oldest: 661s ago 
HtmlResponse                      377   oldest: 661s ago 
Selector                          377   oldest: 661s ago 
MySpider1                           1   oldest: 661s ago 

      

It looks like scrapy never releases any objects when using my script. Why is this happening and how to solve it?

Here is the superclass of all my spiders, all requests are handled in this class:

class IndexSpider(Spider):

    __metaclass__ = ABCMeta  

    # splice _queries onto _search_url to get start_requests (index pages of news)
    _search_url = ""
    _queries = []

    _char_set = "utf8"

    def __init__(self, queries=self._queries, start_time=datetime.min, end_time=datetime.now()):        
        self.queries = queries
        self.start_time = start_time
        self.end_time = end_time

    def start_requests(self):
        query_count = 0
        query = None
        try:
            for query in self.queries:
                yield Request(self._search_url.format(urllib.quote(query.encode(self._char_set))), self.parse_index)
                query_count += 1
        except Exception, e:
            self.log("Query No.{0} can't be encoded in {1}, because of {2}!"
                     .format(str(query_count), self.name, e), level=log.WARNING)
            yield Request(self._search_url.format(query.encode("gbk")), self.parse_index)

    def parse_index(self, response):
        """parse index page
        """
        requests = []
        page_list = self._get_result(response)

        if not page_list:
            return requests
        next_page = True  

        for item in page_list:
            if isinstance(item, Request):  
                requests.append(item)
                next_page = False
                break
            if item['publish_time'] <= self.from_time: 
                next_page = False
                break
            elif item['publish_time'] > self.end_time:
                continue
            else:
                req = Request(item['url'], self.parse_news, priority=1)
                req.meta["item"] = item
                requests.append(req)
        if next_page:
            next_page = self._next_index_page(response)
            if next_page:
                requests.append(Request(self._next_index_page(response), self.parse_index))
        return requests

    def parse_news(self, response): 
        """parse news page
        """
        item = response.meta["item"]
        del response.meta['item']
        return self._finish_item(item, response)

    @abstractmethod
    def _get_result(self, response):
        """get news list from index page
        :param response: index page
        :return: a list of objects of crawlers.items.Base or its subclass, each object represents a news
        """
        pass

    @abstractmethod
    def _next_index_page(self, response):
        """
        :param response: current index page
        :return: URL of the next index page
        """
        pass

    @abstractmethod
    def _finish_item(self, item, response):
        """parse news page
        :param item: news item get from the index page
        :param response: news page
        :return: news item or new request
        """
        pass

      

+3


source to share


1 answer


it may be that scrapy is not using the full power of your computer when running from a script. You could use Settings to control spider behavior:



my_settings = {
    'MEMUSAGE_ENABLED' = 1,
    'MEMUSAGE_LIMIT_MB' = 1024,
    'CONCURRENT_REQUESTS' = 100 # lower this if it is still reaching memory limits
}
process = CrawlerProcess(my_settings) 

process.crawl(MySpider)
process.start() 

      

0


source







All Articles