How to get bad urls with scrapy

I have two custom intermediaries to catch the failed URLs:




very close to the bootloader so that it can catch all exceptions that are counted by the default middleware DownloaderStats



and CustomFailresponseMiddleware

catches urls and exceptions that still failed after retries.



from agents import AGENTS
from usefulproxy350 import PROXIES
from scrapy import log
import random

class CustomUserAgentMiddleware(object):
    def process_request(self, request, spider):
        agent = random.choice(AGENTS)
        request.headers['User-Agent'] = agent

class CustomHttpProxyMiddleware(object):

    def process_request(self, request, spider):
        agent = random.choice(AGENTS)
        request.headers['User-Agent'] = agent
        p = random.choice(PROXIES)
            request.meta['proxy'] = "http://%s" % p
        except Exception, e:
            log.msg("Exception %s" % e, _level=log.CRITICAL)

    def process_exception(self, request, exception, spider):
        url = request.url
        proxy = request.meta['proxy']
        myfile = open('outurl_excep.txt','a')
class CustomFailresponseMiddleware(object):

    def process_response(self,request,response,spider):
            if response.status != 200 or len(response.headers)==0 :
                myfile = open('outurl_respo.txt','a')
                myfile.write(response.url + '\n')
                return request
            return response
        except Exception,e:
            log.msg("Response Exception %s" % e)

class CustomRecordMiddleware(object):

    def process_exception(self,request,exception,spider):
        url = request.url
        proxy = request.meta['proxy']
        myfile = open('outurl_record.txt','a')
        log.msg('Fail to request url %s with exception %s' % (url, str(exception)))


There seems to be some more bad urls that I haven't caught. When I scanned from page 51, the crawler seems to stop after 24 pages.

Below are the statistics:

2015-05-27 13:04:15+0800 [soufang_redis] INFO: Dumping Scrapy stats:
    {'downloader/exception_count': 55,
     'downloader/exception_type_count/twisted.internet.error.ConnectError': 6,
     'downloader/exception_type_count/twisted.internet.error.ConnectionRefusedError': 1,
     'downloader/exception_type_count/twisted.internet.error.TCPTimedOutError': 18,
     'downloader/exception_type_count/twisted.internet.error.TimeoutError': 9,
     'downloader/exception_type_count/twisted.web._newclient.ResponseNeverReceived': 21,
     'downloader/request_bytes': 230985,
     'downloader/request_count': 582,
     'downloader/request_method_count/GET': 582,
     'downloader/response_bytes': 8174486,
     'downloader/response_count': 527,
     'downloader/response_status_count/200': 505,
     'downloader/response_status_count/400': 1,
     'downloader/response_status_count/404': 4,
     'downloader/response_status_count/502': 10,
     'downloader/response_status_count/503': 7,
     'finish_reason': 'shutdown',
     'finish_time': datetime.datetime(2015, 5, 27, 5, 4, 15, 945815),
     'item_dropped_count': 5,
     'item_dropped_reasons_count/DropItem': 5,
     'item_scraped_count': 475,
     'log_count/INFO': 82,
     'log_count/WARNING': 5,
     'request_depth_max': 24,
     'response_received_count': 505,
     'scheduler/dequeued/redis': 582,
     'scheduler/enqueued/redis': 582,
     'start_time': datetime.datetime(2015, 5, 27, 4, 47, 13, 889437)}


I checked mine outurl_record.txt

and the number of exceptions recorded is 55 which is exactly the same downloader/exception_count

. request_depth_max

only 24 (it should have been 51), but I haven't found any bad information about page 25 in outurl_record.txt

. I also did not find in outurl_excep.txt

and outurl_respo.txt


I've tried it several times and sometimes it can scan all pages and sometimes it doesn't.

What did I miss?


source to share

All Articles