Parsing urls in sitemap with different url using sitemap spider in scrapy, python

I am using sitemap spider in scrapy, python. The sitemap seems to have an unusual format with "//" in front of the URLs:

<url>
    <loc>//www.example.com/10/20-baby-names</loc>
</url>
<url>
    <loc>//www.example.com/elizabeth/christmas</loc>
 </url>

      

myspider.py

from scrapy.contrib.spiders import SitemapSpider
from myspider.items import *

class MySpider(SitemapSpider):
    name = "myspider"
    sitemap_urls = ["http://www.example.com/robots.txt"]

    def parse(self, response):
        item = PostItem()           
        item['url'] = response.url
        item['title'] = response.xpath('//title/text()').extract()

        return item

      

I am getting this error:

raise ValueError('Missing scheme in request url: %s' % self._url)
    exceptions.ValueError: Missing scheme in request url: //www.example.com/10/20-baby-names

      

How can I manually parse the url using the sitemap spider?

+2


source to share


3 answers


I think the nicest and cleanest solution would be to add a downloader middleware that changes the malicious URLs without spider notification.



import re
import urlparse
from scrapy.http import XmlResponse
from scrapy.utils.gz import gunzip, is_gzipped
from scrapy.contrib.spiders import SitemapSpider

# downloader middleware
class SitemapWithoutSchemeMiddleware(object):
    def process_response(self, request, response, spider):
        if isinstance(spider, SitemapSpider):
            body = self._get_sitemap_body(response)

            if body:
                scheme = urlparse.urlsplit(response.url).scheme
                body = re.sub(r'<loc>\/\/(.+)<\/loc>', r'<loc>%s://\1</loc>' % scheme, body)    
                return response.replace(body=body)

        return response

    # this is from scrapy Sitemap class, but sitemap is
    # only for internal use and it api can change without
    # notice
    def _get_sitemap_body(self, response):
        """Return the sitemap body contained in the given response, or None if the
        response is not a sitemap.
        """
        if isinstance(response, XmlResponse):
            return response.body
        elif is_gzipped(response):
            return gunzip(response.body)
        elif response.url.endswith('.xml'):
            return response.body
        elif response.url.endswith('.xml.gz'):
            return gunzip(response.body)

      

+1


source


If I see it correctly, you can (for a quick fix) override the default implementation _parse_sitemap

in SitemapSpider

. It's not nice because you have to copy a lot of code, but it should work. You will need to add a method to generate a URL with a schema.

"""if the URL starts with // take the current website scheme and make an absolute
URL with the same scheme"""
def _fix_url_bug(url, current_url):
    if url.startswith('//'):
           ':'.join((urlparse.urlsplit(current_url).scheme, url))
       else:
           yield url

def _parse_sitemap(self, response):
    if response.url.endswith('/robots.txt'):
        for url in sitemap_urls_from_robots(response.body)
            yield Request(url, callback=self._parse_sitemap)
    else:
        body = self._get_sitemap_body(response)
        if body is None:
            log.msg(format="Ignoring invalid sitemap: %(response)s",
                    level=log.WARNING, spider=self, response=response)
            return

        s = Sitemap(body)
        if s.type == 'sitemapindex':
            for loc in iterloc(s):
                # added it before follow-test, to allow test to return true
                # if it includes the scheme (yet do not know if this is the better solution)
                loc = _fix_url_bug(loc, response.url)
                if any(x.search(loc) for x in self._follow):
                    yield Request(loc, callback=self._parse_sitemap)
        elif s.type == 'urlset':
            for loc in iterloc(s):
                loc = _fix_url_bug(loc, response.url) # same here
                for r, c in self._cbs:
                    if r.search(loc):
                        yield Request(loc, callback=c)
                        break

      



This is just a general idea and untested. So it may either not work completely or syntax errors. Please respond via comments so I can improve my answer.

The sitemap you are trying to analyze seems to be wrong. The schema is missing from the RFC is fine, but sitemaps require urls to start with a schema .

+2


source


I used @alecxe's trick to parse urls inside a spider. I did it, but not sure if this is the best way to do it.

from urlparse import urlparse
import re 
from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.utils.response import body_or_str
from example.items import *

class ExampleSpider(BaseSpider):
    name = "example"
    start_urls = ["http://www.example.com/sitemap.xml"]

    def parse(self,response):
        nodename = 'loc'
        text = body_or_str(response)
        r = re.compile(r"(<%s[\s>])(.*?)(</%s>)" % (nodename, nodename), re.DOTALL)
        for match in r.finditer(text):
            url = match.group(2)
            if url.startswith('//'):
                url = 'http:'+url
                yield Request(url, callback=self.parse_page)

    def parse_page(self, response):
        # print response.url
        item = PostItem()   

        item['url'] = response.url
        item['title'] = response.xpath('//title/text()').extract()
        return item

      

+1


source







All Articles