Use scrapy to get a list of urls and then clean up the content inside those urls

Question

Use scrapy to get a list of urls and then clean up the content inside those urls

I need a Scrapy spider to clean up the following page ( https://www.phidgets.com/?tier=1&catid=64&pcid=57 ) for each url (30 products, so 30 urls) and then go to each product via this Url and clear the data inside.

The second part works for me exactly as I want:

import scrapy

class ProductsSpider(scrapy.Spider):
    name = "products"
    start_urls = [
        'https://www.phidgets.com/?tier=1&catid=64&pcid=57',
    ]

    def parse(self, response):
        for info in response.css('div.ph-product-container'):
            yield {
                'product_name': info.css('h2.ph-product-name::text').extract_first(),
                'product_image': info.css('div.ph-product-img-ctn a').xpath('@href').extract(),
                'sku': info.css('span.ph-pid').xpath('@prod-sku').extract_first(),
                'short_description': info.css('div.ph-product-summary::text').extract_first(),
                'price': info.css('h2.ph-product-price > span.price::text').extract_first(),
                'long_description': info.css('div#product_tab_1').extract_first(),
                'specs': info.css('div#product_tab_2').extract_first(),
            }

        # next_page = response.css('div.ph-summary-entry-ctn a::attr("href")').extract_first()
        # if next_page is not None:
        #     yield response.follow(next_page, self.parse)

But I don't know how to do the first part. As you will see, I have a home page ( https://www.phidgets.com/?tier=1&catid=64&pcid=57 ) set as start_url. But how do I get it to populate the start_urls list with all 30 URLs that I need to crawl?

+3

python web-scraping scrapy

Adriano CR 04 jul. 17 at 20:46

source to share

1 answer

Henrique coura · Accepted Answer · 2017-07-04T21:20:21+0000

I cannot verify at this point, so please let me know if this works for you, so I can edit it if there are any errors.

The idea here is that we find every link on the first page and provide new fetch requests by passing your product analysis method as a callback

import scrapy
from urllib.parse import urljoin

class ProductsSpider(scrapy.Spider):
    name = "products"
    start_urls = [
        'https://www.phidgets.com/?tier=1&catid=64&pcid=57',
    ]

    def parse(self, response):
        products = response.xpath("//*[contains(@class, 'ph-summary-entry-ctn')]/a/@href").extract()
        for p in products:
            url = urljoin(response.url, p)
            yield scrapy.Request(url, callback=self.parse_product)

    def parse_product(self, response):
        for info in response.css('div.ph-product-container'):
            yield {
                'product_name': info.css('h2.ph-product-name::text').extract_first(),
                'product_image': info.css('div.ph-product-img-ctn a').xpath('@href').extract(),
                'sku': info.css('span.ph-pid').xpath('@prod-sku').extract_first(),
                'short_description': info.css('div.ph-product-summary::text').extract_first(),
                'price': info.css('h2.ph-product-price > span.price::text').extract_first(),
                'long_description': info.css('div#product_tab_1').extract_first(),
                'specs': info.css('div#product_tab_2').extract_first(),
            }

Use scrapy to get a list of urls and then clean up the content inside those urls

More articles: