How to remove '\ n' from scrapy output in python

Question

How to remove '\ n' from scrapy output in python

I'm trying to output to CSV, but I realized that when clearing tripadvisor, I get a lot of carriage returns, so the array goes over 30 while there are only 10 views, so I get a lot of fields. Is there any way to remove carriage return.

spider.

from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from scrapingtest.items import ScrapingTestingItem
from collections import OrderedDict
import json
from scrapy.selector.lxmlsel import HtmlXPathSelector
import csv
import html2text
import unicodedata


class scrapingtestspider(Spider):
    name = "scrapytesting"
    allowed_domains = ["tripadvisor.in"]
    base_uri = ["tripadvisor.in"]
    start_urls = [
        "http://www.tripadvisor.in/Hotel_Review-g297679-d736080-Reviews-Ooty_Elk_Hill_A_Sterling_Holidays_Resort-Ooty_Tamil_Nadu.html"]



    def parse(self, response):
        item = ScrapingTestingItem()
        sel = HtmlXPathSelector(response)
        converter = html2text.HTML2Text()
        sites = sel.xpath('//a[contains(text(), "Next")]/@href').extract()
##        dummy_test = [ "" for k in range(10)]

        item['reviews'] = sel.xpath('//div[@class="col2of2"]//p[@class="partial_entry"]/text()').extract()
        item['subjects'] = sel.xpath('//span[@class="noQuotes"]/text()').extract()
        item['stars'] = sel.xpath('//*[@class="rating reviewItemInline"]//img/@alt').extract()
        item['names'] = sel.xpath('//*[@class="username mo"]/span/text()').extract()
        item['location'] = sel.xpath('//*[@class="location"]/text()').extract()
        item['date'] = sel.xpath('//*[@class="ratingDate relativeDate"]/@title').extract()
        item['date'] += sel.xpath('//div[@class="col2of2"]//span[@class="ratingDate"]/text()').extract()


        startingrange = len(sel.xpath('//*[@class="ratingDate relativeDate"]/@title').extract())

        for j in range(startingrange,len(item['date'])):
            item['date'][j] = item['date'][j][9:].strip()

        for i in range(len(item['stars'])):
            item['stars'][i] = item['stars'][i][:1].strip()

        for o in range(len(item['reviews'])):
            print unicodedata.normalize('NFKD', unicode(item['reviews'][o])).encode('ascii', 'ignore')

        for y in range(len(item['subjects'])):
            item['subjects'][y] = unicodedata.normalize('NFKD', unicode(item['subjects'][y])).encode('ascii', 'ignore')

        yield item

#        print item['reviews']

        if(sites and len(sites) > 0):
            for site in sites:
                yield Request(url="http://tripadvisor.in" + site, callback=self.parse)

Can I use a regular expression that I could use to go through the for loop and replace it. I tried to replace but didn't do anything. And also why scrapy does it.

+3

python regex web-scraping scrapy

Smashed 22 jul. 15 at 5:50 am

source to share

2 answers

Easy solution after reading the list of documents.

while "\n" in some_list: some_list.remove("\n")

+1

Smashed 22 jul. '15 at 6:23

source to share

alecxe · Accepted Answer · 2015-07-22T13:58:47+0000

What I usually do to trim and clean up the output is use "Input and / or Output Processors" with Product Loaders - this makes things more modular and cleaner:

class ScrapingTestingLoader(ItemLoader):
    default_input_processor = MapCompose(unicode.strip)
    default_output_processor = TakeFirst()

Then if you use this product loader to load your items, you will get the highlighted values as strings (instead of lists). For example, if a field ["my value \n"]

is selected , you will receive it my value

as an output.

How to remove '\ n' from scrapy output in python

More articles: