Tries to download .pdf.png anf.jpg from nytimes.com

I wrote a simple Python scraper to grab some documents from a specific page at nytimes.com. It works in the sense that it grabs and formats the entire URL correctly and also tries to download files and formats the name correctly.

But all I get is 1kb files. I can't figure out why. Here is my code:

import urllib2 
import urllib 
from cookielib import CookieJar

files = 'http://www.nytimes.com/interactive/2014/11/25/us/evidence-released-in-michael-brown-case.html?_r=0' 
slashpos = 0

def getLinks(url):
    cj = CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    p = opener.open(url)
    result = []
    for line in p:
        for element in line.split():
            if element.startswith('href="http://gr'):
                if element.endswith('pdf"') or element.endswith('png"') or element.endswith('jpg"'):
                    result.append(element[6:])
                else:
                    continue
    for char in result:
        slashpos = char.rfind('/') + 1
        urllib.urlretrieve(char, char[slashpos:-1])

getLinks(files)

      

Any help is greatly appreciated. Thank!

+3


source to share


2 answers


1) use result.append(element[6:-1])

instead of result.append(element[6:])

(avoids having double quotes in url, reason for download failure)
2) and to save the file use urllib.urlretrieve(char, char[slashpos:])

insteadurllib.urlretrieve(char, char[slashpos:-1])



0


source


Solved !: D

#!/usr/bin/env python
from bs4 import BeautifulSoup
import urllib2
import urlparse
from sys import argv
from cookielib import CookieJar

if len(argv) != 2:
    print "Usage:\n\tpython %s 'http://www.nytimes.com/interactive/2014/11/25/us/evidence-released-in-michael-brown-case.html?_r=0'"%argv[0]
    exit()
url = argv[1]
urls =[]
try:
    cj = CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    html = opener.open(url)
except:
    print "[-] No such website"
    exit()
soup = BeautifulSoup(html)

for tag in soup.find_all('a'):
    try: 
        tag["href"] = urlparse.urljoin(url, tag['href'])
        if tag['href'] not in urls and '.png' in tag['href'] or '.jpg'  in tag['href']:
            newpdf = tag['href'].split("/")
            name = newpdf[-1]
            resp = urllib2.urlopen(tag['href'])
            meta_data =  resp.info()
            fsize = int(meta_data.getheaders("Content-Length")[0])
            print "Downloading --> %s \t size: %s "%(name, fsize)
            f = open(name, "wb")
            f.write(resp.read())
            f.close
            urls.append(tag["href"])
        else:
            print tag['href']
    except KeyboardInterrupt:
        print " User hit CTRL+C"
        exit()
    except:
        pass

      



Hope you find it helpful

0


source







All Articles