Tries to download .pdf.png anf.jpg from nytimes.com
I wrote a simple Python scraper to grab some documents from a specific page at nytimes.com. It works in the sense that it grabs and formats the entire URL correctly and also tries to download files and formats the name correctly.
But all I get is 1kb files. I can't figure out why. Here is my code:
import urllib2
import urllib
from cookielib import CookieJar
files = 'http://www.nytimes.com/interactive/2014/11/25/us/evidence-released-in-michael-brown-case.html?_r=0'
slashpos = 0
def getLinks(url):
cj = CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
p = opener.open(url)
result = []
for line in p:
for element in line.split():
if element.startswith('href="http://gr'):
if element.endswith('pdf"') or element.endswith('png"') or element.endswith('jpg"'):
result.append(element[6:])
else:
continue
for char in result:
slashpos = char.rfind('/') + 1
urllib.urlretrieve(char, char[slashpos:-1])
getLinks(files)
Any help is greatly appreciated. Thank!
+3
source to share
2 answers
Solved !: D
#!/usr/bin/env python
from bs4 import BeautifulSoup
import urllib2
import urlparse
from sys import argv
from cookielib import CookieJar
if len(argv) != 2:
print "Usage:\n\tpython %s 'http://www.nytimes.com/interactive/2014/11/25/us/evidence-released-in-michael-brown-case.html?_r=0'"%argv[0]
exit()
url = argv[1]
urls =[]
try:
cj = CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
html = opener.open(url)
except:
print "[-] No such website"
exit()
soup = BeautifulSoup(html)
for tag in soup.find_all('a'):
try:
tag["href"] = urlparse.urljoin(url, tag['href'])
if tag['href'] not in urls and '.png' in tag['href'] or '.jpg' in tag['href']:
newpdf = tag['href'].split("/")
name = newpdf[-1]
resp = urllib2.urlopen(tag['href'])
meta_data = resp.info()
fsize = int(meta_data.getheaders("Content-Length")[0])
print "Downloading --> %s \t size: %s "%(name, fsize)
f = open(name, "wb")
f.write(resp.read())
f.close
urls.append(tag["href"])
else:
print tag['href']
except KeyboardInterrupt:
print " User hit CTRL+C"
exit()
except:
pass
Hope you find it helpful
0
source to share