Files
scrape-virtual-seismic-atlas/scrape_seismic_atlas.py
T
2014-04-20 20:24:42 +12:00

201 lines
7.4 KiB
Python

# -*- coding: utf-8 -*-
"""
Created on Sat Apr 19 20:37:13 2014
This script will download files from http://see-atlas.leeds.ac.uk:808, then append
a url name and caption to them. This is designed to run on linux and requires
a few unique modules such as gdshortener and also imagemagik installed and in the path.
A few option are set in the file, base url, lim url, high res?, and draft?.
Images and info is scraped from the main site and images are downloaded to the
current directory, only if they do not alread exist.
Captions are appended to image using calls to imagemagik on the command line. This is because PIL wasn't worked as desired.
@author: wassname [located_at] wassname (dot) org
"""
#==============================================================================
# options
#==============================================================================
#the base URL we'll append the the 'limb' URL we'll find in the 'next' buttons
base_url = r'http://see-atlas.leeds.ac.uk:8080'
limb_url = r'/search/advancedSearch.jsp?rpp=50&Ns=P_LastModified%7c1&N=6394'
high_res=True
draft=False
#==============================================================================
# Main program
#==============================================================================
if high_res:
inc=1
else:
inc=2
#target=r'http://see-atlas.leeds.ac.uk:8080/search/advancedSearch.jsp?rpp=50&Ns=P_LastModified%7c1&N=6394' # adv search with 50 results per page
#exampleimage=r'http://see-atlas.leeds.ac.uk:8080/docbaseContent?objectId=09000064800112fe'
#make the required import
import bs4
import requests
import re
import gdshortener
from PIL import Image
import os
#create a session object that'll allow us to track the cookies across the session and raise
# too much suspicion
s = requests.session()
#a list to hold all the image URLs we'll find
image_links = []
# a few booleans to help our loop out
done = False
print "Scraping site:",base_url
#keep looping until we're 'done' which means that our 'next' button is leading us in circles
while not done:
#combine base and limb URLs to create a full URL we can use
url = base_url + limb_url
#make the GET requests to the full URL we've just built
r = s.get(url)
#create a BeautifulSoup Object that we can parse for data
soup = bs4.BeautifulSoup(r.content)
entity_divs=soup.findAll(attrs={'class': 'entity'})
# first grab all the images on the page
img_divs=soup.findAll(attrs={'class': 'result-thumbnail'})
for entity_div in entity_divs:
img_div=entity_div.findAll(attrs={'class': 'result-thumbnail'})[0]
img_dic=img_div.a.attrs
img_dic['id']=img_dic['href'].split('=')[1]
img_dic['abstract']=re.sub(r'\s\s+',r'\n',entity_div.text) # here is the description to overlay on
image_links.append(img_dic)
print 'found:', img_dic['id'], img_dic['title'][:50] # max 50 chars in title
#now see if we can find the next button
next_found = False
nxt_elems=soup.findAll(text='Next',attrs={'class':'paginate'})
if nxt_elems:
limb_url=nxt_elems[-1].attrs['href']
print "Searching next page:", limb_url
if draft: break
else:
done=True
print "Finished scaping, qeueing {} image downloads".format(len(image_links))
def wget(url,file_name=None):
# not working
'''http://stackoverflow.com/questions/22676/how-do-i-download-a-file-over-http-using-python'''
try:
r = s.get(url)
except: # in case of "Max retries exceeded with url"
s = requests.session()
r = s.get(url)
try:
image_name= r.raw.getheaders()['content-disposition'].split('=')[1]
except:
image_name='im.jpg'
if file_name==None:
file_name=image_name
else:
file_name=file_name+'.'+image_name.split('.')[1]
f = open(file_name, 'wb')
try:
file_size = int(r.raw.getheaders()['content-length'])
print "Downloading: %s (%s MB)" % (file_name,file_size*9.53674e-7)
except:
print "Downloading: %s" % (file_name)
f.write(requests.get(url).content)
f.close()
return file_name
def increase_last_char(c,inc=1):
'''This increases the last charector on a string by 1, e.g. 09000064800112fc-> 09000064800112fd or 54 -> 55'''
c=c[:-1]+chr(ord(c[-1])+inc)
return c
def safe_filename(s):
""""This will make a filename safe and nice. So remove invalid chars, as well as spaced for linux"""
remove="""\\/:;*?"<>|%,#$!+{}[]'@`=&^"""
replace=' ()'
for c in remove:
s=s.replace(c,'')
for c in replace:
s=s.replace(c,'_')
s=s.lower()
return s
# now download images,then write a caption under them
# i have to iterate the last number or digit by 1! to get high res and 2 for low res!
# high res http://see-atlas.leeds.ac.uk:8080/docbaseContent?asAttachment=false&objectId=090000648001b0d3&version=CURRENT&vs=1
# lowres http://see-atlas.leeds.ac.uk:8080/docbaseContent?asAttachment=false&objectId=090000648001b0d4&version=CURRENT&vs=1
# let us queue the downloads
if draft: image_links=image_links[:5]
sg = gdshortener.ISGDShortener()
for image_link in image_links:
image_link2=increase_last_char(image_link['id'],inc=inc) # this increases the last charector by 1
url=base_url+r'/docbaseContent?asAttachment=false&objectId={}&version=CURRENT&vs=1'.format(image_link2)
file_name = safe_filename(image_link['title'])
# check if its already there
if os.path.isfile(file_name+'.jpg'):
print "File already exists", file_name+'.jpg'
continue
print "Download %s as %s" % (url, file_name)
file_name=wget(url,file_name=file_name) # download image
# write caption under image
# first register tinyurl
tnyurl=sg.shorten(url = base_url+image_link['href'])[0].replace('http://','')
print "Drawing on image"
# lets format the caption, we have to worry about escaping, and line wrapping
import textwrap
lwidth=140
b=[]
for line in image_link['abstract'].split('\n'): # split it by existing line breaks to preserve them
b=b+textwrap.wrap(line,width=lwidth) # wrap lines that are longer than width
caption='link:'+tnyurl+'\n'+'\n'.join(b) # add it all together
caption=caption.replace(':\n',':').replace('"','') # some formating and escaping
b=textwrap.wrap(image_link['abstract'].strip().replace(u'\xa0', u' ').replace(':\n',':').replace('\n',',\t'),width=lwidth) # wrap lines that are longer than width
caption='link:'+tnyurl+',\n'+'\n'.join(b) # add it all together
caption=caption.replace('"','').expandtabs().decode('ascii',"ignore").encode('ascii','ignore') # some formating and escaping
# now write it using imagemagic
# a 3rd way? http://stackoverflow.com/questions/4106200/overlaying-an-images-filename-using-imagemagick-or-similar
try:
img = Image.open(file_name)
except:
print "Could not open image", file_name
continue
width, height = img.size
try:
os.system(' '.join(['montage','-label', '"{}"'.format(caption),'"{}"'.format(file_name),'-pointsize', '{}'.format(int(width/lwidth*2)) ,'-geometry','+0+0','-background','White','"{}"'.format(file_name)]))
except:
print "Could not add caption to image", file_name, ' '.join(['montage','-label', '"{}"'.format(caption),'"{}"'.format(file_name),'-pointsize', '{}'.format(int(width/lwidth*2)) ,'-geometry','+0+0','-background','White','"{}"'.format(file_name)])
continue