mirror of
https://github.com/wassname/scrape-virtual-seismic-atlas.git
synced 2026-06-27 15:16:46 +08:00
201 lines
7.4 KiB
Python
201 lines
7.4 KiB
Python
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Created on Sat Apr 19 20:37:13 2014
|
|
|
|
This script will download files from http://see-atlas.leeds.ac.uk:808, then append
|
|
a url name and caption to them. This is designed to run on linux and requires
|
|
a few unique modules such as gdshortener and also imagemagik installed and in the path.
|
|
|
|
A few option are set in the file, base url, lim url, high res?, and draft?.
|
|
Images and info is scraped from the main site and images are downloaded to the
|
|
current directory, only if they do not alread exist.
|
|
|
|
Captions are appended to image using calls to imagemagik on the command line. This is because PIL wasn't worked as desired.
|
|
|
|
@author: wassname [located_at] wassname (dot) org
|
|
"""
|
|
|
|
|
|
#==============================================================================
|
|
# options
|
|
#==============================================================================
|
|
#the base URL we'll append the the 'limb' URL we'll find in the 'next' buttons
|
|
base_url = r'http://see-atlas.leeds.ac.uk:8080'
|
|
limb_url = r'/search/advancedSearch.jsp?rpp=50&Ns=P_LastModified%7c1&N=6394'
|
|
high_res=True
|
|
draft=False
|
|
|
|
|
|
#==============================================================================
|
|
# Main program
|
|
#==============================================================================
|
|
if high_res:
|
|
inc=1
|
|
else:
|
|
inc=2
|
|
|
|
|
|
#target=r'http://see-atlas.leeds.ac.uk:8080/search/advancedSearch.jsp?rpp=50&Ns=P_LastModified%7c1&N=6394' # adv search with 50 results per page
|
|
#exampleimage=r'http://see-atlas.leeds.ac.uk:8080/docbaseContent?objectId=09000064800112fe'
|
|
|
|
#make the required import
|
|
import bs4
|
|
import requests
|
|
import re
|
|
import gdshortener
|
|
from PIL import Image
|
|
import os
|
|
#create a session object that'll allow us to track the cookies across the session and raise
|
|
# too much suspicion
|
|
s = requests.session()
|
|
|
|
|
|
|
|
#a list to hold all the image URLs we'll find
|
|
image_links = []
|
|
|
|
|
|
# a few booleans to help our loop out
|
|
done = False
|
|
|
|
print "Scraping site:",base_url
|
|
#keep looping until we're 'done' which means that our 'next' button is leading us in circles
|
|
while not done:
|
|
#combine base and limb URLs to create a full URL we can use
|
|
url = base_url + limb_url
|
|
#make the GET requests to the full URL we've just built
|
|
r = s.get(url)
|
|
#create a BeautifulSoup Object that we can parse for data
|
|
soup = bs4.BeautifulSoup(r.content)
|
|
|
|
entity_divs=soup.findAll(attrs={'class': 'entity'})
|
|
# first grab all the images on the page
|
|
img_divs=soup.findAll(attrs={'class': 'result-thumbnail'})
|
|
for entity_div in entity_divs:
|
|
img_div=entity_div.findAll(attrs={'class': 'result-thumbnail'})[0]
|
|
img_dic=img_div.a.attrs
|
|
img_dic['id']=img_dic['href'].split('=')[1]
|
|
img_dic['abstract']=re.sub(r'\s\s+',r'\n',entity_div.text) # here is the description to overlay on
|
|
image_links.append(img_dic)
|
|
print 'found:', img_dic['id'], img_dic['title'][:50] # max 50 chars in title
|
|
|
|
#now see if we can find the next button
|
|
next_found = False
|
|
nxt_elems=soup.findAll(text='Next',attrs={'class':'paginate'})
|
|
if nxt_elems:
|
|
limb_url=nxt_elems[-1].attrs['href']
|
|
print "Searching next page:", limb_url
|
|
if draft: break
|
|
else:
|
|
done=True
|
|
|
|
print "Finished scaping, qeueing {} image downloads".format(len(image_links))
|
|
|
|
|
|
def wget(url,file_name=None):
|
|
# not working
|
|
'''http://stackoverflow.com/questions/22676/how-do-i-download-a-file-over-http-using-python'''
|
|
|
|
try:
|
|
r = s.get(url)
|
|
except: # in case of "Max retries exceeded with url"
|
|
s = requests.session()
|
|
r = s.get(url)
|
|
try:
|
|
image_name= r.raw.getheaders()['content-disposition'].split('=')[1]
|
|
except:
|
|
image_name='im.jpg'
|
|
if file_name==None:
|
|
file_name=image_name
|
|
else:
|
|
file_name=file_name+'.'+image_name.split('.')[1]
|
|
|
|
f = open(file_name, 'wb')
|
|
try:
|
|
file_size = int(r.raw.getheaders()['content-length'])
|
|
print "Downloading: %s (%s MB)" % (file_name,file_size*9.53674e-7)
|
|
except:
|
|
print "Downloading: %s" % (file_name)
|
|
f.write(requests.get(url).content)
|
|
f.close()
|
|
return file_name
|
|
|
|
def increase_last_char(c,inc=1):
|
|
'''This increases the last charector on a string by 1, e.g. 09000064800112fc-> 09000064800112fd or 54 -> 55'''
|
|
c=c[:-1]+chr(ord(c[-1])+inc)
|
|
return c
|
|
|
|
|
|
|
|
def safe_filename(s):
|
|
""""This will make a filename safe and nice. So remove invalid chars, as well as spaced for linux"""
|
|
remove="""\\/:;*?"<>|%,#$!+{}[]'@`=&^"""
|
|
replace=' ()'
|
|
for c in remove:
|
|
s=s.replace(c,'')
|
|
for c in replace:
|
|
s=s.replace(c,'_')
|
|
s=s.lower()
|
|
return s
|
|
|
|
|
|
# now download images,then write a caption under them
|
|
# i have to iterate the last number or digit by 1! to get high res and 2 for low res!
|
|
# high res http://see-atlas.leeds.ac.uk:8080/docbaseContent?asAttachment=false&objectId=090000648001b0d3&version=CURRENT&vs=1
|
|
# lowres http://see-atlas.leeds.ac.uk:8080/docbaseContent?asAttachment=false&objectId=090000648001b0d4&version=CURRENT&vs=1
|
|
# let us queue the downloads
|
|
|
|
if draft: image_links=image_links[:5]
|
|
sg = gdshortener.ISGDShortener()
|
|
|
|
for image_link in image_links:
|
|
image_link2=increase_last_char(image_link['id'],inc=inc) # this increases the last charector by 1
|
|
url=base_url+r'/docbaseContent?asAttachment=false&objectId={}&version=CURRENT&vs=1'.format(image_link2)
|
|
file_name = safe_filename(image_link['title'])
|
|
|
|
# check if its already there
|
|
if os.path.isfile(file_name+'.jpg'):
|
|
print "File already exists", file_name+'.jpg'
|
|
continue
|
|
|
|
print "Download %s as %s" % (url, file_name)
|
|
file_name=wget(url,file_name=file_name) # download image
|
|
|
|
|
|
# write caption under image
|
|
# first register tinyurl
|
|
tnyurl=sg.shorten(url = base_url+image_link['href'])[0].replace('http://','')
|
|
|
|
print "Drawing on image"
|
|
|
|
|
|
# lets format the caption, we have to worry about escaping, and line wrapping
|
|
import textwrap
|
|
lwidth=140
|
|
b=[]
|
|
for line in image_link['abstract'].split('\n'): # split it by existing line breaks to preserve them
|
|
b=b+textwrap.wrap(line,width=lwidth) # wrap lines that are longer than width
|
|
caption='link:'+tnyurl+'\n'+'\n'.join(b) # add it all together
|
|
caption=caption.replace(':\n',':').replace('"','') # some formating and escaping
|
|
|
|
b=textwrap.wrap(image_link['abstract'].strip().replace(u'\xa0', u' ').replace(':\n',':').replace('\n',',\t'),width=lwidth) # wrap lines that are longer than width
|
|
caption='link:'+tnyurl+',\n'+'\n'.join(b) # add it all together
|
|
caption=caption.replace('"','').expandtabs().decode('ascii',"ignore").encode('ascii','ignore') # some formating and escaping
|
|
|
|
# now write it using imagemagic
|
|
# a 3rd way? http://stackoverflow.com/questions/4106200/overlaying-an-images-filename-using-imagemagick-or-similar
|
|
try:
|
|
img = Image.open(file_name)
|
|
except:
|
|
print "Could not open image", file_name
|
|
continue
|
|
width, height = img.size
|
|
|
|
try:
|
|
os.system(' '.join(['montage','-label', '"{}"'.format(caption),'"{}"'.format(file_name),'-pointsize', '{}'.format(int(width/lwidth*2)) ,'-geometry','+0+0','-background','White','"{}"'.format(file_name)]))
|
|
except:
|
|
print "Could not add caption to image", file_name, ' '.join(['montage','-label', '"{}"'.format(caption),'"{}"'.format(file_name),'-pointsize', '{}'.format(int(width/lwidth*2)) ,'-geometry','+0+0','-background','White','"{}"'.format(file_name)])
|
|
continue
|
|
|
|
|