mirror of
https://github.com/wassname/scrape-virtual-seismic-atlas.git
synced 2026-06-27 16:46:47 +08:00
initial2
This commit is contained in:
+66
-80
@@ -3,7 +3,17 @@
|
|||||||
"""
|
"""
|
||||||
Created on Sat Apr 19 20:37:13 2014
|
Created on Sat Apr 19 20:37:13 2014
|
||||||
|
|
||||||
@author: wassname
|
This script will download files from http://see-atlas.leeds.ac.uk:808, then append
|
||||||
|
a url name and caption to them. This is designed to run on linux and requires
|
||||||
|
a few unique modules such as gdshortener and also imagemagik installed and in the path.
|
||||||
|
|
||||||
|
A few option are set in the file, base url, lim url, high res?, and draft?.
|
||||||
|
Images and info is scraped from the main site and images are downloaded to the
|
||||||
|
current directory, only if they do not alread exist.
|
||||||
|
|
||||||
|
Captions are appended to image using calls to imagemagik on the command line. This is because PIL wasn't worked as desired.
|
||||||
|
|
||||||
|
@author: wassname [located_at] wassname (dot) org
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@@ -33,6 +43,9 @@ else:
|
|||||||
import bs4
|
import bs4
|
||||||
import requests
|
import requests
|
||||||
import re
|
import re
|
||||||
|
import gdshortener
|
||||||
|
from PIL import Image
|
||||||
|
import os
|
||||||
#create a session object that'll allow us to track the cookies across the session and raise
|
#create a session object that'll allow us to track the cookies across the session and raise
|
||||||
# too much suspicion
|
# too much suspicion
|
||||||
s = requests.session()
|
s = requests.session()
|
||||||
@@ -76,37 +89,43 @@ while not done:
|
|||||||
if draft: break
|
if draft: break
|
||||||
else:
|
else:
|
||||||
done=True
|
done=True
|
||||||
|
|
||||||
#print the list of image links
|
print "Finished scaping, qeueing {} image downloads".format(len(image_links))
|
||||||
#print image_links
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def wget(url,file_name=None):
|
def wget(url,file_name=None):
|
||||||
# not working
|
# not working
|
||||||
'''http://stackoverflow.com/questions/22676/how-do-i-download-a-file-over-http-using-python'''
|
'''http://stackoverflow.com/questions/22676/how-do-i-download-a-file-over-http-using-python'''
|
||||||
|
|
||||||
r = s.get(url)
|
try:
|
||||||
if not file_name: file_name = r.raw.getheaders()['content-disposition'].split('=')[1]
|
r = s.get(url)
|
||||||
|
except: # in case of "Max retries exceeded with url"
|
||||||
|
s = requests.session()
|
||||||
|
r = s.get(url)
|
||||||
|
try:
|
||||||
|
image_name= r.raw.getheaders()['content-disposition'].split('=')[1]
|
||||||
|
except:
|
||||||
|
image_name='im.jpg'
|
||||||
|
if file_name==None:
|
||||||
|
file_name=image_name
|
||||||
|
else:
|
||||||
|
file_name=file_name+'.'+image_name.split('.')[1]
|
||||||
|
|
||||||
f = open(file_name, 'wb')
|
f = open(file_name, 'wb')
|
||||||
#file_size = int(r.raw.getheaders()['content-length'])
|
try:
|
||||||
print "Downloading: %s" % (file_name)
|
file_size = int(r.raw.getheaders()['content-length'])
|
||||||
|
print "Downloading: %s (%s MB)" % (file_name,file_size*9.53674e-7)
|
||||||
|
except:
|
||||||
|
print "Downloading: %s" % (file_name)
|
||||||
f.write(requests.get(url).content)
|
f.write(requests.get(url).content)
|
||||||
f.close()
|
f.close()
|
||||||
|
return file_name
|
||||||
|
|
||||||
def increase_last_char(c,inc=1):
|
def increase_last_char(c,inc=1):
|
||||||
'''This increases the last charector on a string by 1, e.g. 09000064800112fc-> 09000064800112fd or 54 -> 55'''
|
'''This increases the last charector on a string by 1, e.g. 09000064800112fc-> 09000064800112fd or 54 -> 55'''
|
||||||
c=c[:-1]+chr(ord(c[-1])+inc)
|
c=c[:-1]+chr(ord(c[-1])+inc)
|
||||||
return c
|
return c
|
||||||
|
|
||||||
#for image_link in image_links:
|
|
||||||
# image_link2=increase_last_char(image_link['id'],inc=2) # this increases the last charector by 1
|
|
||||||
# url=base_url+r'/docbaseContent?asAttachment=false&objectId={}&version=CURRENT&vs=1'.format(image_link2)
|
|
||||||
# wget_with_progress_bar(url)
|
|
||||||
|
|
||||||
print "Finished scaping, qeueing {} image downloads".format(len(image_links))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def safe_filename(s):
|
def safe_filename(s):
|
||||||
@@ -121,49 +140,35 @@ def safe_filename(s):
|
|||||||
return s
|
return s
|
||||||
|
|
||||||
|
|
||||||
# now download images, i have to iterate the last number or digit by 1! to get high res and 2 for low res!
|
# now download images,then write a caption under them
|
||||||
|
# i have to iterate the last number or digit by 1! to get high res and 2 for low res!
|
||||||
# high res http://see-atlas.leeds.ac.uk:8080/docbaseContent?asAttachment=false&objectId=090000648001b0d3&version=CURRENT&vs=1
|
# high res http://see-atlas.leeds.ac.uk:8080/docbaseContent?asAttachment=false&objectId=090000648001b0d3&version=CURRENT&vs=1
|
||||||
# lowres http://see-atlas.leeds.ac.uk:8080/docbaseContent?asAttachment=false&objectId=090000648001b0d4&version=CURRENT&vs=1
|
# lowres http://see-atlas.leeds.ac.uk:8080/docbaseContent?asAttachment=false&objectId=090000648001b0d4&version=CURRENT&vs=1
|
||||||
# let us queue the downloads
|
# let us queue the downloads
|
||||||
|
|
||||||
if draft: image_links=image_links[:5]
|
if draft: image_links=image_links[:5]
|
||||||
#import tinyurl
|
|
||||||
import gdshortener
|
|
||||||
sg = gdshortener.ISGDShortener()
|
sg = gdshortener.ISGDShortener()
|
||||||
import Image
|
|
||||||
import ImageFont, ImageDraw
|
for image_link in image_links:
|
||||||
for image_link in image_links[46:]:
|
|
||||||
image_link2=increase_last_char(image_link['id'],inc=inc) # this increases the last charector by 1
|
image_link2=increase_last_char(image_link['id'],inc=inc) # this increases the last charector by 1
|
||||||
url=base_url+r'/docbaseContent?asAttachment=false&objectId={}&version=CURRENT&vs=1'.format(image_link2)
|
url=base_url+r'/docbaseContent?asAttachment=false&objectId={}&version=CURRENT&vs=1'.format(image_link2)
|
||||||
file_name = safe_filename(image_link['title'])+'.jpg'
|
file_name = safe_filename(image_link['title'])
|
||||||
print "Download %s as %s" % (url, file_name)
|
|
||||||
wget(url,file_name=file_name)
|
|
||||||
|
|
||||||
# now do I also want to write on them?
|
|
||||||
# the bastract is to long so just a tinyurl
|
|
||||||
# first register tinyurl
|
|
||||||
|
|
||||||
|
# check if its already there
|
||||||
|
if os.path.isfile(file_name+'.jpg'):
|
||||||
|
print "File already exists", file_name+'.jpg'
|
||||||
|
continue
|
||||||
|
|
||||||
|
print "Download %s as %s" % (url, file_name)
|
||||||
|
file_name=wget(url,file_name=file_name) # download image
|
||||||
|
|
||||||
|
|
||||||
|
# write caption under image
|
||||||
|
# first register tinyurl
|
||||||
tnyurl=sg.shorten(url = base_url+image_link['href'])[0].replace('http://','')
|
tnyurl=sg.shorten(url = base_url+image_link['href'])[0].replace('http://','')
|
||||||
# http://stackoverflow.com/questions/245447/how-do-i-draw-text-at-an-angle-using-pythons-pil
|
|
||||||
# now draw it on the image
|
|
||||||
|
|
||||||
print "Drawing on image"
|
print "Drawing on image"
|
||||||
|
|
||||||
# use python to draw an image, but it is overlayed and I cannot get the size right
|
|
||||||
# im=Image.open(file_name)
|
|
||||||
#
|
|
||||||
# try:
|
|
||||||
# f = ImageFont.truetype("usr/share/fonts/truetype/droid/DroidSansMono.ttf",15)
|
|
||||||
# #"/usr/share/fonts/truetype/dejavu/DejaVuSerif.ttf"
|
|
||||||
# except:
|
|
||||||
# f = ImageFont.load_default()
|
|
||||||
#
|
|
||||||
# txt=Image.new('L', (130,int(15*1.3)),color=255)
|
|
||||||
# d = ImageDraw.Draw(txt)
|
|
||||||
# d.text( (0, 0), tnyurl, font=f, fill=0)
|
|
||||||
# #txt=txt.rotate(17.5, expand=1)
|
|
||||||
# im.paste( txt, (0,0))
|
|
||||||
# im.save(file_name)
|
|
||||||
|
|
||||||
# lets format the caption, we have to worry about escaping, and line wrapping
|
# lets format the caption, we have to worry about escaping, and line wrapping
|
||||||
import textwrap
|
import textwrap
|
||||||
@@ -175,41 +180,22 @@ for image_link in image_links[46:]:
|
|||||||
caption=caption.replace(':\n',':').replace('"','') # some formating and escaping
|
caption=caption.replace(':\n',':').replace('"','') # some formating and escaping
|
||||||
|
|
||||||
b=textwrap.wrap(image_link['abstract'].strip().replace(u'\xa0', u' ').replace(':\n',':').replace('\n',',\t'),width=lwidth) # wrap lines that are longer than width
|
b=textwrap.wrap(image_link['abstract'].strip().replace(u'\xa0', u' ').replace(':\n',':').replace('\n',',\t'),width=lwidth) # wrap lines that are longer than width
|
||||||
caption='link:'+tnyurl+',\t'+'\n'.join(b) # add it all together
|
caption='link:'+tnyurl+',\n'+'\n'.join(b) # add it all together
|
||||||
caption=caption.replace('"','').expandtabs().decode('ascii',"ignore").encode('ascii','ignore') # some formating and escaping
|
caption=caption.replace('"','').expandtabs().decode('ascii',"ignore").encode('ascii','ignore') # some formating and escaping
|
||||||
|
|
||||||
# now add a caption using imagemagik
|
# now write it using imagemagic
|
||||||
from subprocess import call
|
|
||||||
import subprocess
|
|
||||||
#call(['convert', file_name ,'-background','white',"label:{}".format(caption),'-gravity','Center','-append',file_name])
|
|
||||||
|
|
||||||
# a 3rd way? http://stackoverflow.com/questions/4106200/overlaying-an-images-filename-using-imagemagick-or-similar
|
# a 3rd way? http://stackoverflow.com/questions/4106200/overlaying-an-images-filename-using-imagemagick-or-similar
|
||||||
#width=subprocess.check_output('img={}'.format(file_name)).strip() # set gilename in bash
|
try:
|
||||||
width=int(subprocess.check_output(['identify','-format','%W','"{}"'.format(file_name)]).strip())
|
img = Image.open(file_name)
|
||||||
#call(['convert',
|
except:
|
||||||
#'-background white',
|
print "Could not open image", file_name
|
||||||
#'-gravity center',
|
continue
|
||||||
#'-fill black',
|
width, height = img.size
|
||||||
#'-size ${width}x100',
|
|
||||||
#'caption:"{}"'.format(caption),
|
try:
|
||||||
#'"{}"'.format(file_name),
|
os.system(' '.join(['montage','-label', '"{}"'.format(caption),'"{}"'.format(file_name),'-pointsize', '{}'.format(int(width/lwidth*2)) ,'-geometry','+0+0','-background','White','"{}"'.format(file_name)]))
|
||||||
#'+swap',
|
except:
|
||||||
#'-gravity south',
|
print "Could not add caption to image", file_name, ' '.join(['montage','-label', '"{}"'.format(caption),'"{}"'.format(file_name),'-pointsize', '{}'.format(int(width/lwidth*2)) ,'-geometry','+0+0','-background','White','"{}"'.format(file_name)])
|
||||||
#'-pointsize 24',
|
continue
|
||||||
#'-composite',
|
|
||||||
#'"with-caption-{}" '.format(file_name)])
|
|
||||||
#
|
|
||||||
#call(['convert',
|
|
||||||
#'"${img}"',
|
|
||||||
#'-fill black',
|
|
||||||
#'-undercolor',
|
|
||||||
#'"#0008"',
|
|
||||||
#'-pointsize 24',
|
|
||||||
#'-gravity south',
|
|
||||||
#'-annotate +0+5 "{}" '.format(caption),
|
|
||||||
#'"with-annotate-${img}" '])
|
|
||||||
|
|
||||||
# 3rd try is the charm, this actually uses fontsize yay
|
|
||||||
call(['montage','-label', '"{}"'.format(caption),'-pointsize', '{}'.format(width/lwidth*4) ,'{}'.format(file_name),'-geometry +0+0 -background Gold','"with-montage-{}"'.format(file_name)])
|
|
||||||
|
|
||||||
|
|
||||||
Reference in New Issue
Block a user