mirror of
https://github.com/wassname/scrape-virtual-seismic-atlas.git
synced 2026-06-27 17:02:08 +08:00
initial2
This commit is contained in:
+60
-74
@@ -3,7 +3,17 @@
|
||||
"""
|
||||
Created on Sat Apr 19 20:37:13 2014
|
||||
|
||||
@author: wassname
|
||||
This script will download files from http://see-atlas.leeds.ac.uk:808, then append
|
||||
a url name and caption to them. This is designed to run on linux and requires
|
||||
a few unique modules such as gdshortener and also imagemagik installed and in the path.
|
||||
|
||||
A few option are set in the file, base url, lim url, high res?, and draft?.
|
||||
Images and info is scraped from the main site and images are downloaded to the
|
||||
current directory, only if they do not alread exist.
|
||||
|
||||
Captions are appended to image using calls to imagemagik on the command line. This is because PIL wasn't worked as desired.
|
||||
|
||||
@author: wassname [located_at] wassname (dot) org
|
||||
"""
|
||||
|
||||
|
||||
@@ -33,6 +43,9 @@ else:
|
||||
import bs4
|
||||
import requests
|
||||
import re
|
||||
import gdshortener
|
||||
from PIL import Image
|
||||
import os
|
||||
#create a session object that'll allow us to track the cookies across the session and raise
|
||||
# too much suspicion
|
||||
s = requests.session()
|
||||
@@ -77,36 +90,42 @@ while not done:
|
||||
else:
|
||||
done=True
|
||||
|
||||
#print the list of image links
|
||||
#print image_links
|
||||
|
||||
print "Finished scaping, qeueing {} image downloads".format(len(image_links))
|
||||
|
||||
|
||||
def wget(url,file_name=None):
|
||||
# not working
|
||||
'''http://stackoverflow.com/questions/22676/how-do-i-download-a-file-over-http-using-python'''
|
||||
|
||||
try:
|
||||
r = s.get(url)
|
||||
if not file_name: file_name = r.raw.getheaders()['content-disposition'].split('=')[1]
|
||||
except: # in case of "Max retries exceeded with url"
|
||||
s = requests.session()
|
||||
r = s.get(url)
|
||||
try:
|
||||
image_name= r.raw.getheaders()['content-disposition'].split('=')[1]
|
||||
except:
|
||||
image_name='im.jpg'
|
||||
if file_name==None:
|
||||
file_name=image_name
|
||||
else:
|
||||
file_name=file_name+'.'+image_name.split('.')[1]
|
||||
|
||||
f = open(file_name, 'wb')
|
||||
#file_size = int(r.raw.getheaders()['content-length'])
|
||||
try:
|
||||
file_size = int(r.raw.getheaders()['content-length'])
|
||||
print "Downloading: %s (%s MB)" % (file_name,file_size*9.53674e-7)
|
||||
except:
|
||||
print "Downloading: %s" % (file_name)
|
||||
f.write(requests.get(url).content)
|
||||
f.close()
|
||||
return file_name
|
||||
|
||||
def increase_last_char(c,inc=1):
|
||||
'''This increases the last charector on a string by 1, e.g. 09000064800112fc-> 09000064800112fd or 54 -> 55'''
|
||||
c=c[:-1]+chr(ord(c[-1])+inc)
|
||||
return c
|
||||
|
||||
#for image_link in image_links:
|
||||
# image_link2=increase_last_char(image_link['id'],inc=2) # this increases the last charector by 1
|
||||
# url=base_url+r'/docbaseContent?asAttachment=false&objectId={}&version=CURRENT&vs=1'.format(image_link2)
|
||||
# wget_with_progress_bar(url)
|
||||
|
||||
print "Finished scaping, qeueing {} image downloads".format(len(image_links))
|
||||
|
||||
|
||||
|
||||
|
||||
def safe_filename(s):
|
||||
@@ -121,49 +140,35 @@ def safe_filename(s):
|
||||
return s
|
||||
|
||||
|
||||
# now download images, i have to iterate the last number or digit by 1! to get high res and 2 for low res!
|
||||
# now download images,then write a caption under them
|
||||
# i have to iterate the last number or digit by 1! to get high res and 2 for low res!
|
||||
# high res http://see-atlas.leeds.ac.uk:8080/docbaseContent?asAttachment=false&objectId=090000648001b0d3&version=CURRENT&vs=1
|
||||
# lowres http://see-atlas.leeds.ac.uk:8080/docbaseContent?asAttachment=false&objectId=090000648001b0d4&version=CURRENT&vs=1
|
||||
# let us queue the downloads
|
||||
|
||||
if draft: image_links=image_links[:5]
|
||||
#import tinyurl
|
||||
import gdshortener
|
||||
sg = gdshortener.ISGDShortener()
|
||||
import Image
|
||||
import ImageFont, ImageDraw
|
||||
for image_link in image_links[46:]:
|
||||
|
||||
for image_link in image_links:
|
||||
image_link2=increase_last_char(image_link['id'],inc=inc) # this increases the last charector by 1
|
||||
url=base_url+r'/docbaseContent?asAttachment=false&objectId={}&version=CURRENT&vs=1'.format(image_link2)
|
||||
file_name = safe_filename(image_link['title'])+'.jpg'
|
||||
file_name = safe_filename(image_link['title'])
|
||||
|
||||
# check if its already there
|
||||
if os.path.isfile(file_name+'.jpg'):
|
||||
print "File already exists", file_name+'.jpg'
|
||||
continue
|
||||
|
||||
print "Download %s as %s" % (url, file_name)
|
||||
wget(url,file_name=file_name)
|
||||
file_name=wget(url,file_name=file_name) # download image
|
||||
|
||||
# now do I also want to write on them?
|
||||
# the bastract is to long so just a tinyurl
|
||||
|
||||
# write caption under image
|
||||
# first register tinyurl
|
||||
|
||||
tnyurl=sg.shorten(url = base_url+image_link['href'])[0].replace('http://','')
|
||||
# http://stackoverflow.com/questions/245447/how-do-i-draw-text-at-an-angle-using-pythons-pil
|
||||
# now draw it on the image
|
||||
|
||||
print "Drawing on image"
|
||||
|
||||
# use python to draw an image, but it is overlayed and I cannot get the size right
|
||||
# im=Image.open(file_name)
|
||||
#
|
||||
# try:
|
||||
# f = ImageFont.truetype("usr/share/fonts/truetype/droid/DroidSansMono.ttf",15)
|
||||
# #"/usr/share/fonts/truetype/dejavu/DejaVuSerif.ttf"
|
||||
# except:
|
||||
# f = ImageFont.load_default()
|
||||
#
|
||||
# txt=Image.new('L', (130,int(15*1.3)),color=255)
|
||||
# d = ImageDraw.Draw(txt)
|
||||
# d.text( (0, 0), tnyurl, font=f, fill=0)
|
||||
# #txt=txt.rotate(17.5, expand=1)
|
||||
# im.paste( txt, (0,0))
|
||||
# im.save(file_name)
|
||||
|
||||
# lets format the caption, we have to worry about escaping, and line wrapping
|
||||
import textwrap
|
||||
@@ -175,41 +180,22 @@ for image_link in image_links[46:]:
|
||||
caption=caption.replace(':\n',':').replace('"','') # some formating and escaping
|
||||
|
||||
b=textwrap.wrap(image_link['abstract'].strip().replace(u'\xa0', u' ').replace(':\n',':').replace('\n',',\t'),width=lwidth) # wrap lines that are longer than width
|
||||
caption='link:'+tnyurl+',\t'+'\n'.join(b) # add it all together
|
||||
caption='link:'+tnyurl+',\n'+'\n'.join(b) # add it all together
|
||||
caption=caption.replace('"','').expandtabs().decode('ascii',"ignore").encode('ascii','ignore') # some formating and escaping
|
||||
|
||||
# now add a caption using imagemagik
|
||||
from subprocess import call
|
||||
import subprocess
|
||||
#call(['convert', file_name ,'-background','white',"label:{}".format(caption),'-gravity','Center','-append',file_name])
|
||||
|
||||
# now write it using imagemagic
|
||||
# a 3rd way? http://stackoverflow.com/questions/4106200/overlaying-an-images-filename-using-imagemagick-or-similar
|
||||
#width=subprocess.check_output('img={}'.format(file_name)).strip() # set gilename in bash
|
||||
width=int(subprocess.check_output(['identify','-format','%W','"{}"'.format(file_name)]).strip())
|
||||
#call(['convert',
|
||||
#'-background white',
|
||||
#'-gravity center',
|
||||
#'-fill black',
|
||||
#'-size ${width}x100',
|
||||
#'caption:"{}"'.format(caption),
|
||||
#'"{}"'.format(file_name),
|
||||
#'+swap',
|
||||
#'-gravity south',
|
||||
#'-pointsize 24',
|
||||
#'-composite',
|
||||
#'"with-caption-{}" '.format(file_name)])
|
||||
#
|
||||
#call(['convert',
|
||||
#'"${img}"',
|
||||
#'-fill black',
|
||||
#'-undercolor',
|
||||
#'"#0008"',
|
||||
#'-pointsize 24',
|
||||
#'-gravity south',
|
||||
#'-annotate +0+5 "{}" '.format(caption),
|
||||
#'"with-annotate-${img}" '])
|
||||
try:
|
||||
img = Image.open(file_name)
|
||||
except:
|
||||
print "Could not open image", file_name
|
||||
continue
|
||||
width, height = img.size
|
||||
|
||||
# 3rd try is the charm, this actually uses fontsize yay
|
||||
call(['montage','-label', '"{}"'.format(caption),'-pointsize', '{}'.format(width/lwidth*4) ,'{}'.format(file_name),'-geometry +0+0 -background Gold','"with-montage-{}"'.format(file_name)])
|
||||
try:
|
||||
os.system(' '.join(['montage','-label', '"{}"'.format(caption),'"{}"'.format(file_name),'-pointsize', '{}'.format(int(width/lwidth*2)) ,'-geometry','+0+0','-background','White','"{}"'.format(file_name)]))
|
||||
except:
|
||||
print "Could not add caption to image", file_name, ' '.join(['montage','-label', '"{}"'.format(caption),'"{}"'.format(file_name),'-pointsize', '{}'.format(int(width/lwidth*2)) ,'-geometry','+0+0','-background','White','"{}"'.format(file_name)])
|
||||
continue
|
||||
|
||||
|
||||
Reference in New Issue
Block a user