From 52f14f2dcc58c61b6b43b4a78e77c3863c4bcaa9 Mon Sep 17 00:00:00 2001
From: wassname <wassname@wassname.org>
Date: Sun, 20 Apr 2014 20:24:42 +1200
Subject: [PATCH] initial2

---
 scrape_seismic_atlas.py | 146 ++++++++++++++++++----------------------
 1 file changed, 66 insertions(+), 80 deletions(-)

diff --git a/scrape_seismic_atlas.py b/scrape_seismic_atlas.py
index f71e6ba..46a7501 100644
--- a/scrape_seismic_atlas.py
+++ b/scrape_seismic_atlas.py
@@ -3,7 +3,17 @@
 """
 Created on Sat Apr 19 20:37:13 2014
 
-@author: wassname
+This script will download files from http://see-atlas.leeds.ac.uk:808, then append
+ a url name and caption to them. This is designed to run on linux and requires 
+ a few unique modules such as gdshortener and also imagemagik installed and in the path.
+ 
+A few option are set in the file, base url, lim url, high res?, and draft?. 
+Images and info is scraped from the main site and images are downloaded to the 
+current directory, only if they do not alread exist.
+
+Captions are appended to image using calls to imagemagik on the command line. This is because PIL wasn't worked as desired.
+
+@author: wassname [located_at] wassname (dot) org
 """
 
 
@@ -33,6 +43,9 @@ else:
 import bs4
 import requests
 import re
+import gdshortener
+from PIL import Image
+import os
 #create a session object that'll allow us to track the cookies across the session and raise
 # too much suspicion
 s = requests.session()
@@ -76,37 +89,43 @@ while not done:
         if draft: break
     else:
          done=True
-
-#print the list of image links
-#print image_links
-
+    
+print "Finished scaping, qeueing {} image downloads".format(len(image_links))
 
     
 def wget(url,file_name=None):
     # not working
     '''http://stackoverflow.com/questions/22676/how-do-i-download-a-file-over-http-using-python'''
 
-    r = s.get(url) 
-    if not file_name: file_name = r.raw.getheaders()['content-disposition'].split('=')[1]
+    try:
+        r = s.get(url) 
+    except: # in case of "Max retries exceeded with url"
+        s = requests.session()
+        r = s.get(url) 
+    try:
+        image_name= r.raw.getheaders()['content-disposition'].split('=')[1]
+    except:
+        image_name='im.jpg'
+    if file_name==None:
+        file_name=image_name
+    else:
+        file_name=file_name+'.'+image_name.split('.')[1]
+        
     f = open(file_name, 'wb')
-    #file_size = int(r.raw.getheaders()['content-length'])
-    print "Downloading: %s" % (file_name)
+    try:
+        file_size = int(r.raw.getheaders()['content-length'])
+        print "Downloading: %s (%s MB)" % (file_name,file_size*9.53674e-7)
+    except:
+        print "Downloading: %s" % (file_name)
     f.write(requests.get(url).content)    
     f.close()
+    return file_name
 
 def increase_last_char(c,inc=1):
     '''This increases the last charector on a string by 1, e.g. 09000064800112fc-> 09000064800112fd or 54 -> 55'''
     c=c[:-1]+chr(ord(c[-1])+inc)
     return c
 
-#for image_link in image_links:
-#    image_link2=increase_last_char(image_link['id'],inc=2) # this increases the last charector by 1
-#    url=base_url+r'/docbaseContent?asAttachment=false&objectId={}&version=CURRENT&vs=1'.format(image_link2)
-#    wget_with_progress_bar(url)
-    
-print "Finished scaping, qeueing {} image downloads".format(len(image_links))
-    
-
    
 
 def safe_filename(s):
@@ -121,49 +140,35 @@ def safe_filename(s):
     return s
     
 
-# now download images, i have to iterate the last number or digit by 1! to get high res and 2 for low res!
+# now download images,then write a caption under them
+#  i have to iterate the last number or digit by 1! to get high res and 2 for low res!
 # high res http://see-atlas.leeds.ac.uk:8080/docbaseContent?asAttachment=false&objectId=090000648001b0d3&version=CURRENT&vs=1
 # lowres   http://see-atlas.leeds.ac.uk:8080/docbaseContent?asAttachment=false&objectId=090000648001b0d4&version=CURRENT&vs=1
 # let us queue the downloads
 
 if draft: image_links=image_links[:5]
-#import tinyurl
-import gdshortener
 sg = gdshortener.ISGDShortener()
-import Image
-import ImageFont, ImageDraw
-for image_link in image_links[46:]:
+
+for image_link in image_links:
     image_link2=increase_last_char(image_link['id'],inc=inc) # this increases the last charector by 1
     url=base_url+r'/docbaseContent?asAttachment=false&objectId={}&version=CURRENT&vs=1'.format(image_link2)
-    file_name = safe_filename(image_link['title'])+'.jpg'
-    print "Download %s as %s" % (url, file_name)
-    wget(url,file_name=file_name)
-
-    # now do I also want to write on them?
-    # the bastract is to long so just a tinyurl
-    # first register tinyurl
+    file_name = safe_filename(image_link['title'])
     
+    # check if its already there
+    if os.path.isfile(file_name+'.jpg'):
+        print "File already exists", file_name+'.jpg'
+        continue
+
+    print "Download %s as %s" % (url, file_name)
+    file_name=wget(url,file_name=file_name) # download image
+
+
+    # write caption under image
+    # first register tinyurl
     tnyurl=sg.shorten(url = base_url+image_link['href'])[0].replace('http://','')
-    # http://stackoverflow.com/questions/245447/how-do-i-draw-text-at-an-angle-using-pythons-pil
-    # now draw it on the image
 
     print "Drawing on image"
     
-    # use python to draw an image, but it is overlayed and I cannot get the size right
-#    im=Image.open(file_name)
-#    
-#    try:
-#        f = ImageFont.truetype("usr/share/fonts/truetype/droid/DroidSansMono.ttf",15)
-#        #"/usr/share/fonts/truetype/dejavu/DejaVuSerif.ttf"
-#    except:
-#        f = ImageFont.load_default()
-#    
-#    txt=Image.new('L', (130,int(15*1.3)),color=255)
-#    d = ImageDraw.Draw(txt)
-#    d.text( (0, 0), tnyurl,  font=f, fill=0)
-#    #txt=txt.rotate(17.5,  expand=1)
-#    im.paste( txt, (0,0))
-#    im.save(file_name)
 
     # lets format the caption, we have to worry about escaping, and line wrapping
     import textwrap
@@ -175,41 +180,22 @@ for image_link in image_links[46:]:
     caption=caption.replace(':\n',':').replace('"','') # some formating and escaping
     
     b=textwrap.wrap(image_link['abstract'].strip().replace(u'\xa0', u' ').replace(':\n',':').replace('\n',',\t'),width=lwidth) # wrap lines that are longer than width
-    caption='link:'+tnyurl+',\t'+'\n'.join(b) # add it all together
+    caption='link:'+tnyurl+',\n'+'\n'.join(b) # add it all together
     caption=caption.replace('"','').expandtabs().decode('ascii',"ignore").encode('ascii','ignore') # some formating and escaping
     
-    # now add a caption using imagemagik
-    from subprocess import call 
-    import subprocess
-    #call(['convert', file_name   ,'-background','white',"label:{}".format(caption),'-gravity','Center','-append',file_name])
-    
+    # now write it using imagemagic
     # a 3rd way? http://stackoverflow.com/questions/4106200/overlaying-an-images-filename-using-imagemagick-or-similar
-    #width=subprocess.check_output('img={}'.format(file_name)).strip() # set gilename in bash
-    width=int(subprocess.check_output(['identify','-format','%W','"{}"'.format(file_name)]).strip())
-    #call(['convert',
-    #'-background white',
-    #'-gravity center',
-    #'-fill black',
-    #'-size ${width}x100',
-    #'caption:"{}"'.format(caption),
-    #'"{}"'.format(file_name),
-    #'+swap',
-    #'-gravity south',
-    #'-pointsize 24',
-    #'-composite',
-    #'"with-caption-{}" '.format(file_name)])
-    #
-    #call(['convert',
-    #'"${img}"',
-    #'-fill black',
-    #'-undercolor',
-    #'"#0008"',
-    #'-pointsize 24',
-    #'-gravity south',
-    #'-annotate +0+5 "{}" '.format(caption),
-    #'"with-annotate-${img}" '])
-    
-    # 3rd try is the charm, this actually uses fontsize yay
-    call(['montage','-label', '"{}"'.format(caption),'-pointsize', '{}'.format(width/lwidth*4) ,'{}'.format(file_name),'-geometry +0+0 -background Gold','"with-montage-{}"'.format(file_name)])
+    try:
+        img = Image.open(file_name)
+    except:
+        print "Could not open image", file_name
+        continue
+    width, height = img.size
+
+    try:
+        os.system(' '.join(['montage','-label', '"{}"'.format(caption),'"{}"'.format(file_name),'-pointsize', '{}'.format(int(width/lwidth*2)) ,'-geometry','+0+0','-background','White','"{}"'.format(file_name)]))
+    except:
+        print "Could not add caption to image", file_name, ' '.join(['montage','-label', '"{}"'.format(caption),'"{}"'.format(file_name),'-pointsize', '{}'.format(int(width/lwidth*2)) ,'-geometry','+0+0','-background','White','"{}"'.format(file_name)])
+        continue
     
     
\ No newline at end of file