From 11884430dc5f53be3a558ad92b49f7561edece95 Mon Sep 17 00:00:00 2001
From: Ben Firshman <ben@firshman.co.uk>
Date: Fri, 26 Apr 2019 13:54:30 -0700
Subject: [PATCH] Make matching more robust with regex

---
 bot.py | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/bot.py b/bot.py
index 079a5c7..1d34409 100644
--- a/bot.py
+++ b/bot.py
@@ -4,6 +4,11 @@ import bs4
 import html2text
 import time, os
 import bmemcached
+import re
+
+# from https://github.com/arxiv-vanity/arxiv-vanity/blob/master/arxiv_vanity/scraper/arxiv_ids.py
+ARXIV_ID_PATTERN = r'([a-z\-]+(?:\.[A-Z]{2})?/\d{7}|\d+\.\d+)(v\d+)?'
+ARXIV_URL_RE = re.compile(r'arxiv.org/[^\/]+/({})(\.pdf)?'.format(ARXIV_ID_PATTERN), re.I)
 
 def get_bot():
     PRAW_CLIENT_ID = os.environ.get('PRAW_CLIENT_ID')
@@ -27,7 +32,8 @@ subreddit = r.subreddit('pythonforengineers')
 # alreadydone = set()
 
 
-def scrape_arxiv(url):
+def scrape_arxiv(arxiv_id):
+    url = 'https://arxiv.org/abs/{}'.format(arxiv_id)
     r = requests.get(url)
     soup = bs4.BeautifulSoup(r.text)
     abstract = soup.select('.abstract')[0]
@@ -41,8 +47,7 @@ def scrape_arxiv(url):
     title =  html2text.html2text(title.decode()).replace('\n', ' ')[2:]
 
     abs_link = u'[Landing page]({})'.format(url)
-    pdf_url = url.replace('/abs/', '/pdf/')
-    pdf_link = u'[PDF link]({})'.format(pdf_url)
+    pdf_link = u'[PDF link](https://arxiv.org/pdf/{})'.format(arxiv_id)
     links = u'{}  {}'.format(pdf_link, abs_link)
     response = '\n\n'.join([title, authors, abstract, links]) 
     return response
@@ -53,7 +58,9 @@ def comment(cache):
     try:
         all_posts = subreddit.new(limit=100)
         for post in all_posts:
-            if 'arxiv.org' in post.url:
+            match = ARXIV_URL_RE.search(post.url)
+            if match:
+                arxiv_id = match.group(1)
                 if cache.get(post.id) and cache.get(post.id) is 'T':
                     print "Parsed this post already: %s"%(post.permalink)
                     continue
@@ -61,16 +68,11 @@ def comment(cache):
                     if str(comment.author) == 'arxiv_abstract_bot':
                         break
                 else:
-                    landing_url = post.url
-                    if '.pdf' in landing_url:
-                        landing_url = post.url.replace('.pdf', '')
-                        landing_url = landing_url.replace('/pdf/', '/abs/')
-
-                    response = scrape_arxiv(landing_url)
+                    response = scrape_arxiv(arxiv_id)
                     post.reply(response)
                     cache.set(post.id, 'T')
                     print "Parsed post: %s"%(post.permalink)
-                    print(landing_url, response)
+                    print(arxiv_id, response)
                     time.sleep(10)
     except Exception as error:
         print(error)