From 11884430dc5f53be3a558ad92b49f7561edece95 Mon Sep 17 00:00:00 2001 From: Ben Firshman Date: Fri, 26 Apr 2019 13:54:30 -0700 Subject: [PATCH] Make matching more robust with regex --- bot.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/bot.py b/bot.py index 079a5c7..1d34409 100644 --- a/bot.py +++ b/bot.py @@ -4,6 +4,11 @@ import bs4 import html2text import time, os import bmemcached +import re + +# from https://github.com/arxiv-vanity/arxiv-vanity/blob/master/arxiv_vanity/scraper/arxiv_ids.py +ARXIV_ID_PATTERN = r'([a-z\-]+(?:\.[A-Z]{2})?/\d{7}|\d+\.\d+)(v\d+)?' +ARXIV_URL_RE = re.compile(r'arxiv.org/[^\/]+/({})(\.pdf)?'.format(ARXIV_ID_PATTERN), re.I) def get_bot(): PRAW_CLIENT_ID = os.environ.get('PRAW_CLIENT_ID') @@ -27,7 +32,8 @@ subreddit = r.subreddit('pythonforengineers') # alreadydone = set() -def scrape_arxiv(url): +def scrape_arxiv(arxiv_id): + url = 'https://arxiv.org/abs/{}'.format(arxiv_id) r = requests.get(url) soup = bs4.BeautifulSoup(r.text) abstract = soup.select('.abstract')[0] @@ -41,8 +47,7 @@ def scrape_arxiv(url): title = html2text.html2text(title.decode()).replace('\n', ' ')[2:] abs_link = u'[Landing page]({})'.format(url) - pdf_url = url.replace('/abs/', '/pdf/') - pdf_link = u'[PDF link]({})'.format(pdf_url) + pdf_link = u'[PDF link](https://arxiv.org/pdf/{})'.format(arxiv_id) links = u'{} {}'.format(pdf_link, abs_link) response = '\n\n'.join([title, authors, abstract, links]) return response @@ -53,7 +58,9 @@ def comment(cache): try: all_posts = subreddit.new(limit=100) for post in all_posts: - if 'arxiv.org' in post.url: + match = ARXIV_URL_RE.search(post.url) + if match: + arxiv_id = match.group(1) if cache.get(post.id) and cache.get(post.id) is 'T': print "Parsed this post already: %s"%(post.permalink) continue @@ -61,16 +68,11 @@ def comment(cache): if str(comment.author) == 'arxiv_abstract_bot': break else: - landing_url = post.url - if '.pdf' in landing_url: - landing_url = post.url.replace('.pdf', '') - landing_url = landing_url.replace('/pdf/', '/abs/') - - response = scrape_arxiv(landing_url) + response = scrape_arxiv(arxiv_id) post.reply(response) cache.set(post.id, 'T') print "Parsed post: %s"%(post.permalink) - print(landing_url, response) + print(arxiv_id, response) time.sleep(10) except Exception as error: print(error)