From 11884430dc5f53be3a558ad92b49f7561edece95 Mon Sep 17 00:00:00 2001 From: Ben Firshman Date: Fri, 26 Apr 2019 13:54:30 -0700 Subject: [PATCH 1/2] Make matching more robust with regex --- bot.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/bot.py b/bot.py index 079a5c7..1d34409 100644 --- a/bot.py +++ b/bot.py @@ -4,6 +4,11 @@ import bs4 import html2text import time, os import bmemcached +import re + +# from https://github.com/arxiv-vanity/arxiv-vanity/blob/master/arxiv_vanity/scraper/arxiv_ids.py +ARXIV_ID_PATTERN = r'([a-z\-]+(?:\.[A-Z]{2})?/\d{7}|\d+\.\d+)(v\d+)?' +ARXIV_URL_RE = re.compile(r'arxiv.org/[^\/]+/({})(\.pdf)?'.format(ARXIV_ID_PATTERN), re.I) def get_bot(): PRAW_CLIENT_ID = os.environ.get('PRAW_CLIENT_ID') @@ -27,7 +32,8 @@ subreddit = r.subreddit('pythonforengineers') # alreadydone = set() -def scrape_arxiv(url): +def scrape_arxiv(arxiv_id): + url = 'https://arxiv.org/abs/{}'.format(arxiv_id) r = requests.get(url) soup = bs4.BeautifulSoup(r.text) abstract = soup.select('.abstract')[0] @@ -41,8 +47,7 @@ def scrape_arxiv(url): title = html2text.html2text(title.decode()).replace('\n', ' ')[2:] abs_link = u'[Landing page]({})'.format(url) - pdf_url = url.replace('/abs/', '/pdf/') - pdf_link = u'[PDF link]({})'.format(pdf_url) + pdf_link = u'[PDF link](https://arxiv.org/pdf/{})'.format(arxiv_id) links = u'{} {}'.format(pdf_link, abs_link) response = '\n\n'.join([title, authors, abstract, links]) return response @@ -53,7 +58,9 @@ def comment(cache): try: all_posts = subreddit.new(limit=100) for post in all_posts: - if 'arxiv.org' in post.url: + match = ARXIV_URL_RE.search(post.url) + if match: + arxiv_id = match.group(1) if cache.get(post.id) and cache.get(post.id) is 'T': print "Parsed this post already: %s"%(post.permalink) continue @@ -61,16 +68,11 @@ def comment(cache): if str(comment.author) == 'arxiv_abstract_bot': break else: - landing_url = post.url - if '.pdf' in landing_url: - landing_url = post.url.replace('.pdf', '') - landing_url = landing_url.replace('/pdf/', '/abs/') - - response = scrape_arxiv(landing_url) + response = scrape_arxiv(arxiv_id) post.reply(response) cache.set(post.id, 'T') print "Parsed post: %s"%(post.permalink) - print(landing_url, response) + print(arxiv_id, response) time.sleep(10) except Exception as error: print(error) From c3b81252763de32d7dad67d4c88253cc4eb0954c Mon Sep 17 00:00:00 2001 From: Ben Firshman Date: Fri, 26 Apr 2019 14:02:18 -0700 Subject: [PATCH 2/2] Add arXiv Vanity link --- bot.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bot.py b/bot.py index 1d34409..a3f0257 100644 --- a/bot.py +++ b/bot.py @@ -46,9 +46,10 @@ def scrape_arxiv(arxiv_id): title = soup.select('.title')[0] title = html2text.html2text(title.decode()).replace('\n', ' ')[2:] - abs_link = u'[Landing page]({})'.format(url) - pdf_link = u'[PDF link](https://arxiv.org/pdf/{})'.format(arxiv_id) - links = u'{} {}'.format(pdf_link, abs_link) + abs_link = u'[Abstract page]({})'.format(url) + pdf_link = u'[Read as PDF](https://arxiv.org/pdf/{})'.format(arxiv_id) + web_link = u'[Read as web page on arXiv Vanity](https://www.arxiv-vanity.com/papers/{}/)'.format(arxiv_id) + links = u'{} | {} | {}'.format(pdf_link, abs_link, web_link) response = '\n\n'.join([title, authors, abstract, links]) return response