mirror of
https://github.com/wassname/arXiv_abstract_bot.git
synced 2026-06-27 16:44:23 +08:00
Merge branch 'add-arxiv-vanity-links' of https://github.com/bfirsh/arXiv_abstract_bot into bfirsh-add-arxiv-vanity-links
This commit is contained in:
@@ -4,6 +4,11 @@ import bs4
|
||||
import html2text
|
||||
import time, os
|
||||
import bmemcached
|
||||
import re
|
||||
|
||||
# from https://github.com/arxiv-vanity/arxiv-vanity/blob/master/arxiv_vanity/scraper/arxiv_ids.py
|
||||
ARXIV_ID_PATTERN = r'([a-z\-]+(?:\.[A-Z]{2})?/\d{7}|\d+\.\d+)(v\d+)?'
|
||||
ARXIV_URL_RE = re.compile(r'arxiv.org/[^\/]+/({})(\.pdf)?'.format(ARXIV_ID_PATTERN), re.I)
|
||||
|
||||
def get_bot():
|
||||
PRAW_CLIENT_ID = os.environ.get('PRAW_CLIENT_ID')
|
||||
@@ -27,7 +32,8 @@ subreddit = r.subreddit('machinelearning')
|
||||
# alreadydone = set()
|
||||
|
||||
|
||||
def scrape_arxiv(url):
|
||||
def scrape_arxiv(arxiv_id):
|
||||
url = 'https://arxiv.org/abs/{}'.format(arxiv_id)
|
||||
r = requests.get(url)
|
||||
soup = bs4.BeautifulSoup(r.text)
|
||||
abstract = soup.select('.abstract')[0]
|
||||
@@ -40,10 +46,10 @@ def scrape_arxiv(url):
|
||||
title = soup.select('.title')[0]
|
||||
title = html2text.html2text(title.decode()).replace('\n', ' ')[2:]
|
||||
|
||||
abs_link = u'[Landing page]({})'.format(url)
|
||||
pdf_url = url.replace('/abs/', '/pdf/')
|
||||
pdf_link = u'[PDF link]({})'.format(pdf_url)
|
||||
links = u'{} {}'.format(pdf_link, abs_link)
|
||||
abs_link = u'[Abstract page]({})'.format(url)
|
||||
pdf_link = u'[Read as PDF](https://arxiv.org/pdf/{})'.format(arxiv_id)
|
||||
web_link = u'[Read as web page on arXiv Vanity](https://www.arxiv-vanity.com/papers/{}/)'.format(arxiv_id)
|
||||
links = u'{} | {} | {}'.format(pdf_link, abs_link, web_link)
|
||||
response = '\n\n'.join([title, authors, abstract, links])
|
||||
return response
|
||||
|
||||
@@ -53,7 +59,9 @@ def comment(cache):
|
||||
try:
|
||||
all_posts = subreddit.new(limit=100)
|
||||
for post in all_posts:
|
||||
if 'arxiv.org' in post.url:
|
||||
match = ARXIV_URL_RE.search(post.url)
|
||||
if match:
|
||||
arxiv_id = match.group(1)
|
||||
if cache.get(post.id) and cache.get(post.id) is 'T':
|
||||
print "Parsed this post already: %s"%(post.permalink)
|
||||
continue
|
||||
@@ -61,16 +69,11 @@ def comment(cache):
|
||||
if str(comment.author) == 'arXiv_abstract_bot':
|
||||
break
|
||||
else:
|
||||
landing_url = post.url
|
||||
if '.pdf' in landing_url:
|
||||
landing_url = post.url.replace('.pdf', '')
|
||||
landing_url = landing_url.replace('/pdf/', '/abs/')
|
||||
|
||||
response = scrape_arxiv(landing_url)
|
||||
response = scrape_arxiv(arxiv_id)
|
||||
post.reply(response)
|
||||
cache.set(post.id, 'T')
|
||||
print "Parsed post: %s"%(post.permalink)
|
||||
print(landing_url, response)
|
||||
print(arxiv_id, response)
|
||||
time.sleep(10)
|
||||
except Exception as error:
|
||||
print(error)
|
||||
|
||||
Reference in New Issue
Block a user