mirror of
https://github.com/wassname/arXiv_abstract_bot.git
synced 2026-06-27 19:29:47 +08:00
80 lines
2.4 KiB
Python
80 lines
2.4 KiB
Python
import praw
|
|
import requests
|
|
import bs4
|
|
import html2text
|
|
import time, os
|
|
|
|
def get_bot():
|
|
PRAW_CLIENT_ID = os.environ.get('PRAW_CLIENT_ID')
|
|
PRAW_CLIENT_SECRET = os.environ.get('PRAW_CLIENT_SECRET')
|
|
PRAW_PASSWORD = os.environ.get('PRAW_PASSWORD')
|
|
PRAW_USERNAME = os.environ.get('PRAW_USERNAME')
|
|
PRAW_USERAGENT = 'linux:arXiv_abstract_bot:0.2 (by /u/arXiv_abstract_bot)'
|
|
return praw.Reddit(
|
|
username=PRAW_USERNAME,
|
|
password=PRAW_PASSWORD,
|
|
client_id=PRAW_CLIENT_ID,
|
|
client_secret=PRAW_CLIENT_SECRET,
|
|
user_agent=PRAW_USERAGENT
|
|
)
|
|
|
|
r = get_bot()
|
|
|
|
subreddit = r.subreddit('machinelearning')
|
|
|
|
alreadydone = set()
|
|
|
|
def scrape_arxiv(url):
|
|
r = requests.get(url)
|
|
soup = bs4.BeautifulSoup(r.text)
|
|
abstract = soup.select('.abstract')[0]
|
|
abstract = html2text.html2text(abstract.decode()).replace('\n', ' ')
|
|
|
|
authors = soup.select('.authors')[0]
|
|
authors = html2text.html2text(authors.decode()).replace('\n', ' ')
|
|
authors = authors.replace('(/', '(http://arxiv.org/')
|
|
|
|
title = soup.select('.title')[0]
|
|
title = html2text.html2text(title.decode()).replace('\n', ' ')[2:]
|
|
|
|
abs_link = u'[Landing page]({})'.format(url)
|
|
pdf_url = url.replace('/abs/', '/pdf/')
|
|
pdf_link = u'[PDF link]({})'.format(pdf_url)
|
|
links = u'{} {}'.format(pdf_link, abs_link)
|
|
response = '\n\n'.join([title, authors, abstract, links])
|
|
return response
|
|
|
|
|
|
def comment():
|
|
print(time.asctime(), "searching")
|
|
try:
|
|
all_posts = subreddit.new(limit=100)
|
|
for post in all_posts:
|
|
if 'arxiv.org' in post.url:
|
|
if post.id in alreadydone:
|
|
continue
|
|
for comment in post.comments:
|
|
if str(comment.author) == 'arxiv_abstract_bot':
|
|
break
|
|
else:
|
|
landing_url = post.url
|
|
if '.pdf' in landing_url:
|
|
landing_url = post.url.replace('.pdf', '')
|
|
landing_url = landing_url.replace('/pdf/', '/abs/')
|
|
|
|
response = scrape_arxiv(landing_url)
|
|
post.reply(response)
|
|
alreadydone.add(post.id)
|
|
print(landing_url, response)
|
|
time.sleep(10)
|
|
except Exception as error:
|
|
print(error)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
while True:
|
|
pass
|
|
# comment()
|
|
# time.sleep(30)
|