mirror of
https://github.com/wassname/arXiv_abstract_bot.git
synced 2026-06-27 16:44:23 +08:00
shortsciencebot and arxiv xpost working
This commit is contained in:
@@ -0,0 +1,52 @@
|
||||
import urllib.request
|
||||
import datetime
|
||||
from urllib.error import URLError
|
||||
import html2text
|
||||
import xml.etree.ElementTree as ET
|
||||
import os
|
||||
|
||||
|
||||
def get_new_articles(source):
|
||||
articles = []
|
||||
try:
|
||||
response = urllib.request.urlopen(source)
|
||||
orig_rss = response.read().decode("utf-8")
|
||||
rss = ET.fromstring(orig_rss)
|
||||
channel = rss.find("channel")
|
||||
|
||||
for item in channel.findall("item"):
|
||||
# Not used anymore
|
||||
# pubDate = item.find("pubDate").text
|
||||
# pubDateConv = mktime(time.strptime(pubDate, PUBDATEFORMAT)))
|
||||
|
||||
link = item.find("link").text
|
||||
|
||||
title = item.find("title")
|
||||
|
||||
if title is not None:
|
||||
title = title.text
|
||||
if title is None:
|
||||
print("found no title, will use link")
|
||||
title = link
|
||||
|
||||
description = item.find("description")
|
||||
|
||||
if description is not None:
|
||||
description = html2text.html2text(description.text)
|
||||
|
||||
guid = item.find("guid")
|
||||
|
||||
pubDate = item.find("pubDate").text
|
||||
pubDate = datetime.datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S %z")
|
||||
|
||||
if guid is not None:
|
||||
guid = guid.text
|
||||
if guid is None:
|
||||
#print("found no guid, will use link")
|
||||
guid = link
|
||||
articles.append((title, link, description, guid, pubDate))
|
||||
|
||||
except URLError as e:
|
||||
print("Error:", e.reason)
|
||||
|
||||
return articles
|
||||
@@ -8,6 +8,8 @@ import bmemcached
|
||||
import re
|
||||
from prawcore import NotFound
|
||||
|
||||
logger = logging
|
||||
|
||||
# from https://github.com/arxiv-vanity/arxiv-vanity/blob/master/arxiv_vanity/scraper/arxiv_ids.py
|
||||
ARXIV_ID_PATTERN = r'([a-z\-]+(?:\.[A-Z]{2})?/\d{7}|\d+\.\d+)(v\d+)?'
|
||||
ARXIV_URL_RE = re.compile(r'arxiv.org/[^\/]+/({})(\.pdf)?'.format(ARXIV_ID_PATTERN), re.I)
|
||||
@@ -29,7 +31,11 @@ def get_bot():
|
||||
|
||||
r = get_bot()
|
||||
|
||||
subreddit = r.subreddit('machinelearning')
|
||||
subreddits = [
|
||||
r.subreddit('machinelearning'),
|
||||
# r.subreddit('reinforcementlearning')
|
||||
# r.subreddit('LanguageTechnology')
|
||||
]
|
||||
target_subreddit = r.subreddit('mlresearch')
|
||||
|
||||
if r.read_only == False:
|
||||
@@ -61,32 +67,35 @@ def scrape_arxiv(arxiv_id):
|
||||
|
||||
def comment(cache):
|
||||
# print(time.asctime(), "searching")
|
||||
try:
|
||||
all_posts = subreddit.new(limit=100)
|
||||
for post in all_posts:
|
||||
match = ARXIV_URL_RE.search(post.url)
|
||||
if match:
|
||||
arxiv_id = match.group(1)
|
||||
for subreddit in subreddits:
|
||||
try:
|
||||
all_posts = subreddit.new(limit=100)
|
||||
for post in all_posts:
|
||||
match = ARXIV_URL_RE.search(post.url)
|
||||
if match:
|
||||
arxiv_id = match.group(1)
|
||||
|
||||
# crosspost
|
||||
print('found', arxiv_id)
|
||||
xpost(['r/researchml'], post)
|
||||
# crosspost
|
||||
print('found', arxiv_id)
|
||||
|
||||
|
||||
# if cache.get(post.id) and cache.get(post.id) is 'T':
|
||||
# print "Parsed this post already: %s"%(post.permalink)
|
||||
# continue
|
||||
# for comment in post.comments:
|
||||
# if str(comment.author) == 'arXiv_abstract_bot':
|
||||
# break
|
||||
# else:
|
||||
# response = scrape_arxiv(arxiv_id)
|
||||
# post.reply(response)
|
||||
# cache.set(post.id, 'T')
|
||||
# print "Parsed post: %s"%(post.permalink)
|
||||
# print(arxiv_id, response)
|
||||
# time.sleep(10)
|
||||
except Exception as error:
|
||||
print(error)
|
||||
if cache.get(post.id) and cache.get(post.id) is 'T':
|
||||
print ("Parsed this post already: %s"%(post.permalink))
|
||||
continue
|
||||
# for comment in post.comments:
|
||||
# if str(comment.author) == 'arXiv_abstract_bot':
|
||||
# break
|
||||
else:
|
||||
xpost(['r/researchml'], post)
|
||||
# response = scrape_arxiv(arxiv_id)
|
||||
# post.reply(response)
|
||||
cache.set(post.id, 'T')
|
||||
# print "Parsed post: %s"%(post.permalink)
|
||||
# print(arxiv_id, response)
|
||||
time.sleep(10)
|
||||
except Exception as error:
|
||||
logger.error("Failed to scrape")
|
||||
print(error)
|
||||
|
||||
def xpost(subs, originalpost):
|
||||
# originalpost = where.submission
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
I would like to audocrosspost
|
||||
- arxiv
|
||||
- openreview
|
||||
|
||||
and also auto post shortscience summaries from their rss feed.
|
||||
@@ -0,0 +1,79 @@
|
||||
"""
|
||||
Posts new shortscience.org summaries to researchml
|
||||
"""
|
||||
import praw
|
||||
import os
|
||||
import time
|
||||
import datetime
|
||||
import pytz
|
||||
import RSSReader
|
||||
import logging
|
||||
|
||||
from bot import get_bot, get_memcache_client
|
||||
|
||||
# https://github.com/SmBe19/RedditBots/blob/master/RSSBot/RSSBot.py
|
||||
log = logging
|
||||
|
||||
|
||||
r = get_bot()
|
||||
cache = get_memcache_client()
|
||||
|
||||
SUBREDDIT = 'researchml' # 'testingground4bots'
|
||||
SLEEP = 60
|
||||
POST_DESCRIPTION = True
|
||||
DESCRIPTION_FORMAT = "{}"
|
||||
|
||||
# main procedure
|
||||
def run_bot():
|
||||
sub = r.subreddit(SUBREDDIT)
|
||||
t0 = datetime.datetime.utcnow().replace(tzinfo=pytz.utc) - datetime.timedelta(days=0)
|
||||
|
||||
log.info("Start bot for subreddit %s", SUBREDDIT)
|
||||
while True:
|
||||
sources = ["https://www.shortscience.org/rss.xml"]
|
||||
try:
|
||||
log.info("check sources")
|
||||
newArticles = []
|
||||
for source in sources:
|
||||
newArticles.extend(RSSReader.get_new_articles(source))
|
||||
|
||||
for article in newArticles:
|
||||
title, url, desc, id, dt = article
|
||||
|
||||
if dt < t0:
|
||||
# skip older ones
|
||||
# print(f"skipping older article {title}, {id} {dt-t0}")
|
||||
continue
|
||||
|
||||
if cache.get(id) and cache.get(id) is 'T':
|
||||
# print(f"skipping done article {title}, {id}")
|
||||
continue
|
||||
else:
|
||||
cache.set(id, 'T')
|
||||
print('posting', id)
|
||||
try:
|
||||
# submission = sub.submit(title, url=url, resubmit=RESUBMIT_ANYWAYS, send_replies=False)
|
||||
submission = sub.submit('[S] ' + title, url=url, resubmit=True, send_replies=False)
|
||||
if POST_DESCRIPTION and desc is not None:
|
||||
submission.reply(DESCRIPTION_FORMAT.format(desc))
|
||||
except praw.exceptions.PRAWException as e:
|
||||
log.error("could not submit %s", e)
|
||||
else:
|
||||
log.info("submit article %s", article)
|
||||
|
||||
# Allows the bot to exit on ^C, all other exceptions are ignored
|
||||
except KeyboardInterrupt:
|
||||
return 0
|
||||
break
|
||||
except Exception as e:
|
||||
log.error("Exception %s", e, exc_info=True)
|
||||
|
||||
# write_config_done(done)
|
||||
log.info("sleep for %s s", SLEEP)
|
||||
time.sleep(SLEEP)
|
||||
|
||||
# write_config_done(done)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_bot()
|
||||
Reference in New Issue
Block a user