shortsciencebot and arxiv xpost working

This commit is contained in:
wassname
2020-01-24 07:56:43 +08:00
parent f81c100fbf
commit 5ff836465c
4 changed files with 170 additions and 25 deletions
+52
View File
@@ -0,0 +1,52 @@
import urllib.request
import datetime
from urllib.error import URLError
import html2text
import xml.etree.ElementTree as ET
import os
def get_new_articles(source):
articles = []
try:
response = urllib.request.urlopen(source)
orig_rss = response.read().decode("utf-8")
rss = ET.fromstring(orig_rss)
channel = rss.find("channel")
for item in channel.findall("item"):
# Not used anymore
# pubDate = item.find("pubDate").text
# pubDateConv = mktime(time.strptime(pubDate, PUBDATEFORMAT)))
link = item.find("link").text
title = item.find("title")
if title is not None:
title = title.text
if title is None:
print("found no title, will use link")
title = link
description = item.find("description")
if description is not None:
description = html2text.html2text(description.text)
guid = item.find("guid")
pubDate = item.find("pubDate").text
pubDate = datetime.datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S %z")
if guid is not None:
guid = guid.text
if guid is None:
#print("found no guid, will use link")
guid = link
articles.append((title, link, description, guid, pubDate))
except URLError as e:
print("Error:", e.reason)
return articles
+34 -25
View File
@@ -8,6 +8,8 @@ import bmemcached
import re
from prawcore import NotFound
logger = logging
# from https://github.com/arxiv-vanity/arxiv-vanity/blob/master/arxiv_vanity/scraper/arxiv_ids.py
ARXIV_ID_PATTERN = r'([a-z\-]+(?:\.[A-Z]{2})?/\d{7}|\d+\.\d+)(v\d+)?'
ARXIV_URL_RE = re.compile(r'arxiv.org/[^\/]+/({})(\.pdf)?'.format(ARXIV_ID_PATTERN), re.I)
@@ -29,7 +31,11 @@ def get_bot():
r = get_bot()
subreddit = r.subreddit('machinelearning')
subreddits = [
r.subreddit('machinelearning'),
# r.subreddit('reinforcementlearning')
# r.subreddit('LanguageTechnology')
]
target_subreddit = r.subreddit('mlresearch')
if r.read_only == False:
@@ -61,32 +67,35 @@ def scrape_arxiv(arxiv_id):
def comment(cache):
# print(time.asctime(), "searching")
try:
all_posts = subreddit.new(limit=100)
for post in all_posts:
match = ARXIV_URL_RE.search(post.url)
if match:
arxiv_id = match.group(1)
for subreddit in subreddits:
try:
all_posts = subreddit.new(limit=100)
for post in all_posts:
match = ARXIV_URL_RE.search(post.url)
if match:
arxiv_id = match.group(1)
# crosspost
print('found', arxiv_id)
xpost(['r/researchml'], post)
# crosspost
print('found', arxiv_id)
# if cache.get(post.id) and cache.get(post.id) is 'T':
# print "Parsed this post already: %s"%(post.permalink)
# continue
# for comment in post.comments:
# if str(comment.author) == 'arXiv_abstract_bot':
# break
# else:
# response = scrape_arxiv(arxiv_id)
# post.reply(response)
# cache.set(post.id, 'T')
# print "Parsed post: %s"%(post.permalink)
# print(arxiv_id, response)
# time.sleep(10)
except Exception as error:
print(error)
if cache.get(post.id) and cache.get(post.id) is 'T':
print ("Parsed this post already: %s"%(post.permalink))
continue
# for comment in post.comments:
# if str(comment.author) == 'arXiv_abstract_bot':
# break
else:
xpost(['r/researchml'], post)
# response = scrape_arxiv(arxiv_id)
# post.reply(response)
cache.set(post.id, 'T')
# print "Parsed post: %s"%(post.permalink)
# print(arxiv_id, response)
time.sleep(10)
except Exception as error:
logger.error("Failed to scrape")
print(error)
def xpost(subs, originalpost):
# originalpost = where.submission
+5
View File
@@ -0,0 +1,5 @@
I would like to audocrosspost
- arxiv
- openreview
and also auto post shortscience summaries from their rss feed.
+79
View File
@@ -0,0 +1,79 @@
"""
Posts new shortscience.org summaries to researchml
"""
import praw
import os
import time
import datetime
import pytz
import RSSReader
import logging
from bot import get_bot, get_memcache_client
# https://github.com/SmBe19/RedditBots/blob/master/RSSBot/RSSBot.py
log = logging
r = get_bot()
cache = get_memcache_client()
SUBREDDIT = 'researchml' # 'testingground4bots'
SLEEP = 60
POST_DESCRIPTION = True
DESCRIPTION_FORMAT = "{}"
# main procedure
def run_bot():
sub = r.subreddit(SUBREDDIT)
t0 = datetime.datetime.utcnow().replace(tzinfo=pytz.utc) - datetime.timedelta(days=0)
log.info("Start bot for subreddit %s", SUBREDDIT)
while True:
sources = ["https://www.shortscience.org/rss.xml"]
try:
log.info("check sources")
newArticles = []
for source in sources:
newArticles.extend(RSSReader.get_new_articles(source))
for article in newArticles:
title, url, desc, id, dt = article
if dt < t0:
# skip older ones
# print(f"skipping older article {title}, {id} {dt-t0}")
continue
if cache.get(id) and cache.get(id) is 'T':
# print(f"skipping done article {title}, {id}")
continue
else:
cache.set(id, 'T')
print('posting', id)
try:
# submission = sub.submit(title, url=url, resubmit=RESUBMIT_ANYWAYS, send_replies=False)
submission = sub.submit('[S] ' + title, url=url, resubmit=True, send_replies=False)
if POST_DESCRIPTION and desc is not None:
submission.reply(DESCRIPTION_FORMAT.format(desc))
except praw.exceptions.PRAWException as e:
log.error("could not submit %s", e)
else:
log.info("submit article %s", article)
# Allows the bot to exit on ^C, all other exceptions are ignored
except KeyboardInterrupt:
return 0
break
except Exception as e:
log.error("Exception %s", e, exc_info=True)
# write_config_done(done)
log.info("sleep for %s s", SLEEP)
time.sleep(SLEEP)
# write_config_done(done)
if __name__ == "__main__":
run_bot()