From 5ff836465c8b8964edcadf87f85d596f7867619d Mon Sep 17 00:00:00 2001 From: wassname Date: Fri, 24 Jan 2020 07:56:43 +0800 Subject: [PATCH] shortsciencebot and arxiv xpost working --- RSSReader.py | 52 +++++++++++++++++++++++++++++ bot.py | 59 +++++++++++++++++++-------------- readme.md | 5 +++ shortscienceposts.py | 79 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 170 insertions(+), 25 deletions(-) create mode 100644 RSSReader.py create mode 100644 readme.md create mode 100644 shortscienceposts.py diff --git a/RSSReader.py b/RSSReader.py new file mode 100644 index 0000000..d5987ab --- /dev/null +++ b/RSSReader.py @@ -0,0 +1,52 @@ +import urllib.request +import datetime +from urllib.error import URLError +import html2text +import xml.etree.ElementTree as ET +import os + + +def get_new_articles(source): + articles = [] + try: + response = urllib.request.urlopen(source) + orig_rss = response.read().decode("utf-8") + rss = ET.fromstring(orig_rss) + channel = rss.find("channel") + + for item in channel.findall("item"): + # Not used anymore + # pubDate = item.find("pubDate").text + # pubDateConv = mktime(time.strptime(pubDate, PUBDATEFORMAT))) + + link = item.find("link").text + + title = item.find("title") + + if title is not None: + title = title.text + if title is None: + print("found no title, will use link") + title = link + + description = item.find("description") + + if description is not None: + description = html2text.html2text(description.text) + + guid = item.find("guid") + + pubDate = item.find("pubDate").text + pubDate = datetime.datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S %z") + + if guid is not None: + guid = guid.text + if guid is None: + #print("found no guid, will use link") + guid = link + articles.append((title, link, description, guid, pubDate)) + + except URLError as e: + print("Error:", e.reason) + + return articles diff --git a/bot.py b/bot.py index 5153915..f7a66cb 100644 --- a/bot.py +++ b/bot.py @@ -8,6 +8,8 @@ import bmemcached import re from prawcore import NotFound +logger = logging + # from https://github.com/arxiv-vanity/arxiv-vanity/blob/master/arxiv_vanity/scraper/arxiv_ids.py ARXIV_ID_PATTERN = r'([a-z\-]+(?:\.[A-Z]{2})?/\d{7}|\d+\.\d+)(v\d+)?' ARXIV_URL_RE = re.compile(r'arxiv.org/[^\/]+/({})(\.pdf)?'.format(ARXIV_ID_PATTERN), re.I) @@ -29,7 +31,11 @@ def get_bot(): r = get_bot() -subreddit = r.subreddit('machinelearning') +subreddits = [ + r.subreddit('machinelearning'), + # r.subreddit('reinforcementlearning') + # r.subreddit('LanguageTechnology') +] target_subreddit = r.subreddit('mlresearch') if r.read_only == False: @@ -61,32 +67,35 @@ def scrape_arxiv(arxiv_id): def comment(cache): # print(time.asctime(), "searching") - try: - all_posts = subreddit.new(limit=100) - for post in all_posts: - match = ARXIV_URL_RE.search(post.url) - if match: - arxiv_id = match.group(1) + for subreddit in subreddits: + try: + all_posts = subreddit.new(limit=100) + for post in all_posts: + match = ARXIV_URL_RE.search(post.url) + if match: + arxiv_id = match.group(1) - # crosspost - print('found', arxiv_id) - xpost(['r/researchml'], post) + # crosspost + print('found', arxiv_id) + - # if cache.get(post.id) and cache.get(post.id) is 'T': - # print "Parsed this post already: %s"%(post.permalink) - # continue - # for comment in post.comments: - # if str(comment.author) == 'arXiv_abstract_bot': - # break - # else: - # response = scrape_arxiv(arxiv_id) - # post.reply(response) - # cache.set(post.id, 'T') - # print "Parsed post: %s"%(post.permalink) - # print(arxiv_id, response) - # time.sleep(10) - except Exception as error: - print(error) + if cache.get(post.id) and cache.get(post.id) is 'T': + print ("Parsed this post already: %s"%(post.permalink)) + continue + # for comment in post.comments: + # if str(comment.author) == 'arXiv_abstract_bot': + # break + else: + xpost(['r/researchml'], post) + # response = scrape_arxiv(arxiv_id) + # post.reply(response) + cache.set(post.id, 'T') + # print "Parsed post: %s"%(post.permalink) + # print(arxiv_id, response) + time.sleep(10) + except Exception as error: + logger.error("Failed to scrape") + print(error) def xpost(subs, originalpost): # originalpost = where.submission diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..ede644c --- /dev/null +++ b/readme.md @@ -0,0 +1,5 @@ +I would like to audocrosspost +- arxiv +- openreview + +and also auto post shortscience summaries from their rss feed. diff --git a/shortscienceposts.py b/shortscienceposts.py new file mode 100644 index 0000000..8d45626 --- /dev/null +++ b/shortscienceposts.py @@ -0,0 +1,79 @@ +""" +Posts new shortscience.org summaries to researchml +""" +import praw +import os +import time +import datetime +import pytz +import RSSReader +import logging + +from bot import get_bot, get_memcache_client + +# https://github.com/SmBe19/RedditBots/blob/master/RSSBot/RSSBot.py +log = logging + + +r = get_bot() +cache = get_memcache_client() + +SUBREDDIT = 'researchml' # 'testingground4bots' +SLEEP = 60 +POST_DESCRIPTION = True +DESCRIPTION_FORMAT = "{}" + +# main procedure +def run_bot(): + sub = r.subreddit(SUBREDDIT) + t0 = datetime.datetime.utcnow().replace(tzinfo=pytz.utc) - datetime.timedelta(days=0) + + log.info("Start bot for subreddit %s", SUBREDDIT) + while True: + sources = ["https://www.shortscience.org/rss.xml"] + try: + log.info("check sources") + newArticles = [] + for source in sources: + newArticles.extend(RSSReader.get_new_articles(source)) + + for article in newArticles: + title, url, desc, id, dt = article + + if dt < t0: + # skip older ones + # print(f"skipping older article {title}, {id} {dt-t0}") + continue + + if cache.get(id) and cache.get(id) is 'T': + # print(f"skipping done article {title}, {id}") + continue + else: + cache.set(id, 'T') + print('posting', id) + try: + # submission = sub.submit(title, url=url, resubmit=RESUBMIT_ANYWAYS, send_replies=False) + submission = sub.submit('[S] ' + title, url=url, resubmit=True, send_replies=False) + if POST_DESCRIPTION and desc is not None: + submission.reply(DESCRIPTION_FORMAT.format(desc)) + except praw.exceptions.PRAWException as e: + log.error("could not submit %s", e) + else: + log.info("submit article %s", article) + + # Allows the bot to exit on ^C, all other exceptions are ignored + except KeyboardInterrupt: + return 0 + break + except Exception as e: + log.error("Exception %s", e, exc_info=True) + + # write_config_done(done) + log.info("sleep for %s s", SLEEP) + time.sleep(SLEEP) + + # write_config_done(done) + + +if __name__ == "__main__": + run_bot()