From 5d89a12d2a12e288e27249d73ec83fa1aac64405 Mon Sep 17 00:00:00 2001 From: wassname Date: Fri, 24 Jan 2020 09:14:51 +0800 Subject: [PATCH] tidies rss and distill --- RSSReader.py | 52 --------------------- bot.py => arxiv_ml_xpost_bot.py | 71 +++++++++++------------------ botlib.py | 45 ++++++++++++++++++ readme.md | 20 ++++++-- rss_poster_bot.py | 81 +++++++++++++++++++++++++++++++++ shortscienceposts.py | 79 -------------------------------- 6 files changed, 169 insertions(+), 179 deletions(-) delete mode 100644 RSSReader.py rename bot.py => arxiv_ml_xpost_bot.py (66%) create mode 100644 botlib.py create mode 100644 rss_poster_bot.py delete mode 100644 shortscienceposts.py diff --git a/RSSReader.py b/RSSReader.py deleted file mode 100644 index d5987ab..0000000 --- a/RSSReader.py +++ /dev/null @@ -1,52 +0,0 @@ -import urllib.request -import datetime -from urllib.error import URLError -import html2text -import xml.etree.ElementTree as ET -import os - - -def get_new_articles(source): - articles = [] - try: - response = urllib.request.urlopen(source) - orig_rss = response.read().decode("utf-8") - rss = ET.fromstring(orig_rss) - channel = rss.find("channel") - - for item in channel.findall("item"): - # Not used anymore - # pubDate = item.find("pubDate").text - # pubDateConv = mktime(time.strptime(pubDate, PUBDATEFORMAT))) - - link = item.find("link").text - - title = item.find("title") - - if title is not None: - title = title.text - if title is None: - print("found no title, will use link") - title = link - - description = item.find("description") - - if description is not None: - description = html2text.html2text(description.text) - - guid = item.find("guid") - - pubDate = item.find("pubDate").text - pubDate = datetime.datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S %z") - - if guid is not None: - guid = guid.text - if guid is None: - #print("found no guid, will use link") - guid = link - articles.append((title, link, description, guid, pubDate)) - - except URLError as e: - print("Error:", e.reason) - - return articles diff --git a/bot.py b/arxiv_ml_xpost_bot.py similarity index 66% rename from bot.py rename to arxiv_ml_xpost_bot.py index f7a66cb..b1a2fad 100644 --- a/bot.py +++ b/arxiv_ml_xpost_bot.py @@ -7,62 +7,56 @@ import time, os import bmemcached import re from prawcore import NotFound +import datetime -logger = logging + +from botlib import get_bot, get_memcache_client, get_logger + +logger = get_logger() # from https://github.com/arxiv-vanity/arxiv-vanity/blob/master/arxiv_vanity/scraper/arxiv_ids.py ARXIV_ID_PATTERN = r'([a-z\-]+(?:\.[A-Z]{2})?/\d{7}|\d+\.\d+)(v\d+)?' ARXIV_URL_RE = re.compile(r'arxiv.org/[^\/]+/({})(\.pdf)?'.format(ARXIV_ID_PATTERN), re.I) - -def get_bot(): - PRAW_CLIENT_ID = os.environ.get('PRAW_CLIENT_ID') - PRAW_CLIENT_SECRET = os.environ.get('PRAW_CLIENT_SECRET') - PRAW_PASSWORD = os.environ.get('PRAW_PASSWORD') - PRAW_USERNAME = os.environ.get('PRAW_USERNAME') - PRAW_USERAGENT = os.environ.get('PRAW_USERAGENT') - return praw.Reddit( - username=PRAW_USERNAME, - password=PRAW_PASSWORD, - client_id=PRAW_CLIENT_ID, - client_secret=PRAW_CLIENT_SECRET, - user_agent=PRAW_USERAGENT - ) - +# OPENREVIEW_URL_RE = re.compile(r'openreview.net/./', re.I) +# OPENREVIEW_URL_RE = re.compile(r'distill.pub/./', re.I) r = get_bot() +# source subreddits subreddits = [ r.subreddit('machinelearning'), # r.subreddit('reinforcementlearning') # r.subreddit('LanguageTechnology') ] target_subreddit = r.subreddit('mlresearch') +target_subreddit = r.subreddit('testingground4bots') + if r.read_only == False: print("Connected and running.") # alreadydone = set() -def scrape_arxiv(arxiv_id): - url = 'https://arxiv.org/abs/{}'.format(arxiv_id) - r = requests.get(url) - soup = bs4.BeautifulSoup(r.text) - abstract = soup.select('.abstract')[0] - abstract = html2text.html2text(abstract.decode()).replace('\n', ' ') +# def scrape_arxiv(arxiv_id): +# url = 'https://arxiv.org/abs/{}'.format(arxiv_id) +# r = requests.get(url) +# soup = bs4.BeautifulSoup(r.text) +# abstract = soup.select('.abstract')[0] +# abstract = html2text.html2text(abstract.decode()).replace('\n', ' ') - authors = soup.select('.authors')[0] - authors = html2text.html2text(authors.decode()).replace('\n', ' ') - authors = authors.replace('(/', '(http://arxiv.org/') +# authors = soup.select('.authors')[0] +# authors = html2text.html2text(authors.decode()).replace('\n', ' ') +# authors = authors.replace('(/', '(http://arxiv.org/') - title = soup.select('.title')[0] - title = html2text.html2text(title.decode()).replace('\n', ' ')[2:] +# title = soup.select('.title')[0] +# title = html2text.html2text(title.decode()).replace('\n', ' ')[2:] - abs_link = u'[Landing Page]({})'.format(url) - pdf_link = u'[PDF Link](https://arxiv.org/pdf/{})'.format(arxiv_id) - web_link = u'[Read as web page on arXiv Vanity](https://www.arxiv-vanity.com/papers/{}/)'.format(arxiv_id) - links = u'{} | {} | {}'.format(pdf_link, abs_link, web_link) - response = '\n\n'.join([title, authors, abstract, links]) - return response +# abs_link = u'[Landing Page]({})'.format(url) +# pdf_link = u'[PDF Link](https://arxiv.org/pdf/{})'.format(arxiv_id) +# web_link = u'[Read as web page on arXiv Vanity](https://www.arxiv-vanity.com/papers/{}/)'.format(arxiv_id) +# links = u'{} | {} | {}'.format(pdf_link, abs_link, web_link) +# response = '\n\n'.join([title, authors, abstract, links]) +# return response def comment(cache): @@ -142,17 +136,6 @@ def xpost(subs, originalpost): # where.reply(str(response) + " Make sure to give me karma to prevent that in the future.") -def get_memcache_client(): - # Store IDs of comments that the bot has already replied to. - # Read local cache by default - - MEMCACHEDCLOUD_SERVERS = os.environ.get('MEMCACHEDCLOUD_SERVERS') - MEMCACHEDCLOUD_USERNAME = os.environ.get('MEMCACHEDCLOUD_USERNAME') - MEMCACHEDCLOUD_PASSWORD = os.environ.get('MEMCACHEDCLOUD_PASSWORD') - - client = bmemcached.Client((MEMCACHEDCLOUD_SERVERS,), MEMCACHEDCLOUD_USERNAME, - MEMCACHEDCLOUD_PASSWORD) - return client if __name__ == "__main__": diff --git a/botlib.py b/botlib.py new file mode 100644 index 0000000..d3efcac --- /dev/null +++ b/botlib.py @@ -0,0 +1,45 @@ +import praw +import logging +import requests +import bs4 +import html2text +import time, os +import bmemcached +import re +from prawcore import NotFound + +import logging +import sys + +logging.basicConfig(stream=sys.stdout, level=logging.INFO) + +def get_logger(): + logger = logging.getLogger(__name__) + return logger + +def get_memcache_client(): + # Store IDs of comments that the bot has already replied to. + # Read local cache by default + + MEMCACHEDCLOUD_SERVERS = os.environ.get('MEMCACHEDCLOUD_SERVERS') + MEMCACHEDCLOUD_USERNAME = os.environ.get('MEMCACHEDCLOUD_USERNAME') + MEMCACHEDCLOUD_PASSWORD = os.environ.get('MEMCACHEDCLOUD_PASSWORD') + + client = bmemcached.Client((MEMCACHEDCLOUD_SERVERS,), MEMCACHEDCLOUD_USERNAME, + MEMCACHEDCLOUD_PASSWORD) + return client + + +def get_bot(): + PRAW_CLIENT_ID = os.environ.get('PRAW_CLIENT_ID') + PRAW_CLIENT_SECRET = os.environ.get('PRAW_CLIENT_SECRET') + PRAW_PASSWORD = os.environ.get('PRAW_PASSWORD') + PRAW_USERNAME = os.environ.get('PRAW_USERNAME') + PRAW_USERAGENT = os.environ.get('PRAW_USERAGENT') + return praw.Reddit( + username=PRAW_USERNAME, + password=PRAW_PASSWORD, + client_id=PRAW_CLIENT_ID, + client_secret=PRAW_CLIENT_SECRET, + user_agent=PRAW_USERAGENT + ) diff --git a/readme.md b/readme.md index ede644c..dbe98a5 100644 --- a/readme.md +++ b/readme.md @@ -1,5 +1,17 @@ -I would like to audocrosspost -- arxiv -- openreview +I would like to autocrosspost +- [x] arxiv from r/machinelearning +- [x] shortscience summaries from shortscience rss feed +- [ ] openreview +- [ ] openai, googlemin, facebookai, deepmind blogposts -and also auto post shortscience summaries from their rss feed. + +# start + +```sh +pyenv activate jup3.7.2 +#memcached? +#https://github.com/jaysonsantos/python-binary-memcached +source ./vars.sh +python ./arxiv_ml_xpost_bot.py & +python ./rss_poster_bot.py & +``` diff --git a/rss_poster_bot.py b/rss_poster_bot.py new file mode 100644 index 0000000..79d3c51 --- /dev/null +++ b/rss_poster_bot.py @@ -0,0 +1,81 @@ +""" +Posts new shortscience.org summaries to researchml +""" +import praw +import os +import time +import datetime +import pytz +import feedparser +import logging +from time import mktime +from botlib import get_bot, get_memcache_client, get_logger + +logger = get_logger() + +r = get_bot() +cache = get_memcache_client() + +SUBREDDIT = 'researchml' # 'testingground4bots' +SLEEP = 60 +POST_DESCRIPTION = True +DESCRIPTION_FORMAT = "{}" + +# main procedure + +sources = ["https://distill.pub/rss.xml", "https://www.shortscience.org/rss.xml"] + +def run_bot(sources): + sub = r.subreddit(SUBREDDIT) + t0 = datetime.datetime.utcnow().replace(tzinfo=pytz.utc) - datetime.timedelta(days=10) + + logger.info("Start bot for subreddit %s", SUBREDDIT) + while True: + try: + logger.info("check sources") + newArticles = [] + for source in sources: + d = feedparser.parse(source) + newArticles = d['entries'] + + for article in newArticles: + dt = datetime.datetime.fromtimestamp(mktime(article['published_parsed'])).replace(tzinfo=pytz.utc) + url = id = article['link'] + desc = article['summary'] + title= article['title'] + + if dt < t0: + # skip older ones + logger.debug(f"skipping older article '{title}', id='{id}' td={dt-t0}") + continue + + if cache.get(id) and cache.get(id) is 'T': + logger.info(f"skipping done article '{title}', id='{id}'") + continue + else: + logger.info('posting %s', id) + try: + submission = sub.submit('[S] ' + title, url=url, resubmit=True, send_replies=False) + if POST_DESCRIPTION and desc is not None: + submission.reply(DESCRIPTION_FORMAT.format(desc)) + except praw.exceptions.PRAWException as e: + logger.exception("could not submit %s", e) + else: + cache.set(id, 'T') + logger.info("submit article %s", article) + + # Allows the bot to exit on ^C, all other exceptions are ignored + except KeyboardInterrupt: + return 0 + break + except Exception as e: + logger.error("Exception %s", e, exc_info=True) + + logger.info("sleep for %s s", SLEEP) + time.sleep(SLEEP) + + # write_config_done(done) + + +if __name__ == "__main__": + run_bot(sources) diff --git a/shortscienceposts.py b/shortscienceposts.py deleted file mode 100644 index 8d45626..0000000 --- a/shortscienceposts.py +++ /dev/null @@ -1,79 +0,0 @@ -""" -Posts new shortscience.org summaries to researchml -""" -import praw -import os -import time -import datetime -import pytz -import RSSReader -import logging - -from bot import get_bot, get_memcache_client - -# https://github.com/SmBe19/RedditBots/blob/master/RSSBot/RSSBot.py -log = logging - - -r = get_bot() -cache = get_memcache_client() - -SUBREDDIT = 'researchml' # 'testingground4bots' -SLEEP = 60 -POST_DESCRIPTION = True -DESCRIPTION_FORMAT = "{}" - -# main procedure -def run_bot(): - sub = r.subreddit(SUBREDDIT) - t0 = datetime.datetime.utcnow().replace(tzinfo=pytz.utc) - datetime.timedelta(days=0) - - log.info("Start bot for subreddit %s", SUBREDDIT) - while True: - sources = ["https://www.shortscience.org/rss.xml"] - try: - log.info("check sources") - newArticles = [] - for source in sources: - newArticles.extend(RSSReader.get_new_articles(source)) - - for article in newArticles: - title, url, desc, id, dt = article - - if dt < t0: - # skip older ones - # print(f"skipping older article {title}, {id} {dt-t0}") - continue - - if cache.get(id) and cache.get(id) is 'T': - # print(f"skipping done article {title}, {id}") - continue - else: - cache.set(id, 'T') - print('posting', id) - try: - # submission = sub.submit(title, url=url, resubmit=RESUBMIT_ANYWAYS, send_replies=False) - submission = sub.submit('[S] ' + title, url=url, resubmit=True, send_replies=False) - if POST_DESCRIPTION and desc is not None: - submission.reply(DESCRIPTION_FORMAT.format(desc)) - except praw.exceptions.PRAWException as e: - log.error("could not submit %s", e) - else: - log.info("submit article %s", article) - - # Allows the bot to exit on ^C, all other exceptions are ignored - except KeyboardInterrupt: - return 0 - break - except Exception as e: - log.error("Exception %s", e, exc_info=True) - - # write_config_done(done) - log.info("sleep for %s s", SLEEP) - time.sleep(SLEEP) - - # write_config_done(done) - - -if __name__ == "__main__": - run_bot()