tidies rss and distill

2026-06-27 16:44:23 +08:00 · 2020-01-24 09:14:51 +08:00
parent 5ff836465c
commit 5d89a12d2a
6 changed files with 169 additions and 179 deletions
@@ -1,52 +0,0 @@
 import urllib.request
 import datetime
 from urllib.error import URLError
 import html2text
 import xml.etree.ElementTree as ET
 import os
 def get_new_articles(source):
 	articles = []
 	try:
 		response = urllib.request.urlopen(source)
 		orig_rss = response.read().decode("utf-8")
 		rss = ET.fromstring(orig_rss)
 		channel = rss.find("channel")
 		for item in channel.findall("item"):
 			# Not used anymore
 			# pubDate = item.find("pubDate").text
 			# pubDateConv = mktime(time.strptime(pubDate, PUBDATEFORMAT)))
 			link = item.find("link").text
 			title = item.find("title")
 			if title is not None:
 				title = title.text
 			if title is None:
 				print("found no title, will use link")
 				title = link
 			description = item.find("description")
 			if description is not None:
 				description = html2text.html2text(description.text)
 			guid = item.find("guid")
 			pubDate = item.find("pubDate").text
 			pubDate = datetime.datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S %z")
 			if guid is not None:
 				guid = guid.text
 			if guid is None:
 				#print("found no guid, will use link")
 				guid = link
 			articles.append((title, link, description, guid, pubDate))
 	except URLError as e:
 		print("Error:", e.reason)
 	return articles
@@ -7,62 +7,56 @@ import time, os
 import bmemcached
 import re
 from prawcore import NotFound
 import datetime
-logger = logging
+
 from botlib import get_bot, get_memcache_client, get_logger
 logger = get_logger()
 # from https://github.com/arxiv-vanity/arxiv-vanity/blob/master/arxiv_vanity/scraper/arxiv_ids.py
 ARXIV_ID_PATTERN = r'([a-z\-]+(?:\.[A-Z]{2})?/\d{7}|\d+\.\d+)(v\d+)?'
 ARXIV_URL_RE = re.compile(r'arxiv.org/[^\/]+/({})(\.pdf)?'.format(ARXIV_ID_PATTERN), re.I)
-
+# OPENREVIEW_URL_RE = re.compile(r'openreview.net/./', re.I)
-def get_bot():
+# OPENREVIEW_URL_RE = re.compile(r'distill.pub/./', re.I)
    PRAW_CLIENT_ID = os.environ.get('PRAW_CLIENT_ID')
    PRAW_CLIENT_SECRET = os.environ.get('PRAW_CLIENT_SECRET')
    PRAW_PASSWORD = os.environ.get('PRAW_PASSWORD')
    PRAW_USERNAME = os.environ.get('PRAW_USERNAME')
    PRAW_USERAGENT = os.environ.get('PRAW_USERAGENT')
    return praw.Reddit(
        username=PRAW_USERNAME,
        password=PRAW_PASSWORD,
        client_id=PRAW_CLIENT_ID,
        client_secret=PRAW_CLIENT_SECRET,
        user_agent=PRAW_USERAGENT
        )
 r = get_bot()
 # source subreddits
 subreddits = [
    r.subreddit('machinelearning'),
    # r.subreddit('reinforcementlearning')
    # r.subreddit('LanguageTechnology')
 ]
 target_subreddit = r.subreddit('mlresearch')
 target_subreddit = r.subreddit('testingground4bots')
 if r.read_only == False:
    print("Connected and running.")
 # alreadydone = set()
-def scrape_arxiv(arxiv_id):
+# def scrape_arxiv(arxiv_id):
-    url = 'https://arxiv.org/abs/{}'.format(arxiv_id)
+#     url = 'https://arxiv.org/abs/{}'.format(arxiv_id)
-    r = requests.get(url)
+#     r = requests.get(url)
-    soup = bs4.BeautifulSoup(r.text)
+#     soup = bs4.BeautifulSoup(r.text)
-    abstract = soup.select('.abstract')[0]
+#     abstract = soup.select('.abstract')[0]
-    abstract = html2text.html2text(abstract.decode()).replace('\n', ' ')
+#     abstract = html2text.html2text(abstract.decode()).replace('\n', ' ')
-    authors = soup.select('.authors')[0]
+#     authors = soup.select('.authors')[0]
-    authors = html2text.html2text(authors.decode()).replace('\n', ' ')
+#     authors = html2text.html2text(authors.decode()).replace('\n', ' ')
-    authors = authors.replace('(/', '(http://arxiv.org/')
+#     authors = authors.replace('(/', '(http://arxiv.org/')
-    title = soup.select('.title')[0]
+#     title = soup.select('.title')[0]
-    title =  html2text.html2text(title.decode()).replace('\n', ' ')[2:]
+#     title =  html2text.html2text(title.decode()).replace('\n', ' ')[2:]
-    abs_link = u'[Landing Page]({})'.format(url)
+#     abs_link = u'[Landing Page]({})'.format(url)
-    pdf_link = u'[PDF Link](https://arxiv.org/pdf/{})'.format(arxiv_id)
+#     pdf_link = u'[PDF Link](https://arxiv.org/pdf/{})'.format(arxiv_id)
-    web_link = u'[Read as web page on arXiv Vanity](https://www.arxiv-vanity.com/papers/{}/)'.format(arxiv_id)
+#     web_link = u'[Read as web page on arXiv Vanity](https://www.arxiv-vanity.com/papers/{}/)'.format(arxiv_id)
-    links = u'{} | {} | {}'.format(pdf_link, abs_link, web_link)
+#     links = u'{} | {} | {}'.format(pdf_link, abs_link, web_link)
-    response = '\n\n'.join([title, authors, abstract, links]) 
+#     response = '\n\n'.join([title, authors, abstract, links]) 
-    return response
+#     return response
 def comment(cache):
@@ -142,17 +136,6 @@ def xpost(subs, originalpost):
        # where.reply(str(response) + " Make sure to give me karma to prevent that in the future.")
 def get_memcache_client():
    # Store IDs of comments that the bot has already replied to.
    # Read local cache by default
    MEMCACHEDCLOUD_SERVERS = os.environ.get('MEMCACHEDCLOUD_SERVERS')
    MEMCACHEDCLOUD_USERNAME = os.environ.get('MEMCACHEDCLOUD_USERNAME')
    MEMCACHEDCLOUD_PASSWORD = os.environ.get('MEMCACHEDCLOUD_PASSWORD')
    client = bmemcached.Client((MEMCACHEDCLOUD_SERVERS,), MEMCACHEDCLOUD_USERNAME,
                           MEMCACHEDCLOUD_PASSWORD)
    return client
 if __name__ == "__main__":
@@ -0,0 +1,45 @@
 import praw
 import logging
 import requests
 import bs4
 import html2text
 import time, os
 import bmemcached
 import re
 from prawcore import NotFound
 import logging
 import sys
 logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 def get_logger():
    logger = logging.getLogger(__name__)
    return logger
 def get_memcache_client():
    # Store IDs of comments that the bot has already replied to.
    # Read local cache by default
    MEMCACHEDCLOUD_SERVERS = os.environ.get('MEMCACHEDCLOUD_SERVERS')
    MEMCACHEDCLOUD_USERNAME = os.environ.get('MEMCACHEDCLOUD_USERNAME')
    MEMCACHEDCLOUD_PASSWORD = os.environ.get('MEMCACHEDCLOUD_PASSWORD')
    client = bmemcached.Client((MEMCACHEDCLOUD_SERVERS,), MEMCACHEDCLOUD_USERNAME,
                           MEMCACHEDCLOUD_PASSWORD)
    return client
 def get_bot():
    PRAW_CLIENT_ID = os.environ.get('PRAW_CLIENT_ID')
    PRAW_CLIENT_SECRET = os.environ.get('PRAW_CLIENT_SECRET')
    PRAW_PASSWORD = os.environ.get('PRAW_PASSWORD')
    PRAW_USERNAME = os.environ.get('PRAW_USERNAME')
    PRAW_USERAGENT = os.environ.get('PRAW_USERAGENT')
    return praw.Reddit(
        username=PRAW_USERNAME,
        password=PRAW_PASSWORD,
        client_id=PRAW_CLIENT_ID,
        client_secret=PRAW_CLIENT_SECRET,
        user_agent=PRAW_USERAGENT
        )
@@ -1,5 +1,17 @@
-I would like to audocrosspost
+I would like to autocrosspost
- arxiv
+- [x] arxiv from r/machinelearning
- openreview
+- [x] shortscience summaries from shortscience rss feed
 - [ ] openreview
 - [ ] openai, googlemin, facebookai, deepmind blogposts
-and also auto post shortscience summaries from their rss feed.
+
 # start
 ```sh
 pyenv activate jup3.7.2
 #memcached?
 #https://github.com/jaysonsantos/python-binary-memcached
 source ./vars.sh
 python ./arxiv_ml_xpost_bot.py &
 python ./rss_poster_bot.py &
 ```
@@ -0,0 +1,81 @@
 """
 Posts new shortscience.org summaries to researchml
 """
 import praw
 import os
 import time
 import datetime
 import pytz
 import feedparser
 import logging
 from time import mktime
 from botlib import get_bot, get_memcache_client, get_logger
 logger = get_logger()
 r = get_bot()
 cache = get_memcache_client()
 SUBREDDIT = 'researchml'  # 'testingground4bots'
 SLEEP = 60
 POST_DESCRIPTION = True
 DESCRIPTION_FORMAT = "{}"
 # main procedure
 sources = ["https://distill.pub/rss.xml", "https://www.shortscience.org/rss.xml"]
 def run_bot(sources):
    sub = r.subreddit(SUBREDDIT)
    t0 = datetime.datetime.utcnow().replace(tzinfo=pytz.utc) - datetime.timedelta(days=10) 
    logger.info("Start bot for subreddit %s", SUBREDDIT)
    while True:
        try:
            logger.info("check sources")
            newArticles = []
            for source in sources:
                d = feedparser.parse(source)
                newArticles = d['entries']
                for article in newArticles:
                    dt = datetime.datetime.fromtimestamp(mktime(article['published_parsed'])).replace(tzinfo=pytz.utc)
                    url = id = article['link']
                    desc = article['summary']
                    title= article['title']
                    if dt < t0:
                        # skip older ones
                        logger.debug(f"skipping older article '{title}', id='{id}' td={dt-t0}")
                        continue
                    if cache.get(id) and cache.get(id) is 'T':
                        logger.info(f"skipping done article '{title}', id='{id}'")
                        continue
                    else:
                        logger.info('posting %s', id)
                        try:
                            submission = sub.submit('[S] ' + title, url=url, resubmit=True, send_replies=False)
                            if POST_DESCRIPTION and desc is not None:
                                submission.reply(DESCRIPTION_FORMAT.format(desc))
                        except praw.exceptions.PRAWException as e:
                            logger.exception("could not submit %s", e)
                        else:
                            cache.set(id, 'T')
                            logger.info("submit article %s", article)
        # Allows the bot to exit on ^C, all other exceptions are ignored
        except KeyboardInterrupt:
            return 0
            break
        except Exception as e:
            logger.error("Exception %s", e, exc_info=True)
        logger.info("sleep for %s s", SLEEP)
        time.sleep(SLEEP)
    # write_config_done(done)
 if __name__ == "__main__":
    run_bot(sources)
@@ -1,79 +0,0 @@
 """
 Posts new shortscience.org summaries to researchml
 """
 import praw
 import os
 import time
 import datetime
 import pytz
 import RSSReader
 import logging
 from bot import get_bot, get_memcache_client
 # https://github.com/SmBe19/RedditBots/blob/master/RSSBot/RSSBot.py
 log = logging
 r = get_bot()
 cache = get_memcache_client()
 SUBREDDIT = 'researchml'  # 'testingground4bots'
 SLEEP = 60
 POST_DESCRIPTION = True
 DESCRIPTION_FORMAT = "{}"
 # main procedure
 def run_bot():
    sub = r.subreddit(SUBREDDIT)
    t0 = datetime.datetime.utcnow().replace(tzinfo=pytz.utc) - datetime.timedelta(days=0) 
    log.info("Start bot for subreddit %s", SUBREDDIT)
    while True:
        sources = ["https://www.shortscience.org/rss.xml"]
        try:
            log.info("check sources")
            newArticles = []
            for source in sources:
                newArticles.extend(RSSReader.get_new_articles(source))
            for article in newArticles:
                title, url, desc, id, dt = article
                if dt < t0:
                    # skip older ones
                    # print(f"skipping older article {title}, {id} {dt-t0}")
                    continue
                if cache.get(id) and cache.get(id) is 'T':
                    # print(f"skipping done article {title}, {id}")
                    continue
                else:
                    cache.set(id, 'T')
                    print('posting', id)
                    try:
                        # submission = sub.submit(title, url=url, resubmit=RESUBMIT_ANYWAYS, send_replies=False)
                        submission = sub.submit('[S] ' + title, url=url, resubmit=True, send_replies=False)
                        if POST_DESCRIPTION and desc is not None:
                            submission.reply(DESCRIPTION_FORMAT.format(desc))
                    except praw.exceptions.PRAWException as e:
                        log.error("could not submit %s", e)
                    else:
                        log.info("submit article %s", article)
        # Allows the bot to exit on ^C, all other exceptions are ignored
        except KeyboardInterrupt:
            return 0
            break
        except Exception as e:
            log.error("Exception %s", e, exc_info=True)
        # write_config_done(done)
        log.info("sleep for %s s", SLEEP)
        time.sleep(SLEEP)
    # write_config_done(done)
 if __name__ == "__main__":
    run_bot()