tidies rss and distill

2026-06-27 15:14:19 +08:00 · 2020-01-24 09:14:51 +08:00
parent 5ff836465c
commit 5d89a12d2a
6 changed files with 169 additions and 179 deletions
@@ -1,52 +0,0 @@
-import urllib.request
-import datetime
-from urllib.error import URLError
-import html2text
-import xml.etree.ElementTree as ET
-import os
-
-
-def get_new_articles(source):
-	articles = []
-	try:
-		response = urllib.request.urlopen(source)
-		orig_rss = response.read().decode("utf-8")
-		rss = ET.fromstring(orig_rss)
-		channel = rss.find("channel")
-		
-		for item in channel.findall("item"):
-			# Not used anymore
-			# pubDate = item.find("pubDate").text
-			# pubDateConv = mktime(time.strptime(pubDate, PUBDATEFORMAT)))
-			
-			link = item.find("link").text
-			
-			title = item.find("title")
-			
-			if title is not None:
-				title = title.text
-			if title is None:
-				print("found no title, will use link")
-				title = link
-				
-			description = item.find("description")
-			
-			if description is not None:
-				description = html2text.html2text(description.text)
-			
-			guid = item.find("guid")
-
-			pubDate = item.find("pubDate").text
-			pubDate = datetime.datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S %z")
-
-			if guid is not None:
-				guid = guid.text
-			if guid is None:
-				#print("found no guid, will use link")
-				guid = link
-			articles.append((title, link, description, guid, pubDate))
-		
-	except URLError as e:
-		print("Error:", e.reason)
-	
-	return articles
@@ -7,62 +7,56 @@ import time, os
 import bmemcached
 import re
 from prawcore import NotFound
+import datetime

-logger = logging
+
+from botlib import get_bot, get_memcache_client, get_logger
+
+logger = get_logger()

 # from https://github.com/arxiv-vanity/arxiv-vanity/blob/master/arxiv_vanity/scraper/arxiv_ids.py
 ARXIV_ID_PATTERN = r'([a-z\-]+(?:\.[A-Z]{2})?/\d{7}|\d+\.\d+)(v\d+)?'
 ARXIV_URL_RE = re.compile(r'arxiv.org/[^\/]+/({})(\.pdf)?'.format(ARXIV_ID_PATTERN), re.I)
-
-def get_bot():
-    PRAW_CLIENT_ID = os.environ.get('PRAW_CLIENT_ID')
-    PRAW_CLIENT_SECRET = os.environ.get('PRAW_CLIENT_SECRET')
-    PRAW_PASSWORD = os.environ.get('PRAW_PASSWORD')
-    PRAW_USERNAME = os.environ.get('PRAW_USERNAME')
-    PRAW_USERAGENT = os.environ.get('PRAW_USERAGENT')
-    return praw.Reddit(
-        username=PRAW_USERNAME,
-        password=PRAW_PASSWORD,
-        client_id=PRAW_CLIENT_ID,
-        client_secret=PRAW_CLIENT_SECRET,
-        user_agent=PRAW_USERAGENT
-        )
-
+# OPENREVIEW_URL_RE = re.compile(r'openreview.net/./', re.I)
+# OPENREVIEW_URL_RE = re.compile(r'distill.pub/./', re.I)

 r = get_bot()

+# source subreddits
 subreddits = [
    r.subreddit('machinelearning'),
    # r.subreddit('reinforcementlearning')
    # r.subreddit('LanguageTechnology')
 ]
 target_subreddit = r.subreddit('mlresearch')
+target_subreddit = r.subreddit('testingground4bots')
+

 if r.read_only == False:
    print("Connected and running.")
 # alreadydone = set()


-def scrape_arxiv(arxiv_id):
-    url = 'https://arxiv.org/abs/{}'.format(arxiv_id)
-    r = requests.get(url)
-    soup = bs4.BeautifulSoup(r.text)
-    abstract = soup.select('.abstract')[0]
-    abstract = html2text.html2text(abstract.decode()).replace('\n', ' ')
+# def scrape_arxiv(arxiv_id):
+#     url = 'https://arxiv.org/abs/{}'.format(arxiv_id)
+#     r = requests.get(url)
+#     soup = bs4.BeautifulSoup(r.text)
+#     abstract = soup.select('.abstract')[0]
+#     abstract = html2text.html2text(abstract.decode()).replace('\n', ' ')

-    authors = soup.select('.authors')[0]
-    authors = html2text.html2text(authors.decode()).replace('\n', ' ')
-    authors = authors.replace('(/', '(http://arxiv.org/')
+#     authors = soup.select('.authors')[0]
+#     authors = html2text.html2text(authors.decode()).replace('\n', ' ')
+#     authors = authors.replace('(/', '(http://arxiv.org/')

-    title = soup.select('.title')[0]
-    title =  html2text.html2text(title.decode()).replace('\n', ' ')[2:]
+#     title = soup.select('.title')[0]
+#     title =  html2text.html2text(title.decode()).replace('\n', ' ')[2:]

-    abs_link = u'[Landing Page]({})'.format(url)
-    pdf_link = u'[PDF Link](https://arxiv.org/pdf/{})'.format(arxiv_id)
-    web_link = u'[Read as web page on arXiv Vanity](https://www.arxiv-vanity.com/papers/{}/)'.format(arxiv_id)
-    links = u'{} | {} | {}'.format(pdf_link, abs_link, web_link)
-    response = '\n\n'.join([title, authors, abstract, links]) 
-    return response
+#     abs_link = u'[Landing Page]({})'.format(url)
+#     pdf_link = u'[PDF Link](https://arxiv.org/pdf/{})'.format(arxiv_id)
+#     web_link = u'[Read as web page on arXiv Vanity](https://www.arxiv-vanity.com/papers/{}/)'.format(arxiv_id)
+#     links = u'{} | {} | {}'.format(pdf_link, abs_link, web_link)
+#     response = '\n\n'.join([title, authors, abstract, links]) 
+#     return response


 def comment(cache):
@@ -142,17 +136,6 @@ def xpost(subs, originalpost):
        # where.reply(str(response) + " Make sure to give me karma to prevent that in the future.")


-def get_memcache_client():
-    # Store IDs of comments that the bot has already replied to.
-    # Read local cache by default
-
-    MEMCACHEDCLOUD_SERVERS = os.environ.get('MEMCACHEDCLOUD_SERVERS')
-    MEMCACHEDCLOUD_USERNAME = os.environ.get('MEMCACHEDCLOUD_USERNAME')
-    MEMCACHEDCLOUD_PASSWORD = os.environ.get('MEMCACHEDCLOUD_PASSWORD')
-
-    client = bmemcached.Client((MEMCACHEDCLOUD_SERVERS,), MEMCACHEDCLOUD_USERNAME,
-                           MEMCACHEDCLOUD_PASSWORD)
-    return client


 if __name__ == "__main__":
@@ -0,0 +1,45 @@
+import praw
+import logging
+import requests
+import bs4
+import html2text
+import time, os
+import bmemcached
+import re
+from prawcore import NotFound
+
+import logging
+import sys
+
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+def get_logger():
+    logger = logging.getLogger(__name__)
+    return logger
+
+def get_memcache_client():
+    # Store IDs of comments that the bot has already replied to.
+    # Read local cache by default
+
+    MEMCACHEDCLOUD_SERVERS = os.environ.get('MEMCACHEDCLOUD_SERVERS')
+    MEMCACHEDCLOUD_USERNAME = os.environ.get('MEMCACHEDCLOUD_USERNAME')
+    MEMCACHEDCLOUD_PASSWORD = os.environ.get('MEMCACHEDCLOUD_PASSWORD')
+
+    client = bmemcached.Client((MEMCACHEDCLOUD_SERVERS,), MEMCACHEDCLOUD_USERNAME,
+                           MEMCACHEDCLOUD_PASSWORD)
+    return client
+    
+
+def get_bot():
+    PRAW_CLIENT_ID = os.environ.get('PRAW_CLIENT_ID')
+    PRAW_CLIENT_SECRET = os.environ.get('PRAW_CLIENT_SECRET')
+    PRAW_PASSWORD = os.environ.get('PRAW_PASSWORD')
+    PRAW_USERNAME = os.environ.get('PRAW_USERNAME')
+    PRAW_USERAGENT = os.environ.get('PRAW_USERAGENT')
+    return praw.Reddit(
+        username=PRAW_USERNAME,
+        password=PRAW_PASSWORD,
+        client_id=PRAW_CLIENT_ID,
+        client_secret=PRAW_CLIENT_SECRET,
+        user_agent=PRAW_USERAGENT
+        )
@@ -1,5 +1,17 @@
-I would like to audocrosspost
- arxiv
- openreview
+I would like to autocrosspost
+- [x] arxiv from r/machinelearning
+- [x] shortscience summaries from shortscience rss feed
+- [ ] openreview
+- [ ] openai, googlemin, facebookai, deepmind blogposts

-and also auto post shortscience summaries from their rss feed.
+
+# start
+
+```sh
+pyenv activate jup3.7.2
+#memcached?
+#https://github.com/jaysonsantos/python-binary-memcached
+source ./vars.sh
+python ./arxiv_ml_xpost_bot.py &
+python ./rss_poster_bot.py &
+```
@@ -0,0 +1,81 @@
+"""
+Posts new shortscience.org summaries to researchml
+"""
+import praw
+import os
+import time
+import datetime
+import pytz
+import feedparser
+import logging
+from time import mktime
+from botlib import get_bot, get_memcache_client, get_logger
+
+logger = get_logger()
+
+r = get_bot()
+cache = get_memcache_client()
+
+SUBREDDIT = 'researchml'  # 'testingground4bots'
+SLEEP = 60
+POST_DESCRIPTION = True
+DESCRIPTION_FORMAT = "{}"
+
+# main procedure
+
+sources = ["https://distill.pub/rss.xml", "https://www.shortscience.org/rss.xml"]
+
+def run_bot(sources):
+    sub = r.subreddit(SUBREDDIT)
+    t0 = datetime.datetime.utcnow().replace(tzinfo=pytz.utc) - datetime.timedelta(days=10) 
+
+    logger.info("Start bot for subreddit %s", SUBREDDIT)
+    while True:
+        try:
+            logger.info("check sources")
+            newArticles = []
+            for source in sources:
+                d = feedparser.parse(source)
+                newArticles = d['entries']
+                    
+                for article in newArticles:
+                    dt = datetime.datetime.fromtimestamp(mktime(article['published_parsed'])).replace(tzinfo=pytz.utc)
+                    url = id = article['link']
+                    desc = article['summary']
+                    title= article['title']
+
+                    if dt < t0:
+                        # skip older ones
+                        logger.debug(f"skipping older article '{title}', id='{id}' td={dt-t0}")
+                        continue
+                    
+                    if cache.get(id) and cache.get(id) is 'T':
+                        logger.info(f"skipping done article '{title}', id='{id}'")
+                        continue
+                    else:
+                        logger.info('posting %s', id)
+                        try:
+                            submission = sub.submit('[S] ' + title, url=url, resubmit=True, send_replies=False)
+                            if POST_DESCRIPTION and desc is not None:
+                                submission.reply(DESCRIPTION_FORMAT.format(desc))
+                        except praw.exceptions.PRAWException as e:
+                            logger.exception("could not submit %s", e)
+                        else:
+                            cache.set(id, 'T')
+                            logger.info("submit article %s", article)
+
+        # Allows the bot to exit on ^C, all other exceptions are ignored
+        except KeyboardInterrupt:
+            return 0
+            break
+        except Exception as e:
+            logger.error("Exception %s", e, exc_info=True)
+
+        logger.info("sleep for %s s", SLEEP)
+        time.sleep(SLEEP)
+
+    # write_config_done(done)
+
+
+if __name__ == "__main__":
+    run_bot(sources)
@@ -1,79 +0,0 @@
-"""
-Posts new shortscience.org summaries to researchml
-"""
-import praw
-import os
-import time
-import datetime
-import pytz
-import RSSReader
-import logging
-
-from bot import get_bot, get_memcache_client
-
-# https://github.com/SmBe19/RedditBots/blob/master/RSSBot/RSSBot.py
-log = logging
-
-
-r = get_bot()
-cache = get_memcache_client()
-
-SUBREDDIT = 'researchml'  # 'testingground4bots'
-SLEEP = 60
-POST_DESCRIPTION = True
-DESCRIPTION_FORMAT = "{}"
-
-# main procedure
-def run_bot():
-    sub = r.subreddit(SUBREDDIT)
-    t0 = datetime.datetime.utcnow().replace(tzinfo=pytz.utc) - datetime.timedelta(days=0) 
-
-    log.info("Start bot for subreddit %s", SUBREDDIT)
-    while True:
-        sources = ["https://www.shortscience.org/rss.xml"]
-        try:
-            log.info("check sources")
-            newArticles = []
-            for source in sources:
-                newArticles.extend(RSSReader.get_new_articles(source))
-
-            for article in newArticles:
-                title, url, desc, id, dt = article
-
-                if dt < t0:
-                    # skip older ones
-                    # print(f"skipping older article {title}, {id} {dt-t0}")
-                    continue
-                
-                if cache.get(id) and cache.get(id) is 'T':
-                    # print(f"skipping done article {title}, {id}")
-                    continue
-                else:
-                    cache.set(id, 'T')
-                    print('posting', id)
-                    try:
-                        # submission = sub.submit(title, url=url, resubmit=RESUBMIT_ANYWAYS, send_replies=False)
-                        submission = sub.submit('[S] ' + title, url=url, resubmit=True, send_replies=False)
-                        if POST_DESCRIPTION and desc is not None:
-                            submission.reply(DESCRIPTION_FORMAT.format(desc))
-                    except praw.exceptions.PRAWException as e:
-                        log.error("could not submit %s", e)
-                    else:
-                        log.info("submit article %s", article)
-
-        # Allows the bot to exit on ^C, all other exceptions are ignored
-        except KeyboardInterrupt:
-            return 0
-            break
-        except Exception as e:
-            log.error("Exception %s", e, exc_info=True)
-
-        # write_config_done(done)
-        log.info("sleep for %s s", SLEEP)
-        time.sleep(SLEEP)
-
-    # write_config_done(done)
-
-
-if __name__ == "__main__":
-    run_bot()