shortsciencebot and arxiv xpost working

2026-06-27 16:44:23 +08:00 · 2020-01-24 07:56:43 +08:00
parent f81c100fbf
commit 5ff836465c
4 changed files with 170 additions and 25 deletions
@@ -0,0 +1,52 @@
+import urllib.request
+import datetime
+from urllib.error import URLError
+import html2text
+import xml.etree.ElementTree as ET
+import os
+
+
+def get_new_articles(source):
+	articles = []
+	try:
+		response = urllib.request.urlopen(source)
+		orig_rss = response.read().decode("utf-8")
+		rss = ET.fromstring(orig_rss)
+		channel = rss.find("channel")
+		
+		for item in channel.findall("item"):
+			# Not used anymore
+			# pubDate = item.find("pubDate").text
+			# pubDateConv = mktime(time.strptime(pubDate, PUBDATEFORMAT)))
+			
+			link = item.find("link").text
+			
+			title = item.find("title")
+			
+			if title is not None:
+				title = title.text
+			if title is None:
+				print("found no title, will use link")
+				title = link
+				
+			description = item.find("description")
+			
+			if description is not None:
+				description = html2text.html2text(description.text)
+			
+			guid = item.find("guid")
+
+			pubDate = item.find("pubDate").text
+			pubDate = datetime.datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S %z")
+
+			if guid is not None:
+				guid = guid.text
+			if guid is None:
+				#print("found no guid, will use link")
+				guid = link
+			articles.append((title, link, description, guid, pubDate))
+		
+	except URLError as e:
+		print("Error:", e.reason)
+	
+	return articles
@@ -8,6 +8,8 @@ import bmemcached
 import re
 from prawcore import NotFound

+logger = logging
+
 # from https://github.com/arxiv-vanity/arxiv-vanity/blob/master/arxiv_vanity/scraper/arxiv_ids.py
 ARXIV_ID_PATTERN = r'([a-z\-]+(?:\.[A-Z]{2})?/\d{7}|\d+\.\d+)(v\d+)?'
 ARXIV_URL_RE = re.compile(r'arxiv.org/[^\/]+/({})(\.pdf)?'.format(ARXIV_ID_PATTERN), re.I)
@@ -29,7 +31,11 @@ def get_bot():

 r = get_bot()

-subreddit = r.subreddit('machinelearning')
+subreddits = [
+    r.subreddit('machinelearning'),
+    # r.subreddit('reinforcementlearning')
+    # r.subreddit('LanguageTechnology')
+]
 target_subreddit = r.subreddit('mlresearch')

 if r.read_only == False:
@@ -61,32 +67,35 @@ def scrape_arxiv(arxiv_id):

 def comment(cache):
    # print(time.asctime(), "searching")
-    try:
-        all_posts = subreddit.new(limit=100)
-        for post in all_posts:
-            match = ARXIV_URL_RE.search(post.url)
-            if match:
-                arxiv_id = match.group(1)
+    for subreddit in subreddits:
+        try:
+            all_posts = subreddit.new(limit=100)
+            for post in all_posts:
+                match = ARXIV_URL_RE.search(post.url)
+                if match:
+                    arxiv_id = match.group(1)

-                # crosspost
-                print('found', arxiv_id)
-                xpost(['r/researchml'], post)
+                    # crosspost
+                    print('found', arxiv_id)
+                    

-                # if cache.get(post.id) and cache.get(post.id) is 'T':
-                #     print "Parsed this post already: %s"%(post.permalink)
-                #     continue
-                # for comment in post.comments:
-                #     if str(comment.author) == 'arXiv_abstract_bot':
-                #         break
-                # else:
-                #     response = scrape_arxiv(arxiv_id)
-                #     post.reply(response)
-                #     cache.set(post.id, 'T')
-                #     print "Parsed post: %s"%(post.permalink)
-                #     print(arxiv_id, response)
-                #     time.sleep(10)
-    except Exception as error:
-        print(error)
+                    if cache.get(post.id) and cache.get(post.id) is 'T':
+                        print ("Parsed this post already: %s"%(post.permalink))
+                        continue
+                    # for comment in post.comments:
+                    #     if str(comment.author) == 'arXiv_abstract_bot':
+                    #         break
+                    else:
+                        xpost(['r/researchml'], post)
+                        # response = scrape_arxiv(arxiv_id)
+                        # post.reply(response)
+                        cache.set(post.id, 'T')
+                    #     print "Parsed post: %s"%(post.permalink)
+                    #     print(arxiv_id, response)
+                        time.sleep(10)
+        except Exception as error:
+            logger.error("Failed to scrape")
+            print(error)

 def xpost(subs, originalpost):
    # originalpost = where.submission
@@ -0,0 +1,5 @@
+I would like to audocrosspost
+- arxiv
+- openreview
+
+and also auto post shortscience summaries from their rss feed.
@@ -0,0 +1,79 @@
+"""
+Posts new shortscience.org summaries to researchml
+"""
+import praw
+import os
+import time
+import datetime
+import pytz
+import RSSReader
+import logging
+
+from bot import get_bot, get_memcache_client
+
+# https://github.com/SmBe19/RedditBots/blob/master/RSSBot/RSSBot.py
+log = logging
+
+
+r = get_bot()
+cache = get_memcache_client()
+
+SUBREDDIT = 'researchml'  # 'testingground4bots'
+SLEEP = 60
+POST_DESCRIPTION = True
+DESCRIPTION_FORMAT = "{}"
+
+# main procedure
+def run_bot():
+    sub = r.subreddit(SUBREDDIT)
+    t0 = datetime.datetime.utcnow().replace(tzinfo=pytz.utc) - datetime.timedelta(days=0) 
+
+    log.info("Start bot for subreddit %s", SUBREDDIT)
+    while True:
+        sources = ["https://www.shortscience.org/rss.xml"]
+        try:
+            log.info("check sources")
+            newArticles = []
+            for source in sources:
+                newArticles.extend(RSSReader.get_new_articles(source))
+
+            for article in newArticles:
+                title, url, desc, id, dt = article
+
+                if dt < t0:
+                    # skip older ones
+                    # print(f"skipping older article {title}, {id} {dt-t0}")
+                    continue
+                
+                if cache.get(id) and cache.get(id) is 'T':
+                    # print(f"skipping done article {title}, {id}")
+                    continue
+                else:
+                    cache.set(id, 'T')
+                    print('posting', id)
+                    try:
+                        # submission = sub.submit(title, url=url, resubmit=RESUBMIT_ANYWAYS, send_replies=False)
+                        submission = sub.submit('[S] ' + title, url=url, resubmit=True, send_replies=False)
+                        if POST_DESCRIPTION and desc is not None:
+                            submission.reply(DESCRIPTION_FORMAT.format(desc))
+                    except praw.exceptions.PRAWException as e:
+                        log.error("could not submit %s", e)
+                    else:
+                        log.info("submit article %s", article)
+
+        # Allows the bot to exit on ^C, all other exceptions are ignored
+        except KeyboardInterrupt:
+            return 0
+            break
+        except Exception as e:
+            log.error("Exception %s", e, exc_info=True)
+
+        # write_config_done(done)
+        log.info("sleep for %s s", SLEEP)
+        time.sleep(SLEEP)
+
+    # write_config_done(done)
+
+
+if __name__ == "__main__":
+    run_bot()