From dc2596460975cc97538d0f65c102a66af3654172 Mon Sep 17 00:00:00 2001
From: wassname <wassname@users.noreply.github.com>
Date: Wed, 6 May 2020 11:19:22 +0800
Subject: [PATCH] logging and don't double postu rl

---
 arxiv_ml_xpost_bot.py | 81 +++++++++++++++++++++++++++++++------------
 botlib.py             |  1 +
 rss_poster_bot.py     |  2 +-
 3 files changed, 61 insertions(+), 23 deletions(-)

diff --git a/arxiv_ml_xpost_bot.py b/arxiv_ml_xpost_bot.py
index c557604..2a2de84 100644
--- a/arxiv_ml_xpost_bot.py
+++ b/arxiv_ml_xpost_bot.py
@@ -15,60 +15,97 @@ from botlib import get_bot, get_logger
 logger = get_logger()
 
 # from https://github.com/arxiv-vanity/arxiv-vanity/blob/master/arxiv_vanity/scraper/arxiv_ids.py
-ARXIV_ID_PATTERN = r'([a-z\-]+(?:\.[A-Z]{2})?/\d{7}|\d+\.\d+)(v\d+)?'
-ARXIV_URL_RE = re.compile(r'arxiv.org/[^\/]+/({})(\.pdf)?'.format(ARXIV_ID_PATTERN), re.I)
-OPENREVIEW_URL_RE = re.compile(r'openreview.net/', re.I)
+ARXIV_ID_PATTERN = r"([a-z\-]+(?:\.[A-Z]{2})?/\d{7}|\d+\.\d+)(v\d+)?"
+ARXIV_URL_RE = re.compile(
+    r"arxiv.org/[^\/]+/({})(\.pdf)?".format(ARXIV_ID_PATTERN), re.I
+)
+OPENREVIEW_URL_RE = re.compile(r"openreview.net/", re.I)
 # OPENREVIEW_URL_RE = re.compile(r'distill.pub/./', re.I)
 
 r = get_bot()
 
 # source subreddits
 subreddits = [
-    r.subreddit('machinelearning'),
-    r.subreddit('reinforcementlearning'),
-    r.subreddit('LanguageTechnology')
+    r.subreddit("machinelearning"),
+    r.subreddit("reinforcementlearning"),
+    r.subreddit("LanguageTechnology"),
 ]
 # target_subreddit = r.subreddit('mlresearch')
 # target_subreddit = r.subreddit('testingground4bots')
 
-SLEEP = 60*10
-LIMIT_CHECK = 120
-MIN_SECONDS = 60 * 60
+SLEEP = 60 * 10
+LIMIT_CHECK = 20
+MIN_SECONDS = 3 * 60 * 60
 MIN_SCORE = 5
 
 
 if r.read_only == False:
-    print("Connected and running.")
+    logger.info("Connected and running.")
 
 
 def comment():
-    with shelve.open('.arxiv_bot') as cache:
+    with shelve.open(".arxiv_bot") as cache:
         for j, subreddit in enumerate(subreddits):
             try:
                 all_posts = subreddit.new(limit=LIMIT_CHECK)
                 for i, post in enumerate(all_posts):
-                    match = ARXIV_URL_RE.search(post.url) or OPENREVIEW_URL_RE.search(post.url)
-                    
+                    match = ARXIV_URL_RE.search(post.url) or OPENREVIEW_URL_RE.search(
+                        post.url
+                    )
+
                     ts = time.time() - post.created_utc
+                    logger.debug(
+                        "%s %s %s, match %s, score %s>%s, age %s>%s, url %s",
+                        j,
+                        i,
+                        subreddit,
+                        bool(match),
+                        post.score,
+                        MIN_SCORE,
+                        int(ts),
+                        MIN_SECONDS,
+                        post.url,
+                    )
                     if match and (post.score > MIN_SCORE) and (ts > MIN_SECONDS):
 
-                        if cache.get(post.id) and cache.get(post.id) is 'T':
-                            print(j, i, "Parsed this post already: %s. %s. %s" % (post.permalink, post.id, post.url))
+                        if cache.get(post.id) is "T":
+                            logger.debug(
+                                "%s %s. Parsed this post already: %s. %s. %s"
+                                % (j, i, post.permalink, post.id, post.url)
+                            )
+                            continue
+                        elif cache.get(post.url) is "T":
+                            # avoid situations where the same url is in multiple subreddits
+                            logger.debug(
+                                "%s %s .Parsed this url already: %s. %s. %s"
+                                % (j, i, post.permalink, post.id, post.url)
+                            )
                             continue
                         else:
-                            print(j, i, "posting", post, post.url, post.permalink, post.id, post.score, ts)
-                            post.crosspost('researchml')
-                            # xpost(['r/researchml'], post)
-                            cache[post.id] = 'T'
+                            logger.info(
+                                "%s %s. Posting %s %s %s %s %s %s",
+                                j,
+                                i,
+                                post,
+                                post.url,
+                                post.permalink,
+                                post.id,
+                                post.score,
+                                ts,
+                            )
+                            post.crosspost("ResearchML")
+                            cache[post.id] = "T"
+                            cache[post.url] = "T"
                             time.sleep(60)
-                    time.sleep(1)
+                    time.sleep(0.1)
             except Exception as error:
-                logger.error("Failed to scrape")
-                print(error)
+                logger.error(f"Failed to scrape {error} {subreddit}")
+                logger.exception(error)
 
 
 if __name__ == "__main__":
 
     while True:
         comment()
+        logger.debug("Sleeping for %s", SLEEP)
         time.sleep(SLEEP)
diff --git a/botlib.py b/botlib.py
index 976b94a..3975b6d 100644
--- a/botlib.py
+++ b/botlib.py
@@ -15,6 +15,7 @@ logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 
 def get_logger():
     logger = logging.getLogger(__name__)
+    logger.setLevel(logging.DEBUG)
     return logger
 
 def get_memcache_client(name):
diff --git a/rss_poster_bot.py b/rss_poster_bot.py
index 01434f6..bd3d193 100644
--- a/rss_poster_bot.py
+++ b/rss_poster_bot.py
@@ -25,7 +25,7 @@ DESCRIPTION_FORMAT = "{}"
 
 # main procedure
 
-sources = ["https://www.shortscience.org/rss.xml", "https://distill.pub/rss.xml", ]
+sources = ["https://www.shortscience.org/rss.xml"]#, "https://distill.pub/rss.xml", ]
 # TODO might be nice to check shortscience votes 'shortscience:votes'
 
 # note this is what the entries look like