logging and don't double postu rl

2026-06-27 16:59:36 +08:00 · 2020-05-06 11:19:22 +08:00
parent 7d4cdbf7e8
commit dc25964609
3 changed files with 61 additions and 23 deletions
@@ -15,60 +15,97 @@ from botlib import get_bot, get_logger
 logger = get_logger()

 # from https://github.com/arxiv-vanity/arxiv-vanity/blob/master/arxiv_vanity/scraper/arxiv_ids.py
-ARXIV_ID_PATTERN = r'([a-z\-]+(?:\.[A-Z]{2})?/\d{7}|\d+\.\d+)(v\d+)?'
-ARXIV_URL_RE = re.compile(r'arxiv.org/[^\/]+/({})(\.pdf)?'.format(ARXIV_ID_PATTERN), re.I)
-OPENREVIEW_URL_RE = re.compile(r'openreview.net/', re.I)
+ARXIV_ID_PATTERN = r"([a-z\-]+(?:\.[A-Z]{2})?/\d{7}|\d+\.\d+)(v\d+)?"
+ARXIV_URL_RE = re.compile(
+    r"arxiv.org/[^\/]+/({})(\.pdf)?".format(ARXIV_ID_PATTERN), re.I
+)
+OPENREVIEW_URL_RE = re.compile(r"openreview.net/", re.I)
 # OPENREVIEW_URL_RE = re.compile(r'distill.pub/./', re.I)

 r = get_bot()

 # source subreddits
 subreddits = [
-    r.subreddit('machinelearning'),
-    r.subreddit('reinforcementlearning'),
-    r.subreddit('LanguageTechnology')
+    r.subreddit("machinelearning"),
+    r.subreddit("reinforcementlearning"),
+    r.subreddit("LanguageTechnology"),
 ]
 # target_subreddit = r.subreddit('mlresearch')
 # target_subreddit = r.subreddit('testingground4bots')

-SLEEP = 60*10
-LIMIT_CHECK = 120
-MIN_SECONDS = 60 * 60
+SLEEP = 60 * 10
+LIMIT_CHECK = 20
+MIN_SECONDS = 3 * 60 * 60
 MIN_SCORE = 5


 if r.read_only == False:
-    print("Connected and running.")
+    logger.info("Connected and running.")


 def comment():
-    with shelve.open('.arxiv_bot') as cache:
+    with shelve.open(".arxiv_bot") as cache:
        for j, subreddit in enumerate(subreddits):
            try:
                all_posts = subreddit.new(limit=LIMIT_CHECK)
                for i, post in enumerate(all_posts):
-                    match = ARXIV_URL_RE.search(post.url) or OPENREVIEW_URL_RE.search(post.url)
+                    match = ARXIV_URL_RE.search(post.url) or OPENREVIEW_URL_RE.search(
+                        post.url
+                    )

                    ts = time.time() - post.created_utc
+                    logger.debug(
+                        "%s %s %s, match %s, score %s>%s, age %s>%s, url %s",
+                        j,
+                        i,
+                        subreddit,
+                        bool(match),
+                        post.score,
+                        MIN_SCORE,
+                        int(ts),
+                        MIN_SECONDS,
+                        post.url,
+                    )
                    if match and (post.score > MIN_SCORE) and (ts > MIN_SECONDS):

-                        if cache.get(post.id) and cache.get(post.id) is 'T':
-                            print(j, i, "Parsed this post already: %s. %s. %s" % (post.permalink, post.id, post.url))
+                        if cache.get(post.id) is "T":
+                            logger.debug(
+                                "%s %s. Parsed this post already: %s. %s. %s"
+                                % (j, i, post.permalink, post.id, post.url)
+                            )
+                            continue
+                        elif cache.get(post.url) is "T":
+                            # avoid situations where the same url is in multiple subreddits
+                            logger.debug(
+                                "%s %s .Parsed this url already: %s. %s. %s"
+                                % (j, i, post.permalink, post.id, post.url)
+                            )
                            continue
                        else:
-                            print(j, i, "posting", post, post.url, post.permalink, post.id, post.score, ts)
-                            post.crosspost('researchml')
-                            # xpost(['r/researchml'], post)
-                            cache[post.id] = 'T'
+                            logger.info(
+                                "%s %s. Posting %s %s %s %s %s %s",
+                                j,
+                                i,
+                                post,
+                                post.url,
+                                post.permalink,
+                                post.id,
+                                post.score,
+                                ts,
+                            )
+                            post.crosspost("ResearchML")
+                            cache[post.id] = "T"
+                            cache[post.url] = "T"
                            time.sleep(60)
-                    time.sleep(1)
+                    time.sleep(0.1)
            except Exception as error:
-                logger.error("Failed to scrape")
-                print(error)
+                logger.error(f"Failed to scrape {error} {subreddit}")
+                logger.exception(error)


 if __name__ == "__main__":

    while True:
        comment()
+        logger.debug("Sleeping for %s", SLEEP)
        time.sleep(SLEEP)
@@ -15,6 +15,7 @@ logging.basicConfig(stream=sys.stdout, level=logging.INFO)

 def get_logger():
    logger = logging.getLogger(__name__)
+    logger.setLevel(logging.DEBUG)
    return logger

 def get_memcache_client(name):
@@ -25,7 +25,7 @@ DESCRIPTION_FORMAT = "{}"

 # main procedure

-sources = ["https://www.shortscience.org/rss.xml", "https://distill.pub/rss.xml", ]
+sources = ["https://www.shortscience.org/rss.xml"]#, "https://distill.pub/rss.xml", ]
 # TODO might be nice to check shortscience votes 'shortscience:votes'

 # note this is what the entries look like