From dc2596460975cc97538d0f65c102a66af3654172 Mon Sep 17 00:00:00 2001 From: wassname Date: Wed, 6 May 2020 11:19:22 +0800 Subject: [PATCH] logging and don't double postu rl --- arxiv_ml_xpost_bot.py | 81 +++++++++++++++++++++++++++++++------------ botlib.py | 1 + rss_poster_bot.py | 2 +- 3 files changed, 61 insertions(+), 23 deletions(-) diff --git a/arxiv_ml_xpost_bot.py b/arxiv_ml_xpost_bot.py index c557604..2a2de84 100644 --- a/arxiv_ml_xpost_bot.py +++ b/arxiv_ml_xpost_bot.py @@ -15,60 +15,97 @@ from botlib import get_bot, get_logger logger = get_logger() # from https://github.com/arxiv-vanity/arxiv-vanity/blob/master/arxiv_vanity/scraper/arxiv_ids.py -ARXIV_ID_PATTERN = r'([a-z\-]+(?:\.[A-Z]{2})?/\d{7}|\d+\.\d+)(v\d+)?' -ARXIV_URL_RE = re.compile(r'arxiv.org/[^\/]+/({})(\.pdf)?'.format(ARXIV_ID_PATTERN), re.I) -OPENREVIEW_URL_RE = re.compile(r'openreview.net/', re.I) +ARXIV_ID_PATTERN = r"([a-z\-]+(?:\.[A-Z]{2})?/\d{7}|\d+\.\d+)(v\d+)?" +ARXIV_URL_RE = re.compile( + r"arxiv.org/[^\/]+/({})(\.pdf)?".format(ARXIV_ID_PATTERN), re.I +) +OPENREVIEW_URL_RE = re.compile(r"openreview.net/", re.I) # OPENREVIEW_URL_RE = re.compile(r'distill.pub/./', re.I) r = get_bot() # source subreddits subreddits = [ - r.subreddit('machinelearning'), - r.subreddit('reinforcementlearning'), - r.subreddit('LanguageTechnology') + r.subreddit("machinelearning"), + r.subreddit("reinforcementlearning"), + r.subreddit("LanguageTechnology"), ] # target_subreddit = r.subreddit('mlresearch') # target_subreddit = r.subreddit('testingground4bots') -SLEEP = 60*10 -LIMIT_CHECK = 120 -MIN_SECONDS = 60 * 60 +SLEEP = 60 * 10 +LIMIT_CHECK = 20 +MIN_SECONDS = 3 * 60 * 60 MIN_SCORE = 5 if r.read_only == False: - print("Connected and running.") + logger.info("Connected and running.") def comment(): - with shelve.open('.arxiv_bot') as cache: + with shelve.open(".arxiv_bot") as cache: for j, subreddit in enumerate(subreddits): try: all_posts = subreddit.new(limit=LIMIT_CHECK) for i, post in enumerate(all_posts): - match = ARXIV_URL_RE.search(post.url) or OPENREVIEW_URL_RE.search(post.url) - + match = ARXIV_URL_RE.search(post.url) or OPENREVIEW_URL_RE.search( + post.url + ) + ts = time.time() - post.created_utc + logger.debug( + "%s %s %s, match %s, score %s>%s, age %s>%s, url %s", + j, + i, + subreddit, + bool(match), + post.score, + MIN_SCORE, + int(ts), + MIN_SECONDS, + post.url, + ) if match and (post.score > MIN_SCORE) and (ts > MIN_SECONDS): - if cache.get(post.id) and cache.get(post.id) is 'T': - print(j, i, "Parsed this post already: %s. %s. %s" % (post.permalink, post.id, post.url)) + if cache.get(post.id) is "T": + logger.debug( + "%s %s. Parsed this post already: %s. %s. %s" + % (j, i, post.permalink, post.id, post.url) + ) + continue + elif cache.get(post.url) is "T": + # avoid situations where the same url is in multiple subreddits + logger.debug( + "%s %s .Parsed this url already: %s. %s. %s" + % (j, i, post.permalink, post.id, post.url) + ) continue else: - print(j, i, "posting", post, post.url, post.permalink, post.id, post.score, ts) - post.crosspost('researchml') - # xpost(['r/researchml'], post) - cache[post.id] = 'T' + logger.info( + "%s %s. Posting %s %s %s %s %s %s", + j, + i, + post, + post.url, + post.permalink, + post.id, + post.score, + ts, + ) + post.crosspost("ResearchML") + cache[post.id] = "T" + cache[post.url] = "T" time.sleep(60) - time.sleep(1) + time.sleep(0.1) except Exception as error: - logger.error("Failed to scrape") - print(error) + logger.error(f"Failed to scrape {error} {subreddit}") + logger.exception(error) if __name__ == "__main__": while True: comment() + logger.debug("Sleeping for %s", SLEEP) time.sleep(SLEEP) diff --git a/botlib.py b/botlib.py index 976b94a..3975b6d 100644 --- a/botlib.py +++ b/botlib.py @@ -15,6 +15,7 @@ logging.basicConfig(stream=sys.stdout, level=logging.INFO) def get_logger(): logger = logging.getLogger(__name__) + logger.setLevel(logging.DEBUG) return logger def get_memcache_client(name): diff --git a/rss_poster_bot.py b/rss_poster_bot.py index 01434f6..bd3d193 100644 --- a/rss_poster_bot.py +++ b/rss_poster_bot.py @@ -25,7 +25,7 @@ DESCRIPTION_FORMAT = "{}" # main procedure -sources = ["https://www.shortscience.org/rss.xml", "https://distill.pub/rss.xml", ] +sources = ["https://www.shortscience.org/rss.xml"]#, "https://distill.pub/rss.xml", ] # TODO might be nice to check shortscience votes 'shortscience:votes' # note this is what the entries look like