From 71908d185a20ab6a2c7b350c5d7204558a9ba75f Mon Sep 17 00:00:00 2001 From: wassname Date: Wed, 12 Feb 2020 07:28:13 +0800 Subject: [PATCH] working well except for reposts due to non perist cache --- arxiv_ml_xpost_bot.py | 42 +++++++----------------------------------- rss_poster_bot.py | 5 +++-- 2 files changed, 10 insertions(+), 37 deletions(-) diff --git a/arxiv_ml_xpost_bot.py b/arxiv_ml_xpost_bot.py index b1a2fad..1f73ff8 100644 --- a/arxiv_ml_xpost_bot.py +++ b/arxiv_ml_xpost_bot.py @@ -28,42 +28,21 @@ subreddits = [ # r.subreddit('reinforcementlearning') # r.subreddit('LanguageTechnology') ] -target_subreddit = r.subreddit('mlresearch') -target_subreddit = r.subreddit('testingground4bots') +# target_subreddit = r.subreddit('mlresearch') +# target_subreddit = r.subreddit('testingground4bots') + +SLEEP = 600 +LIMIT_CHECK=20 if r.read_only == False: print("Connected and running.") -# alreadydone = set() - - -# def scrape_arxiv(arxiv_id): -# url = 'https://arxiv.org/abs/{}'.format(arxiv_id) -# r = requests.get(url) -# soup = bs4.BeautifulSoup(r.text) -# abstract = soup.select('.abstract')[0] -# abstract = html2text.html2text(abstract.decode()).replace('\n', ' ') - -# authors = soup.select('.authors')[0] -# authors = html2text.html2text(authors.decode()).replace('\n', ' ') -# authors = authors.replace('(/', '(http://arxiv.org/') - -# title = soup.select('.title')[0] -# title = html2text.html2text(title.decode()).replace('\n', ' ')[2:] - -# abs_link = u'[Landing Page]({})'.format(url) -# pdf_link = u'[PDF Link](https://arxiv.org/pdf/{})'.format(arxiv_id) -# web_link = u'[Read as web page on arXiv Vanity](https://www.arxiv-vanity.com/papers/{}/)'.format(arxiv_id) -# links = u'{} | {} | {}'.format(pdf_link, abs_link, web_link) -# response = '\n\n'.join([title, authors, abstract, links]) -# return response def comment(cache): - # print(time.asctime(), "searching") for subreddit in subreddits: try: - all_posts = subreddit.new(limit=100) + all_posts = subreddit.new(limit=LIMIT_CHECK) for post in all_posts: match = ARXIV_URL_RE.search(post.url) if match: @@ -76,16 +55,9 @@ def comment(cache): if cache.get(post.id) and cache.get(post.id) is 'T': print ("Parsed this post already: %s"%(post.permalink)) continue - # for comment in post.comments: - # if str(comment.author) == 'arXiv_abstract_bot': - # break else: xpost(['r/researchml'], post) - # response = scrape_arxiv(arxiv_id) - # post.reply(response) cache.set(post.id, 'T') - # print "Parsed post: %s"%(post.permalink) - # print(arxiv_id, response) time.sleep(10) except Exception as error: logger.error("Failed to scrape") @@ -143,4 +115,4 @@ if __name__ == "__main__": while True: comment(cache) - time.sleep(30) + time.sleep(SLEEP) diff --git a/rss_poster_bot.py b/rss_poster_bot.py index 79d3c51..f7d71e5 100644 --- a/rss_poster_bot.py +++ b/rss_poster_bot.py @@ -17,7 +17,8 @@ r = get_bot() cache = get_memcache_client() SUBREDDIT = 'researchml' # 'testingground4bots' -SLEEP = 60 +SLEEP = 600 +MAX_AGE_DAYS = 1 POST_DESCRIPTION = True DESCRIPTION_FORMAT = "{}" @@ -27,7 +28,7 @@ sources = ["https://distill.pub/rss.xml", "https://www.shortscience.org/rss.xml" def run_bot(sources): sub = r.subreddit(SUBREDDIT) - t0 = datetime.datetime.utcnow().replace(tzinfo=pytz.utc) - datetime.timedelta(days=10) + t0 = datetime.datetime.utcnow().replace(tzinfo=pytz.utc) - datetime.timedelta(days=MAX_AGE_DAYS) logger.info("Start bot for subreddit %s", SUBREDDIT) while True: