working well except for reposts due to non perist cache

This commit is contained in:
wassname
2020-02-12 07:28:13 +08:00
parent 5d89a12d2a
commit 71908d185a
2 changed files with 10 additions and 37 deletions
+7 -35
View File
@@ -28,42 +28,21 @@ subreddits = [
# r.subreddit('reinforcementlearning')
# r.subreddit('LanguageTechnology')
]
target_subreddit = r.subreddit('mlresearch')
target_subreddit = r.subreddit('testingground4bots')
# target_subreddit = r.subreddit('mlresearch')
# target_subreddit = r.subreddit('testingground4bots')
SLEEP = 600
LIMIT_CHECK=20
if r.read_only == False:
print("Connected and running.")
# alreadydone = set()
# def scrape_arxiv(arxiv_id):
# url = 'https://arxiv.org/abs/{}'.format(arxiv_id)
# r = requests.get(url)
# soup = bs4.BeautifulSoup(r.text)
# abstract = soup.select('.abstract')[0]
# abstract = html2text.html2text(abstract.decode()).replace('\n', ' ')
# authors = soup.select('.authors')[0]
# authors = html2text.html2text(authors.decode()).replace('\n', ' ')
# authors = authors.replace('(/', '(http://arxiv.org/')
# title = soup.select('.title')[0]
# title = html2text.html2text(title.decode()).replace('\n', ' ')[2:]
# abs_link = u'[Landing Page]({})'.format(url)
# pdf_link = u'[PDF Link](https://arxiv.org/pdf/{})'.format(arxiv_id)
# web_link = u'[Read as web page on arXiv Vanity](https://www.arxiv-vanity.com/papers/{}/)'.format(arxiv_id)
# links = u'{} | {} | {}'.format(pdf_link, abs_link, web_link)
# response = '\n\n'.join([title, authors, abstract, links])
# return response
def comment(cache):
# print(time.asctime(), "searching")
for subreddit in subreddits:
try:
all_posts = subreddit.new(limit=100)
all_posts = subreddit.new(limit=LIMIT_CHECK)
for post in all_posts:
match = ARXIV_URL_RE.search(post.url)
if match:
@@ -76,16 +55,9 @@ def comment(cache):
if cache.get(post.id) and cache.get(post.id) is 'T':
print ("Parsed this post already: %s"%(post.permalink))
continue
# for comment in post.comments:
# if str(comment.author) == 'arXiv_abstract_bot':
# break
else:
xpost(['r/researchml'], post)
# response = scrape_arxiv(arxiv_id)
# post.reply(response)
cache.set(post.id, 'T')
# print "Parsed post: %s"%(post.permalink)
# print(arxiv_id, response)
time.sleep(10)
except Exception as error:
logger.error("Failed to scrape")
@@ -143,4 +115,4 @@ if __name__ == "__main__":
while True:
comment(cache)
time.sleep(30)
time.sleep(SLEEP)
+3 -2
View File
@@ -17,7 +17,8 @@ r = get_bot()
cache = get_memcache_client()
SUBREDDIT = 'researchml' # 'testingground4bots'
SLEEP = 60
SLEEP = 600
MAX_AGE_DAYS = 1
POST_DESCRIPTION = True
DESCRIPTION_FORMAT = "{}"
@@ -27,7 +28,7 @@ sources = ["https://distill.pub/rss.xml", "https://www.shortscience.org/rss.xml"
def run_bot(sources):
sub = r.subreddit(SUBREDDIT)
t0 = datetime.datetime.utcnow().replace(tzinfo=pytz.utc) - datetime.timedelta(days=10)
t0 = datetime.datetime.utcnow().replace(tzinfo=pytz.utc) - datetime.timedelta(days=MAX_AGE_DAYS)
logger.info("Start bot for subreddit %s", SUBREDDIT)
while True: