use persistant cache "shelve"

This commit is contained in:
wassname
2020-02-12 07:39:49 +08:00
parent 71908d185a
commit 69ae66fb42
5 changed files with 78 additions and 81 deletions
+5
View File
@@ -1,3 +1,8 @@
.rss_bot.*
.arxiv_bot.*
.vscode/*
vars_copy.sh
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
+25 -25
View File
@@ -4,13 +4,13 @@ import requests
import bs4
import html2text
import time, os
import bmemcached
import shelve
import re
from prawcore import NotFound
import datetime
from botlib import get_bot, get_memcache_client, get_logger
from botlib import get_bot, get_logger
logger = get_logger()
@@ -39,29 +39,30 @@ if r.read_only == False:
print("Connected and running.")
def comment(cache):
for subreddit in subreddits:
try:
all_posts = subreddit.new(limit=LIMIT_CHECK)
for post in all_posts:
match = ARXIV_URL_RE.search(post.url)
if match:
arxiv_id = match.group(1)
def comment():
with shelve.open('.arxiv_bot') as cache:
for subreddit in subreddits:
try:
all_posts = subreddit.new(limit=LIMIT_CHECK)
for post in all_posts:
match = ARXIV_URL_RE.search(post.url)
if match:
arxiv_id = match.group(1)
# crosspost
print('found', arxiv_id)
# crosspost
print('found', arxiv_id)
if cache.get(post.id) and cache.get(post.id) is 'T':
print ("Parsed this post already: %s"%(post.permalink))
continue
else:
xpost(['r/researchml'], post)
cache.set(post.id, 'T')
time.sleep(10)
except Exception as error:
logger.error("Failed to scrape")
print(error)
if cache.get(post.id) and cache.get(post.id) is 'T':
print ("Parsed this post already: %s"%(post.permalink))
continue
else:
xpost(['r/researchml'], post)
cache[post.id]='T'
time.sleep(10)
except Exception as error:
logger.error("Failed to scrape")
print(error)
def xpost(subs, originalpost):
# originalpost = where.submission
@@ -111,8 +112,7 @@ def xpost(subs, originalpost):
if __name__ == "__main__":
cache = get_memcache_client()
while True:
comment(cache)
comment()
time.sleep(SLEEP)
+3 -11
View File
@@ -4,7 +4,7 @@ import requests
import bs4
import html2text
import time, os
import bmemcached
import shelve
import re
from prawcore import NotFound
@@ -17,16 +17,8 @@ def get_logger():
logger = logging.getLogger(__name__)
return logger
def get_memcache_client():
# Store IDs of comments that the bot has already replied to.
# Read local cache by default
MEMCACHEDCLOUD_SERVERS = os.environ.get('MEMCACHEDCLOUD_SERVERS')
MEMCACHEDCLOUD_USERNAME = os.environ.get('MEMCACHEDCLOUD_USERNAME')
MEMCACHEDCLOUD_PASSWORD = os.environ.get('MEMCACHEDCLOUD_PASSWORD')
client = bmemcached.Client((MEMCACHEDCLOUD_SERVERS,), MEMCACHEDCLOUD_USERNAME,
MEMCACHEDCLOUD_PASSWORD)
def get_memcache_client(name):
client = shelve.open(name)
return client
-1
View File
@@ -6,7 +6,6 @@ html2text==2018.1.9
idna==2.8
praw==6.0.0
prawcore==1.0.0
python-binary-memcached==0.28.0
requests==2.21.0
six==1.12.0
typing==3.6.6
+45 -44
View File
@@ -8,13 +8,13 @@ import datetime
import pytz
import feedparser
import logging
import shelve
from time import mktime
from botlib import get_bot, get_memcache_client, get_logger
from botlib import get_bot, get_logger
logger = get_logger()
r = get_bot()
cache = get_memcache_client()
SUBREDDIT = 'researchml' # 'testingground4bots'
SLEEP = 600
@@ -27,53 +27,54 @@ DESCRIPTION_FORMAT = "{}"
sources = ["https://distill.pub/rss.xml", "https://www.shortscience.org/rss.xml"]
def run_bot(sources):
sub = r.subreddit(SUBREDDIT)
t0 = datetime.datetime.utcnow().replace(tzinfo=pytz.utc) - datetime.timedelta(days=MAX_AGE_DAYS)
with shelve.open('.rss_bot') as cache:
sub = r.subreddit(SUBREDDIT)
t0 = datetime.datetime.utcnow().replace(tzinfo=pytz.utc) - datetime.timedelta(days=MAX_AGE_DAYS)
logger.info("Start bot for subreddit %s", SUBREDDIT)
while True:
try:
logger.info("check sources")
newArticles = []
for source in sources:
d = feedparser.parse(source)
newArticles = d['entries']
for article in newArticles:
dt = datetime.datetime.fromtimestamp(mktime(article['published_parsed'])).replace(tzinfo=pytz.utc)
url = id = article['link']
desc = article['summary']
title= article['title']
logger.info("Start bot for subreddit %s", SUBREDDIT)
while True:
try:
logger.info("check sources")
newArticles = []
for source in sources:
d = feedparser.parse(source)
newArticles = d['entries']
for article in newArticles:
dt = datetime.datetime.fromtimestamp(mktime(article['published_parsed'])).replace(tzinfo=pytz.utc)
url = id = article['link']
desc = article['summary']
title= article['title']
if dt < t0:
# skip older ones
logger.debug(f"skipping older article '{title}', id='{id}' td={dt-t0}")
continue
if cache.get(id) and cache.get(id) is 'T':
logger.info(f"skipping done article '{title}', id='{id}'")
continue
else:
logger.info('posting %s', id)
try:
submission = sub.submit('[S] ' + title, url=url, resubmit=True, send_replies=False)
if POST_DESCRIPTION and desc is not None:
submission.reply(DESCRIPTION_FORMAT.format(desc))
except praw.exceptions.PRAWException as e:
logger.exception("could not submit %s", e)
if dt < t0:
# skip older ones
logger.debug(f"skipping older article '{title}', id='{id}' td={dt-t0}")
continue
if cache.get(id) and cache.get(id) is 'T':
logger.info(f"skipping done article '{title}', id='{id}'")
continue
else:
cache.set(id, 'T')
logger.info("submit article %s", article)
logger.info('posting %s', id)
try:
submission = sub.submit('[S] ' + title, url=url, resubmit=True, send_replies=False)
if POST_DESCRIPTION and desc is not None:
submission.reply(DESCRIPTION_FORMAT.format(desc))
except praw.exceptions.PRAWException as e:
logger.exception("could not submit %s", e)
else:
cache[id] ='T'
logger.info("submit article %s", article)
# Allows the bot to exit on ^C, all other exceptions are ignored
except KeyboardInterrupt:
return 0
break
except Exception as e:
logger.error("Exception %s", e, exc_info=True)
# Allows the bot to exit on ^C, all other exceptions are ignored
except KeyboardInterrupt:
return 0
break
except Exception as e:
logger.error("Exception %s", e, exc_info=True)
logger.info("sleep for %s s", SLEEP)
time.sleep(SLEEP)
logger.info("sleep for %s s", SLEEP)
time.sleep(SLEEP)
# write_config_done(done)