use persistant cache "shelve"

2026-06-27 16:44:23 +08:00 · 2020-02-12 07:39:49 +08:00
parent 71908d185a
commit 69ae66fb42
5 changed files with 78 additions and 81 deletions
@@ -1,3 +1,8 @@
+.rss_bot.*
+.arxiv_bot.*
+.vscode/*
+vars_copy.sh
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -4,13 +4,13 @@ import requests
 import bs4
 import html2text
 import time, os
-import bmemcached
+import shelve
 import re
 from prawcore import NotFound
 import datetime


-from botlib import get_bot, get_memcache_client, get_logger
+from botlib import get_bot, get_logger

 logger = get_logger()

@@ -39,29 +39,30 @@ if r.read_only == False:
    print("Connected and running.")


-def comment(cache):
-    for subreddit in subreddits:
-        try:
-            all_posts = subreddit.new(limit=LIMIT_CHECK)
-            for post in all_posts:
-                match = ARXIV_URL_RE.search(post.url)
-                if match:
-                    arxiv_id = match.group(1)
+def comment():
+    with shelve.open('.arxiv_bot') as cache:
+        for subreddit in subreddits:
+            try:
+                all_posts = subreddit.new(limit=LIMIT_CHECK)
+                for post in all_posts:
+                    match = ARXIV_URL_RE.search(post.url)
+                    if match:
+                        arxiv_id = match.group(1)

-                    # crosspost
-                    print('found', arxiv_id)
-                    
+                        # crosspost
+                        print('found', arxiv_id)
+                        

-                    if cache.get(post.id) and cache.get(post.id) is 'T':
-                        print ("Parsed this post already: %s"%(post.permalink))
-                        continue
-                    else:
-                        xpost(['r/researchml'], post)
-                        cache.set(post.id, 'T')
-                        time.sleep(10)
-        except Exception as error:
-            logger.error("Failed to scrape")
-            print(error)
+                        if cache.get(post.id) and cache.get(post.id) is 'T':
+                            print ("Parsed this post already: %s"%(post.permalink))
+                            continue
+                        else:
+                            xpost(['r/researchml'], post)
+                            cache[post.id]='T'
+                            time.sleep(10)
+            except Exception as error:
+                logger.error("Failed to scrape")
+                print(error)

 def xpost(subs, originalpost):
    # originalpost = where.submission
@@ -111,8 +112,7 @@ def xpost(subs, originalpost):


 if __name__ == "__main__":
-    cache = get_memcache_client()

    while True:
-        comment(cache)
+        comment()
        time.sleep(SLEEP)
@@ -4,7 +4,7 @@ import requests
 import bs4
 import html2text
 import time, os
-import bmemcached
+import shelve
 import re
 from prawcore import NotFound

@@ -17,16 +17,8 @@ def get_logger():
    logger = logging.getLogger(__name__)
    return logger

-def get_memcache_client():
-    # Store IDs of comments that the bot has already replied to.
-    # Read local cache by default
-
-    MEMCACHEDCLOUD_SERVERS = os.environ.get('MEMCACHEDCLOUD_SERVERS')
-    MEMCACHEDCLOUD_USERNAME = os.environ.get('MEMCACHEDCLOUD_USERNAME')
-    MEMCACHEDCLOUD_PASSWORD = os.environ.get('MEMCACHEDCLOUD_PASSWORD')
-
-    client = bmemcached.Client((MEMCACHEDCLOUD_SERVERS,), MEMCACHEDCLOUD_USERNAME,
-                           MEMCACHEDCLOUD_PASSWORD)
+def get_memcache_client(name):
+    client = shelve.open(name)
    return client
    

@@ -6,7 +6,6 @@ html2text==2018.1.9
 idna==2.8
 praw==6.0.0
 prawcore==1.0.0
-python-binary-memcached==0.28.0
 requests==2.21.0
 six==1.12.0
 typing==3.6.6
@@ -8,13 +8,13 @@ import datetime
 import pytz
 import feedparser
 import logging
+import shelve
 from time import mktime
-from botlib import get_bot, get_memcache_client, get_logger
+from botlib import get_bot, get_logger

 logger = get_logger()

 r = get_bot()
-cache = get_memcache_client()

 SUBREDDIT = 'researchml'  # 'testingground4bots'
 SLEEP = 600
@@ -27,53 +27,54 @@ DESCRIPTION_FORMAT = "{}"
 sources = ["https://distill.pub/rss.xml", "https://www.shortscience.org/rss.xml"]

 def run_bot(sources):
-    sub = r.subreddit(SUBREDDIT)
-    t0 = datetime.datetime.utcnow().replace(tzinfo=pytz.utc) - datetime.timedelta(days=MAX_AGE_DAYS) 
+    with shelve.open('.rss_bot') as cache:
+        sub = r.subreddit(SUBREDDIT)
+        t0 = datetime.datetime.utcnow().replace(tzinfo=pytz.utc) - datetime.timedelta(days=MAX_AGE_DAYS) 

-    logger.info("Start bot for subreddit %s", SUBREDDIT)
-    while True:
-        try:
-            logger.info("check sources")
-            newArticles = []
-            for source in sources:
-                d = feedparser.parse(source)
-                newArticles = d['entries']
-                    
-                for article in newArticles:
-                    dt = datetime.datetime.fromtimestamp(mktime(article['published_parsed'])).replace(tzinfo=pytz.utc)
-                    url = id = article['link']
-                    desc = article['summary']
-                    title= article['title']
+        logger.info("Start bot for subreddit %s", SUBREDDIT)
+        while True:
+            try:
+                logger.info("check sources")
+                newArticles = []
+                for source in sources:
+                    d = feedparser.parse(source)
+                    newArticles = d['entries']
+                        
+                    for article in newArticles:
+                        dt = datetime.datetime.fromtimestamp(mktime(article['published_parsed'])).replace(tzinfo=pytz.utc)
+                        url = id = article['link']
+                        desc = article['summary']
+                        title= article['title']

-                    if dt < t0:
-                        # skip older ones
-                        logger.debug(f"skipping older article '{title}', id='{id}' td={dt-t0}")
-                        continue
-                    
-                    if cache.get(id) and cache.get(id) is 'T':
-                        logger.info(f"skipping done article '{title}', id='{id}'")
-                        continue
-                    else:
-                        logger.info('posting %s', id)
-                        try:
-                            submission = sub.submit('[S] ' + title, url=url, resubmit=True, send_replies=False)
-                            if POST_DESCRIPTION and desc is not None:
-                                submission.reply(DESCRIPTION_FORMAT.format(desc))
-                        except praw.exceptions.PRAWException as e:
-                            logger.exception("could not submit %s", e)
+                        if dt < t0:
+                            # skip older ones
+                            logger.debug(f"skipping older article '{title}', id='{id}' td={dt-t0}")
+                            continue
+                        
+                        if cache.get(id) and cache.get(id) is 'T':
+                            logger.info(f"skipping done article '{title}', id='{id}'")
+                            continue
                        else:
-                            cache.set(id, 'T')
-                            logger.info("submit article %s", article)
+                            logger.info('posting %s', id)
+                            try:
+                                submission = sub.submit('[S] ' + title, url=url, resubmit=True, send_replies=False)
+                                if POST_DESCRIPTION and desc is not None:
+                                    submission.reply(DESCRIPTION_FORMAT.format(desc))
+                            except praw.exceptions.PRAWException as e:
+                                logger.exception("could not submit %s", e)
+                            else:
+                                cache[id] ='T'
+                                logger.info("submit article %s", article)

-        # Allows the bot to exit on ^C, all other exceptions are ignored
-        except KeyboardInterrupt:
-            return 0
-            break
-        except Exception as e:
-            logger.error("Exception %s", e, exc_info=True)
+            # Allows the bot to exit on ^C, all other exceptions are ignored
+            except KeyboardInterrupt:
+                return 0
+                break
+            except Exception as e:
+                logger.error("Exception %s", e, exc_info=True)

-        logger.info("sleep for %s s", SLEEP)
-        time.sleep(SLEEP)
+            logger.info("sleep for %s s", SLEEP)
+            time.sleep(SLEEP)

    # write_config_done(done)