mirror of
https://github.com/wassname/arXiv_abstract_bot.git
synced 2026-06-27 16:44:23 +08:00
use persistant cache "shelve"
This commit is contained in:
@@ -1,3 +1,8 @@
|
||||
.rss_bot.*
|
||||
.arxiv_bot.*
|
||||
.vscode/*
|
||||
vars_copy.sh
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
|
||||
+25
-25
@@ -4,13 +4,13 @@ import requests
|
||||
import bs4
|
||||
import html2text
|
||||
import time, os
|
||||
import bmemcached
|
||||
import shelve
|
||||
import re
|
||||
from prawcore import NotFound
|
||||
import datetime
|
||||
|
||||
|
||||
from botlib import get_bot, get_memcache_client, get_logger
|
||||
from botlib import get_bot, get_logger
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
@@ -39,29 +39,30 @@ if r.read_only == False:
|
||||
print("Connected and running.")
|
||||
|
||||
|
||||
def comment(cache):
|
||||
for subreddit in subreddits:
|
||||
try:
|
||||
all_posts = subreddit.new(limit=LIMIT_CHECK)
|
||||
for post in all_posts:
|
||||
match = ARXIV_URL_RE.search(post.url)
|
||||
if match:
|
||||
arxiv_id = match.group(1)
|
||||
def comment():
|
||||
with shelve.open('.arxiv_bot') as cache:
|
||||
for subreddit in subreddits:
|
||||
try:
|
||||
all_posts = subreddit.new(limit=LIMIT_CHECK)
|
||||
for post in all_posts:
|
||||
match = ARXIV_URL_RE.search(post.url)
|
||||
if match:
|
||||
arxiv_id = match.group(1)
|
||||
|
||||
# crosspost
|
||||
print('found', arxiv_id)
|
||||
|
||||
# crosspost
|
||||
print('found', arxiv_id)
|
||||
|
||||
|
||||
if cache.get(post.id) and cache.get(post.id) is 'T':
|
||||
print ("Parsed this post already: %s"%(post.permalink))
|
||||
continue
|
||||
else:
|
||||
xpost(['r/researchml'], post)
|
||||
cache.set(post.id, 'T')
|
||||
time.sleep(10)
|
||||
except Exception as error:
|
||||
logger.error("Failed to scrape")
|
||||
print(error)
|
||||
if cache.get(post.id) and cache.get(post.id) is 'T':
|
||||
print ("Parsed this post already: %s"%(post.permalink))
|
||||
continue
|
||||
else:
|
||||
xpost(['r/researchml'], post)
|
||||
cache[post.id]='T'
|
||||
time.sleep(10)
|
||||
except Exception as error:
|
||||
logger.error("Failed to scrape")
|
||||
print(error)
|
||||
|
||||
def xpost(subs, originalpost):
|
||||
# originalpost = where.submission
|
||||
@@ -111,8 +112,7 @@ def xpost(subs, originalpost):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cache = get_memcache_client()
|
||||
|
||||
while True:
|
||||
comment(cache)
|
||||
comment()
|
||||
time.sleep(SLEEP)
|
||||
|
||||
@@ -4,7 +4,7 @@ import requests
|
||||
import bs4
|
||||
import html2text
|
||||
import time, os
|
||||
import bmemcached
|
||||
import shelve
|
||||
import re
|
||||
from prawcore import NotFound
|
||||
|
||||
@@ -17,16 +17,8 @@ def get_logger():
|
||||
logger = logging.getLogger(__name__)
|
||||
return logger
|
||||
|
||||
def get_memcache_client():
|
||||
# Store IDs of comments that the bot has already replied to.
|
||||
# Read local cache by default
|
||||
|
||||
MEMCACHEDCLOUD_SERVERS = os.environ.get('MEMCACHEDCLOUD_SERVERS')
|
||||
MEMCACHEDCLOUD_USERNAME = os.environ.get('MEMCACHEDCLOUD_USERNAME')
|
||||
MEMCACHEDCLOUD_PASSWORD = os.environ.get('MEMCACHEDCLOUD_PASSWORD')
|
||||
|
||||
client = bmemcached.Client((MEMCACHEDCLOUD_SERVERS,), MEMCACHEDCLOUD_USERNAME,
|
||||
MEMCACHEDCLOUD_PASSWORD)
|
||||
def get_memcache_client(name):
|
||||
client = shelve.open(name)
|
||||
return client
|
||||
|
||||
|
||||
|
||||
@@ -6,7 +6,6 @@ html2text==2018.1.9
|
||||
idna==2.8
|
||||
praw==6.0.0
|
||||
prawcore==1.0.0
|
||||
python-binary-memcached==0.28.0
|
||||
requests==2.21.0
|
||||
six==1.12.0
|
||||
typing==3.6.6
|
||||
|
||||
+45
-44
@@ -8,13 +8,13 @@ import datetime
|
||||
import pytz
|
||||
import feedparser
|
||||
import logging
|
||||
import shelve
|
||||
from time import mktime
|
||||
from botlib import get_bot, get_memcache_client, get_logger
|
||||
from botlib import get_bot, get_logger
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
r = get_bot()
|
||||
cache = get_memcache_client()
|
||||
|
||||
SUBREDDIT = 'researchml' # 'testingground4bots'
|
||||
SLEEP = 600
|
||||
@@ -27,53 +27,54 @@ DESCRIPTION_FORMAT = "{}"
|
||||
sources = ["https://distill.pub/rss.xml", "https://www.shortscience.org/rss.xml"]
|
||||
|
||||
def run_bot(sources):
|
||||
sub = r.subreddit(SUBREDDIT)
|
||||
t0 = datetime.datetime.utcnow().replace(tzinfo=pytz.utc) - datetime.timedelta(days=MAX_AGE_DAYS)
|
||||
with shelve.open('.rss_bot') as cache:
|
||||
sub = r.subreddit(SUBREDDIT)
|
||||
t0 = datetime.datetime.utcnow().replace(tzinfo=pytz.utc) - datetime.timedelta(days=MAX_AGE_DAYS)
|
||||
|
||||
logger.info("Start bot for subreddit %s", SUBREDDIT)
|
||||
while True:
|
||||
try:
|
||||
logger.info("check sources")
|
||||
newArticles = []
|
||||
for source in sources:
|
||||
d = feedparser.parse(source)
|
||||
newArticles = d['entries']
|
||||
|
||||
for article in newArticles:
|
||||
dt = datetime.datetime.fromtimestamp(mktime(article['published_parsed'])).replace(tzinfo=pytz.utc)
|
||||
url = id = article['link']
|
||||
desc = article['summary']
|
||||
title= article['title']
|
||||
logger.info("Start bot for subreddit %s", SUBREDDIT)
|
||||
while True:
|
||||
try:
|
||||
logger.info("check sources")
|
||||
newArticles = []
|
||||
for source in sources:
|
||||
d = feedparser.parse(source)
|
||||
newArticles = d['entries']
|
||||
|
||||
for article in newArticles:
|
||||
dt = datetime.datetime.fromtimestamp(mktime(article['published_parsed'])).replace(tzinfo=pytz.utc)
|
||||
url = id = article['link']
|
||||
desc = article['summary']
|
||||
title= article['title']
|
||||
|
||||
if dt < t0:
|
||||
# skip older ones
|
||||
logger.debug(f"skipping older article '{title}', id='{id}' td={dt-t0}")
|
||||
continue
|
||||
|
||||
if cache.get(id) and cache.get(id) is 'T':
|
||||
logger.info(f"skipping done article '{title}', id='{id}'")
|
||||
continue
|
||||
else:
|
||||
logger.info('posting %s', id)
|
||||
try:
|
||||
submission = sub.submit('[S] ' + title, url=url, resubmit=True, send_replies=False)
|
||||
if POST_DESCRIPTION and desc is not None:
|
||||
submission.reply(DESCRIPTION_FORMAT.format(desc))
|
||||
except praw.exceptions.PRAWException as e:
|
||||
logger.exception("could not submit %s", e)
|
||||
if dt < t0:
|
||||
# skip older ones
|
||||
logger.debug(f"skipping older article '{title}', id='{id}' td={dt-t0}")
|
||||
continue
|
||||
|
||||
if cache.get(id) and cache.get(id) is 'T':
|
||||
logger.info(f"skipping done article '{title}', id='{id}'")
|
||||
continue
|
||||
else:
|
||||
cache.set(id, 'T')
|
||||
logger.info("submit article %s", article)
|
||||
logger.info('posting %s', id)
|
||||
try:
|
||||
submission = sub.submit('[S] ' + title, url=url, resubmit=True, send_replies=False)
|
||||
if POST_DESCRIPTION and desc is not None:
|
||||
submission.reply(DESCRIPTION_FORMAT.format(desc))
|
||||
except praw.exceptions.PRAWException as e:
|
||||
logger.exception("could not submit %s", e)
|
||||
else:
|
||||
cache[id] ='T'
|
||||
logger.info("submit article %s", article)
|
||||
|
||||
# Allows the bot to exit on ^C, all other exceptions are ignored
|
||||
except KeyboardInterrupt:
|
||||
return 0
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error("Exception %s", e, exc_info=True)
|
||||
# Allows the bot to exit on ^C, all other exceptions are ignored
|
||||
except KeyboardInterrupt:
|
||||
return 0
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error("Exception %s", e, exc_info=True)
|
||||
|
||||
logger.info("sleep for %s s", SLEEP)
|
||||
time.sleep(SLEEP)
|
||||
logger.info("sleep for %s s", SLEEP)
|
||||
time.sleep(SLEEP)
|
||||
|
||||
# write_config_done(done)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user