add min age and vote threshold

This commit is contained in:
wassname
2020-04-10 13:26:16 +08:00
parent 7499212bae
commit 3f614555d2
+23 -8
View File
@@ -18,18 +18,28 @@ r = get_bot()
SUBREDDIT = 'researchml' # 'testingground4bots'
SLEEP = 600
MAX_AGE_DAYS = 1
MIN_AGE_DAYS = 2
MAX_AGE_DAYS = 30
POST_DESCRIPTION = True
DESCRIPTION_FORMAT = "{}"
# main procedure
sources = ["https://distill.pub/rss.xml", "https://www.shortscience.org/rss.xml"]
sources = ["https://www.shortscience.org/rss.xml", "https://distill.pub/rss.xml", ]
# TODO might be nice to check shortscience votes 'shortscience:votes'
# note this is what the entries look like
# {'title': 'Visualizing Neural Networks with the Grand Tour', 'title_detail': {'type': 'text/plain', 'language': None, 'base': 'https://distill.pub/rss.xml', 'value': 'Visualizing Neural Networks with the Grand Tour'}, 'links': [{'rel': 'alternate', 'type': 'text/html', 'href': 'https://distill.pub/2020/grand-tour'}], 'link': 'https://distill.pub/2020/grand-tour', 'summary': 'By focusing on linear dimensionality reduction, we show how to visualize many dynamic phenomena in neural networks.', 'summary_detail': {'type': 'text/html', 'language': None, 'base': 'https://distill.pub/rss.xml', 'value': 'By focusing on linear dimensionality reduction, we show how to visualize many dynamic phenomena in neural networks.'}, 'id': 'https://distill.pub/2020/grand-tour', 'guidislink': False, 'published': 'Mon, 16 Mar 2020 20:0:0 Z', 'published_parsed': time.struct_time(tm_year=2020, tm_mon=3, tm_mday=16, tm_hour=20, tm_min=0, tm_sec=0, tm_wday=0, tm_yday=76, tm_isdst=0)}
# {'shortscience_arxivid': '1708.09259', 'shortscience_bibtexkey': 'journals/corr/1708.09259', 'shortscience_votes': '2', 'title': 'Efficient Convolutional Network Learning using Parametric Log based Dual-Tree Wavelet ScatterNet', 'title_detail': {'type': 'text/plain', 'language': None, 'base': 'https://www.shortscience.org/rss.xml', 'value': 'Efficient Convolutional Network Learning using Parametric Log based Dual-Tree Wavelet ScatterNet'}, 'authors': [{'name': 'hanoch kremer'}], 'author': 'hanoch kremer', 'author_detail': {'name': 'hanoch kremer'}, 'summary': "ScatterNets incorporates geometric knowledge of images to produce discriminative and invariant (translation and rotation) features i.e. edge information. The same outcome as CNN's first layers hold. So why not replace that first layer/s with an equivalent, fixed, structure and let the optimizer find the best weights for the CNN with its leading-edge removed.\nThe main motivations of the idea of replacing the first convolutional, ReLU and pooling layers of the CNN with a two-layer parametric log-b...", 'summary_detail': {'type': 'text/html', 'language': None, 'base': 'https://www.shortscience.org/rss.xml', 'value': "ScatterNets incorporates geometric knowledge of images to produce discriminative and invariant (translation and rotation) features i.e. edge information. The same outcome as CNN's first layers hold. So why not replace that first layer/s with an equivalent, fixed, structure and let the optimizer find the best weights for the CNN with its leading-edge removed.\nThe main motivations of the idea of replacing the first convolutional, ReLU and pooling layers of the CNN with a two-layer parametric log-b..."}, 'links': [{'rel': 'alternate', 'type': 'text/html', 'href': 'http://www.shortscience.org/paper?bibtexKey=journals/corr/1708.09259#hanochkremer'}], 'link': 'http://www.shortscience.org/paper?bibtexKey=journals/corr/1708.09259#hanochkremer', 'id': 'http://www.shortscience.org/paper?bibtexKey=journals/corr/1708.09259#hanochkremer', 'guidislink': False, 'published': 'Thu, 09 Apr 2020 12:05:38 +0000', 'published_parsed': time.struct_time(tm_year=2020, tm_mon=4, tm_mday=9, tm_hour=12, tm_min=5, tm_sec=38, tm_wday=3, tm_yday=100, tm_isdst=0)}
def run_bot(sources):
with shelve.open('.rss_bot') as cache:
sub = r.subreddit(SUBREDDIT)
t0 = datetime.datetime.utcnow().replace(tzinfo=pytz.utc) - datetime.timedelta(days=MAX_AGE_DAYS)
t0 = datetime.datetime.utcnow().replace(tzinfo=pytz.utc) - datetime.timedelta(days=MAX_AGE_DAYS)
t1 = datetime.datetime.utcnow().replace(tzinfo=pytz.utc) - datetime.timedelta(days=MIN_AGE_DAYS)
logger.info("Start bot for subreddit %s", SUBREDDIT)
while True:
@@ -44,10 +54,15 @@ def run_bot(sources):
dt = datetime.datetime.fromtimestamp(mktime(article['published_parsed'])).replace(tzinfo=pytz.utc)
url = id = article['link']
desc = article['summary']
title= article['title']
title = article['title']
votes = int(article.get('shortscience_votes', 99))
if dt < t0:
# skip older ones
if votes < 2:
logger.debug(f"skipping low votes article '{title}', id='{id}' td={dt-t0}, votes={votes}")
continue
if (dt > t0) and (dt < t1):
# skip older ones and new ones (that way we miss bugs that are removed from rss feed within a day)
logger.debug(f"skipping older article '{title}', id='{id}' td={dt-t0}")
continue
@@ -70,8 +85,8 @@ def run_bot(sources):
except KeyboardInterrupt:
return 0
break
except Exception as e:
logger.error("Exception %s", e, exc_info=True)
# except Exception as e:
# logger.error("Exception %s", e, exc_info=True)
logger.info("sleep for %s s", SLEEP)
time.sleep(SLEEP)