logging and don't double postu rl

This commit is contained in:
wassname
2020-05-06 11:19:22 +08:00
parent 7d4cdbf7e8
commit dc25964609
3 changed files with 61 additions and 23 deletions
+58 -21
View File
@@ -15,60 +15,97 @@ from botlib import get_bot, get_logger
logger = get_logger()
# from https://github.com/arxiv-vanity/arxiv-vanity/blob/master/arxiv_vanity/scraper/arxiv_ids.py
ARXIV_ID_PATTERN = r'([a-z\-]+(?:\.[A-Z]{2})?/\d{7}|\d+\.\d+)(v\d+)?'
ARXIV_URL_RE = re.compile(r'arxiv.org/[^\/]+/({})(\.pdf)?'.format(ARXIV_ID_PATTERN), re.I)
OPENREVIEW_URL_RE = re.compile(r'openreview.net/', re.I)
ARXIV_ID_PATTERN = r"([a-z\-]+(?:\.[A-Z]{2})?/\d{7}|\d+\.\d+)(v\d+)?"
ARXIV_URL_RE = re.compile(
r"arxiv.org/[^\/]+/({})(\.pdf)?".format(ARXIV_ID_PATTERN), re.I
)
OPENREVIEW_URL_RE = re.compile(r"openreview.net/", re.I)
# OPENREVIEW_URL_RE = re.compile(r'distill.pub/./', re.I)
r = get_bot()
# source subreddits
subreddits = [
r.subreddit('machinelearning'),
r.subreddit('reinforcementlearning'),
r.subreddit('LanguageTechnology')
r.subreddit("machinelearning"),
r.subreddit("reinforcementlearning"),
r.subreddit("LanguageTechnology"),
]
# target_subreddit = r.subreddit('mlresearch')
# target_subreddit = r.subreddit('testingground4bots')
SLEEP = 60*10
LIMIT_CHECK = 120
MIN_SECONDS = 60 * 60
SLEEP = 60 * 10
LIMIT_CHECK = 20
MIN_SECONDS = 3 * 60 * 60
MIN_SCORE = 5
if r.read_only == False:
print("Connected and running.")
logger.info("Connected and running.")
def comment():
with shelve.open('.arxiv_bot') as cache:
with shelve.open(".arxiv_bot") as cache:
for j, subreddit in enumerate(subreddits):
try:
all_posts = subreddit.new(limit=LIMIT_CHECK)
for i, post in enumerate(all_posts):
match = ARXIV_URL_RE.search(post.url) or OPENREVIEW_URL_RE.search(post.url)
match = ARXIV_URL_RE.search(post.url) or OPENREVIEW_URL_RE.search(
post.url
)
ts = time.time() - post.created_utc
logger.debug(
"%s %s %s, match %s, score %s>%s, age %s>%s, url %s",
j,
i,
subreddit,
bool(match),
post.score,
MIN_SCORE,
int(ts),
MIN_SECONDS,
post.url,
)
if match and (post.score > MIN_SCORE) and (ts > MIN_SECONDS):
if cache.get(post.id) and cache.get(post.id) is 'T':
print(j, i, "Parsed this post already: %s. %s. %s" % (post.permalink, post.id, post.url))
if cache.get(post.id) is "T":
logger.debug(
"%s %s. Parsed this post already: %s. %s. %s"
% (j, i, post.permalink, post.id, post.url)
)
continue
elif cache.get(post.url) is "T":
# avoid situations where the same url is in multiple subreddits
logger.debug(
"%s %s .Parsed this url already: %s. %s. %s"
% (j, i, post.permalink, post.id, post.url)
)
continue
else:
print(j, i, "posting", post, post.url, post.permalink, post.id, post.score, ts)
post.crosspost('researchml')
# xpost(['r/researchml'], post)
cache[post.id] = 'T'
logger.info(
"%s %s. Posting %s %s %s %s %s %s",
j,
i,
post,
post.url,
post.permalink,
post.id,
post.score,
ts,
)
post.crosspost("ResearchML")
cache[post.id] = "T"
cache[post.url] = "T"
time.sleep(60)
time.sleep(1)
time.sleep(0.1)
except Exception as error:
logger.error("Failed to scrape")
print(error)
logger.error(f"Failed to scrape {error} {subreddit}")
logger.exception(error)
if __name__ == "__main__":
while True:
comment()
logger.debug("Sleeping for %s", SLEEP)
time.sleep(SLEEP)
+1
View File
@@ -15,6 +15,7 @@ logging.basicConfig(stream=sys.stdout, level=logging.INFO)
def get_logger():
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
return logger
def get_memcache_client(name):
+1 -1
View File
@@ -25,7 +25,7 @@ DESCRIPTION_FORMAT = "{}"
# main procedure
sources = ["https://www.shortscience.org/rss.xml", "https://distill.pub/rss.xml", ]
sources = ["https://www.shortscience.org/rss.xml"]#, "https://distill.pub/rss.xml", ]
# TODO might be nice to check shortscience votes 'shortscience:votes'
# note this is what the entries look like