Files
arXiv_abstract_bot/arxiv_ml_xpost_bot.py
2020-05-06 11:19:22 +08:00

112 lines
3.6 KiB
Python

import praw
import logging
import requests
import bs4
import html2text
import time, os
import shelve
import re
from prawcore import NotFound
import datetime
from botlib import get_bot, get_logger
logger = get_logger()
# from https://github.com/arxiv-vanity/arxiv-vanity/blob/master/arxiv_vanity/scraper/arxiv_ids.py
ARXIV_ID_PATTERN = r"([a-z\-]+(?:\.[A-Z]{2})?/\d{7}|\d+\.\d+)(v\d+)?"
ARXIV_URL_RE = re.compile(
r"arxiv.org/[^\/]+/({})(\.pdf)?".format(ARXIV_ID_PATTERN), re.I
)
OPENREVIEW_URL_RE = re.compile(r"openreview.net/", re.I)
# OPENREVIEW_URL_RE = re.compile(r'distill.pub/./', re.I)
r = get_bot()
# source subreddits
subreddits = [
r.subreddit("machinelearning"),
r.subreddit("reinforcementlearning"),
r.subreddit("LanguageTechnology"),
]
# target_subreddit = r.subreddit('mlresearch')
# target_subreddit = r.subreddit('testingground4bots')
SLEEP = 60 * 10
LIMIT_CHECK = 20
MIN_SECONDS = 3 * 60 * 60
MIN_SCORE = 5
if r.read_only == False:
logger.info("Connected and running.")
def comment():
with shelve.open(".arxiv_bot") as cache:
for j, subreddit in enumerate(subreddits):
try:
all_posts = subreddit.new(limit=LIMIT_CHECK)
for i, post in enumerate(all_posts):
match = ARXIV_URL_RE.search(post.url) or OPENREVIEW_URL_RE.search(
post.url
)
ts = time.time() - post.created_utc
logger.debug(
"%s %s %s, match %s, score %s>%s, age %s>%s, url %s",
j,
i,
subreddit,
bool(match),
post.score,
MIN_SCORE,
int(ts),
MIN_SECONDS,
post.url,
)
if match and (post.score > MIN_SCORE) and (ts > MIN_SECONDS):
if cache.get(post.id) is "T":
logger.debug(
"%s %s. Parsed this post already: %s. %s. %s"
% (j, i, post.permalink, post.id, post.url)
)
continue
elif cache.get(post.url) is "T":
# avoid situations where the same url is in multiple subreddits
logger.debug(
"%s %s .Parsed this url already: %s. %s. %s"
% (j, i, post.permalink, post.id, post.url)
)
continue
else:
logger.info(
"%s %s. Posting %s %s %s %s %s %s",
j,
i,
post,
post.url,
post.permalink,
post.id,
post.score,
ts,
)
post.crosspost("ResearchML")
cache[post.id] = "T"
cache[post.url] = "T"
time.sleep(60)
time.sleep(0.1)
except Exception as error:
logger.error(f"Failed to scrape {error} {subreddit}")
logger.exception(error)
if __name__ == "__main__":
while True:
comment()
logger.debug("Sleeping for %s", SLEEP)
time.sleep(SLEEP)