mirror of
https://github.com/wassname/arXiv_abstract_bot.git
synced 2026-06-27 15:14:19 +08:00
112 lines
3.6 KiB
Python
112 lines
3.6 KiB
Python
import praw
|
|
import logging
|
|
import requests
|
|
import bs4
|
|
import html2text
|
|
import time, os
|
|
import shelve
|
|
import re
|
|
from prawcore import NotFound
|
|
import datetime
|
|
|
|
|
|
from botlib import get_bot, get_logger
|
|
|
|
logger = get_logger()
|
|
|
|
# from https://github.com/arxiv-vanity/arxiv-vanity/blob/master/arxiv_vanity/scraper/arxiv_ids.py
|
|
ARXIV_ID_PATTERN = r"([a-z\-]+(?:\.[A-Z]{2})?/\d{7}|\d+\.\d+)(v\d+)?"
|
|
ARXIV_URL_RE = re.compile(
|
|
r"arxiv.org/[^\/]+/({})(\.pdf)?".format(ARXIV_ID_PATTERN), re.I
|
|
)
|
|
OPENREVIEW_URL_RE = re.compile(r"openreview.net/", re.I)
|
|
# OPENREVIEW_URL_RE = re.compile(r'distill.pub/./', re.I)
|
|
|
|
r = get_bot()
|
|
|
|
# source subreddits
|
|
subreddits = [
|
|
r.subreddit("machinelearning"),
|
|
r.subreddit("reinforcementlearning"),
|
|
r.subreddit("LanguageTechnology"),
|
|
]
|
|
# target_subreddit = r.subreddit('mlresearch')
|
|
# target_subreddit = r.subreddit('testingground4bots')
|
|
|
|
SLEEP = 60 * 10
|
|
LIMIT_CHECK = 20
|
|
MIN_SECONDS = 3 * 60 * 60
|
|
MIN_SCORE = 5
|
|
|
|
|
|
if r.read_only == False:
|
|
logger.info("Connected and running.")
|
|
|
|
|
|
def comment():
|
|
with shelve.open(".arxiv_bot") as cache:
|
|
for j, subreddit in enumerate(subreddits):
|
|
try:
|
|
all_posts = subreddit.new(limit=LIMIT_CHECK)
|
|
for i, post in enumerate(all_posts):
|
|
match = ARXIV_URL_RE.search(post.url) or OPENREVIEW_URL_RE.search(
|
|
post.url
|
|
)
|
|
|
|
ts = time.time() - post.created_utc
|
|
logger.debug(
|
|
"%s %s %s, match %s, score %s>%s, age %s>%s, url %s",
|
|
j,
|
|
i,
|
|
subreddit,
|
|
bool(match),
|
|
post.score,
|
|
MIN_SCORE,
|
|
int(ts),
|
|
MIN_SECONDS,
|
|
post.url,
|
|
)
|
|
if match and (post.score > MIN_SCORE) and (ts > MIN_SECONDS):
|
|
|
|
if cache.get(post.id) is "T":
|
|
logger.debug(
|
|
"%s %s. Parsed this post already: %s. %s. %s"
|
|
% (j, i, post.permalink, post.id, post.url)
|
|
)
|
|
continue
|
|
elif cache.get(post.url) is "T":
|
|
# avoid situations where the same url is in multiple subreddits
|
|
logger.debug(
|
|
"%s %s .Parsed this url already: %s. %s. %s"
|
|
% (j, i, post.permalink, post.id, post.url)
|
|
)
|
|
continue
|
|
else:
|
|
logger.info(
|
|
"%s %s. Posting %s %s %s %s %s %s",
|
|
j,
|
|
i,
|
|
post,
|
|
post.url,
|
|
post.permalink,
|
|
post.id,
|
|
post.score,
|
|
ts,
|
|
)
|
|
post.crosspost("ResearchML")
|
|
cache[post.id] = "T"
|
|
cache[post.url] = "T"
|
|
time.sleep(60)
|
|
time.sleep(0.1)
|
|
except Exception as error:
|
|
logger.error(f"Failed to scrape {error} {subreddit}")
|
|
logger.exception(error)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
while True:
|
|
comment()
|
|
logger.debug("Sleeping for %s", SLEEP)
|
|
time.sleep(SLEEP)
|