mirror of
https://github.com/wassname/arXiv_abstract_bot.git
synced 2026-06-27 16:59:36 +08:00
logging and don't double postu rl
This commit is contained in:
+58
-21
@@ -15,60 +15,97 @@ from botlib import get_bot, get_logger
|
||||
logger = get_logger()
|
||||
|
||||
# from https://github.com/arxiv-vanity/arxiv-vanity/blob/master/arxiv_vanity/scraper/arxiv_ids.py
|
||||
ARXIV_ID_PATTERN = r'([a-z\-]+(?:\.[A-Z]{2})?/\d{7}|\d+\.\d+)(v\d+)?'
|
||||
ARXIV_URL_RE = re.compile(r'arxiv.org/[^\/]+/({})(\.pdf)?'.format(ARXIV_ID_PATTERN), re.I)
|
||||
OPENREVIEW_URL_RE = re.compile(r'openreview.net/', re.I)
|
||||
ARXIV_ID_PATTERN = r"([a-z\-]+(?:\.[A-Z]{2})?/\d{7}|\d+\.\d+)(v\d+)?"
|
||||
ARXIV_URL_RE = re.compile(
|
||||
r"arxiv.org/[^\/]+/({})(\.pdf)?".format(ARXIV_ID_PATTERN), re.I
|
||||
)
|
||||
OPENREVIEW_URL_RE = re.compile(r"openreview.net/", re.I)
|
||||
# OPENREVIEW_URL_RE = re.compile(r'distill.pub/./', re.I)
|
||||
|
||||
r = get_bot()
|
||||
|
||||
# source subreddits
|
||||
subreddits = [
|
||||
r.subreddit('machinelearning'),
|
||||
r.subreddit('reinforcementlearning'),
|
||||
r.subreddit('LanguageTechnology')
|
||||
r.subreddit("machinelearning"),
|
||||
r.subreddit("reinforcementlearning"),
|
||||
r.subreddit("LanguageTechnology"),
|
||||
]
|
||||
# target_subreddit = r.subreddit('mlresearch')
|
||||
# target_subreddit = r.subreddit('testingground4bots')
|
||||
|
||||
SLEEP = 60*10
|
||||
LIMIT_CHECK = 120
|
||||
MIN_SECONDS = 60 * 60
|
||||
SLEEP = 60 * 10
|
||||
LIMIT_CHECK = 20
|
||||
MIN_SECONDS = 3 * 60 * 60
|
||||
MIN_SCORE = 5
|
||||
|
||||
|
||||
if r.read_only == False:
|
||||
print("Connected and running.")
|
||||
logger.info("Connected and running.")
|
||||
|
||||
|
||||
def comment():
|
||||
with shelve.open('.arxiv_bot') as cache:
|
||||
with shelve.open(".arxiv_bot") as cache:
|
||||
for j, subreddit in enumerate(subreddits):
|
||||
try:
|
||||
all_posts = subreddit.new(limit=LIMIT_CHECK)
|
||||
for i, post in enumerate(all_posts):
|
||||
match = ARXIV_URL_RE.search(post.url) or OPENREVIEW_URL_RE.search(post.url)
|
||||
match = ARXIV_URL_RE.search(post.url) or OPENREVIEW_URL_RE.search(
|
||||
post.url
|
||||
)
|
||||
|
||||
ts = time.time() - post.created_utc
|
||||
logger.debug(
|
||||
"%s %s %s, match %s, score %s>%s, age %s>%s, url %s",
|
||||
j,
|
||||
i,
|
||||
subreddit,
|
||||
bool(match),
|
||||
post.score,
|
||||
MIN_SCORE,
|
||||
int(ts),
|
||||
MIN_SECONDS,
|
||||
post.url,
|
||||
)
|
||||
if match and (post.score > MIN_SCORE) and (ts > MIN_SECONDS):
|
||||
|
||||
if cache.get(post.id) and cache.get(post.id) is 'T':
|
||||
print(j, i, "Parsed this post already: %s. %s. %s" % (post.permalink, post.id, post.url))
|
||||
if cache.get(post.id) is "T":
|
||||
logger.debug(
|
||||
"%s %s. Parsed this post already: %s. %s. %s"
|
||||
% (j, i, post.permalink, post.id, post.url)
|
||||
)
|
||||
continue
|
||||
elif cache.get(post.url) is "T":
|
||||
# avoid situations where the same url is in multiple subreddits
|
||||
logger.debug(
|
||||
"%s %s .Parsed this url already: %s. %s. %s"
|
||||
% (j, i, post.permalink, post.id, post.url)
|
||||
)
|
||||
continue
|
||||
else:
|
||||
print(j, i, "posting", post, post.url, post.permalink, post.id, post.score, ts)
|
||||
post.crosspost('researchml')
|
||||
# xpost(['r/researchml'], post)
|
||||
cache[post.id] = 'T'
|
||||
logger.info(
|
||||
"%s %s. Posting %s %s %s %s %s %s",
|
||||
j,
|
||||
i,
|
||||
post,
|
||||
post.url,
|
||||
post.permalink,
|
||||
post.id,
|
||||
post.score,
|
||||
ts,
|
||||
)
|
||||
post.crosspost("ResearchML")
|
||||
cache[post.id] = "T"
|
||||
cache[post.url] = "T"
|
||||
time.sleep(60)
|
||||
time.sleep(1)
|
||||
time.sleep(0.1)
|
||||
except Exception as error:
|
||||
logger.error("Failed to scrape")
|
||||
print(error)
|
||||
logger.error(f"Failed to scrape {error} {subreddit}")
|
||||
logger.exception(error)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
while True:
|
||||
comment()
|
||||
logger.debug("Sleeping for %s", SLEEP)
|
||||
time.sleep(SLEEP)
|
||||
|
||||
@@ -15,6 +15,7 @@ logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
||||
|
||||
def get_logger():
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.DEBUG)
|
||||
return logger
|
||||
|
||||
def get_memcache_client(name):
|
||||
|
||||
+1
-1
@@ -25,7 +25,7 @@ DESCRIPTION_FORMAT = "{}"
|
||||
|
||||
# main procedure
|
||||
|
||||
sources = ["https://www.shortscience.org/rss.xml", "https://distill.pub/rss.xml", ]
|
||||
sources = ["https://www.shortscience.org/rss.xml"]#, "https://distill.pub/rss.xml", ]
|
||||
# TODO might be nice to check shortscience votes 'shortscience:votes'
|
||||
|
||||
# note this is what the entries look like
|
||||
|
||||
Reference in New Issue
Block a user