mirror of
https://github.com/wassname/arXiv_abstract_bot.git
synced 2026-06-27 16:44:23 +08:00
tidies rss and distill
This commit is contained in:
@@ -1,52 +0,0 @@
|
|||||||
import urllib.request
|
|
||||||
import datetime
|
|
||||||
from urllib.error import URLError
|
|
||||||
import html2text
|
|
||||||
import xml.etree.ElementTree as ET
|
|
||||||
import os
|
|
||||||
|
|
||||||
|
|
||||||
def get_new_articles(source):
|
|
||||||
articles = []
|
|
||||||
try:
|
|
||||||
response = urllib.request.urlopen(source)
|
|
||||||
orig_rss = response.read().decode("utf-8")
|
|
||||||
rss = ET.fromstring(orig_rss)
|
|
||||||
channel = rss.find("channel")
|
|
||||||
|
|
||||||
for item in channel.findall("item"):
|
|
||||||
# Not used anymore
|
|
||||||
# pubDate = item.find("pubDate").text
|
|
||||||
# pubDateConv = mktime(time.strptime(pubDate, PUBDATEFORMAT)))
|
|
||||||
|
|
||||||
link = item.find("link").text
|
|
||||||
|
|
||||||
title = item.find("title")
|
|
||||||
|
|
||||||
if title is not None:
|
|
||||||
title = title.text
|
|
||||||
if title is None:
|
|
||||||
print("found no title, will use link")
|
|
||||||
title = link
|
|
||||||
|
|
||||||
description = item.find("description")
|
|
||||||
|
|
||||||
if description is not None:
|
|
||||||
description = html2text.html2text(description.text)
|
|
||||||
|
|
||||||
guid = item.find("guid")
|
|
||||||
|
|
||||||
pubDate = item.find("pubDate").text
|
|
||||||
pubDate = datetime.datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S %z")
|
|
||||||
|
|
||||||
if guid is not None:
|
|
||||||
guid = guid.text
|
|
||||||
if guid is None:
|
|
||||||
#print("found no guid, will use link")
|
|
||||||
guid = link
|
|
||||||
articles.append((title, link, description, guid, pubDate))
|
|
||||||
|
|
||||||
except URLError as e:
|
|
||||||
print("Error:", e.reason)
|
|
||||||
|
|
||||||
return articles
|
|
||||||
@@ -7,62 +7,56 @@ import time, os
|
|||||||
import bmemcached
|
import bmemcached
|
||||||
import re
|
import re
|
||||||
from prawcore import NotFound
|
from prawcore import NotFound
|
||||||
|
import datetime
|
||||||
|
|
||||||
logger = logging
|
|
||||||
|
from botlib import get_bot, get_memcache_client, get_logger
|
||||||
|
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
# from https://github.com/arxiv-vanity/arxiv-vanity/blob/master/arxiv_vanity/scraper/arxiv_ids.py
|
# from https://github.com/arxiv-vanity/arxiv-vanity/blob/master/arxiv_vanity/scraper/arxiv_ids.py
|
||||||
ARXIV_ID_PATTERN = r'([a-z\-]+(?:\.[A-Z]{2})?/\d{7}|\d+\.\d+)(v\d+)?'
|
ARXIV_ID_PATTERN = r'([a-z\-]+(?:\.[A-Z]{2})?/\d{7}|\d+\.\d+)(v\d+)?'
|
||||||
ARXIV_URL_RE = re.compile(r'arxiv.org/[^\/]+/({})(\.pdf)?'.format(ARXIV_ID_PATTERN), re.I)
|
ARXIV_URL_RE = re.compile(r'arxiv.org/[^\/]+/({})(\.pdf)?'.format(ARXIV_ID_PATTERN), re.I)
|
||||||
|
# OPENREVIEW_URL_RE = re.compile(r'openreview.net/./', re.I)
|
||||||
def get_bot():
|
# OPENREVIEW_URL_RE = re.compile(r'distill.pub/./', re.I)
|
||||||
PRAW_CLIENT_ID = os.environ.get('PRAW_CLIENT_ID')
|
|
||||||
PRAW_CLIENT_SECRET = os.environ.get('PRAW_CLIENT_SECRET')
|
|
||||||
PRAW_PASSWORD = os.environ.get('PRAW_PASSWORD')
|
|
||||||
PRAW_USERNAME = os.environ.get('PRAW_USERNAME')
|
|
||||||
PRAW_USERAGENT = os.environ.get('PRAW_USERAGENT')
|
|
||||||
return praw.Reddit(
|
|
||||||
username=PRAW_USERNAME,
|
|
||||||
password=PRAW_PASSWORD,
|
|
||||||
client_id=PRAW_CLIENT_ID,
|
|
||||||
client_secret=PRAW_CLIENT_SECRET,
|
|
||||||
user_agent=PRAW_USERAGENT
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
r = get_bot()
|
r = get_bot()
|
||||||
|
|
||||||
|
# source subreddits
|
||||||
subreddits = [
|
subreddits = [
|
||||||
r.subreddit('machinelearning'),
|
r.subreddit('machinelearning'),
|
||||||
# r.subreddit('reinforcementlearning')
|
# r.subreddit('reinforcementlearning')
|
||||||
# r.subreddit('LanguageTechnology')
|
# r.subreddit('LanguageTechnology')
|
||||||
]
|
]
|
||||||
target_subreddit = r.subreddit('mlresearch')
|
target_subreddit = r.subreddit('mlresearch')
|
||||||
|
target_subreddit = r.subreddit('testingground4bots')
|
||||||
|
|
||||||
|
|
||||||
if r.read_only == False:
|
if r.read_only == False:
|
||||||
print("Connected and running.")
|
print("Connected and running.")
|
||||||
# alreadydone = set()
|
# alreadydone = set()
|
||||||
|
|
||||||
|
|
||||||
def scrape_arxiv(arxiv_id):
|
# def scrape_arxiv(arxiv_id):
|
||||||
url = 'https://arxiv.org/abs/{}'.format(arxiv_id)
|
# url = 'https://arxiv.org/abs/{}'.format(arxiv_id)
|
||||||
r = requests.get(url)
|
# r = requests.get(url)
|
||||||
soup = bs4.BeautifulSoup(r.text)
|
# soup = bs4.BeautifulSoup(r.text)
|
||||||
abstract = soup.select('.abstract')[0]
|
# abstract = soup.select('.abstract')[0]
|
||||||
abstract = html2text.html2text(abstract.decode()).replace('\n', ' ')
|
# abstract = html2text.html2text(abstract.decode()).replace('\n', ' ')
|
||||||
|
|
||||||
authors = soup.select('.authors')[0]
|
# authors = soup.select('.authors')[0]
|
||||||
authors = html2text.html2text(authors.decode()).replace('\n', ' ')
|
# authors = html2text.html2text(authors.decode()).replace('\n', ' ')
|
||||||
authors = authors.replace('(/', '(http://arxiv.org/')
|
# authors = authors.replace('(/', '(http://arxiv.org/')
|
||||||
|
|
||||||
title = soup.select('.title')[0]
|
# title = soup.select('.title')[0]
|
||||||
title = html2text.html2text(title.decode()).replace('\n', ' ')[2:]
|
# title = html2text.html2text(title.decode()).replace('\n', ' ')[2:]
|
||||||
|
|
||||||
abs_link = u'[Landing Page]({})'.format(url)
|
# abs_link = u'[Landing Page]({})'.format(url)
|
||||||
pdf_link = u'[PDF Link](https://arxiv.org/pdf/{})'.format(arxiv_id)
|
# pdf_link = u'[PDF Link](https://arxiv.org/pdf/{})'.format(arxiv_id)
|
||||||
web_link = u'[Read as web page on arXiv Vanity](https://www.arxiv-vanity.com/papers/{}/)'.format(arxiv_id)
|
# web_link = u'[Read as web page on arXiv Vanity](https://www.arxiv-vanity.com/papers/{}/)'.format(arxiv_id)
|
||||||
links = u'{} | {} | {}'.format(pdf_link, abs_link, web_link)
|
# links = u'{} | {} | {}'.format(pdf_link, abs_link, web_link)
|
||||||
response = '\n\n'.join([title, authors, abstract, links])
|
# response = '\n\n'.join([title, authors, abstract, links])
|
||||||
return response
|
# return response
|
||||||
|
|
||||||
|
|
||||||
def comment(cache):
|
def comment(cache):
|
||||||
@@ -142,17 +136,6 @@ def xpost(subs, originalpost):
|
|||||||
# where.reply(str(response) + " Make sure to give me karma to prevent that in the future.")
|
# where.reply(str(response) + " Make sure to give me karma to prevent that in the future.")
|
||||||
|
|
||||||
|
|
||||||
def get_memcache_client():
|
|
||||||
# Store IDs of comments that the bot has already replied to.
|
|
||||||
# Read local cache by default
|
|
||||||
|
|
||||||
MEMCACHEDCLOUD_SERVERS = os.environ.get('MEMCACHEDCLOUD_SERVERS')
|
|
||||||
MEMCACHEDCLOUD_USERNAME = os.environ.get('MEMCACHEDCLOUD_USERNAME')
|
|
||||||
MEMCACHEDCLOUD_PASSWORD = os.environ.get('MEMCACHEDCLOUD_PASSWORD')
|
|
||||||
|
|
||||||
client = bmemcached.Client((MEMCACHEDCLOUD_SERVERS,), MEMCACHEDCLOUD_USERNAME,
|
|
||||||
MEMCACHEDCLOUD_PASSWORD)
|
|
||||||
return client
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
@@ -0,0 +1,45 @@
|
|||||||
|
import praw
|
||||||
|
import logging
|
||||||
|
import requests
|
||||||
|
import bs4
|
||||||
|
import html2text
|
||||||
|
import time, os
|
||||||
|
import bmemcached
|
||||||
|
import re
|
||||||
|
from prawcore import NotFound
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
|
||||||
|
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
||||||
|
|
||||||
|
def get_logger():
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
return logger
|
||||||
|
|
||||||
|
def get_memcache_client():
|
||||||
|
# Store IDs of comments that the bot has already replied to.
|
||||||
|
# Read local cache by default
|
||||||
|
|
||||||
|
MEMCACHEDCLOUD_SERVERS = os.environ.get('MEMCACHEDCLOUD_SERVERS')
|
||||||
|
MEMCACHEDCLOUD_USERNAME = os.environ.get('MEMCACHEDCLOUD_USERNAME')
|
||||||
|
MEMCACHEDCLOUD_PASSWORD = os.environ.get('MEMCACHEDCLOUD_PASSWORD')
|
||||||
|
|
||||||
|
client = bmemcached.Client((MEMCACHEDCLOUD_SERVERS,), MEMCACHEDCLOUD_USERNAME,
|
||||||
|
MEMCACHEDCLOUD_PASSWORD)
|
||||||
|
return client
|
||||||
|
|
||||||
|
|
||||||
|
def get_bot():
|
||||||
|
PRAW_CLIENT_ID = os.environ.get('PRAW_CLIENT_ID')
|
||||||
|
PRAW_CLIENT_SECRET = os.environ.get('PRAW_CLIENT_SECRET')
|
||||||
|
PRAW_PASSWORD = os.environ.get('PRAW_PASSWORD')
|
||||||
|
PRAW_USERNAME = os.environ.get('PRAW_USERNAME')
|
||||||
|
PRAW_USERAGENT = os.environ.get('PRAW_USERAGENT')
|
||||||
|
return praw.Reddit(
|
||||||
|
username=PRAW_USERNAME,
|
||||||
|
password=PRAW_PASSWORD,
|
||||||
|
client_id=PRAW_CLIENT_ID,
|
||||||
|
client_secret=PRAW_CLIENT_SECRET,
|
||||||
|
user_agent=PRAW_USERAGENT
|
||||||
|
)
|
||||||
@@ -1,5 +1,17 @@
|
|||||||
I would like to audocrosspost
|
I would like to autocrosspost
|
||||||
- arxiv
|
- [x] arxiv from r/machinelearning
|
||||||
- openreview
|
- [x] shortscience summaries from shortscience rss feed
|
||||||
|
- [ ] openreview
|
||||||
|
- [ ] openai, googlemin, facebookai, deepmind blogposts
|
||||||
|
|
||||||
and also auto post shortscience summaries from their rss feed.
|
|
||||||
|
# start
|
||||||
|
|
||||||
|
```sh
|
||||||
|
pyenv activate jup3.7.2
|
||||||
|
#memcached?
|
||||||
|
#https://github.com/jaysonsantos/python-binary-memcached
|
||||||
|
source ./vars.sh
|
||||||
|
python ./arxiv_ml_xpost_bot.py &
|
||||||
|
python ./rss_poster_bot.py &
|
||||||
|
```
|
||||||
|
|||||||
@@ -0,0 +1,81 @@
|
|||||||
|
"""
|
||||||
|
Posts new shortscience.org summaries to researchml
|
||||||
|
"""
|
||||||
|
import praw
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import datetime
|
||||||
|
import pytz
|
||||||
|
import feedparser
|
||||||
|
import logging
|
||||||
|
from time import mktime
|
||||||
|
from botlib import get_bot, get_memcache_client, get_logger
|
||||||
|
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
|
r = get_bot()
|
||||||
|
cache = get_memcache_client()
|
||||||
|
|
||||||
|
SUBREDDIT = 'researchml' # 'testingground4bots'
|
||||||
|
SLEEP = 60
|
||||||
|
POST_DESCRIPTION = True
|
||||||
|
DESCRIPTION_FORMAT = "{}"
|
||||||
|
|
||||||
|
# main procedure
|
||||||
|
|
||||||
|
sources = ["https://distill.pub/rss.xml", "https://www.shortscience.org/rss.xml"]
|
||||||
|
|
||||||
|
def run_bot(sources):
|
||||||
|
sub = r.subreddit(SUBREDDIT)
|
||||||
|
t0 = datetime.datetime.utcnow().replace(tzinfo=pytz.utc) - datetime.timedelta(days=10)
|
||||||
|
|
||||||
|
logger.info("Start bot for subreddit %s", SUBREDDIT)
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
logger.info("check sources")
|
||||||
|
newArticles = []
|
||||||
|
for source in sources:
|
||||||
|
d = feedparser.parse(source)
|
||||||
|
newArticles = d['entries']
|
||||||
|
|
||||||
|
for article in newArticles:
|
||||||
|
dt = datetime.datetime.fromtimestamp(mktime(article['published_parsed'])).replace(tzinfo=pytz.utc)
|
||||||
|
url = id = article['link']
|
||||||
|
desc = article['summary']
|
||||||
|
title= article['title']
|
||||||
|
|
||||||
|
if dt < t0:
|
||||||
|
# skip older ones
|
||||||
|
logger.debug(f"skipping older article '{title}', id='{id}' td={dt-t0}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if cache.get(id) and cache.get(id) is 'T':
|
||||||
|
logger.info(f"skipping done article '{title}', id='{id}'")
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
logger.info('posting %s', id)
|
||||||
|
try:
|
||||||
|
submission = sub.submit('[S] ' + title, url=url, resubmit=True, send_replies=False)
|
||||||
|
if POST_DESCRIPTION and desc is not None:
|
||||||
|
submission.reply(DESCRIPTION_FORMAT.format(desc))
|
||||||
|
except praw.exceptions.PRAWException as e:
|
||||||
|
logger.exception("could not submit %s", e)
|
||||||
|
else:
|
||||||
|
cache.set(id, 'T')
|
||||||
|
logger.info("submit article %s", article)
|
||||||
|
|
||||||
|
# Allows the bot to exit on ^C, all other exceptions are ignored
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
return 0
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Exception %s", e, exc_info=True)
|
||||||
|
|
||||||
|
logger.info("sleep for %s s", SLEEP)
|
||||||
|
time.sleep(SLEEP)
|
||||||
|
|
||||||
|
# write_config_done(done)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
run_bot(sources)
|
||||||
@@ -1,79 +0,0 @@
|
|||||||
"""
|
|
||||||
Posts new shortscience.org summaries to researchml
|
|
||||||
"""
|
|
||||||
import praw
|
|
||||||
import os
|
|
||||||
import time
|
|
||||||
import datetime
|
|
||||||
import pytz
|
|
||||||
import RSSReader
|
|
||||||
import logging
|
|
||||||
|
|
||||||
from bot import get_bot, get_memcache_client
|
|
||||||
|
|
||||||
# https://github.com/SmBe19/RedditBots/blob/master/RSSBot/RSSBot.py
|
|
||||||
log = logging
|
|
||||||
|
|
||||||
|
|
||||||
r = get_bot()
|
|
||||||
cache = get_memcache_client()
|
|
||||||
|
|
||||||
SUBREDDIT = 'researchml' # 'testingground4bots'
|
|
||||||
SLEEP = 60
|
|
||||||
POST_DESCRIPTION = True
|
|
||||||
DESCRIPTION_FORMAT = "{}"
|
|
||||||
|
|
||||||
# main procedure
|
|
||||||
def run_bot():
|
|
||||||
sub = r.subreddit(SUBREDDIT)
|
|
||||||
t0 = datetime.datetime.utcnow().replace(tzinfo=pytz.utc) - datetime.timedelta(days=0)
|
|
||||||
|
|
||||||
log.info("Start bot for subreddit %s", SUBREDDIT)
|
|
||||||
while True:
|
|
||||||
sources = ["https://www.shortscience.org/rss.xml"]
|
|
||||||
try:
|
|
||||||
log.info("check sources")
|
|
||||||
newArticles = []
|
|
||||||
for source in sources:
|
|
||||||
newArticles.extend(RSSReader.get_new_articles(source))
|
|
||||||
|
|
||||||
for article in newArticles:
|
|
||||||
title, url, desc, id, dt = article
|
|
||||||
|
|
||||||
if dt < t0:
|
|
||||||
# skip older ones
|
|
||||||
# print(f"skipping older article {title}, {id} {dt-t0}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
if cache.get(id) and cache.get(id) is 'T':
|
|
||||||
# print(f"skipping done article {title}, {id}")
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
cache.set(id, 'T')
|
|
||||||
print('posting', id)
|
|
||||||
try:
|
|
||||||
# submission = sub.submit(title, url=url, resubmit=RESUBMIT_ANYWAYS, send_replies=False)
|
|
||||||
submission = sub.submit('[S] ' + title, url=url, resubmit=True, send_replies=False)
|
|
||||||
if POST_DESCRIPTION and desc is not None:
|
|
||||||
submission.reply(DESCRIPTION_FORMAT.format(desc))
|
|
||||||
except praw.exceptions.PRAWException as e:
|
|
||||||
log.error("could not submit %s", e)
|
|
||||||
else:
|
|
||||||
log.info("submit article %s", article)
|
|
||||||
|
|
||||||
# Allows the bot to exit on ^C, all other exceptions are ignored
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
return 0
|
|
||||||
break
|
|
||||||
except Exception as e:
|
|
||||||
log.error("Exception %s", e, exc_info=True)
|
|
||||||
|
|
||||||
# write_config_done(done)
|
|
||||||
log.info("sleep for %s s", SLEEP)
|
|
||||||
time.sleep(SLEEP)
|
|
||||||
|
|
||||||
# write_config_done(done)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
run_bot()
|
|
||||||
Reference in New Issue
Block a user