mirror of
https://github.com/wassname/arXiv_abstract_bot.git
synced 2026-06-27 15:14:19 +08:00
tidies rss and distill
This commit is contained in:
@@ -1,52 +0,0 @@
|
||||
import urllib.request
|
||||
import datetime
|
||||
from urllib.error import URLError
|
||||
import html2text
|
||||
import xml.etree.ElementTree as ET
|
||||
import os
|
||||
|
||||
|
||||
def get_new_articles(source):
|
||||
articles = []
|
||||
try:
|
||||
response = urllib.request.urlopen(source)
|
||||
orig_rss = response.read().decode("utf-8")
|
||||
rss = ET.fromstring(orig_rss)
|
||||
channel = rss.find("channel")
|
||||
|
||||
for item in channel.findall("item"):
|
||||
# Not used anymore
|
||||
# pubDate = item.find("pubDate").text
|
||||
# pubDateConv = mktime(time.strptime(pubDate, PUBDATEFORMAT)))
|
||||
|
||||
link = item.find("link").text
|
||||
|
||||
title = item.find("title")
|
||||
|
||||
if title is not None:
|
||||
title = title.text
|
||||
if title is None:
|
||||
print("found no title, will use link")
|
||||
title = link
|
||||
|
||||
description = item.find("description")
|
||||
|
||||
if description is not None:
|
||||
description = html2text.html2text(description.text)
|
||||
|
||||
guid = item.find("guid")
|
||||
|
||||
pubDate = item.find("pubDate").text
|
||||
pubDate = datetime.datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S %z")
|
||||
|
||||
if guid is not None:
|
||||
guid = guid.text
|
||||
if guid is None:
|
||||
#print("found no guid, will use link")
|
||||
guid = link
|
||||
articles.append((title, link, description, guid, pubDate))
|
||||
|
||||
except URLError as e:
|
||||
print("Error:", e.reason)
|
||||
|
||||
return articles
|
||||
@@ -7,62 +7,56 @@ import time, os
|
||||
import bmemcached
|
||||
import re
|
||||
from prawcore import NotFound
|
||||
import datetime
|
||||
|
||||
logger = logging
|
||||
|
||||
from botlib import get_bot, get_memcache_client, get_logger
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
# from https://github.com/arxiv-vanity/arxiv-vanity/blob/master/arxiv_vanity/scraper/arxiv_ids.py
|
||||
ARXIV_ID_PATTERN = r'([a-z\-]+(?:\.[A-Z]{2})?/\d{7}|\d+\.\d+)(v\d+)?'
|
||||
ARXIV_URL_RE = re.compile(r'arxiv.org/[^\/]+/({})(\.pdf)?'.format(ARXIV_ID_PATTERN), re.I)
|
||||
|
||||
def get_bot():
|
||||
PRAW_CLIENT_ID = os.environ.get('PRAW_CLIENT_ID')
|
||||
PRAW_CLIENT_SECRET = os.environ.get('PRAW_CLIENT_SECRET')
|
||||
PRAW_PASSWORD = os.environ.get('PRAW_PASSWORD')
|
||||
PRAW_USERNAME = os.environ.get('PRAW_USERNAME')
|
||||
PRAW_USERAGENT = os.environ.get('PRAW_USERAGENT')
|
||||
return praw.Reddit(
|
||||
username=PRAW_USERNAME,
|
||||
password=PRAW_PASSWORD,
|
||||
client_id=PRAW_CLIENT_ID,
|
||||
client_secret=PRAW_CLIENT_SECRET,
|
||||
user_agent=PRAW_USERAGENT
|
||||
)
|
||||
|
||||
# OPENREVIEW_URL_RE = re.compile(r'openreview.net/./', re.I)
|
||||
# OPENREVIEW_URL_RE = re.compile(r'distill.pub/./', re.I)
|
||||
|
||||
r = get_bot()
|
||||
|
||||
# source subreddits
|
||||
subreddits = [
|
||||
r.subreddit('machinelearning'),
|
||||
# r.subreddit('reinforcementlearning')
|
||||
# r.subreddit('LanguageTechnology')
|
||||
]
|
||||
target_subreddit = r.subreddit('mlresearch')
|
||||
target_subreddit = r.subreddit('testingground4bots')
|
||||
|
||||
|
||||
if r.read_only == False:
|
||||
print("Connected and running.")
|
||||
# alreadydone = set()
|
||||
|
||||
|
||||
def scrape_arxiv(arxiv_id):
|
||||
url = 'https://arxiv.org/abs/{}'.format(arxiv_id)
|
||||
r = requests.get(url)
|
||||
soup = bs4.BeautifulSoup(r.text)
|
||||
abstract = soup.select('.abstract')[0]
|
||||
abstract = html2text.html2text(abstract.decode()).replace('\n', ' ')
|
||||
# def scrape_arxiv(arxiv_id):
|
||||
# url = 'https://arxiv.org/abs/{}'.format(arxiv_id)
|
||||
# r = requests.get(url)
|
||||
# soup = bs4.BeautifulSoup(r.text)
|
||||
# abstract = soup.select('.abstract')[0]
|
||||
# abstract = html2text.html2text(abstract.decode()).replace('\n', ' ')
|
||||
|
||||
authors = soup.select('.authors')[0]
|
||||
authors = html2text.html2text(authors.decode()).replace('\n', ' ')
|
||||
authors = authors.replace('(/', '(http://arxiv.org/')
|
||||
# authors = soup.select('.authors')[0]
|
||||
# authors = html2text.html2text(authors.decode()).replace('\n', ' ')
|
||||
# authors = authors.replace('(/', '(http://arxiv.org/')
|
||||
|
||||
title = soup.select('.title')[0]
|
||||
title = html2text.html2text(title.decode()).replace('\n', ' ')[2:]
|
||||
# title = soup.select('.title')[0]
|
||||
# title = html2text.html2text(title.decode()).replace('\n', ' ')[2:]
|
||||
|
||||
abs_link = u'[Landing Page]({})'.format(url)
|
||||
pdf_link = u'[PDF Link](https://arxiv.org/pdf/{})'.format(arxiv_id)
|
||||
web_link = u'[Read as web page on arXiv Vanity](https://www.arxiv-vanity.com/papers/{}/)'.format(arxiv_id)
|
||||
links = u'{} | {} | {}'.format(pdf_link, abs_link, web_link)
|
||||
response = '\n\n'.join([title, authors, abstract, links])
|
||||
return response
|
||||
# abs_link = u'[Landing Page]({})'.format(url)
|
||||
# pdf_link = u'[PDF Link](https://arxiv.org/pdf/{})'.format(arxiv_id)
|
||||
# web_link = u'[Read as web page on arXiv Vanity](https://www.arxiv-vanity.com/papers/{}/)'.format(arxiv_id)
|
||||
# links = u'{} | {} | {}'.format(pdf_link, abs_link, web_link)
|
||||
# response = '\n\n'.join([title, authors, abstract, links])
|
||||
# return response
|
||||
|
||||
|
||||
def comment(cache):
|
||||
@@ -142,17 +136,6 @@ def xpost(subs, originalpost):
|
||||
# where.reply(str(response) + " Make sure to give me karma to prevent that in the future.")
|
||||
|
||||
|
||||
def get_memcache_client():
|
||||
# Store IDs of comments that the bot has already replied to.
|
||||
# Read local cache by default
|
||||
|
||||
MEMCACHEDCLOUD_SERVERS = os.environ.get('MEMCACHEDCLOUD_SERVERS')
|
||||
MEMCACHEDCLOUD_USERNAME = os.environ.get('MEMCACHEDCLOUD_USERNAME')
|
||||
MEMCACHEDCLOUD_PASSWORD = os.environ.get('MEMCACHEDCLOUD_PASSWORD')
|
||||
|
||||
client = bmemcached.Client((MEMCACHEDCLOUD_SERVERS,), MEMCACHEDCLOUD_USERNAME,
|
||||
MEMCACHEDCLOUD_PASSWORD)
|
||||
return client
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
@@ -0,0 +1,45 @@
|
||||
import praw
|
||||
import logging
|
||||
import requests
|
||||
import bs4
|
||||
import html2text
|
||||
import time, os
|
||||
import bmemcached
|
||||
import re
|
||||
from prawcore import NotFound
|
||||
|
||||
import logging
|
||||
import sys
|
||||
|
||||
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
||||
|
||||
def get_logger():
|
||||
logger = logging.getLogger(__name__)
|
||||
return logger
|
||||
|
||||
def get_memcache_client():
|
||||
# Store IDs of comments that the bot has already replied to.
|
||||
# Read local cache by default
|
||||
|
||||
MEMCACHEDCLOUD_SERVERS = os.environ.get('MEMCACHEDCLOUD_SERVERS')
|
||||
MEMCACHEDCLOUD_USERNAME = os.environ.get('MEMCACHEDCLOUD_USERNAME')
|
||||
MEMCACHEDCLOUD_PASSWORD = os.environ.get('MEMCACHEDCLOUD_PASSWORD')
|
||||
|
||||
client = bmemcached.Client((MEMCACHEDCLOUD_SERVERS,), MEMCACHEDCLOUD_USERNAME,
|
||||
MEMCACHEDCLOUD_PASSWORD)
|
||||
return client
|
||||
|
||||
|
||||
def get_bot():
|
||||
PRAW_CLIENT_ID = os.environ.get('PRAW_CLIENT_ID')
|
||||
PRAW_CLIENT_SECRET = os.environ.get('PRAW_CLIENT_SECRET')
|
||||
PRAW_PASSWORD = os.environ.get('PRAW_PASSWORD')
|
||||
PRAW_USERNAME = os.environ.get('PRAW_USERNAME')
|
||||
PRAW_USERAGENT = os.environ.get('PRAW_USERAGENT')
|
||||
return praw.Reddit(
|
||||
username=PRAW_USERNAME,
|
||||
password=PRAW_PASSWORD,
|
||||
client_id=PRAW_CLIENT_ID,
|
||||
client_secret=PRAW_CLIENT_SECRET,
|
||||
user_agent=PRAW_USERAGENT
|
||||
)
|
||||
@@ -1,5 +1,17 @@
|
||||
I would like to audocrosspost
|
||||
- arxiv
|
||||
- openreview
|
||||
I would like to autocrosspost
|
||||
- [x] arxiv from r/machinelearning
|
||||
- [x] shortscience summaries from shortscience rss feed
|
||||
- [ ] openreview
|
||||
- [ ] openai, googlemin, facebookai, deepmind blogposts
|
||||
|
||||
and also auto post shortscience summaries from their rss feed.
|
||||
|
||||
# start
|
||||
|
||||
```sh
|
||||
pyenv activate jup3.7.2
|
||||
#memcached?
|
||||
#https://github.com/jaysonsantos/python-binary-memcached
|
||||
source ./vars.sh
|
||||
python ./arxiv_ml_xpost_bot.py &
|
||||
python ./rss_poster_bot.py &
|
||||
```
|
||||
|
||||
@@ -0,0 +1,81 @@
|
||||
"""
|
||||
Posts new shortscience.org summaries to researchml
|
||||
"""
|
||||
import praw
|
||||
import os
|
||||
import time
|
||||
import datetime
|
||||
import pytz
|
||||
import feedparser
|
||||
import logging
|
||||
from time import mktime
|
||||
from botlib import get_bot, get_memcache_client, get_logger
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
r = get_bot()
|
||||
cache = get_memcache_client()
|
||||
|
||||
SUBREDDIT = 'researchml' # 'testingground4bots'
|
||||
SLEEP = 60
|
||||
POST_DESCRIPTION = True
|
||||
DESCRIPTION_FORMAT = "{}"
|
||||
|
||||
# main procedure
|
||||
|
||||
sources = ["https://distill.pub/rss.xml", "https://www.shortscience.org/rss.xml"]
|
||||
|
||||
def run_bot(sources):
|
||||
sub = r.subreddit(SUBREDDIT)
|
||||
t0 = datetime.datetime.utcnow().replace(tzinfo=pytz.utc) - datetime.timedelta(days=10)
|
||||
|
||||
logger.info("Start bot for subreddit %s", SUBREDDIT)
|
||||
while True:
|
||||
try:
|
||||
logger.info("check sources")
|
||||
newArticles = []
|
||||
for source in sources:
|
||||
d = feedparser.parse(source)
|
||||
newArticles = d['entries']
|
||||
|
||||
for article in newArticles:
|
||||
dt = datetime.datetime.fromtimestamp(mktime(article['published_parsed'])).replace(tzinfo=pytz.utc)
|
||||
url = id = article['link']
|
||||
desc = article['summary']
|
||||
title= article['title']
|
||||
|
||||
if dt < t0:
|
||||
# skip older ones
|
||||
logger.debug(f"skipping older article '{title}', id='{id}' td={dt-t0}")
|
||||
continue
|
||||
|
||||
if cache.get(id) and cache.get(id) is 'T':
|
||||
logger.info(f"skipping done article '{title}', id='{id}'")
|
||||
continue
|
||||
else:
|
||||
logger.info('posting %s', id)
|
||||
try:
|
||||
submission = sub.submit('[S] ' + title, url=url, resubmit=True, send_replies=False)
|
||||
if POST_DESCRIPTION and desc is not None:
|
||||
submission.reply(DESCRIPTION_FORMAT.format(desc))
|
||||
except praw.exceptions.PRAWException as e:
|
||||
logger.exception("could not submit %s", e)
|
||||
else:
|
||||
cache.set(id, 'T')
|
||||
logger.info("submit article %s", article)
|
||||
|
||||
# Allows the bot to exit on ^C, all other exceptions are ignored
|
||||
except KeyboardInterrupt:
|
||||
return 0
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error("Exception %s", e, exc_info=True)
|
||||
|
||||
logger.info("sleep for %s s", SLEEP)
|
||||
time.sleep(SLEEP)
|
||||
|
||||
# write_config_done(done)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_bot(sources)
|
||||
@@ -1,79 +0,0 @@
|
||||
"""
|
||||
Posts new shortscience.org summaries to researchml
|
||||
"""
|
||||
import praw
|
||||
import os
|
||||
import time
|
||||
import datetime
|
||||
import pytz
|
||||
import RSSReader
|
||||
import logging
|
||||
|
||||
from bot import get_bot, get_memcache_client
|
||||
|
||||
# https://github.com/SmBe19/RedditBots/blob/master/RSSBot/RSSBot.py
|
||||
log = logging
|
||||
|
||||
|
||||
r = get_bot()
|
||||
cache = get_memcache_client()
|
||||
|
||||
SUBREDDIT = 'researchml' # 'testingground4bots'
|
||||
SLEEP = 60
|
||||
POST_DESCRIPTION = True
|
||||
DESCRIPTION_FORMAT = "{}"
|
||||
|
||||
# main procedure
|
||||
def run_bot():
|
||||
sub = r.subreddit(SUBREDDIT)
|
||||
t0 = datetime.datetime.utcnow().replace(tzinfo=pytz.utc) - datetime.timedelta(days=0)
|
||||
|
||||
log.info("Start bot for subreddit %s", SUBREDDIT)
|
||||
while True:
|
||||
sources = ["https://www.shortscience.org/rss.xml"]
|
||||
try:
|
||||
log.info("check sources")
|
||||
newArticles = []
|
||||
for source in sources:
|
||||
newArticles.extend(RSSReader.get_new_articles(source))
|
||||
|
||||
for article in newArticles:
|
||||
title, url, desc, id, dt = article
|
||||
|
||||
if dt < t0:
|
||||
# skip older ones
|
||||
# print(f"skipping older article {title}, {id} {dt-t0}")
|
||||
continue
|
||||
|
||||
if cache.get(id) and cache.get(id) is 'T':
|
||||
# print(f"skipping done article {title}, {id}")
|
||||
continue
|
||||
else:
|
||||
cache.set(id, 'T')
|
||||
print('posting', id)
|
||||
try:
|
||||
# submission = sub.submit(title, url=url, resubmit=RESUBMIT_ANYWAYS, send_replies=False)
|
||||
submission = sub.submit('[S] ' + title, url=url, resubmit=True, send_replies=False)
|
||||
if POST_DESCRIPTION and desc is not None:
|
||||
submission.reply(DESCRIPTION_FORMAT.format(desc))
|
||||
except praw.exceptions.PRAWException as e:
|
||||
log.error("could not submit %s", e)
|
||||
else:
|
||||
log.info("submit article %s", article)
|
||||
|
||||
# Allows the bot to exit on ^C, all other exceptions are ignored
|
||||
except KeyboardInterrupt:
|
||||
return 0
|
||||
break
|
||||
except Exception as e:
|
||||
log.error("Exception %s", e, exc_info=True)
|
||||
|
||||
# write_config_done(done)
|
||||
log.info("sleep for %s s", SLEEP)
|
||||
time.sleep(SLEEP)
|
||||
|
||||
# write_config_done(done)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_bot()
|
||||
Reference in New Issue
Block a user