tidies rss and distill

This commit is contained in:
wassname
2020-01-24 09:14:51 +08:00
parent 5ff836465c
commit 5d89a12d2a
6 changed files with 169 additions and 179 deletions
-52
View File
@@ -1,52 +0,0 @@
import urllib.request
import datetime
from urllib.error import URLError
import html2text
import xml.etree.ElementTree as ET
import os
def get_new_articles(source):
articles = []
try:
response = urllib.request.urlopen(source)
orig_rss = response.read().decode("utf-8")
rss = ET.fromstring(orig_rss)
channel = rss.find("channel")
for item in channel.findall("item"):
# Not used anymore
# pubDate = item.find("pubDate").text
# pubDateConv = mktime(time.strptime(pubDate, PUBDATEFORMAT)))
link = item.find("link").text
title = item.find("title")
if title is not None:
title = title.text
if title is None:
print("found no title, will use link")
title = link
description = item.find("description")
if description is not None:
description = html2text.html2text(description.text)
guid = item.find("guid")
pubDate = item.find("pubDate").text
pubDate = datetime.datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S %z")
if guid is not None:
guid = guid.text
if guid is None:
#print("found no guid, will use link")
guid = link
articles.append((title, link, description, guid, pubDate))
except URLError as e:
print("Error:", e.reason)
return articles
+27 -44
View File
@@ -7,62 +7,56 @@ import time, os
import bmemcached
import re
from prawcore import NotFound
import datetime
logger = logging
from botlib import get_bot, get_memcache_client, get_logger
logger = get_logger()
# from https://github.com/arxiv-vanity/arxiv-vanity/blob/master/arxiv_vanity/scraper/arxiv_ids.py
ARXIV_ID_PATTERN = r'([a-z\-]+(?:\.[A-Z]{2})?/\d{7}|\d+\.\d+)(v\d+)?'
ARXIV_URL_RE = re.compile(r'arxiv.org/[^\/]+/({})(\.pdf)?'.format(ARXIV_ID_PATTERN), re.I)
def get_bot():
PRAW_CLIENT_ID = os.environ.get('PRAW_CLIENT_ID')
PRAW_CLIENT_SECRET = os.environ.get('PRAW_CLIENT_SECRET')
PRAW_PASSWORD = os.environ.get('PRAW_PASSWORD')
PRAW_USERNAME = os.environ.get('PRAW_USERNAME')
PRAW_USERAGENT = os.environ.get('PRAW_USERAGENT')
return praw.Reddit(
username=PRAW_USERNAME,
password=PRAW_PASSWORD,
client_id=PRAW_CLIENT_ID,
client_secret=PRAW_CLIENT_SECRET,
user_agent=PRAW_USERAGENT
)
# OPENREVIEW_URL_RE = re.compile(r'openreview.net/./', re.I)
# OPENREVIEW_URL_RE = re.compile(r'distill.pub/./', re.I)
r = get_bot()
# source subreddits
subreddits = [
r.subreddit('machinelearning'),
# r.subreddit('reinforcementlearning')
# r.subreddit('LanguageTechnology')
]
target_subreddit = r.subreddit('mlresearch')
target_subreddit = r.subreddit('testingground4bots')
if r.read_only == False:
print("Connected and running.")
# alreadydone = set()
def scrape_arxiv(arxiv_id):
url = 'https://arxiv.org/abs/{}'.format(arxiv_id)
r = requests.get(url)
soup = bs4.BeautifulSoup(r.text)
abstract = soup.select('.abstract')[0]
abstract = html2text.html2text(abstract.decode()).replace('\n', ' ')
# def scrape_arxiv(arxiv_id):
# url = 'https://arxiv.org/abs/{}'.format(arxiv_id)
# r = requests.get(url)
# soup = bs4.BeautifulSoup(r.text)
# abstract = soup.select('.abstract')[0]
# abstract = html2text.html2text(abstract.decode()).replace('\n', ' ')
authors = soup.select('.authors')[0]
authors = html2text.html2text(authors.decode()).replace('\n', ' ')
authors = authors.replace('(/', '(http://arxiv.org/')
# authors = soup.select('.authors')[0]
# authors = html2text.html2text(authors.decode()).replace('\n', ' ')
# authors = authors.replace('(/', '(http://arxiv.org/')
title = soup.select('.title')[0]
title = html2text.html2text(title.decode()).replace('\n', ' ')[2:]
# title = soup.select('.title')[0]
# title = html2text.html2text(title.decode()).replace('\n', ' ')[2:]
abs_link = u'[Landing Page]({})'.format(url)
pdf_link = u'[PDF Link](https://arxiv.org/pdf/{})'.format(arxiv_id)
web_link = u'[Read as web page on arXiv Vanity](https://www.arxiv-vanity.com/papers/{}/)'.format(arxiv_id)
links = u'{} | {} | {}'.format(pdf_link, abs_link, web_link)
response = '\n\n'.join([title, authors, abstract, links])
return response
# abs_link = u'[Landing Page]({})'.format(url)
# pdf_link = u'[PDF Link](https://arxiv.org/pdf/{})'.format(arxiv_id)
# web_link = u'[Read as web page on arXiv Vanity](https://www.arxiv-vanity.com/papers/{}/)'.format(arxiv_id)
# links = u'{} | {} | {}'.format(pdf_link, abs_link, web_link)
# response = '\n\n'.join([title, authors, abstract, links])
# return response
def comment(cache):
@@ -142,17 +136,6 @@ def xpost(subs, originalpost):
# where.reply(str(response) + " Make sure to give me karma to prevent that in the future.")
def get_memcache_client():
# Store IDs of comments that the bot has already replied to.
# Read local cache by default
MEMCACHEDCLOUD_SERVERS = os.environ.get('MEMCACHEDCLOUD_SERVERS')
MEMCACHEDCLOUD_USERNAME = os.environ.get('MEMCACHEDCLOUD_USERNAME')
MEMCACHEDCLOUD_PASSWORD = os.environ.get('MEMCACHEDCLOUD_PASSWORD')
client = bmemcached.Client((MEMCACHEDCLOUD_SERVERS,), MEMCACHEDCLOUD_USERNAME,
MEMCACHEDCLOUD_PASSWORD)
return client
if __name__ == "__main__":
+45
View File
@@ -0,0 +1,45 @@
import praw
import logging
import requests
import bs4
import html2text
import time, os
import bmemcached
import re
from prawcore import NotFound
import logging
import sys
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
def get_logger():
logger = logging.getLogger(__name__)
return logger
def get_memcache_client():
# Store IDs of comments that the bot has already replied to.
# Read local cache by default
MEMCACHEDCLOUD_SERVERS = os.environ.get('MEMCACHEDCLOUD_SERVERS')
MEMCACHEDCLOUD_USERNAME = os.environ.get('MEMCACHEDCLOUD_USERNAME')
MEMCACHEDCLOUD_PASSWORD = os.environ.get('MEMCACHEDCLOUD_PASSWORD')
client = bmemcached.Client((MEMCACHEDCLOUD_SERVERS,), MEMCACHEDCLOUD_USERNAME,
MEMCACHEDCLOUD_PASSWORD)
return client
def get_bot():
PRAW_CLIENT_ID = os.environ.get('PRAW_CLIENT_ID')
PRAW_CLIENT_SECRET = os.environ.get('PRAW_CLIENT_SECRET')
PRAW_PASSWORD = os.environ.get('PRAW_PASSWORD')
PRAW_USERNAME = os.environ.get('PRAW_USERNAME')
PRAW_USERAGENT = os.environ.get('PRAW_USERAGENT')
return praw.Reddit(
username=PRAW_USERNAME,
password=PRAW_PASSWORD,
client_id=PRAW_CLIENT_ID,
client_secret=PRAW_CLIENT_SECRET,
user_agent=PRAW_USERAGENT
)
+16 -4
View File
@@ -1,5 +1,17 @@
I would like to audocrosspost
- arxiv
- openreview
I would like to autocrosspost
- [x] arxiv from r/machinelearning
- [x] shortscience summaries from shortscience rss feed
- [ ] openreview
- [ ] openai, googlemin, facebookai, deepmind blogposts
and also auto post shortscience summaries from their rss feed.
# start
```sh
pyenv activate jup3.7.2
#memcached?
#https://github.com/jaysonsantos/python-binary-memcached
source ./vars.sh
python ./arxiv_ml_xpost_bot.py &
python ./rss_poster_bot.py &
```
+81
View File
@@ -0,0 +1,81 @@
"""
Posts new shortscience.org summaries to researchml
"""
import praw
import os
import time
import datetime
import pytz
import feedparser
import logging
from time import mktime
from botlib import get_bot, get_memcache_client, get_logger
logger = get_logger()
r = get_bot()
cache = get_memcache_client()
SUBREDDIT = 'researchml' # 'testingground4bots'
SLEEP = 60
POST_DESCRIPTION = True
DESCRIPTION_FORMAT = "{}"
# main procedure
sources = ["https://distill.pub/rss.xml", "https://www.shortscience.org/rss.xml"]
def run_bot(sources):
sub = r.subreddit(SUBREDDIT)
t0 = datetime.datetime.utcnow().replace(tzinfo=pytz.utc) - datetime.timedelta(days=10)
logger.info("Start bot for subreddit %s", SUBREDDIT)
while True:
try:
logger.info("check sources")
newArticles = []
for source in sources:
d = feedparser.parse(source)
newArticles = d['entries']
for article in newArticles:
dt = datetime.datetime.fromtimestamp(mktime(article['published_parsed'])).replace(tzinfo=pytz.utc)
url = id = article['link']
desc = article['summary']
title= article['title']
if dt < t0:
# skip older ones
logger.debug(f"skipping older article '{title}', id='{id}' td={dt-t0}")
continue
if cache.get(id) and cache.get(id) is 'T':
logger.info(f"skipping done article '{title}', id='{id}'")
continue
else:
logger.info('posting %s', id)
try:
submission = sub.submit('[S] ' + title, url=url, resubmit=True, send_replies=False)
if POST_DESCRIPTION and desc is not None:
submission.reply(DESCRIPTION_FORMAT.format(desc))
except praw.exceptions.PRAWException as e:
logger.exception("could not submit %s", e)
else:
cache.set(id, 'T')
logger.info("submit article %s", article)
# Allows the bot to exit on ^C, all other exceptions are ignored
except KeyboardInterrupt:
return 0
break
except Exception as e:
logger.error("Exception %s", e, exc_info=True)
logger.info("sleep for %s s", SLEEP)
time.sleep(SLEEP)
# write_config_done(done)
if __name__ == "__main__":
run_bot(sources)
-79
View File
@@ -1,79 +0,0 @@
"""
Posts new shortscience.org summaries to researchml
"""
import praw
import os
import time
import datetime
import pytz
import RSSReader
import logging
from bot import get_bot, get_memcache_client
# https://github.com/SmBe19/RedditBots/blob/master/RSSBot/RSSBot.py
log = logging
r = get_bot()
cache = get_memcache_client()
SUBREDDIT = 'researchml' # 'testingground4bots'
SLEEP = 60
POST_DESCRIPTION = True
DESCRIPTION_FORMAT = "{}"
# main procedure
def run_bot():
sub = r.subreddit(SUBREDDIT)
t0 = datetime.datetime.utcnow().replace(tzinfo=pytz.utc) - datetime.timedelta(days=0)
log.info("Start bot for subreddit %s", SUBREDDIT)
while True:
sources = ["https://www.shortscience.org/rss.xml"]
try:
log.info("check sources")
newArticles = []
for source in sources:
newArticles.extend(RSSReader.get_new_articles(source))
for article in newArticles:
title, url, desc, id, dt = article
if dt < t0:
# skip older ones
# print(f"skipping older article {title}, {id} {dt-t0}")
continue
if cache.get(id) and cache.get(id) is 'T':
# print(f"skipping done article {title}, {id}")
continue
else:
cache.set(id, 'T')
print('posting', id)
try:
# submission = sub.submit(title, url=url, resubmit=RESUBMIT_ANYWAYS, send_replies=False)
submission = sub.submit('[S] ' + title, url=url, resubmit=True, send_replies=False)
if POST_DESCRIPTION and desc is not None:
submission.reply(DESCRIPTION_FORMAT.format(desc))
except praw.exceptions.PRAWException as e:
log.error("could not submit %s", e)
else:
log.info("submit article %s", article)
# Allows the bot to exit on ^C, all other exceptions are ignored
except KeyboardInterrupt:
return 0
break
except Exception as e:
log.error("Exception %s", e, exc_info=True)
# write_config_done(done)
log.info("sleep for %s s", SLEEP)
time.sleep(SLEEP)
# write_config_done(done)
if __name__ == "__main__":
run_bot()