Files
talk/services/scraper.js
T
Wyatt Johnson 745c579b82 Adjust redis to not start during webpack build
- Added new WEBPACK env var which is enabled during yarn build scripts
- Defered redis starting until listen is called
- Moved pubsub to a factory pattern init
- Async/Await'ed the routes
- Moved pubsub handle for routes into middleware
- Adjusted redis cache and job processors to have lazy connection
starting
- Disabled mongo from auto-connecting on require
- Adjusted package redis clients to act as factory singletons instead
2017-07-17 13:34:04 -06:00

128 lines
3.0 KiB
JavaScript

const kue = require('./kue');
const debug = require('debug')('talk:services:scraper');
const AssetModel = require('../models/asset');
const AssetsService = require('./assets');
const metascraper = require('metascraper');
/**
* Exposes a service object to allow operations to execute against the scraper.
* @type {Object}
*/
const scraper = {
/**
* Create the new Task kue singleton.
*/
_task: null,
get task() {
if (scraper._task) {
return scraper._task;
}
scraper._task = new kue.Task({
name: 'scraper'
});
return scraper._task;
},
/**
* Creates a new scraper job and scrapes the url when it gets processed.
*/
create(asset) {
debug(`Creating job for Asset[${asset.id}]`);
return scraper.task.create({
title: `Scrape for asset ${asset.id}`,
asset_id: asset.id
}).then((job) => {
debug(`Created Job[${job.id}] for Asset[${asset.id}]`);
return job;
});
},
/**
* Scrapes the given asset for metadata.
*/
scrape(asset) {
return metascraper.scrapeUrl(asset.url, Object.assign({}, metascraper.RULES, {
section: ($) => $('meta[property="article:section"]').attr('content'),
modified: ($) => $('meta[property="article:modified"]').attr('content')
}));
},
/**
* Updates an Asset based on scraped asset metadata.
*/
update(id, meta) {
return AssetModel.update({id}, {
$set: {
title: meta.title || '',
description: meta.description || '',
image: meta.image ? meta.image : '',
author: meta.author || '',
publication_date: meta.date || '',
modified_date: meta.modified || '',
section: meta.section || '',
scraped: new Date()
}
});
},
/**
* Start the queue processor for the scraper job.
*/
process() {
debug(`Now processing ${scraper.task.name} jobs`);
scraper.task.process((job, done) => {
debug(`Starting on Job[${job.id}] for Asset[${job.data.asset_id}]`);
AssetsService
// Find the asset, or complain that it doesn't exist.
.findById(job.data.asset_id)
.then((asset) => {
if (!asset) {
throw new Error('asset not found');
}
return asset;
})
// Scrape the metadata from the asset.
.then(scraper.scrape)
// Assign the metadata retrieved for the asset to the db.
.then((meta) => {
debug(`Scraped ${JSON.stringify(meta)} on Job[${job.id}] for Asset[${job.data.asset_id}]`);
return scraper.update(job.data.asset_id, meta);
})
// Finish the job because we just handled our scraping + updating the
// asset in the database.
.then(() => {
debug(`Finished on Job[${job.id}] for Asset[${job.data.asset_id}]`);
done();
})
// Handle errors that occur.
.catch((err) => {
debug(`Failed to scrape on Job[${job.id}] for Asset[${job.data.asset_id}]:`, err);
done(err);
});
});
}
};
module.exports = scraper;