Files
talk/services/scraper/index.js
T
Wyatt Johnson 7d246bba76 Metadata Scraper Debugger (#2048)
* feat: added asset scrape debugger

* fix: added docs for scraper

* fix: added asset scraping to sidebar
2018-10-30 17:02:33 +00:00

97 lines
2.2 KiB
JavaScript

const fetch = require('node-fetch');
const ProxyAgent = require('proxy-agent');
const { merge } = require('lodash');
const { SCRAPER_HEADERS, SCRAPER_PROXY_URL } = require('../../config');
const kue = require('../kue');
const { version } = require('../../package.json');
// Load the scraper with the rules.
const metascraper = require('metascraper').load([
require('metascraper-title')(),
require('metascraper-description')(),
require('metascraper-image')(),
require('metascraper-author')(),
require('metascraper-date')(),
require('./rules/modified')(),
require('./rules/section')(),
]);
let customHeaders = {};
try {
customHeaders = JSON.parse(SCRAPER_HEADERS);
} catch (err) {
console.error('Cannot parse TALK_SCRAPER_HEADERS');
throw err;
}
// Parse the headers to be added to the scraper.
const headers = merge(
{
'User-Agent': `Coral-Talk/${version}`,
},
customHeaders
);
// Add proxy configuration if exists.
const agent = SCRAPER_PROXY_URL ? new ProxyAgent(SCRAPER_PROXY_URL) : null;
/**
* Exposes a service object to allow operations to execute against the scraper.
* @type {Object}
*/
const scraper = {
/**
* Create the new Task kue singleton.
*/
task: new kue.Task({
name: 'scraper',
}),
/**
* Creates a new scraper job and scrapes the url when it gets processed.
*/
async create(ctx, id) {
ctx.log.info({ assetID: id }, 'Creating job');
const job = await scraper.task.create({
title: `Scrape for asset ${id}`,
id: ctx.id,
asset_id: id,
});
ctx.log.info({ jobID: job.id, assetID: id }, 'Created job');
return job;
},
/**
* Scrapes the given asset for metadata.
*/
async scrape(url) {
const res = await fetch(url, {
headers,
agent,
});
const html = await res.text();
// Get the metadata from the scraped html.
const meta = await metascraper({
html,
url,
});
return {
title: meta.title || '',
description: meta.description || '',
image: meta.image ? meta.image : '',
author: meta.author || '',
publication_date: meta.date || '',
modified_date: meta.modified || '',
section: meta.section || '',
};
},
};
module.exports = scraper;