From 435067f6193fe5e082bcd6828b52903079f7ca94 Mon Sep 17 00:00:00 2001 From: Wyatt Johnson Date: Wed, 7 Mar 2018 15:30:34 -0700 Subject: [PATCH] Scraping Fixes - Fixed bugs with scraping graphql endpoint - Added traceID to logging for jobs --- graph/context.js | 2 +- graph/loaders/assets.js | 2 +- graph/mutators/asset.js | 4 ++-- jobs/index.js | 2 +- jobs/scraper.js | 25 +++++++++++-------------- services/logging.js | 10 +++++----- services/scraper.js | 12 ++++++------ 7 files changed, 27 insertions(+), 30 deletions(-) diff --git a/graph/context.js b/graph/context.js index adf5be06a..52bb7fdfc 100644 --- a/graph/context.js +++ b/graph/context.js @@ -52,7 +52,7 @@ class Context { this.id = ctx.id || uuid.v4(); // Attach a logger or create one. - this.log = ctx.log || createLogger('context', this.id); + this.log = ctx.log || createLogger('graph:context', this.id); // Load the current logged in user to `user`, otherwise this will be null. this.user = get(ctx, 'user'); diff --git a/graph/loaders/assets.js b/graph/loaders/assets.js index 554ee527d..2f0595e81 100644 --- a/graph/loaders/assets.js +++ b/graph/loaders/assets.js @@ -119,7 +119,7 @@ const findOrCreateAssetByURL = async (ctx, url) => { // If this is a new asset, then we need to scrape it! if (!asset.scraped) { // Create the Scraper job. - await Scraper.create(asset); + await Scraper.create(ctx, asset.id); } return asset; diff --git a/graph/mutators/asset.js b/graph/mutators/asset.js index 3d018658e..2997e7cd4 100644 --- a/graph/mutators/asset.js +++ b/graph/mutators/asset.js @@ -63,9 +63,9 @@ const closeNow = async (ctx, id) => * @param {String} id the asset's id to scrape */ const scrapeAsset = async (ctx, id) => { - const { services: { Scraper } } = ctx; + const { connectors: { services: { Scraper } } } = ctx; - return Scraper.create({ id }); + return Scraper.create(ctx, id); }; module.exports = ctx => { diff --git a/jobs/index.js b/jobs/index.js index b64e59362..9093090ad 100644 --- a/jobs/index.js +++ b/jobs/index.js @@ -1,4 +1,4 @@ -const jobs = [require('./mailer')]; +const jobs = [require('./mailer'), require('./scraper')]; const process = () => jobs.forEach(job => job()); diff --git a/jobs/scraper.js b/jobs/scraper.js index 424304ba8..2e3fa0927 100644 --- a/jobs/scraper.js +++ b/jobs/scraper.js @@ -1,7 +1,8 @@ const Asset = require('../models/asset'); const scraper = require('../services/scraper'); const Assets = require('../services/assets'); -const debug = require('debug')('talk:jobs:scraper'); +const { createLogger } = require('../services/logging'); +const logger = createLogger('jobs:scraper'); const metascraper = require('metascraper'); /** @@ -39,38 +40,34 @@ function update(id, meta) { } module.exports = () => { - debug(`Now processing ${scraper.task.name} jobs`); + logger.info({ taskName: scraper.task.name }, 'Now processing jobs'); scraper.task.process(async (job, done) => { - debug(`Starting on Job[${job.id}] for Asset[${job.data.asset_id}]`); + const { id, asset_id } = job.data; + + const log = logger.child({ traceID: id, jobID: job.id, assetID: asset_id }); + log.info('Starting scrape'); try { // Find the asset, or complain that it doesn't exist. const asset = await Assets.findById(job.data.asset_id); if (!asset) { - return done(new Error('asset not found')); + throw new Error('asset not found'); } // Scrape the metadata from the asset. const meta = await scrape(asset); - debug( - `Scraped ${JSON.stringify(meta)} on Job[${job.id}] for Asset[${ - job.data.asset_id - }]` - ); + log.info('Finished scraping'); // Assign the metadata retrieved for the asset to the db. await update(job.data.asset_id, meta); } catch (err) { - debug( - `Failed to scrape on Job[${job.id}] for Asset[${job.data.asset_id}]:`, - err - ); + log.error({ err }, 'Failed to scrape'); return done(err); } - debug(`Finished on Job[${job.id}] for Asset[${job.data.asset_id}]`); + log.info('Finished updating'); done(); }); }; diff --git a/services/logging.js b/services/logging.js index 9aad098ab..00e9c523b 100644 --- a/services/logging.js +++ b/services/logging.js @@ -1,17 +1,17 @@ const { version } = require('../package.json'); const Logger = require('bunyan'); -const uuid = require('uuid/v1'); -const { LOGGING_LEVEL } = require('../config'); +const { LOGGING_LEVEL, REVISION_HASH } = require('../config'); // Create the logging instance that all logger's are branched from. -function createLogger(name, id = uuid()) { +function createLogger(name, traceID) { return new Logger({ src: true, name, - id, + traceID, version, + revision: REVISION_HASH, level: LOGGING_LEVEL, - serializers: { req: Logger.stdSerializers.req }, + serializers: Logger.stdSerializers, }); } diff --git a/services/scraper.js b/services/scraper.js index daba96a99..cfd845424 100644 --- a/services/scraper.js +++ b/services/scraper.js @@ -1,5 +1,4 @@ const kue = require('./kue'); -const debug = require('debug')('talk:services:scraper'); /** * Exposes a service object to allow operations to execute against the scraper. @@ -16,15 +15,16 @@ const scraper = { /** * Creates a new scraper job and scrapes the url when it gets processed. */ - async create(asset) { - debug(`Creating job for Asset[${asset.id}]`); + async create(ctx, id) { + ctx.log.info({ assetID: id }, 'Creating job'); const job = await scraper.task.create({ - title: `Scrape for asset ${asset.id}`, - asset_id: asset.id, + title: `Scrape for asset ${id}`, + id: ctx.id, + asset_id: id, }); - debug(`Created Job[${job.id}] for Asset[${asset.id}]`); + ctx.log.info({ jobID: job.id, assetID: id }, 'Created job'); return job; },