From 435067f6193fe5e082bcd6828b52903079f7ca94 Mon Sep 17 00:00:00 2001
From: Wyatt Johnson <wyattjoh@gmail.com>
Date: Wed, 7 Mar 2018 15:30:34 -0700
Subject: [PATCH] Scraping Fixes

- Fixed bugs with scraping graphql endpoint
- Added traceID to logging for jobs
---
 graph/context.js        |  2 +-
 graph/loaders/assets.js |  2 +-
 graph/mutators/asset.js |  4 ++--
 jobs/index.js           |  2 +-
 jobs/scraper.js         | 25 +++++++++++--------------
 services/logging.js     | 10 +++++-----
 services/scraper.js     | 12 ++++++------
 7 files changed, 27 insertions(+), 30 deletions(-)

diff --git a/graph/context.js b/graph/context.js
index adf5be06a..52bb7fdfc 100644
--- a/graph/context.js
+++ b/graph/context.js
@@ -52,7 +52,7 @@ class Context {
     this.id = ctx.id || uuid.v4();
 
     // Attach a logger or create one.
-    this.log = ctx.log || createLogger('context', this.id);
+    this.log = ctx.log || createLogger('graph:context', this.id);
 
     // Load the current logged in user to `user`, otherwise this will be null.
     this.user = get(ctx, 'user');
diff --git a/graph/loaders/assets.js b/graph/loaders/assets.js
index 554ee527d..2f0595e81 100644
--- a/graph/loaders/assets.js
+++ b/graph/loaders/assets.js
@@ -119,7 +119,7 @@ const findOrCreateAssetByURL = async (ctx, url) => {
   // If this is a new asset, then we need to scrape it!
   if (!asset.scraped) {
     // Create the Scraper job.
-    await Scraper.create(asset);
+    await Scraper.create(ctx, asset.id);
   }
 
   return asset;
diff --git a/graph/mutators/asset.js b/graph/mutators/asset.js
index 3d018658e..2997e7cd4 100644
--- a/graph/mutators/asset.js
+++ b/graph/mutators/asset.js
@@ -63,9 +63,9 @@ const closeNow = async (ctx, id) =>
  * @param {String} id the asset's id to scrape
  */
 const scrapeAsset = async (ctx, id) => {
-  const { services: { Scraper } } = ctx;
+  const { connectors: { services: { Scraper } } } = ctx;
 
-  return Scraper.create({ id });
+  return Scraper.create(ctx, id);
 };
 
 module.exports = ctx => {
diff --git a/jobs/index.js b/jobs/index.js
index b64e59362..9093090ad 100644
--- a/jobs/index.js
+++ b/jobs/index.js
@@ -1,4 +1,4 @@
-const jobs = [require('./mailer')];
+const jobs = [require('./mailer'), require('./scraper')];
 
 const process = () => jobs.forEach(job => job());
 
diff --git a/jobs/scraper.js b/jobs/scraper.js
index 424304ba8..2e3fa0927 100644
--- a/jobs/scraper.js
+++ b/jobs/scraper.js
@@ -1,7 +1,8 @@
 const Asset = require('../models/asset');
 const scraper = require('../services/scraper');
 const Assets = require('../services/assets');
-const debug = require('debug')('talk:jobs:scraper');
+const { createLogger } = require('../services/logging');
+const logger = createLogger('jobs:scraper');
 const metascraper = require('metascraper');
 
 /**
@@ -39,38 +40,34 @@ function update(id, meta) {
 }
 
 module.exports = () => {
-  debug(`Now processing ${scraper.task.name} jobs`);
+  logger.info({ taskName: scraper.task.name }, 'Now processing jobs');
 
   scraper.task.process(async (job, done) => {
-    debug(`Starting on Job[${job.id}] for Asset[${job.data.asset_id}]`);
+    const { id, asset_id } = job.data;
+
+    const log = logger.child({ traceID: id, jobID: job.id, assetID: asset_id });
+    log.info('Starting scrape');
 
     try {
       // Find the asset, or complain that it doesn't exist.
       const asset = await Assets.findById(job.data.asset_id);
       if (!asset) {
-        return done(new Error('asset not found'));
+        throw new Error('asset not found');
       }
 
       // Scrape the metadata from the asset.
       const meta = await scrape(asset);
 
-      debug(
-        `Scraped ${JSON.stringify(meta)} on Job[${job.id}] for Asset[${
-          job.data.asset_id
-        }]`
-      );
+      log.info('Finished scraping');
 
       // Assign the metadata retrieved for the asset to the db.
       await update(job.data.asset_id, meta);
     } catch (err) {
-      debug(
-        `Failed to scrape on Job[${job.id}] for Asset[${job.data.asset_id}]:`,
-        err
-      );
+      log.error({ err }, 'Failed to scrape');
       return done(err);
     }
 
-    debug(`Finished on Job[${job.id}] for Asset[${job.data.asset_id}]`);
+    log.info('Finished updating');
     done();
   });
 };
diff --git a/services/logging.js b/services/logging.js
index 9aad098ab..00e9c523b 100644
--- a/services/logging.js
+++ b/services/logging.js
@@ -1,17 +1,17 @@
 const { version } = require('../package.json');
 const Logger = require('bunyan');
-const uuid = require('uuid/v1');
-const { LOGGING_LEVEL } = require('../config');
+const { LOGGING_LEVEL, REVISION_HASH } = require('../config');
 
 // Create the logging instance that all logger's are branched from.
-function createLogger(name, id = uuid()) {
+function createLogger(name, traceID) {
   return new Logger({
     src: true,
     name,
-    id,
+    traceID,
     version,
+    revision: REVISION_HASH,
     level: LOGGING_LEVEL,
-    serializers: { req: Logger.stdSerializers.req },
+    serializers: Logger.stdSerializers,
   });
 }
 
diff --git a/services/scraper.js b/services/scraper.js
index daba96a99..cfd845424 100644
--- a/services/scraper.js
+++ b/services/scraper.js
@@ -1,5 +1,4 @@
 const kue = require('./kue');
-const debug = require('debug')('talk:services:scraper');
 
 /**
  * Exposes a service object to allow operations to execute against the scraper.
@@ -16,15 +15,16 @@ const scraper = {
   /**
    * Creates a new scraper job and scrapes the url when it gets processed.
    */
-  async create(asset) {
-    debug(`Creating job for Asset[${asset.id}]`);
+  async create(ctx, id) {
+    ctx.log.info({ assetID: id }, 'Creating job');
 
     const job = await scraper.task.create({
-      title: `Scrape for asset ${asset.id}`,
-      asset_id: asset.id,
+      title: `Scrape for asset ${id}`,
+      id: ctx.id,
+      asset_id: id,
     });
 
-    debug(`Created Job[${job.id}] for Asset[${asset.id}]`);
+    ctx.log.info({ jobID: job.id, assetID: id }, 'Created job');
 
     return job;
   },