mirror of
https://github.com/wassname/talk.git
synced 2026-06-27 19:33:06 +08:00
Generated
+31
-99
@@ -2770,9 +2770,9 @@
|
||||
"dev": true
|
||||
},
|
||||
"@metascraper/helpers": {
|
||||
"version": "5.7.14",
|
||||
"resolved": "https://registry.npmjs.org/@metascraper/helpers/-/helpers-5.7.14.tgz",
|
||||
"integrity": "sha512-xQa24LVinzyT/5H4LKNP4YDdR/kcz+j5GIIB123DyxvhQZTRuDu8BRmEUB/Yj+jY7U5qhwabIgwCT/eN1Y9Uag==",
|
||||
"version": "5.7.21",
|
||||
"resolved": "https://registry.npmjs.org/@metascraper/helpers/-/helpers-5.7.21.tgz",
|
||||
"integrity": "sha512-wcgoVCqV8fU5YrGEKxWyUQo4nbZK3p//0Wav/hLGZ632Qb6VNpfJtUTxqHndGkG5xNeqfNtXNnX64XZaUc7FDg==",
|
||||
"requires": {
|
||||
"audio-extensions": "0.0.0",
|
||||
"chrono-node": "~1.3.11",
|
||||
@@ -22247,86 +22247,18 @@
|
||||
"dev": true
|
||||
},
|
||||
"metascraper-author": {
|
||||
"version": "5.7.14",
|
||||
"resolved": "https://registry.npmjs.org/metascraper-author/-/metascraper-author-5.7.14.tgz",
|
||||
"integrity": "sha512-P8xpHHoCzlbt1lb8qKbkz9XQ4MWC0c9ElKFORQ1GPmSVh0n+aTO1APKofFYcnl9rq6QIyYU4PLTqQZ54KXMqtA==",
|
||||
"version": "5.8.0",
|
||||
"resolved": "https://registry.npmjs.org/metascraper-author/-/metascraper-author-5.8.0.tgz",
|
||||
"integrity": "sha512-k7yZMMOi2+Vh7RoIIpc4Q6FJ2HueQZ/tVqoQueBWIzlyWpJGDkJmI1Wi7P3XfHLJEKxkg8d15bP24Z6WWIbaXw==",
|
||||
"requires": {
|
||||
"@metascraper/helpers": "^5.7.14",
|
||||
"@metascraper/helpers": "^5.8.0",
|
||||
"lodash": "~4.17.15"
|
||||
},
|
||||
"dependencies": {
|
||||
"@metascraper/helpers": {
|
||||
"version": "5.7.14",
|
||||
"resolved": "https://registry.npmjs.org/@metascraper/helpers/-/helpers-5.7.14.tgz",
|
||||
"integrity": "sha512-xQa24LVinzyT/5H4LKNP4YDdR/kcz+j5GIIB123DyxvhQZTRuDu8BRmEUB/Yj+jY7U5qhwabIgwCT/eN1Y9Uag==",
|
||||
"requires": {
|
||||
"audio-extensions": "0.0.0",
|
||||
"chrono-node": "~1.3.11",
|
||||
"condense-whitespace": "~2.0.0",
|
||||
"entities": "~2.0.0",
|
||||
"file-extension": "~4.0.5",
|
||||
"has-values": "~2.0.1",
|
||||
"image-extensions": "~1.1.0",
|
||||
"is-relative-url": "~3.0.0",
|
||||
"is-uri": "~1.2.0",
|
||||
"iso-639-3": "~1.2.0",
|
||||
"isostring": "0.0.1",
|
||||
"lodash": "~4.17.15",
|
||||
"memoize-one": "~5.1.1",
|
||||
"mime-types": "~2.1.24",
|
||||
"normalize-url": "~4.5.0",
|
||||
"smartquotes": "~2.3.1",
|
||||
"title": "~3.4.1",
|
||||
"truncate": "~2.1.0",
|
||||
"url-regex": "~5.0.0",
|
||||
"video-extensions": "~1.1.0"
|
||||
}
|
||||
},
|
||||
"entities": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/entities/-/entities-2.0.0.tgz",
|
||||
"integrity": "sha512-D9f7V0JSRwIxlRI2mjMqufDrRDnx8p+eEOz7aUM9SuvF8gsBzra0/6tbjl1m8eQHrZlYj6PxqE00hZ1SAIKPLw=="
|
||||
},
|
||||
"has-values": {
|
||||
"version": "2.0.1",
|
||||
"resolved": "https://registry.npmjs.org/has-values/-/has-values-2.0.1.tgz",
|
||||
"integrity": "sha512-+QdH3jOmq9P8GfdjFg0eJudqx1FqU62NQJ4P16rOEHeRdl7ckgwn6uqQjzYE0ZoHVV/e5E2esuJ5Gl5+HUW19w==",
|
||||
"requires": {
|
||||
"kind-of": "^6.0.2"
|
||||
}
|
||||
},
|
||||
"mime-db": {
|
||||
"version": "1.40.0",
|
||||
"resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.40.0.tgz",
|
||||
"integrity": "sha512-jYdeOMPy9vnxEqFRRo6ZvTZ8d9oPb+k18PKoYNYUe2stVEBPPwsln/qWzdbmaIvnhZ9v2P+CuecK+fpUfsV2mA=="
|
||||
},
|
||||
"mime-types": {
|
||||
"version": "2.1.24",
|
||||
"resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.24.tgz",
|
||||
"integrity": "sha512-WaFHS3MCl5fapm3oLxU4eYDw77IQM2ACcxQ9RIxfaC3ooc6PFuBMGZZsYpvoXS5D5QTWPieo1jjLdAm3TBP3cQ==",
|
||||
"requires": {
|
||||
"mime-db": "1.40.0"
|
||||
}
|
||||
},
|
||||
"normalize-url": {
|
||||
"version": "4.5.0",
|
||||
"resolved": "https://registry.npmjs.org/normalize-url/-/normalize-url-4.5.0.tgz",
|
||||
"integrity": "sha512-2s47yzUxdexf1OhyRi4Em83iQk0aPvwTddtFz4hnSSw9dCEsLEGf6SwIO8ss/19S9iBb5sJaOuTvTGDeZI00BQ=="
|
||||
}
|
||||
}
|
||||
},
|
||||
"metascraper-date": {
|
||||
"version": "5.7.14",
|
||||
"resolved": "https://registry.npmjs.org/metascraper-date/-/metascraper-date-5.7.14.tgz",
|
||||
"integrity": "sha512-eJKMtIFeBrnkAavkNlIT/O2bKmF2gKVgMpPbdg/9yJ+OS0pH3QTdk/I/NeU91fS0dAaC2ztwFKUqw2zjC27vew==",
|
||||
"requires": {
|
||||
"@metascraper/helpers": "^5.7.14"
|
||||
},
|
||||
"dependencies": {
|
||||
"@metascraper/helpers": {
|
||||
"version": "5.7.14",
|
||||
"resolved": "https://registry.npmjs.org/@metascraper/helpers/-/helpers-5.7.14.tgz",
|
||||
"integrity": "sha512-xQa24LVinzyT/5H4LKNP4YDdR/kcz+j5GIIB123DyxvhQZTRuDu8BRmEUB/Yj+jY7U5qhwabIgwCT/eN1Y9Uag==",
|
||||
"version": "5.8.0",
|
||||
"resolved": "https://registry.npmjs.org/@metascraper/helpers/-/helpers-5.8.0.tgz",
|
||||
"integrity": "sha512-12UG36W2X8oirM3M88Z13PVyLiOwKCvshydDNAK4/naO95Xi7dzMOcf8VXw375DYKnllfi8YdWzYJU8ie0BejA==",
|
||||
"requires": {
|
||||
"audio-extensions": "0.0.0",
|
||||
"chrono-node": "~1.3.11",
|
||||
@@ -22384,17 +22316,17 @@
|
||||
}
|
||||
},
|
||||
"metascraper-description": {
|
||||
"version": "5.7.14",
|
||||
"resolved": "https://registry.npmjs.org/metascraper-description/-/metascraper-description-5.7.14.tgz",
|
||||
"integrity": "sha512-++qN4Rf0Hx13SbhJgRiLSuVOZHsYwhUkMfHa5sVVihSJkrLVjOSdBTpNBajRC7yHwG6m6/qIesuERbT1jdu5bw==",
|
||||
"version": "5.8.0",
|
||||
"resolved": "https://registry.npmjs.org/metascraper-description/-/metascraper-description-5.8.0.tgz",
|
||||
"integrity": "sha512-JHtHiHBIGMr7bZYoqbT6NnMSnIuMTMItxBAQfdW9RDQCK9l/M1yGi/usMcvXiPYUVlzPfuCcwqeN3xMj3JyLEg==",
|
||||
"requires": {
|
||||
"@metascraper/helpers": "^5.7.14"
|
||||
"@metascraper/helpers": "^5.8.0"
|
||||
},
|
||||
"dependencies": {
|
||||
"@metascraper/helpers": {
|
||||
"version": "5.7.14",
|
||||
"resolved": "https://registry.npmjs.org/@metascraper/helpers/-/helpers-5.7.14.tgz",
|
||||
"integrity": "sha512-xQa24LVinzyT/5H4LKNP4YDdR/kcz+j5GIIB123DyxvhQZTRuDu8BRmEUB/Yj+jY7U5qhwabIgwCT/eN1Y9Uag==",
|
||||
"version": "5.8.0",
|
||||
"resolved": "https://registry.npmjs.org/@metascraper/helpers/-/helpers-5.8.0.tgz",
|
||||
"integrity": "sha512-12UG36W2X8oirM3M88Z13PVyLiOwKCvshydDNAK4/naO95Xi7dzMOcf8VXw375DYKnllfi8YdWzYJU8ie0BejA==",
|
||||
"requires": {
|
||||
"audio-extensions": "0.0.0",
|
||||
"chrono-node": "~1.3.11",
|
||||
@@ -22452,17 +22384,17 @@
|
||||
}
|
||||
},
|
||||
"metascraper-image": {
|
||||
"version": "5.7.14",
|
||||
"resolved": "https://registry.npmjs.org/metascraper-image/-/metascraper-image-5.7.14.tgz",
|
||||
"integrity": "sha512-TxnUKYU92iWapq2G55E4AF7VjGyiDO2x01Z6AyjbmRxoM4U2IaHkNpE5msyc7TQhxGoYOSvdUtOeUnqdPqr+aA==",
|
||||
"version": "5.8.0",
|
||||
"resolved": "https://registry.npmjs.org/metascraper-image/-/metascraper-image-5.8.0.tgz",
|
||||
"integrity": "sha512-qDwQcjbSlb5NAdFgbCARaGjRUEzWMiYEA/r2AhJzCFsRZxC9gdurk2M0dhN6NCB6FvEv0JtQwQbkWokvuLKkiQ==",
|
||||
"requires": {
|
||||
"@metascraper/helpers": "^5.7.14"
|
||||
"@metascraper/helpers": "^5.8.0"
|
||||
},
|
||||
"dependencies": {
|
||||
"@metascraper/helpers": {
|
||||
"version": "5.7.14",
|
||||
"resolved": "https://registry.npmjs.org/@metascraper/helpers/-/helpers-5.7.14.tgz",
|
||||
"integrity": "sha512-xQa24LVinzyT/5H4LKNP4YDdR/kcz+j5GIIB123DyxvhQZTRuDu8BRmEUB/Yj+jY7U5qhwabIgwCT/eN1Y9Uag==",
|
||||
"version": "5.8.0",
|
||||
"resolved": "https://registry.npmjs.org/@metascraper/helpers/-/helpers-5.8.0.tgz",
|
||||
"integrity": "sha512-12UG36W2X8oirM3M88Z13PVyLiOwKCvshydDNAK4/naO95Xi7dzMOcf8VXw375DYKnllfi8YdWzYJU8ie0BejA==",
|
||||
"requires": {
|
||||
"audio-extensions": "0.0.0",
|
||||
"chrono-node": "~1.3.11",
|
||||
@@ -22520,18 +22452,18 @@
|
||||
}
|
||||
},
|
||||
"metascraper-title": {
|
||||
"version": "5.7.14",
|
||||
"resolved": "https://registry.npmjs.org/metascraper-title/-/metascraper-title-5.7.14.tgz",
|
||||
"integrity": "sha512-ZiVo4LEfqiNHlCGjht5OSZ3yRKxcZnbaXeRmUReMkCHcFujok5YZBj5ktDpAANmG9T3x2gn3twM3ZbBSyXLYyg==",
|
||||
"version": "5.8.0",
|
||||
"resolved": "https://registry.npmjs.org/metascraper-title/-/metascraper-title-5.8.0.tgz",
|
||||
"integrity": "sha512-ar6zqFGrHPeri8ymoWoHRJ29msmG7f8P5fLd1/A3NvFHXJA3XMTw4w1uLC9tg8MSABAG9t8vISmzB5NdB6MedQ==",
|
||||
"requires": {
|
||||
"@metascraper/helpers": "^5.7.14",
|
||||
"@metascraper/helpers": "^5.8.0",
|
||||
"lodash": "~4.17.15"
|
||||
},
|
||||
"dependencies": {
|
||||
"@metascraper/helpers": {
|
||||
"version": "5.7.14",
|
||||
"resolved": "https://registry.npmjs.org/@metascraper/helpers/-/helpers-5.7.14.tgz",
|
||||
"integrity": "sha512-xQa24LVinzyT/5H4LKNP4YDdR/kcz+j5GIIB123DyxvhQZTRuDu8BRmEUB/Yj+jY7U5qhwabIgwCT/eN1Y9Uag==",
|
||||
"version": "5.8.0",
|
||||
"resolved": "https://registry.npmjs.org/@metascraper/helpers/-/helpers-5.8.0.tgz",
|
||||
"integrity": "sha512-12UG36W2X8oirM3M88Z13PVyLiOwKCvshydDNAK4/naO95Xi7dzMOcf8VXw375DYKnllfi8YdWzYJU8ie0BejA==",
|
||||
"requires": {
|
||||
"audio-extensions": "0.0.0",
|
||||
"chrono-node": "~1.3.11",
|
||||
|
||||
+5
-6
@@ -58,7 +58,7 @@
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"@coralproject/bunyan-prettystream": "^0.1.4",
|
||||
"@metascraper/helpers": "^5.7.14",
|
||||
"@metascraper/helpers": "^5.7.21",
|
||||
"akismet-api": "^4.2.0",
|
||||
"apollo-server-express": "^2.8.1",
|
||||
"archiver": "^3.0.3",
|
||||
@@ -106,11 +106,10 @@
|
||||
"lodash": "^4.17.15",
|
||||
"lru-cache": "^5.1.1",
|
||||
"luxon": "^1.12.0",
|
||||
"metascraper-author": "^5.7.14",
|
||||
"metascraper-date": "^5.7.14",
|
||||
"metascraper-description": "^5.7.14",
|
||||
"metascraper-image": "^5.7.14",
|
||||
"metascraper-title": "^5.7.14",
|
||||
"metascraper-author": "^5.8.0",
|
||||
"metascraper-description": "^5.8.0",
|
||||
"metascraper-image": "^5.8.0",
|
||||
"metascraper-title": "^5.8.0",
|
||||
"mongodb": "^3.2.7",
|
||||
"mongodb-core": "^3.2.7",
|
||||
"ms": "^2.1.1",
|
||||
|
||||
@@ -1,183 +1 @@
|
||||
import Logger from "bunyan";
|
||||
import cheerio from "cheerio";
|
||||
import authorScraper from "metascraper-author";
|
||||
import dateScraper from "metascraper-date";
|
||||
import descriptionScraper from "metascraper-description";
|
||||
import imageScraper from "metascraper-image";
|
||||
import titleScraper from "metascraper-title";
|
||||
import { Db } from "mongodb";
|
||||
import fetch, { RequestInit } from "node-fetch";
|
||||
import ProxyAgent from "proxy-agent";
|
||||
|
||||
import { version } from "coral-common/version";
|
||||
import { GQLStoryMetadata } from "coral-server/graph/tenant/schema/__generated__/types";
|
||||
import logger from "coral-server/logger";
|
||||
import { retrieveStory, updateStory } from "coral-server/models/story";
|
||||
import { retrieveTenant } from "coral-server/models/tenant";
|
||||
|
||||
import { modifiedScraper } from "./rules/modified";
|
||||
import { sectionScraper } from "./rules/section";
|
||||
|
||||
export type Rule = Record<
|
||||
string,
|
||||
Array<
|
||||
(options: { htmlDom: CheerioSelector; url: string }) => string | undefined
|
||||
>
|
||||
>;
|
||||
|
||||
class Scraper {
|
||||
private rules: Rule[];
|
||||
private log: Logger;
|
||||
|
||||
constructor(rules: Rule[]) {
|
||||
this.rules = rules;
|
||||
this.log = logger.child({ taskName: "scraper" }, true);
|
||||
}
|
||||
|
||||
public async scrape(
|
||||
url: string,
|
||||
proxyURL?: string
|
||||
): Promise<GQLStoryMetadata | null> {
|
||||
// Grab the page HTML.
|
||||
|
||||
const log = this.log.child({ storyURL: url }, true);
|
||||
|
||||
const options: RequestInit = {
|
||||
headers: {
|
||||
"User-Agent": `Talk Scraper/${version}`,
|
||||
},
|
||||
};
|
||||
if (proxyURL) {
|
||||
// Force the type here because there's a slight mismatch.
|
||||
options.agent = (new ProxyAgent(
|
||||
proxyURL
|
||||
) as unknown) as RequestInit["agent"];
|
||||
log.debug("using proxy for scrape");
|
||||
}
|
||||
|
||||
const start = Date.now();
|
||||
log.debug("starting scrape of Story");
|
||||
|
||||
const res = await fetch(url, options);
|
||||
if (res.status !== 200) {
|
||||
log.warn(
|
||||
{ statusCode: res.status },
|
||||
"scrape failed with non-200 status code"
|
||||
);
|
||||
return null;
|
||||
}
|
||||
|
||||
const html = await res.text();
|
||||
|
||||
log.debug({ timeElapsed: Date.now() - start }, "scrape complete");
|
||||
|
||||
// Load the DOM.
|
||||
const htmlDom = cheerio.load(html);
|
||||
|
||||
log.debug("parsed html");
|
||||
|
||||
// Gather the results by evaluating each of the rules.
|
||||
const metadata: Record<string, string | undefined> = {};
|
||||
|
||||
for (const rule of this.rules) {
|
||||
for (const property in rule) {
|
||||
if (!rule.hasOwnProperty(property)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Proceed through each of the properties and try to find the mapped
|
||||
// properties.
|
||||
for (const getter of rule[property]) {
|
||||
const value = getter({ htmlDom, url });
|
||||
if (value && value.length > 0) {
|
||||
metadata[property] = value;
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
log.debug("extracted metadata");
|
||||
|
||||
return {
|
||||
title: metadata.title || undefined,
|
||||
description: metadata.description || undefined,
|
||||
image: metadata.image ? metadata.image : undefined,
|
||||
author: metadata.author || undefined,
|
||||
publishedAt: metadata.date ? new Date(metadata.date) : undefined,
|
||||
modifiedAt: metadata.modified ? new Date(metadata.modified) : undefined,
|
||||
section: metadata.section || undefined,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* createScraper will create a scraper that will utilize the rules defined to
|
||||
* scrape metadata from the target page.
|
||||
*/
|
||||
function createScraper() {
|
||||
return new Scraper([
|
||||
authorScraper(),
|
||||
dateScraper(),
|
||||
descriptionScraper(),
|
||||
imageScraper(),
|
||||
titleScraper(),
|
||||
modifiedScraper(),
|
||||
sectionScraper(),
|
||||
]);
|
||||
}
|
||||
|
||||
export const scraper = createScraper();
|
||||
|
||||
export async function scrape(
|
||||
mongo: Db,
|
||||
tenantID: string,
|
||||
storyID: string,
|
||||
storyURL?: string
|
||||
) {
|
||||
// Grab the Tenant.
|
||||
const tenant = await retrieveTenant(mongo, tenantID);
|
||||
if (!tenant) {
|
||||
throw new Error("tenant not found");
|
||||
}
|
||||
|
||||
// If the URL wasn't provided, grab it from the database.
|
||||
if (!storyURL) {
|
||||
const retrievedStory = await retrieveStory(mongo, tenantID, storyID);
|
||||
if (!retrievedStory) {
|
||||
throw new Error("story at specified id not found");
|
||||
}
|
||||
|
||||
// Update the story URL.
|
||||
storyURL = retrievedStory.url;
|
||||
}
|
||||
|
||||
// Get the metadata from the scraped html.
|
||||
const metadata = await scraper.scrape(
|
||||
storyURL,
|
||||
tenant.stories.scraping.proxyURL
|
||||
);
|
||||
if (!metadata) {
|
||||
throw new Error("story at specified url not found");
|
||||
}
|
||||
|
||||
const now = new Date();
|
||||
|
||||
// Update the Story with the scraped details.
|
||||
const story = await updateStory(
|
||||
mongo,
|
||||
tenantID,
|
||||
storyID,
|
||||
{
|
||||
metadata,
|
||||
scrapedAt: now,
|
||||
},
|
||||
now
|
||||
);
|
||||
if (!story) {
|
||||
throw new Error("story at specified id not found");
|
||||
}
|
||||
|
||||
return story;
|
||||
}
|
||||
export * from "./scraper";
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
import { $jsonld } from "@metascraper/helpers";
|
||||
import { $jsonld, date, toRule } from "@metascraper/helpers";
|
||||
import { Rules } from "metascraper";
|
||||
|
||||
import { wrap } from "./helpers";
|
||||
const toDate = toRule(date);
|
||||
|
||||
export const modifiedScraper = (): Rules => ({
|
||||
modified: [
|
||||
// From: http://ogp.me/#type_article
|
||||
wrap($jsonld("dateModified")),
|
||||
wrap($ => $('meta[property="article:modified"]').attr("content")),
|
||||
toDate($jsonld("dateModified")),
|
||||
toDate($ => $('meta[property="article:modified"]').attr("content")),
|
||||
],
|
||||
});
|
||||
|
||||
@@ -0,0 +1,26 @@
|
||||
import { $jsonld, date, toRule } from "@metascraper/helpers";
|
||||
import { Rules } from "metascraper";
|
||||
|
||||
const toDate = toRule(date);
|
||||
|
||||
export const publishedScraper = (): Rules => ({
|
||||
published: [
|
||||
// From: http://ogp.me/#type_article
|
||||
toDate($jsonld("datePublished")),
|
||||
toDate($jsonld("dateCreated")),
|
||||
toDate($ => $('meta[property*="published_time" i]').attr("content")),
|
||||
toDate($ => $('meta[property*="release_date" i]').attr("content")),
|
||||
toDate($ => $('meta[name="date" i]').attr("content")),
|
||||
toDate($ => $('[itemprop="datepublished" i]').attr("content")),
|
||||
toDate($ => $('[itemprop*="date" i]').attr("content")),
|
||||
toDate($ => $('time[itemprop*="date" i]').attr("datetime")),
|
||||
toDate($ => $("time[datetime]").attr("datetime")),
|
||||
toDate($ => $("time[datetime][pubdate]").attr("datetime")),
|
||||
toDate($ => $('meta[name*="dc.date" i]').attr("content")),
|
||||
toDate($ => $('meta[name*="dc.date.issued" i]').attr("content")),
|
||||
toDate($ => $('meta[name*="dc.date.created" i]').attr("content")),
|
||||
toDate($ => $('meta[name*="dcterms.date" i]').attr("content")),
|
||||
toDate($ => $('[property*="dc:date" i]').attr("content")),
|
||||
toDate($ => $('[property*="dc:created" i]').attr("content")),
|
||||
],
|
||||
});
|
||||
@@ -0,0 +1,42 @@
|
||||
import { scraper } from "./scraper";
|
||||
|
||||
describe("Scraper", () => {
|
||||
it("parses the JSON-LD data correctly", async () => {
|
||||
const html = `
|
||||
<script type="application/ld+json">
|
||||
{
|
||||
"@type": "Article",
|
||||
"@id": "https://coralproject.net/blog/working-with-user-stories-keeping-commenters-and-moderators-at-the-center-of-what-we-build/",
|
||||
"author": {
|
||||
"@type": "Person",
|
||||
"name": "sam"
|
||||
},
|
||||
"headline": "Working with User Stories: Keeping commenters and moderators at the center of what we build",
|
||||
"description": "We believe that the comments section can be a place where diverse voices come together to share opinions and experiences.",
|
||||
"datePublished": "2019-09-04T15:43:35+00:00",
|
||||
"dateModified": "2019-09-06T06:14:29+00:00",
|
||||
"image": {
|
||||
"@type": "ImageObject",
|
||||
"url": "https://coralproject.net/wp-content/uploads/2019/09/blog-hero.png",
|
||||
"width": 1440,
|
||||
"height": 1024
|
||||
},
|
||||
"articleSection": "Comments,Design,Moderation,Useful"
|
||||
}
|
||||
</script>
|
||||
`;
|
||||
|
||||
expect(scraper.parse("", html)).toEqual({
|
||||
author: "sam",
|
||||
description:
|
||||
"We believe that the comments section can be a place where diverse voices come together to share opinions and experiences.",
|
||||
image:
|
||||
"https://coralproject.net/wp-content/uploads/2019/09/blog-hero.png",
|
||||
modifiedAt: new Date("2019-09-06T06:14:29+00:00"),
|
||||
publishedAt: new Date("2019-09-04T15:43:35+00:00"),
|
||||
section: "Comments,Design,Moderation,Useful",
|
||||
title:
|
||||
"Working with User Stories: Keeping commenters and moderators at the center of what we build",
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,199 @@
|
||||
import Logger from "bunyan";
|
||||
import cheerio from "cheerio";
|
||||
import authorScraper from "metascraper-author";
|
||||
import descriptionScraper from "metascraper-description";
|
||||
import imageScraper from "metascraper-image";
|
||||
import titleScraper from "metascraper-title";
|
||||
import { Db } from "mongodb";
|
||||
import fetch, { RequestInit } from "node-fetch";
|
||||
import ProxyAgent from "proxy-agent";
|
||||
|
||||
import { version } from "coral-common/version";
|
||||
import logger from "coral-server/logger";
|
||||
import { retrieveStory, updateStory } from "coral-server/models/story";
|
||||
import { retrieveTenant } from "coral-server/models/tenant";
|
||||
|
||||
import { GQLStoryMetadata } from "coral-server/graph/tenant/schema/__generated__/types";
|
||||
|
||||
import { modifiedScraper } from "./rules/modified";
|
||||
import { publishedScraper } from "./rules/published";
|
||||
import { sectionScraper } from "./rules/section";
|
||||
|
||||
export type Rule = Record<
|
||||
string,
|
||||
Array<
|
||||
(options: { htmlDom: CheerioSelector; url: string }) => string | undefined
|
||||
>
|
||||
>;
|
||||
|
||||
class Scraper {
|
||||
private rules: Rule[];
|
||||
private log: Logger;
|
||||
|
||||
constructor(rules: Rule[]) {
|
||||
this.rules = rules;
|
||||
this.log = logger.child({ taskName: "scraper" }, true);
|
||||
}
|
||||
|
||||
public parse(url: string, html: string): GQLStoryMetadata {
|
||||
const log = this.log.child({ storyURL: url }, true);
|
||||
|
||||
// Load the DOM.
|
||||
const htmlDom = cheerio.load(html);
|
||||
|
||||
log.debug("parsed html");
|
||||
|
||||
// Gather the results by evaluating each of the rules.
|
||||
const metadata: Record<string, string | undefined> = {};
|
||||
|
||||
for (const rule of this.rules) {
|
||||
for (const property in rule) {
|
||||
if (!rule.hasOwnProperty(property)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Proceed through each of the properties and try to find the mapped
|
||||
// properties.
|
||||
for (const getter of rule[property]) {
|
||||
const value = getter({ htmlDom, url });
|
||||
if (value && value.length > 0) {
|
||||
metadata[property] = value;
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
log.debug("extracted metadata");
|
||||
|
||||
return {
|
||||
title: metadata.title || undefined,
|
||||
description: metadata.description || undefined,
|
||||
image: metadata.image ? metadata.image : undefined,
|
||||
author: metadata.author || undefined,
|
||||
publishedAt: metadata.published
|
||||
? new Date(metadata.published)
|
||||
: undefined,
|
||||
modifiedAt: metadata.modified ? new Date(metadata.modified) : undefined,
|
||||
section: metadata.section || undefined,
|
||||
};
|
||||
}
|
||||
|
||||
public async download(url: string, proxyURL?: string) {
|
||||
const log = this.log.child({ storyURL: url }, true);
|
||||
|
||||
const options: RequestInit = {
|
||||
headers: {
|
||||
"User-Agent": `Talk Scraper/${version}`,
|
||||
},
|
||||
};
|
||||
if (proxyURL) {
|
||||
// Force the type here because there's a slight mismatch.
|
||||
options.agent = (new ProxyAgent(
|
||||
proxyURL
|
||||
) as unknown) as RequestInit["agent"];
|
||||
log.debug("using proxy for scrape");
|
||||
}
|
||||
|
||||
const start = Date.now();
|
||||
log.debug("starting scrape of Story");
|
||||
|
||||
const res = await fetch(url, options);
|
||||
if (!res.ok || res.status !== 200) {
|
||||
log.warn(
|
||||
{ statusCode: res.status, statusText: res.statusText },
|
||||
"scrape failed with non-200 status code"
|
||||
);
|
||||
return null;
|
||||
}
|
||||
|
||||
const html = await res.text();
|
||||
|
||||
log.debug({ timeElapsed: Date.now() - start }, "scrape complete");
|
||||
|
||||
return html;
|
||||
}
|
||||
|
||||
public async scrape(
|
||||
url: string,
|
||||
proxyURL?: string
|
||||
): Promise<GQLStoryMetadata | null> {
|
||||
const html = await this.download(url, proxyURL);
|
||||
if (!html) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return this.parse(url, html);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* createScraper will create a scraper that will utilize the rules defined to
|
||||
* scrape metadata from the target page.
|
||||
*/
|
||||
function createScraper() {
|
||||
return new Scraper([
|
||||
authorScraper(),
|
||||
publishedScraper(),
|
||||
descriptionScraper(),
|
||||
imageScraper(),
|
||||
titleScraper(),
|
||||
modifiedScraper(),
|
||||
sectionScraper(),
|
||||
]);
|
||||
}
|
||||
|
||||
export const scraper = createScraper();
|
||||
|
||||
export async function scrape(
|
||||
mongo: Db,
|
||||
tenantID: string,
|
||||
storyID: string,
|
||||
storyURL?: string
|
||||
) {
|
||||
// Grab the Tenant.
|
||||
const tenant = await retrieveTenant(mongo, tenantID);
|
||||
if (!tenant) {
|
||||
throw new Error("tenant not found");
|
||||
}
|
||||
|
||||
// If the URL wasn't provided, grab it from the database.
|
||||
if (!storyURL) {
|
||||
const retrievedStory = await retrieveStory(mongo, tenantID, storyID);
|
||||
if (!retrievedStory) {
|
||||
throw new Error("story at specified id not found");
|
||||
}
|
||||
|
||||
// Update the story URL.
|
||||
storyURL = retrievedStory.url;
|
||||
}
|
||||
|
||||
// Get the metadata from the scraped html.
|
||||
const metadata = await scraper.scrape(
|
||||
storyURL,
|
||||
tenant.stories.scraping.proxyURL
|
||||
);
|
||||
if (!metadata) {
|
||||
throw new Error("story at specified url not found");
|
||||
}
|
||||
|
||||
const now = new Date();
|
||||
|
||||
// Update the Story with the scraped details.
|
||||
const story = await updateStory(
|
||||
mongo,
|
||||
tenantID,
|
||||
storyID,
|
||||
{
|
||||
metadata,
|
||||
scrapedAt: now,
|
||||
},
|
||||
now
|
||||
);
|
||||
if (!story) {
|
||||
throw new Error("story at specified id not found");
|
||||
}
|
||||
|
||||
return story;
|
||||
}
|
||||
Vendored
+5
-14
@@ -1,10 +1,4 @@
|
||||
declare module "metascraper" {
|
||||
export interface Scraper {
|
||||
(options: { url: string; html: string }): Promise<
|
||||
Record<string, string | undefined>
|
||||
>;
|
||||
}
|
||||
|
||||
export type Ruler = (options: {
|
||||
htmlDom: CheerioSelector;
|
||||
url: string;
|
||||
@@ -16,8 +10,6 @@ declare module "metascraper" {
|
||||
) => string | undefined;
|
||||
|
||||
export type Rules = Record<string, Array<Ruler>>;
|
||||
|
||||
export function load(rules: Rules[]): Scraper;
|
||||
}
|
||||
|
||||
declare module "metascraper-author" {
|
||||
@@ -25,11 +17,6 @@ declare module "metascraper-author" {
|
||||
export default function def(): Rules;
|
||||
}
|
||||
|
||||
declare module "metascraper-date" {
|
||||
import { Rules } from "metascraper";
|
||||
export default function def(): Rules;
|
||||
}
|
||||
|
||||
declare module "metascraper-description" {
|
||||
import { Rules } from "metascraper";
|
||||
export default function def(): Rules;
|
||||
@@ -46,5 +33,9 @@ declare module "metascraper-title" {
|
||||
}
|
||||
|
||||
declare module "@metascraper/helpers" {
|
||||
export const $jsonld: any;
|
||||
import { Rule } from "metascraper";
|
||||
export const $jsonld: (key: string) => Rule;
|
||||
export const jsonld: (url: string, htmlDom: CheerioSelector) => Rule;
|
||||
export const toRule: (fn: any, opts?: any) => (rule: Rule) => any;
|
||||
export const date: Rule;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user