fix: improvements to scraping (#2702)

- fixes #2690
This commit is contained in:
Wyatt Johnson
2019-11-12 17:25:41 +00:00
committed by GitHub
parent b83d737530
commit 2ef07429c2
8 changed files with 313 additions and 306 deletions
+31 -99
View File
@@ -2770,9 +2770,9 @@
"dev": true
},
"@metascraper/helpers": {
"version": "5.7.14",
"resolved": "https://registry.npmjs.org/@metascraper/helpers/-/helpers-5.7.14.tgz",
"integrity": "sha512-xQa24LVinzyT/5H4LKNP4YDdR/kcz+j5GIIB123DyxvhQZTRuDu8BRmEUB/Yj+jY7U5qhwabIgwCT/eN1Y9Uag==",
"version": "5.7.21",
"resolved": "https://registry.npmjs.org/@metascraper/helpers/-/helpers-5.7.21.tgz",
"integrity": "sha512-wcgoVCqV8fU5YrGEKxWyUQo4nbZK3p//0Wav/hLGZ632Qb6VNpfJtUTxqHndGkG5xNeqfNtXNnX64XZaUc7FDg==",
"requires": {
"audio-extensions": "0.0.0",
"chrono-node": "~1.3.11",
@@ -22247,86 +22247,18 @@
"dev": true
},
"metascraper-author": {
"version": "5.7.14",
"resolved": "https://registry.npmjs.org/metascraper-author/-/metascraper-author-5.7.14.tgz",
"integrity": "sha512-P8xpHHoCzlbt1lb8qKbkz9XQ4MWC0c9ElKFORQ1GPmSVh0n+aTO1APKofFYcnl9rq6QIyYU4PLTqQZ54KXMqtA==",
"version": "5.8.0",
"resolved": "https://registry.npmjs.org/metascraper-author/-/metascraper-author-5.8.0.tgz",
"integrity": "sha512-k7yZMMOi2+Vh7RoIIpc4Q6FJ2HueQZ/tVqoQueBWIzlyWpJGDkJmI1Wi7P3XfHLJEKxkg8d15bP24Z6WWIbaXw==",
"requires": {
"@metascraper/helpers": "^5.7.14",
"@metascraper/helpers": "^5.8.0",
"lodash": "~4.17.15"
},
"dependencies": {
"@metascraper/helpers": {
"version": "5.7.14",
"resolved": "https://registry.npmjs.org/@metascraper/helpers/-/helpers-5.7.14.tgz",
"integrity": "sha512-xQa24LVinzyT/5H4LKNP4YDdR/kcz+j5GIIB123DyxvhQZTRuDu8BRmEUB/Yj+jY7U5qhwabIgwCT/eN1Y9Uag==",
"requires": {
"audio-extensions": "0.0.0",
"chrono-node": "~1.3.11",
"condense-whitespace": "~2.0.0",
"entities": "~2.0.0",
"file-extension": "~4.0.5",
"has-values": "~2.0.1",
"image-extensions": "~1.1.0",
"is-relative-url": "~3.0.0",
"is-uri": "~1.2.0",
"iso-639-3": "~1.2.0",
"isostring": "0.0.1",
"lodash": "~4.17.15",
"memoize-one": "~5.1.1",
"mime-types": "~2.1.24",
"normalize-url": "~4.5.0",
"smartquotes": "~2.3.1",
"title": "~3.4.1",
"truncate": "~2.1.0",
"url-regex": "~5.0.0",
"video-extensions": "~1.1.0"
}
},
"entities": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/entities/-/entities-2.0.0.tgz",
"integrity": "sha512-D9f7V0JSRwIxlRI2mjMqufDrRDnx8p+eEOz7aUM9SuvF8gsBzra0/6tbjl1m8eQHrZlYj6PxqE00hZ1SAIKPLw=="
},
"has-values": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/has-values/-/has-values-2.0.1.tgz",
"integrity": "sha512-+QdH3jOmq9P8GfdjFg0eJudqx1FqU62NQJ4P16rOEHeRdl7ckgwn6uqQjzYE0ZoHVV/e5E2esuJ5Gl5+HUW19w==",
"requires": {
"kind-of": "^6.0.2"
}
},
"mime-db": {
"version": "1.40.0",
"resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.40.0.tgz",
"integrity": "sha512-jYdeOMPy9vnxEqFRRo6ZvTZ8d9oPb+k18PKoYNYUe2stVEBPPwsln/qWzdbmaIvnhZ9v2P+CuecK+fpUfsV2mA=="
},
"mime-types": {
"version": "2.1.24",
"resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.24.tgz",
"integrity": "sha512-WaFHS3MCl5fapm3oLxU4eYDw77IQM2ACcxQ9RIxfaC3ooc6PFuBMGZZsYpvoXS5D5QTWPieo1jjLdAm3TBP3cQ==",
"requires": {
"mime-db": "1.40.0"
}
},
"normalize-url": {
"version": "4.5.0",
"resolved": "https://registry.npmjs.org/normalize-url/-/normalize-url-4.5.0.tgz",
"integrity": "sha512-2s47yzUxdexf1OhyRi4Em83iQk0aPvwTddtFz4hnSSw9dCEsLEGf6SwIO8ss/19S9iBb5sJaOuTvTGDeZI00BQ=="
}
}
},
"metascraper-date": {
"version": "5.7.14",
"resolved": "https://registry.npmjs.org/metascraper-date/-/metascraper-date-5.7.14.tgz",
"integrity": "sha512-eJKMtIFeBrnkAavkNlIT/O2bKmF2gKVgMpPbdg/9yJ+OS0pH3QTdk/I/NeU91fS0dAaC2ztwFKUqw2zjC27vew==",
"requires": {
"@metascraper/helpers": "^5.7.14"
},
"dependencies": {
"@metascraper/helpers": {
"version": "5.7.14",
"resolved": "https://registry.npmjs.org/@metascraper/helpers/-/helpers-5.7.14.tgz",
"integrity": "sha512-xQa24LVinzyT/5H4LKNP4YDdR/kcz+j5GIIB123DyxvhQZTRuDu8BRmEUB/Yj+jY7U5qhwabIgwCT/eN1Y9Uag==",
"version": "5.8.0",
"resolved": "https://registry.npmjs.org/@metascraper/helpers/-/helpers-5.8.0.tgz",
"integrity": "sha512-12UG36W2X8oirM3M88Z13PVyLiOwKCvshydDNAK4/naO95Xi7dzMOcf8VXw375DYKnllfi8YdWzYJU8ie0BejA==",
"requires": {
"audio-extensions": "0.0.0",
"chrono-node": "~1.3.11",
@@ -22384,17 +22316,17 @@
}
},
"metascraper-description": {
"version": "5.7.14",
"resolved": "https://registry.npmjs.org/metascraper-description/-/metascraper-description-5.7.14.tgz",
"integrity": "sha512-++qN4Rf0Hx13SbhJgRiLSuVOZHsYwhUkMfHa5sVVihSJkrLVjOSdBTpNBajRC7yHwG6m6/qIesuERbT1jdu5bw==",
"version": "5.8.0",
"resolved": "https://registry.npmjs.org/metascraper-description/-/metascraper-description-5.8.0.tgz",
"integrity": "sha512-JHtHiHBIGMr7bZYoqbT6NnMSnIuMTMItxBAQfdW9RDQCK9l/M1yGi/usMcvXiPYUVlzPfuCcwqeN3xMj3JyLEg==",
"requires": {
"@metascraper/helpers": "^5.7.14"
"@metascraper/helpers": "^5.8.0"
},
"dependencies": {
"@metascraper/helpers": {
"version": "5.7.14",
"resolved": "https://registry.npmjs.org/@metascraper/helpers/-/helpers-5.7.14.tgz",
"integrity": "sha512-xQa24LVinzyT/5H4LKNP4YDdR/kcz+j5GIIB123DyxvhQZTRuDu8BRmEUB/Yj+jY7U5qhwabIgwCT/eN1Y9Uag==",
"version": "5.8.0",
"resolved": "https://registry.npmjs.org/@metascraper/helpers/-/helpers-5.8.0.tgz",
"integrity": "sha512-12UG36W2X8oirM3M88Z13PVyLiOwKCvshydDNAK4/naO95Xi7dzMOcf8VXw375DYKnllfi8YdWzYJU8ie0BejA==",
"requires": {
"audio-extensions": "0.0.0",
"chrono-node": "~1.3.11",
@@ -22452,17 +22384,17 @@
}
},
"metascraper-image": {
"version": "5.7.14",
"resolved": "https://registry.npmjs.org/metascraper-image/-/metascraper-image-5.7.14.tgz",
"integrity": "sha512-TxnUKYU92iWapq2G55E4AF7VjGyiDO2x01Z6AyjbmRxoM4U2IaHkNpE5msyc7TQhxGoYOSvdUtOeUnqdPqr+aA==",
"version": "5.8.0",
"resolved": "https://registry.npmjs.org/metascraper-image/-/metascraper-image-5.8.0.tgz",
"integrity": "sha512-qDwQcjbSlb5NAdFgbCARaGjRUEzWMiYEA/r2AhJzCFsRZxC9gdurk2M0dhN6NCB6FvEv0JtQwQbkWokvuLKkiQ==",
"requires": {
"@metascraper/helpers": "^5.7.14"
"@metascraper/helpers": "^5.8.0"
},
"dependencies": {
"@metascraper/helpers": {
"version": "5.7.14",
"resolved": "https://registry.npmjs.org/@metascraper/helpers/-/helpers-5.7.14.tgz",
"integrity": "sha512-xQa24LVinzyT/5H4LKNP4YDdR/kcz+j5GIIB123DyxvhQZTRuDu8BRmEUB/Yj+jY7U5qhwabIgwCT/eN1Y9Uag==",
"version": "5.8.0",
"resolved": "https://registry.npmjs.org/@metascraper/helpers/-/helpers-5.8.0.tgz",
"integrity": "sha512-12UG36W2X8oirM3M88Z13PVyLiOwKCvshydDNAK4/naO95Xi7dzMOcf8VXw375DYKnllfi8YdWzYJU8ie0BejA==",
"requires": {
"audio-extensions": "0.0.0",
"chrono-node": "~1.3.11",
@@ -22520,18 +22452,18 @@
}
},
"metascraper-title": {
"version": "5.7.14",
"resolved": "https://registry.npmjs.org/metascraper-title/-/metascraper-title-5.7.14.tgz",
"integrity": "sha512-ZiVo4LEfqiNHlCGjht5OSZ3yRKxcZnbaXeRmUReMkCHcFujok5YZBj5ktDpAANmG9T3x2gn3twM3ZbBSyXLYyg==",
"version": "5.8.0",
"resolved": "https://registry.npmjs.org/metascraper-title/-/metascraper-title-5.8.0.tgz",
"integrity": "sha512-ar6zqFGrHPeri8ymoWoHRJ29msmG7f8P5fLd1/A3NvFHXJA3XMTw4w1uLC9tg8MSABAG9t8vISmzB5NdB6MedQ==",
"requires": {
"@metascraper/helpers": "^5.7.14",
"@metascraper/helpers": "^5.8.0",
"lodash": "~4.17.15"
},
"dependencies": {
"@metascraper/helpers": {
"version": "5.7.14",
"resolved": "https://registry.npmjs.org/@metascraper/helpers/-/helpers-5.7.14.tgz",
"integrity": "sha512-xQa24LVinzyT/5H4LKNP4YDdR/kcz+j5GIIB123DyxvhQZTRuDu8BRmEUB/Yj+jY7U5qhwabIgwCT/eN1Y9Uag==",
"version": "5.8.0",
"resolved": "https://registry.npmjs.org/@metascraper/helpers/-/helpers-5.8.0.tgz",
"integrity": "sha512-12UG36W2X8oirM3M88Z13PVyLiOwKCvshydDNAK4/naO95Xi7dzMOcf8VXw375DYKnllfi8YdWzYJU8ie0BejA==",
"requires": {
"audio-extensions": "0.0.0",
"chrono-node": "~1.3.11",
+5 -6
View File
@@ -58,7 +58,7 @@
"license": "Apache-2.0",
"dependencies": {
"@coralproject/bunyan-prettystream": "^0.1.4",
"@metascraper/helpers": "^5.7.14",
"@metascraper/helpers": "^5.7.21",
"akismet-api": "^4.2.0",
"apollo-server-express": "^2.8.1",
"archiver": "^3.0.3",
@@ -106,11 +106,10 @@
"lodash": "^4.17.15",
"lru-cache": "^5.1.1",
"luxon": "^1.12.0",
"metascraper-author": "^5.7.14",
"metascraper-date": "^5.7.14",
"metascraper-description": "^5.7.14",
"metascraper-image": "^5.7.14",
"metascraper-title": "^5.7.14",
"metascraper-author": "^5.8.0",
"metascraper-description": "^5.8.0",
"metascraper-image": "^5.8.0",
"metascraper-title": "^5.8.0",
"mongodb": "^3.2.7",
"mongodb-core": "^3.2.7",
"ms": "^2.1.1",
@@ -1,183 +1 @@
import Logger from "bunyan";
import cheerio from "cheerio";
import authorScraper from "metascraper-author";
import dateScraper from "metascraper-date";
import descriptionScraper from "metascraper-description";
import imageScraper from "metascraper-image";
import titleScraper from "metascraper-title";
import { Db } from "mongodb";
import fetch, { RequestInit } from "node-fetch";
import ProxyAgent from "proxy-agent";
import { version } from "coral-common/version";
import { GQLStoryMetadata } from "coral-server/graph/tenant/schema/__generated__/types";
import logger from "coral-server/logger";
import { retrieveStory, updateStory } from "coral-server/models/story";
import { retrieveTenant } from "coral-server/models/tenant";
import { modifiedScraper } from "./rules/modified";
import { sectionScraper } from "./rules/section";
export type Rule = Record<
string,
Array<
(options: { htmlDom: CheerioSelector; url: string }) => string | undefined
>
>;
class Scraper {
private rules: Rule[];
private log: Logger;
constructor(rules: Rule[]) {
this.rules = rules;
this.log = logger.child({ taskName: "scraper" }, true);
}
public async scrape(
url: string,
proxyURL?: string
): Promise<GQLStoryMetadata | null> {
// Grab the page HTML.
const log = this.log.child({ storyURL: url }, true);
const options: RequestInit = {
headers: {
"User-Agent": `Talk Scraper/${version}`,
},
};
if (proxyURL) {
// Force the type here because there's a slight mismatch.
options.agent = (new ProxyAgent(
proxyURL
) as unknown) as RequestInit["agent"];
log.debug("using proxy for scrape");
}
const start = Date.now();
log.debug("starting scrape of Story");
const res = await fetch(url, options);
if (res.status !== 200) {
log.warn(
{ statusCode: res.status },
"scrape failed with non-200 status code"
);
return null;
}
const html = await res.text();
log.debug({ timeElapsed: Date.now() - start }, "scrape complete");
// Load the DOM.
const htmlDom = cheerio.load(html);
log.debug("parsed html");
// Gather the results by evaluating each of the rules.
const metadata: Record<string, string | undefined> = {};
for (const rule of this.rules) {
for (const property in rule) {
if (!rule.hasOwnProperty(property)) {
continue;
}
// Proceed through each of the properties and try to find the mapped
// properties.
for (const getter of rule[property]) {
const value = getter({ htmlDom, url });
if (value && value.length > 0) {
metadata[property] = value;
break;
}
}
}
}
log.debug("extracted metadata");
return {
title: metadata.title || undefined,
description: metadata.description || undefined,
image: metadata.image ? metadata.image : undefined,
author: metadata.author || undefined,
publishedAt: metadata.date ? new Date(metadata.date) : undefined,
modifiedAt: metadata.modified ? new Date(metadata.modified) : undefined,
section: metadata.section || undefined,
};
}
}
/**
* createScraper will create a scraper that will utilize the rules defined to
* scrape metadata from the target page.
*/
function createScraper() {
return new Scraper([
authorScraper(),
dateScraper(),
descriptionScraper(),
imageScraper(),
titleScraper(),
modifiedScraper(),
sectionScraper(),
]);
}
export const scraper = createScraper();
export async function scrape(
mongo: Db,
tenantID: string,
storyID: string,
storyURL?: string
) {
// Grab the Tenant.
const tenant = await retrieveTenant(mongo, tenantID);
if (!tenant) {
throw new Error("tenant not found");
}
// If the URL wasn't provided, grab it from the database.
if (!storyURL) {
const retrievedStory = await retrieveStory(mongo, tenantID, storyID);
if (!retrievedStory) {
throw new Error("story at specified id not found");
}
// Update the story URL.
storyURL = retrievedStory.url;
}
// Get the metadata from the scraped html.
const metadata = await scraper.scrape(
storyURL,
tenant.stories.scraping.proxyURL
);
if (!metadata) {
throw new Error("story at specified url not found");
}
const now = new Date();
// Update the Story with the scraped details.
const story = await updateStory(
mongo,
tenantID,
storyID,
{
metadata,
scrapedAt: now,
},
now
);
if (!story) {
throw new Error("story at specified id not found");
}
return story;
}
export * from "./scraper";
@@ -1,12 +1,12 @@
import { $jsonld } from "@metascraper/helpers";
import { $jsonld, date, toRule } from "@metascraper/helpers";
import { Rules } from "metascraper";
import { wrap } from "./helpers";
const toDate = toRule(date);
export const modifiedScraper = (): Rules => ({
modified: [
// From: http://ogp.me/#type_article
wrap($jsonld("dateModified")),
wrap($ => $('meta[property="article:modified"]').attr("content")),
toDate($jsonld("dateModified")),
toDate($ => $('meta[property="article:modified"]').attr("content")),
],
});
@@ -0,0 +1,26 @@
import { $jsonld, date, toRule } from "@metascraper/helpers";
import { Rules } from "metascraper";
const toDate = toRule(date);
export const publishedScraper = (): Rules => ({
published: [
// From: http://ogp.me/#type_article
toDate($jsonld("datePublished")),
toDate($jsonld("dateCreated")),
toDate($ => $('meta[property*="published_time" i]').attr("content")),
toDate($ => $('meta[property*="release_date" i]').attr("content")),
toDate($ => $('meta[name="date" i]').attr("content")),
toDate($ => $('[itemprop="datepublished" i]').attr("content")),
toDate($ => $('[itemprop*="date" i]').attr("content")),
toDate($ => $('time[itemprop*="date" i]').attr("datetime")),
toDate($ => $("time[datetime]").attr("datetime")),
toDate($ => $("time[datetime][pubdate]").attr("datetime")),
toDate($ => $('meta[name*="dc.date" i]').attr("content")),
toDate($ => $('meta[name*="dc.date.issued" i]').attr("content")),
toDate($ => $('meta[name*="dc.date.created" i]').attr("content")),
toDate($ => $('meta[name*="dcterms.date" i]').attr("content")),
toDate($ => $('[property*="dc:date" i]').attr("content")),
toDate($ => $('[property*="dc:created" i]').attr("content")),
],
});
@@ -0,0 +1,42 @@
import { scraper } from "./scraper";
describe("Scraper", () => {
it("parses the JSON-LD data correctly", async () => {
const html = `
<script type="application/ld+json">
{
"@type": "Article",
"@id": "https://coralproject.net/blog/working-with-user-stories-keeping-commenters-and-moderators-at-the-center-of-what-we-build/",
"author": {
"@type": "Person",
"name": "sam"
},
"headline": "Working with User Stories: Keeping commenters and moderators at the center of what we build",
"description": "We believe that the comments section can be a place where diverse voices come together to share opinions and experiences.",
"datePublished": "2019-09-04T15:43:35+00:00",
"dateModified": "2019-09-06T06:14:29+00:00",
"image": {
"@type": "ImageObject",
"url": "https://coralproject.net/wp-content/uploads/2019/09/blog-hero.png",
"width": 1440,
"height": 1024
},
"articleSection": "Comments,Design,Moderation,Useful"
}
</script>
`;
expect(scraper.parse("", html)).toEqual({
author: "sam",
description:
"We believe that the comments section can be a place where diverse voices come together to share opinions and experiences.",
image:
"https://coralproject.net/wp-content/uploads/2019/09/blog-hero.png",
modifiedAt: new Date("2019-09-06T06:14:29+00:00"),
publishedAt: new Date("2019-09-04T15:43:35+00:00"),
section: "Comments,Design,Moderation,Useful",
title:
"Working with User Stories: Keeping commenters and moderators at the center of what we build",
});
});
});
@@ -0,0 +1,199 @@
import Logger from "bunyan";
import cheerio from "cheerio";
import authorScraper from "metascraper-author";
import descriptionScraper from "metascraper-description";
import imageScraper from "metascraper-image";
import titleScraper from "metascraper-title";
import { Db } from "mongodb";
import fetch, { RequestInit } from "node-fetch";
import ProxyAgent from "proxy-agent";
import { version } from "coral-common/version";
import logger from "coral-server/logger";
import { retrieveStory, updateStory } from "coral-server/models/story";
import { retrieveTenant } from "coral-server/models/tenant";
import { GQLStoryMetadata } from "coral-server/graph/tenant/schema/__generated__/types";
import { modifiedScraper } from "./rules/modified";
import { publishedScraper } from "./rules/published";
import { sectionScraper } from "./rules/section";
export type Rule = Record<
string,
Array<
(options: { htmlDom: CheerioSelector; url: string }) => string | undefined
>
>;
class Scraper {
private rules: Rule[];
private log: Logger;
constructor(rules: Rule[]) {
this.rules = rules;
this.log = logger.child({ taskName: "scraper" }, true);
}
public parse(url: string, html: string): GQLStoryMetadata {
const log = this.log.child({ storyURL: url }, true);
// Load the DOM.
const htmlDom = cheerio.load(html);
log.debug("parsed html");
// Gather the results by evaluating each of the rules.
const metadata: Record<string, string | undefined> = {};
for (const rule of this.rules) {
for (const property in rule) {
if (!rule.hasOwnProperty(property)) {
continue;
}
// Proceed through each of the properties and try to find the mapped
// properties.
for (const getter of rule[property]) {
const value = getter({ htmlDom, url });
if (value && value.length > 0) {
metadata[property] = value;
break;
}
}
}
}
log.debug("extracted metadata");
return {
title: metadata.title || undefined,
description: metadata.description || undefined,
image: metadata.image ? metadata.image : undefined,
author: metadata.author || undefined,
publishedAt: metadata.published
? new Date(metadata.published)
: undefined,
modifiedAt: metadata.modified ? new Date(metadata.modified) : undefined,
section: metadata.section || undefined,
};
}
public async download(url: string, proxyURL?: string) {
const log = this.log.child({ storyURL: url }, true);
const options: RequestInit = {
headers: {
"User-Agent": `Talk Scraper/${version}`,
},
};
if (proxyURL) {
// Force the type here because there's a slight mismatch.
options.agent = (new ProxyAgent(
proxyURL
) as unknown) as RequestInit["agent"];
log.debug("using proxy for scrape");
}
const start = Date.now();
log.debug("starting scrape of Story");
const res = await fetch(url, options);
if (!res.ok || res.status !== 200) {
log.warn(
{ statusCode: res.status, statusText: res.statusText },
"scrape failed with non-200 status code"
);
return null;
}
const html = await res.text();
log.debug({ timeElapsed: Date.now() - start }, "scrape complete");
return html;
}
public async scrape(
url: string,
proxyURL?: string
): Promise<GQLStoryMetadata | null> {
const html = await this.download(url, proxyURL);
if (!html) {
return null;
}
return this.parse(url, html);
}
}
/**
* createScraper will create a scraper that will utilize the rules defined to
* scrape metadata from the target page.
*/
function createScraper() {
return new Scraper([
authorScraper(),
publishedScraper(),
descriptionScraper(),
imageScraper(),
titleScraper(),
modifiedScraper(),
sectionScraper(),
]);
}
export const scraper = createScraper();
export async function scrape(
mongo: Db,
tenantID: string,
storyID: string,
storyURL?: string
) {
// Grab the Tenant.
const tenant = await retrieveTenant(mongo, tenantID);
if (!tenant) {
throw new Error("tenant not found");
}
// If the URL wasn't provided, grab it from the database.
if (!storyURL) {
const retrievedStory = await retrieveStory(mongo, tenantID, storyID);
if (!retrievedStory) {
throw new Error("story at specified id not found");
}
// Update the story URL.
storyURL = retrievedStory.url;
}
// Get the metadata from the scraped html.
const metadata = await scraper.scrape(
storyURL,
tenant.stories.scraping.proxyURL
);
if (!metadata) {
throw new Error("story at specified url not found");
}
const now = new Date();
// Update the Story with the scraped details.
const story = await updateStory(
mongo,
tenantID,
storyID,
{
metadata,
scrapedAt: now,
},
now
);
if (!story) {
throw new Error("story at specified id not found");
}
return story;
}
+5 -14
View File
@@ -1,10 +1,4 @@
declare module "metascraper" {
export interface Scraper {
(options: { url: string; html: string }): Promise<
Record<string, string | undefined>
>;
}
export type Ruler = (options: {
htmlDom: CheerioSelector;
url: string;
@@ -16,8 +10,6 @@ declare module "metascraper" {
) => string | undefined;
export type Rules = Record<string, Array<Ruler>>;
export function load(rules: Rules[]): Scraper;
}
declare module "metascraper-author" {
@@ -25,11 +17,6 @@ declare module "metascraper-author" {
export default function def(): Rules;
}
declare module "metascraper-date" {
import { Rules } from "metascraper";
export default function def(): Rules;
}
declare module "metascraper-description" {
import { Rules } from "metascraper";
export default function def(): Rules;
@@ -46,5 +33,9 @@ declare module "metascraper-title" {
}
declare module "@metascraper/helpers" {
export const $jsonld: any;
import { Rule } from "metascraper";
export const $jsonld: (key: string) => Rule;
export const jsonld: (url: string, htmlDom: CheerioSelector) => Rule;
export const toRule: (fn: any, opts?: any) => (rule: Rule) => any;
export const date: Rule;
}