diff --git a/package-lock.json b/package-lock.json index 33b6949e1..53c22757f 100644 --- a/package-lock.json +++ b/package-lock.json @@ -2770,9 +2770,9 @@ "dev": true }, "@metascraper/helpers": { - "version": "5.7.14", - "resolved": "https://registry.npmjs.org/@metascraper/helpers/-/helpers-5.7.14.tgz", - "integrity": "sha512-xQa24LVinzyT/5H4LKNP4YDdR/kcz+j5GIIB123DyxvhQZTRuDu8BRmEUB/Yj+jY7U5qhwabIgwCT/eN1Y9Uag==", + "version": "5.7.21", + "resolved": "https://registry.npmjs.org/@metascraper/helpers/-/helpers-5.7.21.tgz", + "integrity": "sha512-wcgoVCqV8fU5YrGEKxWyUQo4nbZK3p//0Wav/hLGZ632Qb6VNpfJtUTxqHndGkG5xNeqfNtXNnX64XZaUc7FDg==", "requires": { "audio-extensions": "0.0.0", "chrono-node": "~1.3.11", @@ -22247,86 +22247,18 @@ "dev": true }, "metascraper-author": { - "version": "5.7.14", - "resolved": "https://registry.npmjs.org/metascraper-author/-/metascraper-author-5.7.14.tgz", - "integrity": "sha512-P8xpHHoCzlbt1lb8qKbkz9XQ4MWC0c9ElKFORQ1GPmSVh0n+aTO1APKofFYcnl9rq6QIyYU4PLTqQZ54KXMqtA==", + "version": "5.8.0", + "resolved": "https://registry.npmjs.org/metascraper-author/-/metascraper-author-5.8.0.tgz", + "integrity": "sha512-k7yZMMOi2+Vh7RoIIpc4Q6FJ2HueQZ/tVqoQueBWIzlyWpJGDkJmI1Wi7P3XfHLJEKxkg8d15bP24Z6WWIbaXw==", "requires": { - "@metascraper/helpers": "^5.7.14", + "@metascraper/helpers": "^5.8.0", "lodash": "~4.17.15" }, "dependencies": { "@metascraper/helpers": { - "version": "5.7.14", - "resolved": "https://registry.npmjs.org/@metascraper/helpers/-/helpers-5.7.14.tgz", - "integrity": "sha512-xQa24LVinzyT/5H4LKNP4YDdR/kcz+j5GIIB123DyxvhQZTRuDu8BRmEUB/Yj+jY7U5qhwabIgwCT/eN1Y9Uag==", - "requires": { - "audio-extensions": "0.0.0", - "chrono-node": "~1.3.11", - "condense-whitespace": "~2.0.0", - "entities": "~2.0.0", - "file-extension": "~4.0.5", - "has-values": "~2.0.1", - "image-extensions": "~1.1.0", - "is-relative-url": "~3.0.0", - "is-uri": "~1.2.0", - "iso-639-3": "~1.2.0", - "isostring": "0.0.1", - "lodash": "~4.17.15", - "memoize-one": "~5.1.1", - "mime-types": "~2.1.24", - "normalize-url": "~4.5.0", - "smartquotes": "~2.3.1", - "title": "~3.4.1", - "truncate": "~2.1.0", - "url-regex": "~5.0.0", - "video-extensions": "~1.1.0" - } - }, - "entities": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/entities/-/entities-2.0.0.tgz", - "integrity": "sha512-D9f7V0JSRwIxlRI2mjMqufDrRDnx8p+eEOz7aUM9SuvF8gsBzra0/6tbjl1m8eQHrZlYj6PxqE00hZ1SAIKPLw==" - }, - "has-values": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/has-values/-/has-values-2.0.1.tgz", - "integrity": "sha512-+QdH3jOmq9P8GfdjFg0eJudqx1FqU62NQJ4P16rOEHeRdl7ckgwn6uqQjzYE0ZoHVV/e5E2esuJ5Gl5+HUW19w==", - "requires": { - "kind-of": "^6.0.2" - } - }, - "mime-db": { - "version": "1.40.0", - "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.40.0.tgz", - "integrity": "sha512-jYdeOMPy9vnxEqFRRo6ZvTZ8d9oPb+k18PKoYNYUe2stVEBPPwsln/qWzdbmaIvnhZ9v2P+CuecK+fpUfsV2mA==" - }, - "mime-types": { - "version": "2.1.24", - "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.24.tgz", - "integrity": "sha512-WaFHS3MCl5fapm3oLxU4eYDw77IQM2ACcxQ9RIxfaC3ooc6PFuBMGZZsYpvoXS5D5QTWPieo1jjLdAm3TBP3cQ==", - "requires": { - "mime-db": "1.40.0" - } - }, - "normalize-url": { - "version": "4.5.0", - "resolved": "https://registry.npmjs.org/normalize-url/-/normalize-url-4.5.0.tgz", - "integrity": "sha512-2s47yzUxdexf1OhyRi4Em83iQk0aPvwTddtFz4hnSSw9dCEsLEGf6SwIO8ss/19S9iBb5sJaOuTvTGDeZI00BQ==" - } - } - }, - "metascraper-date": { - "version": "5.7.14", - "resolved": "https://registry.npmjs.org/metascraper-date/-/metascraper-date-5.7.14.tgz", - "integrity": "sha512-eJKMtIFeBrnkAavkNlIT/O2bKmF2gKVgMpPbdg/9yJ+OS0pH3QTdk/I/NeU91fS0dAaC2ztwFKUqw2zjC27vew==", - "requires": { - "@metascraper/helpers": "^5.7.14" - }, - "dependencies": { - "@metascraper/helpers": { - "version": "5.7.14", - "resolved": "https://registry.npmjs.org/@metascraper/helpers/-/helpers-5.7.14.tgz", - "integrity": "sha512-xQa24LVinzyT/5H4LKNP4YDdR/kcz+j5GIIB123DyxvhQZTRuDu8BRmEUB/Yj+jY7U5qhwabIgwCT/eN1Y9Uag==", + "version": "5.8.0", + "resolved": "https://registry.npmjs.org/@metascraper/helpers/-/helpers-5.8.0.tgz", + "integrity": "sha512-12UG36W2X8oirM3M88Z13PVyLiOwKCvshydDNAK4/naO95Xi7dzMOcf8VXw375DYKnllfi8YdWzYJU8ie0BejA==", "requires": { "audio-extensions": "0.0.0", "chrono-node": "~1.3.11", @@ -22384,17 +22316,17 @@ } }, "metascraper-description": { - "version": "5.7.14", - "resolved": "https://registry.npmjs.org/metascraper-description/-/metascraper-description-5.7.14.tgz", - "integrity": "sha512-++qN4Rf0Hx13SbhJgRiLSuVOZHsYwhUkMfHa5sVVihSJkrLVjOSdBTpNBajRC7yHwG6m6/qIesuERbT1jdu5bw==", + "version": "5.8.0", + "resolved": "https://registry.npmjs.org/metascraper-description/-/metascraper-description-5.8.0.tgz", + "integrity": "sha512-JHtHiHBIGMr7bZYoqbT6NnMSnIuMTMItxBAQfdW9RDQCK9l/M1yGi/usMcvXiPYUVlzPfuCcwqeN3xMj3JyLEg==", "requires": { - "@metascraper/helpers": "^5.7.14" + "@metascraper/helpers": "^5.8.0" }, "dependencies": { "@metascraper/helpers": { - "version": "5.7.14", - "resolved": "https://registry.npmjs.org/@metascraper/helpers/-/helpers-5.7.14.tgz", - "integrity": "sha512-xQa24LVinzyT/5H4LKNP4YDdR/kcz+j5GIIB123DyxvhQZTRuDu8BRmEUB/Yj+jY7U5qhwabIgwCT/eN1Y9Uag==", + "version": "5.8.0", + "resolved": "https://registry.npmjs.org/@metascraper/helpers/-/helpers-5.8.0.tgz", + "integrity": "sha512-12UG36W2X8oirM3M88Z13PVyLiOwKCvshydDNAK4/naO95Xi7dzMOcf8VXw375DYKnllfi8YdWzYJU8ie0BejA==", "requires": { "audio-extensions": "0.0.0", "chrono-node": "~1.3.11", @@ -22452,17 +22384,17 @@ } }, "metascraper-image": { - "version": "5.7.14", - "resolved": "https://registry.npmjs.org/metascraper-image/-/metascraper-image-5.7.14.tgz", - "integrity": "sha512-TxnUKYU92iWapq2G55E4AF7VjGyiDO2x01Z6AyjbmRxoM4U2IaHkNpE5msyc7TQhxGoYOSvdUtOeUnqdPqr+aA==", + "version": "5.8.0", + "resolved": "https://registry.npmjs.org/metascraper-image/-/metascraper-image-5.8.0.tgz", + "integrity": "sha512-qDwQcjbSlb5NAdFgbCARaGjRUEzWMiYEA/r2AhJzCFsRZxC9gdurk2M0dhN6NCB6FvEv0JtQwQbkWokvuLKkiQ==", "requires": { - "@metascraper/helpers": "^5.7.14" + "@metascraper/helpers": "^5.8.0" }, "dependencies": { "@metascraper/helpers": { - "version": "5.7.14", - "resolved": "https://registry.npmjs.org/@metascraper/helpers/-/helpers-5.7.14.tgz", - "integrity": "sha512-xQa24LVinzyT/5H4LKNP4YDdR/kcz+j5GIIB123DyxvhQZTRuDu8BRmEUB/Yj+jY7U5qhwabIgwCT/eN1Y9Uag==", + "version": "5.8.0", + "resolved": "https://registry.npmjs.org/@metascraper/helpers/-/helpers-5.8.0.tgz", + "integrity": "sha512-12UG36W2X8oirM3M88Z13PVyLiOwKCvshydDNAK4/naO95Xi7dzMOcf8VXw375DYKnllfi8YdWzYJU8ie0BejA==", "requires": { "audio-extensions": "0.0.0", "chrono-node": "~1.3.11", @@ -22520,18 +22452,18 @@ } }, "metascraper-title": { - "version": "5.7.14", - "resolved": "https://registry.npmjs.org/metascraper-title/-/metascraper-title-5.7.14.tgz", - "integrity": "sha512-ZiVo4LEfqiNHlCGjht5OSZ3yRKxcZnbaXeRmUReMkCHcFujok5YZBj5ktDpAANmG9T3x2gn3twM3ZbBSyXLYyg==", + "version": "5.8.0", + "resolved": "https://registry.npmjs.org/metascraper-title/-/metascraper-title-5.8.0.tgz", + "integrity": "sha512-ar6zqFGrHPeri8ymoWoHRJ29msmG7f8P5fLd1/A3NvFHXJA3XMTw4w1uLC9tg8MSABAG9t8vISmzB5NdB6MedQ==", "requires": { - "@metascraper/helpers": "^5.7.14", + "@metascraper/helpers": "^5.8.0", "lodash": "~4.17.15" }, "dependencies": { "@metascraper/helpers": { - "version": "5.7.14", - "resolved": "https://registry.npmjs.org/@metascraper/helpers/-/helpers-5.7.14.tgz", - "integrity": "sha512-xQa24LVinzyT/5H4LKNP4YDdR/kcz+j5GIIB123DyxvhQZTRuDu8BRmEUB/Yj+jY7U5qhwabIgwCT/eN1Y9Uag==", + "version": "5.8.0", + "resolved": "https://registry.npmjs.org/@metascraper/helpers/-/helpers-5.8.0.tgz", + "integrity": "sha512-12UG36W2X8oirM3M88Z13PVyLiOwKCvshydDNAK4/naO95Xi7dzMOcf8VXw375DYKnllfi8YdWzYJU8ie0BejA==", "requires": { "audio-extensions": "0.0.0", "chrono-node": "~1.3.11", diff --git a/package.json b/package.json index c34130a8b..ae0a51c39 100644 --- a/package.json +++ b/package.json @@ -58,7 +58,7 @@ "license": "Apache-2.0", "dependencies": { "@coralproject/bunyan-prettystream": "^0.1.4", - "@metascraper/helpers": "^5.7.14", + "@metascraper/helpers": "^5.7.21", "akismet-api": "^4.2.0", "apollo-server-express": "^2.8.1", "archiver": "^3.0.3", @@ -106,11 +106,10 @@ "lodash": "^4.17.15", "lru-cache": "^5.1.1", "luxon": "^1.12.0", - "metascraper-author": "^5.7.14", - "metascraper-date": "^5.7.14", - "metascraper-description": "^5.7.14", - "metascraper-image": "^5.7.14", - "metascraper-title": "^5.7.14", + "metascraper-author": "^5.8.0", + "metascraper-description": "^5.8.0", + "metascraper-image": "^5.8.0", + "metascraper-title": "^5.8.0", "mongodb": "^3.2.7", "mongodb-core": "^3.2.7", "ms": "^2.1.1", diff --git a/src/core/server/services/stories/scraper/index.ts b/src/core/server/services/stories/scraper/index.ts index b3d87b1d4..6b8e144f0 100644 --- a/src/core/server/services/stories/scraper/index.ts +++ b/src/core/server/services/stories/scraper/index.ts @@ -1,183 +1 @@ -import Logger from "bunyan"; -import cheerio from "cheerio"; -import authorScraper from "metascraper-author"; -import dateScraper from "metascraper-date"; -import descriptionScraper from "metascraper-description"; -import imageScraper from "metascraper-image"; -import titleScraper from "metascraper-title"; -import { Db } from "mongodb"; -import fetch, { RequestInit } from "node-fetch"; -import ProxyAgent from "proxy-agent"; - -import { version } from "coral-common/version"; -import { GQLStoryMetadata } from "coral-server/graph/tenant/schema/__generated__/types"; -import logger from "coral-server/logger"; -import { retrieveStory, updateStory } from "coral-server/models/story"; -import { retrieveTenant } from "coral-server/models/tenant"; - -import { modifiedScraper } from "./rules/modified"; -import { sectionScraper } from "./rules/section"; - -export type Rule = Record< - string, - Array< - (options: { htmlDom: CheerioSelector; url: string }) => string | undefined - > ->; - -class Scraper { - private rules: Rule[]; - private log: Logger; - - constructor(rules: Rule[]) { - this.rules = rules; - this.log = logger.child({ taskName: "scraper" }, true); - } - - public async scrape( - url: string, - proxyURL?: string - ): Promise { - // Grab the page HTML. - - const log = this.log.child({ storyURL: url }, true); - - const options: RequestInit = { - headers: { - "User-Agent": `Talk Scraper/${version}`, - }, - }; - if (proxyURL) { - // Force the type here because there's a slight mismatch. - options.agent = (new ProxyAgent( - proxyURL - ) as unknown) as RequestInit["agent"]; - log.debug("using proxy for scrape"); - } - - const start = Date.now(); - log.debug("starting scrape of Story"); - - const res = await fetch(url, options); - if (res.status !== 200) { - log.warn( - { statusCode: res.status }, - "scrape failed with non-200 status code" - ); - return null; - } - - const html = await res.text(); - - log.debug({ timeElapsed: Date.now() - start }, "scrape complete"); - - // Load the DOM. - const htmlDom = cheerio.load(html); - - log.debug("parsed html"); - - // Gather the results by evaluating each of the rules. - const metadata: Record = {}; - - for (const rule of this.rules) { - for (const property in rule) { - if (!rule.hasOwnProperty(property)) { - continue; - } - - // Proceed through each of the properties and try to find the mapped - // properties. - for (const getter of rule[property]) { - const value = getter({ htmlDom, url }); - if (value && value.length > 0) { - metadata[property] = value; - - break; - } - } - } - } - - log.debug("extracted metadata"); - - return { - title: metadata.title || undefined, - description: metadata.description || undefined, - image: metadata.image ? metadata.image : undefined, - author: metadata.author || undefined, - publishedAt: metadata.date ? new Date(metadata.date) : undefined, - modifiedAt: metadata.modified ? new Date(metadata.modified) : undefined, - section: metadata.section || undefined, - }; - } -} - -/** - * createScraper will create a scraper that will utilize the rules defined to - * scrape metadata from the target page. - */ -function createScraper() { - return new Scraper([ - authorScraper(), - dateScraper(), - descriptionScraper(), - imageScraper(), - titleScraper(), - modifiedScraper(), - sectionScraper(), - ]); -} - -export const scraper = createScraper(); - -export async function scrape( - mongo: Db, - tenantID: string, - storyID: string, - storyURL?: string -) { - // Grab the Tenant. - const tenant = await retrieveTenant(mongo, tenantID); - if (!tenant) { - throw new Error("tenant not found"); - } - - // If the URL wasn't provided, grab it from the database. - if (!storyURL) { - const retrievedStory = await retrieveStory(mongo, tenantID, storyID); - if (!retrievedStory) { - throw new Error("story at specified id not found"); - } - - // Update the story URL. - storyURL = retrievedStory.url; - } - - // Get the metadata from the scraped html. - const metadata = await scraper.scrape( - storyURL, - tenant.stories.scraping.proxyURL - ); - if (!metadata) { - throw new Error("story at specified url not found"); - } - - const now = new Date(); - - // Update the Story with the scraped details. - const story = await updateStory( - mongo, - tenantID, - storyID, - { - metadata, - scrapedAt: now, - }, - now - ); - if (!story) { - throw new Error("story at specified id not found"); - } - - return story; -} +export * from "./scraper"; diff --git a/src/core/server/services/stories/scraper/rules/modified.ts b/src/core/server/services/stories/scraper/rules/modified.ts index 46729a6d4..663492b4b 100644 --- a/src/core/server/services/stories/scraper/rules/modified.ts +++ b/src/core/server/services/stories/scraper/rules/modified.ts @@ -1,12 +1,12 @@ -import { $jsonld } from "@metascraper/helpers"; +import { $jsonld, date, toRule } from "@metascraper/helpers"; import { Rules } from "metascraper"; -import { wrap } from "./helpers"; +const toDate = toRule(date); export const modifiedScraper = (): Rules => ({ modified: [ // From: http://ogp.me/#type_article - wrap($jsonld("dateModified")), - wrap($ => $('meta[property="article:modified"]').attr("content")), + toDate($jsonld("dateModified")), + toDate($ => $('meta[property="article:modified"]').attr("content")), ], }); diff --git a/src/core/server/services/stories/scraper/rules/published.ts b/src/core/server/services/stories/scraper/rules/published.ts new file mode 100644 index 000000000..31a683626 --- /dev/null +++ b/src/core/server/services/stories/scraper/rules/published.ts @@ -0,0 +1,26 @@ +import { $jsonld, date, toRule } from "@metascraper/helpers"; +import { Rules } from "metascraper"; + +const toDate = toRule(date); + +export const publishedScraper = (): Rules => ({ + published: [ + // From: http://ogp.me/#type_article + toDate($jsonld("datePublished")), + toDate($jsonld("dateCreated")), + toDate($ => $('meta[property*="published_time" i]').attr("content")), + toDate($ => $('meta[property*="release_date" i]').attr("content")), + toDate($ => $('meta[name="date" i]').attr("content")), + toDate($ => $('[itemprop="datepublished" i]').attr("content")), + toDate($ => $('[itemprop*="date" i]').attr("content")), + toDate($ => $('time[itemprop*="date" i]').attr("datetime")), + toDate($ => $("time[datetime]").attr("datetime")), + toDate($ => $("time[datetime][pubdate]").attr("datetime")), + toDate($ => $('meta[name*="dc.date" i]').attr("content")), + toDate($ => $('meta[name*="dc.date.issued" i]').attr("content")), + toDate($ => $('meta[name*="dc.date.created" i]').attr("content")), + toDate($ => $('meta[name*="dcterms.date" i]').attr("content")), + toDate($ => $('[property*="dc:date" i]').attr("content")), + toDate($ => $('[property*="dc:created" i]').attr("content")), + ], +}); diff --git a/src/core/server/services/stories/scraper/scraper.spec.ts b/src/core/server/services/stories/scraper/scraper.spec.ts new file mode 100644 index 000000000..3ee746484 --- /dev/null +++ b/src/core/server/services/stories/scraper/scraper.spec.ts @@ -0,0 +1,42 @@ +import { scraper } from "./scraper"; + +describe("Scraper", () => { + it("parses the JSON-LD data correctly", async () => { + const html = ` + + `; + + expect(scraper.parse("", html)).toEqual({ + author: "sam", + description: + "We believe that the comments section can be a place where diverse voices come together to share opinions and experiences.", + image: + "https://coralproject.net/wp-content/uploads/2019/09/blog-hero.png", + modifiedAt: new Date("2019-09-06T06:14:29+00:00"), + publishedAt: new Date("2019-09-04T15:43:35+00:00"), + section: "Comments,Design,Moderation,Useful", + title: + "Working with User Stories: Keeping commenters and moderators at the center of what we build", + }); + }); +}); diff --git a/src/core/server/services/stories/scraper/scraper.ts b/src/core/server/services/stories/scraper/scraper.ts new file mode 100644 index 000000000..809503054 --- /dev/null +++ b/src/core/server/services/stories/scraper/scraper.ts @@ -0,0 +1,199 @@ +import Logger from "bunyan"; +import cheerio from "cheerio"; +import authorScraper from "metascraper-author"; +import descriptionScraper from "metascraper-description"; +import imageScraper from "metascraper-image"; +import titleScraper from "metascraper-title"; +import { Db } from "mongodb"; +import fetch, { RequestInit } from "node-fetch"; +import ProxyAgent from "proxy-agent"; + +import { version } from "coral-common/version"; +import logger from "coral-server/logger"; +import { retrieveStory, updateStory } from "coral-server/models/story"; +import { retrieveTenant } from "coral-server/models/tenant"; + +import { GQLStoryMetadata } from "coral-server/graph/tenant/schema/__generated__/types"; + +import { modifiedScraper } from "./rules/modified"; +import { publishedScraper } from "./rules/published"; +import { sectionScraper } from "./rules/section"; + +export type Rule = Record< + string, + Array< + (options: { htmlDom: CheerioSelector; url: string }) => string | undefined + > +>; + +class Scraper { + private rules: Rule[]; + private log: Logger; + + constructor(rules: Rule[]) { + this.rules = rules; + this.log = logger.child({ taskName: "scraper" }, true); + } + + public parse(url: string, html: string): GQLStoryMetadata { + const log = this.log.child({ storyURL: url }, true); + + // Load the DOM. + const htmlDom = cheerio.load(html); + + log.debug("parsed html"); + + // Gather the results by evaluating each of the rules. + const metadata: Record = {}; + + for (const rule of this.rules) { + for (const property in rule) { + if (!rule.hasOwnProperty(property)) { + continue; + } + + // Proceed through each of the properties and try to find the mapped + // properties. + for (const getter of rule[property]) { + const value = getter({ htmlDom, url }); + if (value && value.length > 0) { + metadata[property] = value; + + break; + } + } + } + } + + log.debug("extracted metadata"); + + return { + title: metadata.title || undefined, + description: metadata.description || undefined, + image: metadata.image ? metadata.image : undefined, + author: metadata.author || undefined, + publishedAt: metadata.published + ? new Date(metadata.published) + : undefined, + modifiedAt: metadata.modified ? new Date(metadata.modified) : undefined, + section: metadata.section || undefined, + }; + } + + public async download(url: string, proxyURL?: string) { + const log = this.log.child({ storyURL: url }, true); + + const options: RequestInit = { + headers: { + "User-Agent": `Talk Scraper/${version}`, + }, + }; + if (proxyURL) { + // Force the type here because there's a slight mismatch. + options.agent = (new ProxyAgent( + proxyURL + ) as unknown) as RequestInit["agent"]; + log.debug("using proxy for scrape"); + } + + const start = Date.now(); + log.debug("starting scrape of Story"); + + const res = await fetch(url, options); + if (!res.ok || res.status !== 200) { + log.warn( + { statusCode: res.status, statusText: res.statusText }, + "scrape failed with non-200 status code" + ); + return null; + } + + const html = await res.text(); + + log.debug({ timeElapsed: Date.now() - start }, "scrape complete"); + + return html; + } + + public async scrape( + url: string, + proxyURL?: string + ): Promise { + const html = await this.download(url, proxyURL); + if (!html) { + return null; + } + + return this.parse(url, html); + } +} + +/** + * createScraper will create a scraper that will utilize the rules defined to + * scrape metadata from the target page. + */ +function createScraper() { + return new Scraper([ + authorScraper(), + publishedScraper(), + descriptionScraper(), + imageScraper(), + titleScraper(), + modifiedScraper(), + sectionScraper(), + ]); +} + +export const scraper = createScraper(); + +export async function scrape( + mongo: Db, + tenantID: string, + storyID: string, + storyURL?: string +) { + // Grab the Tenant. + const tenant = await retrieveTenant(mongo, tenantID); + if (!tenant) { + throw new Error("tenant not found"); + } + + // If the URL wasn't provided, grab it from the database. + if (!storyURL) { + const retrievedStory = await retrieveStory(mongo, tenantID, storyID); + if (!retrievedStory) { + throw new Error("story at specified id not found"); + } + + // Update the story URL. + storyURL = retrievedStory.url; + } + + // Get the metadata from the scraped html. + const metadata = await scraper.scrape( + storyURL, + tenant.stories.scraping.proxyURL + ); + if (!metadata) { + throw new Error("story at specified url not found"); + } + + const now = new Date(); + + // Update the Story with the scraped details. + const story = await updateStory( + mongo, + tenantID, + storyID, + { + metadata, + scrapedAt: now, + }, + now + ); + if (!story) { + throw new Error("story at specified id not found"); + } + + return story; +} diff --git a/src/types/metascraper.d.ts b/src/types/metascraper.d.ts index 429d73a30..4dff1c429 100644 --- a/src/types/metascraper.d.ts +++ b/src/types/metascraper.d.ts @@ -1,10 +1,4 @@ declare module "metascraper" { - export interface Scraper { - (options: { url: string; html: string }): Promise< - Record - >; - } - export type Ruler = (options: { htmlDom: CheerioSelector; url: string; @@ -16,8 +10,6 @@ declare module "metascraper" { ) => string | undefined; export type Rules = Record>; - - export function load(rules: Rules[]): Scraper; } declare module "metascraper-author" { @@ -25,11 +17,6 @@ declare module "metascraper-author" { export default function def(): Rules; } -declare module "metascraper-date" { - import { Rules } from "metascraper"; - export default function def(): Rules; -} - declare module "metascraper-description" { import { Rules } from "metascraper"; export default function def(): Rules; @@ -46,5 +33,9 @@ declare module "metascraper-title" { } declare module "@metascraper/helpers" { - export const $jsonld: any; + import { Rule } from "metascraper"; + export const $jsonld: (key: string) => Rule; + export const jsonld: (url: string, htmlDom: CheerioSelector) => Rule; + export const toRule: (fn: any, opts?: any) => (rule: Rule) => any; + export const date: Rule; }