getCoreContentText for any websites using mozilla/readability (#641)

* getCoreContentText for any websites using https://github.com/mozilla/readability

* improve use of @mozilla/readability

---------

Co-authored-by: josc146 <josStorer@outlook.com>
This commit is contained in:
xxcdd
2024-03-22 16:06:57 +08:00
committed by GitHub
parent a6fa0ed704
commit eb88fc2053
3 changed files with 40 additions and 12 deletions
+9
View File
@@ -6,6 +6,7 @@
"": {
"name": "chatgptbox",
"dependencies": {
"@mozilla/readability": "^0.5.0",
"@nem035/gpt-3-encoder": "^1.1.7",
"@picocss/pico": "^1.5.9",
"@primer/octicons-react": "^18.3.0",
@@ -2077,6 +2078,14 @@
"@jridgewell/sourcemap-codec": "^1.4.14"
}
},
"node_modules/@mozilla/readability": {
"version": "0.5.0",
"resolved": "https://registry.npmjs.org/@mozilla/readability/-/readability-0.5.0.tgz",
"integrity": "sha512-Z+CZ3QaosfFaTqvhQsIktyGrjFjSC0Fa4EMph4mqKnWhmyoGICsV/8QK+8HpXut6zV7zwfWwqDmEjtk1Qf6EgQ==",
"engines": {
"node": ">=14.0.0"
}
},
"node_modules/@nem035/gpt-3-encoder": {
"version": "1.1.7",
"resolved": "https://registry.npmjs.org/@nem035/gpt-3-encoder/-/gpt-3-encoder-1.1.7.tgz",
+1
View File
@@ -19,6 +19,7 @@
"lint"
],
"dependencies": {
"@mozilla/readability": "^0.5.0",
"@nem035/gpt-3-encoder": "^1.1.7",
"@picocss/pico": "^1.5.9",
"@primer/octicons-react": "^18.3.0",
+30 -12
View File
@@ -1,9 +1,5 @@
import { getPossibleElementByQuerySelector } from './get-possible-element-by-query-selector.mjs'
function getArea(e) {
const rect = e.getBoundingClientRect()
return rect.width * rect.height
}
import { Readability, isProbablyReaderable } from '@mozilla/readability'
const adapters = {
'scholar.google': ['#gs_res_ccl_mid'],
@@ -17,6 +13,11 @@ const adapters = {
'new.qq.com': ['.content-article'],
}
function getArea(e) {
const rect = e.getBoundingClientRect()
return rect.width * rect.height
}
function findLargestElement(e) {
if (!e) {
return null
@@ -42,22 +43,39 @@ function findLargestElement(e) {
return largestElement
}
export function getCoreContentText() {
function getTextFrom(e) {
return e.innerText || e.textContent
}
function getTextFrom(e) {
return e.innerText || e.textContent
}
function postProcessText(text) {
return text
.trim()
.replaceAll(' ', '')
.replaceAll('\t', '')
.replaceAll('\n\n', '')
.replaceAll(',,', '')
}
export function getCoreContentText() {
for (const [siteName, selectors] of Object.entries(adapters)) {
if (location.hostname.includes(siteName)) {
const element = getPossibleElementByQuerySelector(selectors)
if (element) return getTextFrom(element)
if (element) return postProcessText(getTextFrom(element))
break
}
}
const element = document.querySelector('article')
if (element) {
return getTextFrom(element)
return postProcessText(getTextFrom(element))
}
if (isProbablyReaderable(document)) {
let article = new Readability(document.cloneNode(true), {
keepClasses: true,
}).parse()
console.log('readerable')
return postProcessText(article.textContent)
}
const largestElement = findLargestElement(document.body)
@@ -79,5 +97,5 @@ export function getCoreContentText() {
ret = getTextFrom(largestElement)
console.log('use first')
}
return ret.trim().replaceAll(' ', '').replaceAll('\n\n', '').replaceAll(',,', '')
return postProcessText(ret)
}