mirror of
https://github.com/wassname/chatGPTBox.git
synced 2026-07-01 05:05:51 +08:00
getCoreContentText for any websites using mozilla/readability (#641)
* getCoreContentText for any websites using https://github.com/mozilla/readability * improve use of @mozilla/readability --------- Co-authored-by: josc146 <josStorer@outlook.com>
This commit is contained in:
Generated
+9
@@ -6,6 +6,7 @@
|
||||
"": {
|
||||
"name": "chatgptbox",
|
||||
"dependencies": {
|
||||
"@mozilla/readability": "^0.5.0",
|
||||
"@nem035/gpt-3-encoder": "^1.1.7",
|
||||
"@picocss/pico": "^1.5.9",
|
||||
"@primer/octicons-react": "^18.3.0",
|
||||
@@ -2077,6 +2078,14 @@
|
||||
"@jridgewell/sourcemap-codec": "^1.4.14"
|
||||
}
|
||||
},
|
||||
"node_modules/@mozilla/readability": {
|
||||
"version": "0.5.0",
|
||||
"resolved": "https://registry.npmjs.org/@mozilla/readability/-/readability-0.5.0.tgz",
|
||||
"integrity": "sha512-Z+CZ3QaosfFaTqvhQsIktyGrjFjSC0Fa4EMph4mqKnWhmyoGICsV/8QK+8HpXut6zV7zwfWwqDmEjtk1Qf6EgQ==",
|
||||
"engines": {
|
||||
"node": ">=14.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@nem035/gpt-3-encoder": {
|
||||
"version": "1.1.7",
|
||||
"resolved": "https://registry.npmjs.org/@nem035/gpt-3-encoder/-/gpt-3-encoder-1.1.7.tgz",
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
"lint"
|
||||
],
|
||||
"dependencies": {
|
||||
"@mozilla/readability": "^0.5.0",
|
||||
"@nem035/gpt-3-encoder": "^1.1.7",
|
||||
"@picocss/pico": "^1.5.9",
|
||||
"@primer/octicons-react": "^18.3.0",
|
||||
|
||||
@@ -1,9 +1,5 @@
|
||||
import { getPossibleElementByQuerySelector } from './get-possible-element-by-query-selector.mjs'
|
||||
|
||||
function getArea(e) {
|
||||
const rect = e.getBoundingClientRect()
|
||||
return rect.width * rect.height
|
||||
}
|
||||
import { Readability, isProbablyReaderable } from '@mozilla/readability'
|
||||
|
||||
const adapters = {
|
||||
'scholar.google': ['#gs_res_ccl_mid'],
|
||||
@@ -17,6 +13,11 @@ const adapters = {
|
||||
'new.qq.com': ['.content-article'],
|
||||
}
|
||||
|
||||
function getArea(e) {
|
||||
const rect = e.getBoundingClientRect()
|
||||
return rect.width * rect.height
|
||||
}
|
||||
|
||||
function findLargestElement(e) {
|
||||
if (!e) {
|
||||
return null
|
||||
@@ -42,22 +43,39 @@ function findLargestElement(e) {
|
||||
return largestElement
|
||||
}
|
||||
|
||||
export function getCoreContentText() {
|
||||
function getTextFrom(e) {
|
||||
return e.innerText || e.textContent
|
||||
}
|
||||
function getTextFrom(e) {
|
||||
return e.innerText || e.textContent
|
||||
}
|
||||
|
||||
function postProcessText(text) {
|
||||
return text
|
||||
.trim()
|
||||
.replaceAll(' ', '')
|
||||
.replaceAll('\t', '')
|
||||
.replaceAll('\n\n', '')
|
||||
.replaceAll(',,', '')
|
||||
}
|
||||
|
||||
export function getCoreContentText() {
|
||||
for (const [siteName, selectors] of Object.entries(adapters)) {
|
||||
if (location.hostname.includes(siteName)) {
|
||||
const element = getPossibleElementByQuerySelector(selectors)
|
||||
if (element) return getTextFrom(element)
|
||||
if (element) return postProcessText(getTextFrom(element))
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
const element = document.querySelector('article')
|
||||
if (element) {
|
||||
return getTextFrom(element)
|
||||
return postProcessText(getTextFrom(element))
|
||||
}
|
||||
|
||||
if (isProbablyReaderable(document)) {
|
||||
let article = new Readability(document.cloneNode(true), {
|
||||
keepClasses: true,
|
||||
}).parse()
|
||||
console.log('readerable')
|
||||
return postProcessText(article.textContent)
|
||||
}
|
||||
|
||||
const largestElement = findLargestElement(document.body)
|
||||
@@ -79,5 +97,5 @@ export function getCoreContentText() {
|
||||
ret = getTextFrom(largestElement)
|
||||
console.log('use first')
|
||||
}
|
||||
return ret.trim().replaceAll(' ', '').replaceAll('\n\n', '').replaceAll(',,', '')
|
||||
return postProcessText(ret)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user