mirror of
https://github.com/wassname/chatGPTBox.git
synced 2026-07-02 04:57:11 +08:00
getCoreContentText for any websites using mozilla/readability (#641)
* getCoreContentText for any websites using https://github.com/mozilla/readability * improve use of @mozilla/readability --------- Co-authored-by: josc146 <josStorer@outlook.com>
This commit is contained in:
Generated
+9
@@ -6,6 +6,7 @@
|
|||||||
"": {
|
"": {
|
||||||
"name": "chatgptbox",
|
"name": "chatgptbox",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
|
"@mozilla/readability": "^0.5.0",
|
||||||
"@nem035/gpt-3-encoder": "^1.1.7",
|
"@nem035/gpt-3-encoder": "^1.1.7",
|
||||||
"@picocss/pico": "^1.5.9",
|
"@picocss/pico": "^1.5.9",
|
||||||
"@primer/octicons-react": "^18.3.0",
|
"@primer/octicons-react": "^18.3.0",
|
||||||
@@ -2077,6 +2078,14 @@
|
|||||||
"@jridgewell/sourcemap-codec": "^1.4.14"
|
"@jridgewell/sourcemap-codec": "^1.4.14"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/@mozilla/readability": {
|
||||||
|
"version": "0.5.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/@mozilla/readability/-/readability-0.5.0.tgz",
|
||||||
|
"integrity": "sha512-Z+CZ3QaosfFaTqvhQsIktyGrjFjSC0Fa4EMph4mqKnWhmyoGICsV/8QK+8HpXut6zV7zwfWwqDmEjtk1Qf6EgQ==",
|
||||||
|
"engines": {
|
||||||
|
"node": ">=14.0.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/@nem035/gpt-3-encoder": {
|
"node_modules/@nem035/gpt-3-encoder": {
|
||||||
"version": "1.1.7",
|
"version": "1.1.7",
|
||||||
"resolved": "https://registry.npmjs.org/@nem035/gpt-3-encoder/-/gpt-3-encoder-1.1.7.tgz",
|
"resolved": "https://registry.npmjs.org/@nem035/gpt-3-encoder/-/gpt-3-encoder-1.1.7.tgz",
|
||||||
|
|||||||
@@ -19,6 +19,7 @@
|
|||||||
"lint"
|
"lint"
|
||||||
],
|
],
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
|
"@mozilla/readability": "^0.5.0",
|
||||||
"@nem035/gpt-3-encoder": "^1.1.7",
|
"@nem035/gpt-3-encoder": "^1.1.7",
|
||||||
"@picocss/pico": "^1.5.9",
|
"@picocss/pico": "^1.5.9",
|
||||||
"@primer/octicons-react": "^18.3.0",
|
"@primer/octicons-react": "^18.3.0",
|
||||||
|
|||||||
@@ -1,9 +1,5 @@
|
|||||||
import { getPossibleElementByQuerySelector } from './get-possible-element-by-query-selector.mjs'
|
import { getPossibleElementByQuerySelector } from './get-possible-element-by-query-selector.mjs'
|
||||||
|
import { Readability, isProbablyReaderable } from '@mozilla/readability'
|
||||||
function getArea(e) {
|
|
||||||
const rect = e.getBoundingClientRect()
|
|
||||||
return rect.width * rect.height
|
|
||||||
}
|
|
||||||
|
|
||||||
const adapters = {
|
const adapters = {
|
||||||
'scholar.google': ['#gs_res_ccl_mid'],
|
'scholar.google': ['#gs_res_ccl_mid'],
|
||||||
@@ -17,6 +13,11 @@ const adapters = {
|
|||||||
'new.qq.com': ['.content-article'],
|
'new.qq.com': ['.content-article'],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function getArea(e) {
|
||||||
|
const rect = e.getBoundingClientRect()
|
||||||
|
return rect.width * rect.height
|
||||||
|
}
|
||||||
|
|
||||||
function findLargestElement(e) {
|
function findLargestElement(e) {
|
||||||
if (!e) {
|
if (!e) {
|
||||||
return null
|
return null
|
||||||
@@ -42,22 +43,39 @@ function findLargestElement(e) {
|
|||||||
return largestElement
|
return largestElement
|
||||||
}
|
}
|
||||||
|
|
||||||
export function getCoreContentText() {
|
function getTextFrom(e) {
|
||||||
function getTextFrom(e) {
|
return e.innerText || e.textContent
|
||||||
return e.innerText || e.textContent
|
}
|
||||||
}
|
|
||||||
|
|
||||||
|
function postProcessText(text) {
|
||||||
|
return text
|
||||||
|
.trim()
|
||||||
|
.replaceAll(' ', '')
|
||||||
|
.replaceAll('\t', '')
|
||||||
|
.replaceAll('\n\n', '')
|
||||||
|
.replaceAll(',,', '')
|
||||||
|
}
|
||||||
|
|
||||||
|
export function getCoreContentText() {
|
||||||
for (const [siteName, selectors] of Object.entries(adapters)) {
|
for (const [siteName, selectors] of Object.entries(adapters)) {
|
||||||
if (location.hostname.includes(siteName)) {
|
if (location.hostname.includes(siteName)) {
|
||||||
const element = getPossibleElementByQuerySelector(selectors)
|
const element = getPossibleElementByQuerySelector(selectors)
|
||||||
if (element) return getTextFrom(element)
|
if (element) return postProcessText(getTextFrom(element))
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const element = document.querySelector('article')
|
const element = document.querySelector('article')
|
||||||
if (element) {
|
if (element) {
|
||||||
return getTextFrom(element)
|
return postProcessText(getTextFrom(element))
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isProbablyReaderable(document)) {
|
||||||
|
let article = new Readability(document.cloneNode(true), {
|
||||||
|
keepClasses: true,
|
||||||
|
}).parse()
|
||||||
|
console.log('readerable')
|
||||||
|
return postProcessText(article.textContent)
|
||||||
}
|
}
|
||||||
|
|
||||||
const largestElement = findLargestElement(document.body)
|
const largestElement = findLargestElement(document.body)
|
||||||
@@ -79,5 +97,5 @@ export function getCoreContentText() {
|
|||||||
ret = getTextFrom(largestElement)
|
ret = getTextFrom(largestElement)
|
||||||
console.log('use first')
|
console.log('use first')
|
||||||
}
|
}
|
||||||
return ret.trim().replaceAll(' ', '').replaceAll('\n\n', '').replaceAll(',,', '')
|
return postProcessText(ret)
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user