From 95544925278508152c10d41a51a87959dadbed68 Mon Sep 17 00:00:00 2001 From: wassname <1103714+wassname@users.noreply.github.com> Date: Tue, 29 Oct 2024 12:55:26 +0800 Subject: [PATCH] crop transcript in middle instead of decimate it, use \n instead of , --- .../site-adapters/youtube/index.mjs | 2 +- src/utils/crop-text.mjs | 65 ++++++++++--------- 2 files changed, 37 insertions(+), 30 deletions(-) diff --git a/src/content-script/site-adapters/youtube/index.mjs b/src/content-script/site-adapters/youtube/index.mjs index 46f1a7f..02f91dd 100644 --- a/src/content-script/site-adapters/youtube/index.mjs +++ b/src/content-script/site-adapters/youtube/index.mjs @@ -48,7 +48,7 @@ export default { let subtitleContent = '' while (subtitleData.indexOf('">') !== -1) { subtitleData = subtitleData.substring(subtitleData.indexOf('">') + 2) - subtitleContent += subtitleData.substring(0, subtitleData.indexOf('<')) + ',' + subtitleContent += subtitleData.substring(0, subtitleData.indexOf('<')) + '\n' } subtitleContent = replaceHtmlEntities(subtitleContent) diff --git a/src/utils/crop-text.mjs b/src/utils/crop-text.mjs index 7546a00..79f6f83 100644 --- a/src/utils/crop-text.mjs +++ b/src/utils/crop-text.mjs @@ -28,6 +28,7 @@ const clamp = (v, min, max) => { return Math.min(Math.max(v, min), max) } +/** this function will crop text by keeping the beginning and end */ export async function cropText( text, maxLength = 4000, @@ -48,47 +49,53 @@ export async function cropText( maxLength -= 100 + clamp(userConfig.maxResponseTokenLength, 1, maxLength - 1000) } - const splits = text.split(/[,,。??!!;;]/).map((s) => s.trim()) + const splits = text.split(/[,,。??!!;;\n]/).map((s) => s.trim()) const splitsLength = splits.map((s) => (tiktoken ? encode(s).length : s.length)) - const length = splitsLength.reduce((sum, length) => sum + length, 0) - - const cropLength = length - startLength - endLength const cropTargetLength = maxLength - startLength - endLength - const cropPercentage = cropTargetLength / cropLength - const cropStep = Math.max(0, 1 / cropPercentage - 1) - - if (cropStep === 0) return text + let firstHalfTokens = 0 + let secondHalfTokens = 0 + const halfTargetTokens = Math.floor(cropTargetLength / 2) + let middleIndex = -1 + let endStartIndex = splits.length + let totalTokens = splitsLength.reduce((sum, length) => sum + length + 1, 0) + let croppedTokens = 0 let croppedText = '' let currentLength = 0 - let currentIndex = 0 - let currentStep = 0 - for (; currentIndex < splits.length; currentIndex++) { - if (currentLength + splitsLength[currentIndex] + 1 <= startLength) { - croppedText += splits[currentIndex] + ',' - currentLength += splitsLength[currentIndex] + 1 - } else if (currentLength + splitsLength[currentIndex] + 1 + endLength <= maxLength) { - if (currentStep < cropStep) { - currentStep++ - } else { - croppedText += splits[currentIndex] + ',' - currentLength += splitsLength[currentIndex] + 1 - currentStep = currentStep - cropStep - } + // First pass: find the middle + for (let i = 0; i < splits.length; i++) { + if (firstHalfTokens < halfTargetTokens) { + firstHalfTokens += splitsLength[i] + 1 } else { + middleIndex = i break } } - let endPart = '' - let endPartLength = 0 - for (let i = splits.length - 1; endPartLength + splitsLength[i] <= endLength; i--) { - endPart = splits[i] + ',' + endPart - endPartLength += splitsLength[i] + 1 + // Second pass: find the start of the end section + for (let i = splits.length - 1; i >= middleIndex; i--) { + secondHalfTokens += splitsLength[i] + 1 + if (secondHalfTokens >= halfTargetTokens) { + endStartIndex = i + break + } } - currentLength += endPartLength - croppedText += endPart + + // Calculate cropped tokens + croppedTokens = totalTokens - firstHalfTokens - secondHalfTokens + + // Construct the cropped text + croppedText = splits.slice(0, middleIndex).join('\n') + if (middleIndex !== endStartIndex) { + croppedText += `\n\n**Important disclaimer**, this text is incomplete! ${croppedTokens} or ${ + (croppedTokens / totalTokens).toFixed(2) * 100 + }% of tokens have been removed from this location in the text due to lack limited model context\n\n` + } + croppedText += splits.slice(endStartIndex).join('\n') + + currentLength = firstHalfTokens + secondHalfTokens + (middleIndex !== endStartIndex ? 9 : 0) // 9 is the length of "\n[cropped]\n" + // ... existing code ... console.log( `input maxLength: ${maxLength}\n` +