mirror of
https://github.com/wassname/pi-telegram.git
synced 2026-06-27 14:45:47 +08:00
Stop fetch retry from killing the polling loop
Two real bugs surfaced after the original retry helper: 1. Healthy long-polls were tripping our 15s per-attempt timeout. The getUpdates request asks Telegram for a 30s server-side long-poll, so our internal timeout aborted every healthy connection and turned it into ABORT_ERR -> retry -> exhausted -> "disconnected", with no auto-reconnect. The fix: long-poll bypasses the retry helper and uses a 60s per-attempt timeout, since the poll loop already retries by re-entering after sleep(). 2. Our own internal AbortController timeout produced a DOMException AbortError indistinguishable from a caller-abort. The poll loop's shouldStopTelegramPolling treated that as "user wants to stop" and exited. Now fetchWithRetry normalizes its own timeout into a tagged Error with code ATTEMPT_TIMEOUT, so only real caller-aborts surface as AbortError upstream. Also: per-attempt timeout default dropped 15s -> 5s, retry budget dropped from [500, 2000] to [500] (so 2 attempts, not 3) for outbound sends, since they serialize and a long retry tail makes the bridge feel hung. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -2069,8 +2069,14 @@ export default function (pi: ExtensionAPI) {
|
||||
);
|
||||
},
|
||||
getUpdates: async (body, pollSignal) => {
|
||||
// Long-poll: server-side timeout is 30s. We bypass our retry helper
|
||||
// (the poll loop already retries by re-entering after a sleep) and
|
||||
// give the per-attempt timeout enough headroom that a healthy poll
|
||||
// can never trip it.
|
||||
return callTelegramApi<TelegramUpdate[]>("getUpdates", body, {
|
||||
signal: pollSignal,
|
||||
retry: false,
|
||||
attemptTimeoutMs: 60_000,
|
||||
});
|
||||
},
|
||||
persistConfig: async () => {
|
||||
|
||||
+50
-19
@@ -29,7 +29,7 @@ export interface TelegramApiClient {
|
||||
call: <TResponse>(
|
||||
method: string,
|
||||
body: Record<string, unknown>,
|
||||
options?: { signal?: AbortSignal },
|
||||
options?: TelegramFetchOptions,
|
||||
) => Promise<TResponse>;
|
||||
callMultipart: <TResponse>(
|
||||
method: string,
|
||||
@@ -56,7 +56,9 @@ function sanitizeFileName(name: string): string {
|
||||
|
||||
// Network-layer codes that warrant a retry. HTTP 4xx/5xx are NOT retried here -
|
||||
// those go through `data.ok` / `response.ok` and are surfaced to callers so
|
||||
// rate-limits and logic errors stay loud.
|
||||
// rate-limits and logic errors stay loud. ATTEMPT_TIMEOUT is our own per-attempt
|
||||
// timeout firing; we treat it as transient but rethrow as a non-AbortError so
|
||||
// it cannot masquerade as a caller-abort upstream (e.g. polling loop).
|
||||
const TRANSIENT_FETCH_CODES = new Set([
|
||||
"UND_ERR_CONNECT_TIMEOUT",
|
||||
"UND_ERR_SOCKET",
|
||||
@@ -66,18 +68,27 @@ const TRANSIENT_FETCH_CODES = new Set([
|
||||
"ETIMEDOUT",
|
||||
"ENOTFOUND",
|
||||
"EAI_AGAIN",
|
||||
"ABORT_ERR", // our own per-attempt timeout, see below
|
||||
"ATTEMPT_TIMEOUT",
|
||||
]);
|
||||
|
||||
const FETCH_ATTEMPT_TIMEOUT_MS = 15_000;
|
||||
const FETCH_RETRY_DELAYS_MS = [500, 2000];
|
||||
const DEFAULT_FETCH_ATTEMPT_TIMEOUT_MS = 5_000;
|
||||
const DEFAULT_FETCH_RETRY_DELAYS_MS = [500];
|
||||
|
||||
export interface TelegramFetchOptions {
|
||||
signal?: AbortSignal;
|
||||
// Override per-attempt timeout. Long-poll callers (getUpdates) should pass
|
||||
// a value larger than the server-side timeout (~30s) plus margin so the
|
||||
// healthy long-poll completes without us aborting it.
|
||||
attemptTimeoutMs?: number;
|
||||
// When false, no in-memory retry is performed. Outer loops (polling) that
|
||||
// already retry by re-entering should disable this to avoid double-retry.
|
||||
retry?: boolean;
|
||||
}
|
||||
|
||||
function transientCode(err: unknown): string | undefined {
|
||||
const e = err as { code?: string; cause?: { code?: string }; name?: string };
|
||||
const e = err as { code?: string; cause?: { code?: string } };
|
||||
const code = e?.code ?? e?.cause?.code;
|
||||
if (code && TRANSIENT_FETCH_CODES.has(code)) return code;
|
||||
// AbortError from our timeout has name="AbortError", no code
|
||||
if (e?.name === "AbortError") return "ABORT_ERR";
|
||||
return undefined;
|
||||
}
|
||||
|
||||
@@ -86,29 +97,45 @@ function transientCode(err: unknown): string | undefined {
|
||||
* AbortController timeout so a stuck connection cannot wedge the bridge forever.
|
||||
*
|
||||
* The caller's own AbortSignal (if any) is honored - if it aborts, we re-throw
|
||||
* immediately and do not retry.
|
||||
* the original AbortError immediately and do not retry. When OUR timeout fires
|
||||
* (caller did not abort), we re-throw as a plain Error tagged with code
|
||||
* "ATTEMPT_TIMEOUT" so upstream code can't confuse it with a caller-abort.
|
||||
*/
|
||||
async function fetchWithRetry(
|
||||
url: string,
|
||||
init: RequestInit,
|
||||
callerSignal?: AbortSignal,
|
||||
options?: { attemptTimeoutMs?: number; retry?: boolean },
|
||||
): Promise<Response> {
|
||||
for (let attempt = 0; attempt <= FETCH_RETRY_DELAYS_MS.length; attempt++) {
|
||||
const attemptTimeoutMs =
|
||||
options?.attemptTimeoutMs ?? DEFAULT_FETCH_ATTEMPT_TIMEOUT_MS;
|
||||
const retryDelays =
|
||||
options?.retry === false ? [] : DEFAULT_FETCH_RETRY_DELAYS_MS;
|
||||
for (let attempt = 0; attempt <= retryDelays.length; attempt++) {
|
||||
const timeoutCtl = new AbortController();
|
||||
const timer = setTimeout(
|
||||
() => timeoutCtl.abort(),
|
||||
FETCH_ATTEMPT_TIMEOUT_MS,
|
||||
);
|
||||
const timer = setTimeout(() => timeoutCtl.abort(), attemptTimeoutMs);
|
||||
const onCallerAbort = () => timeoutCtl.abort();
|
||||
callerSignal?.addEventListener("abort", onCallerAbort, { once: true });
|
||||
let raised: unknown;
|
||||
try {
|
||||
return await fetch(url, { ...init, signal: timeoutCtl.signal });
|
||||
} catch (err) {
|
||||
if (callerSignal?.aborted) throw err;
|
||||
const code = transientCode(err);
|
||||
const isLast = attempt === FETCH_RETRY_DELAYS_MS.length;
|
||||
if (!code || isLast) throw err;
|
||||
const base = FETCH_RETRY_DELAYS_MS[attempt];
|
||||
// Our own timeout fired - normalize to a non-AbortError so polling
|
||||
// loops that key off DOMException("AbortError") don't false-positive.
|
||||
const isOurTimeout =
|
||||
(err as { name?: string })?.name === "AbortError" &&
|
||||
timeoutCtl.signal.aborted;
|
||||
raised = isOurTimeout
|
||||
? Object.assign(
|
||||
new Error(`fetch attempt timed out after ${attemptTimeoutMs}ms`),
|
||||
{ code: "ATTEMPT_TIMEOUT" },
|
||||
)
|
||||
: err;
|
||||
const code = transientCode(raised);
|
||||
const isLast = attempt === retryDelays.length;
|
||||
if (!code || isLast) throw raised;
|
||||
const base = retryDelays[attempt];
|
||||
const jitter = Math.round(base * (0.8 + Math.random() * 0.4));
|
||||
console.warn(
|
||||
`[pi-telegram] transient fetch error ${code} on ${url.replace(/bot[^/]+/, "bot***")}, retry ${attempt + 1} in ${jitter}ms`,
|
||||
@@ -150,7 +177,7 @@ export async function callTelegram<TResponse>(
|
||||
botToken: string | undefined,
|
||||
method: string,
|
||||
body: Record<string, unknown>,
|
||||
options?: { signal?: AbortSignal },
|
||||
options?: TelegramFetchOptions,
|
||||
): Promise<TResponse> {
|
||||
if (!botToken) {
|
||||
throw new Error("Telegram bot token is not configured");
|
||||
@@ -163,6 +190,10 @@ export async function callTelegram<TResponse>(
|
||||
body: JSON.stringify(body),
|
||||
},
|
||||
options?.signal,
|
||||
{
|
||||
attemptTimeoutMs: options?.attemptTimeoutMs,
|
||||
retry: options?.retry,
|
||||
},
|
||||
);
|
||||
const data = (await response.json()) as TelegramApiResponse<TResponse>;
|
||||
if (!data.ok || data.result === undefined) {
|
||||
|
||||
@@ -54,6 +54,9 @@ export function shouldStopTelegramPolling(
|
||||
signalAborted: boolean,
|
||||
error: unknown,
|
||||
): boolean {
|
||||
// AbortError-from-our-own-timeout is normalized in fetchWithRetry so it
|
||||
// can't reach here as a DOMException. Any AbortError that does reach here
|
||||
// is therefore a real caller-abort and should stop the loop.
|
||||
return (
|
||||
signalAborted ||
|
||||
(error instanceof DOMException && error.name === "AbortError")
|
||||
|
||||
Reference in New Issue
Block a user