Stop fetch retry from killing the polling loop

Two real bugs surfaced after the original retry helper:

1. Healthy long-polls were tripping our 15s per-attempt timeout. The
   getUpdates request asks Telegram for a 30s server-side long-poll, so
   our internal timeout aborted every healthy connection and turned it
   into ABORT_ERR -> retry -> exhausted -> "disconnected", with no
   auto-reconnect. The fix: long-poll bypasses the retry helper and uses
   a 60s per-attempt timeout, since the poll loop already retries by
   re-entering after sleep().

2. Our own internal AbortController timeout produced a DOMException
   AbortError indistinguishable from a caller-abort. The poll loop's
   shouldStopTelegramPolling treated that as "user wants to stop" and
   exited. Now fetchWithRetry normalizes its own timeout into a tagged
   Error with code ATTEMPT_TIMEOUT, so only real caller-aborts surface
   as AbortError upstream.

Also: per-attempt timeout default dropped 15s -> 5s, retry budget
dropped from [500, 2000] to [500] (so 2 attempts, not 3) for outbound
sends, since they serialize and a long retry tail makes the bridge feel
hung.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
wassname
2026-05-01 18:51:35 +08:00
parent 66898e02b4
commit b2444fd3cd
3 changed files with 59 additions and 19 deletions
+50 -19
View File
@@ -29,7 +29,7 @@ export interface TelegramApiClient {
call: <TResponse>(
method: string,
body: Record<string, unknown>,
options?: { signal?: AbortSignal },
options?: TelegramFetchOptions,
) => Promise<TResponse>;
callMultipart: <TResponse>(
method: string,
@@ -56,7 +56,9 @@ function sanitizeFileName(name: string): string {
// Network-layer codes that warrant a retry. HTTP 4xx/5xx are NOT retried here -
// those go through `data.ok` / `response.ok` and are surfaced to callers so
// rate-limits and logic errors stay loud.
// rate-limits and logic errors stay loud. ATTEMPT_TIMEOUT is our own per-attempt
// timeout firing; we treat it as transient but rethrow as a non-AbortError so
// it cannot masquerade as a caller-abort upstream (e.g. polling loop).
const TRANSIENT_FETCH_CODES = new Set([
"UND_ERR_CONNECT_TIMEOUT",
"UND_ERR_SOCKET",
@@ -66,18 +68,27 @@ const TRANSIENT_FETCH_CODES = new Set([
"ETIMEDOUT",
"ENOTFOUND",
"EAI_AGAIN",
"ABORT_ERR", // our own per-attempt timeout, see below
"ATTEMPT_TIMEOUT",
]);
const FETCH_ATTEMPT_TIMEOUT_MS = 15_000;
const FETCH_RETRY_DELAYS_MS = [500, 2000];
const DEFAULT_FETCH_ATTEMPT_TIMEOUT_MS = 5_000;
const DEFAULT_FETCH_RETRY_DELAYS_MS = [500];
export interface TelegramFetchOptions {
signal?: AbortSignal;
// Override per-attempt timeout. Long-poll callers (getUpdates) should pass
// a value larger than the server-side timeout (~30s) plus margin so the
// healthy long-poll completes without us aborting it.
attemptTimeoutMs?: number;
// When false, no in-memory retry is performed. Outer loops (polling) that
// already retry by re-entering should disable this to avoid double-retry.
retry?: boolean;
}
function transientCode(err: unknown): string | undefined {
const e = err as { code?: string; cause?: { code?: string }; name?: string };
const e = err as { code?: string; cause?: { code?: string } };
const code = e?.code ?? e?.cause?.code;
if (code && TRANSIENT_FETCH_CODES.has(code)) return code;
// AbortError from our timeout has name="AbortError", no code
if (e?.name === "AbortError") return "ABORT_ERR";
return undefined;
}
@@ -86,29 +97,45 @@ function transientCode(err: unknown): string | undefined {
* AbortController timeout so a stuck connection cannot wedge the bridge forever.
*
* The caller's own AbortSignal (if any) is honored - if it aborts, we re-throw
* immediately and do not retry.
* the original AbortError immediately and do not retry. When OUR timeout fires
* (caller did not abort), we re-throw as a plain Error tagged with code
* "ATTEMPT_TIMEOUT" so upstream code can't confuse it with a caller-abort.
*/
async function fetchWithRetry(
url: string,
init: RequestInit,
callerSignal?: AbortSignal,
options?: { attemptTimeoutMs?: number; retry?: boolean },
): Promise<Response> {
for (let attempt = 0; attempt <= FETCH_RETRY_DELAYS_MS.length; attempt++) {
const attemptTimeoutMs =
options?.attemptTimeoutMs ?? DEFAULT_FETCH_ATTEMPT_TIMEOUT_MS;
const retryDelays =
options?.retry === false ? [] : DEFAULT_FETCH_RETRY_DELAYS_MS;
for (let attempt = 0; attempt <= retryDelays.length; attempt++) {
const timeoutCtl = new AbortController();
const timer = setTimeout(
() => timeoutCtl.abort(),
FETCH_ATTEMPT_TIMEOUT_MS,
);
const timer = setTimeout(() => timeoutCtl.abort(), attemptTimeoutMs);
const onCallerAbort = () => timeoutCtl.abort();
callerSignal?.addEventListener("abort", onCallerAbort, { once: true });
let raised: unknown;
try {
return await fetch(url, { ...init, signal: timeoutCtl.signal });
} catch (err) {
if (callerSignal?.aborted) throw err;
const code = transientCode(err);
const isLast = attempt === FETCH_RETRY_DELAYS_MS.length;
if (!code || isLast) throw err;
const base = FETCH_RETRY_DELAYS_MS[attempt];
// Our own timeout fired - normalize to a non-AbortError so polling
// loops that key off DOMException("AbortError") don't false-positive.
const isOurTimeout =
(err as { name?: string })?.name === "AbortError" &&
timeoutCtl.signal.aborted;
raised = isOurTimeout
? Object.assign(
new Error(`fetch attempt timed out after ${attemptTimeoutMs}ms`),
{ code: "ATTEMPT_TIMEOUT" },
)
: err;
const code = transientCode(raised);
const isLast = attempt === retryDelays.length;
if (!code || isLast) throw raised;
const base = retryDelays[attempt];
const jitter = Math.round(base * (0.8 + Math.random() * 0.4));
console.warn(
`[pi-telegram] transient fetch error ${code} on ${url.replace(/bot[^/]+/, "bot***")}, retry ${attempt + 1} in ${jitter}ms`,
@@ -150,7 +177,7 @@ export async function callTelegram<TResponse>(
botToken: string | undefined,
method: string,
body: Record<string, unknown>,
options?: { signal?: AbortSignal },
options?: TelegramFetchOptions,
): Promise<TResponse> {
if (!botToken) {
throw new Error("Telegram bot token is not configured");
@@ -163,6 +190,10 @@ export async function callTelegram<TResponse>(
body: JSON.stringify(body),
},
options?.signal,
{
attemptTimeoutMs: options?.attemptTimeoutMs,
retry: options?.retry,
},
);
const data = (await response.json()) as TelegramApiResponse<TResponse>;
if (!data.ok || data.result === undefined) {
+3
View File
@@ -54,6 +54,9 @@ export function shouldStopTelegramPolling(
signalAborted: boolean,
error: unknown,
): boolean {
// AbortError-from-our-own-timeout is normalized in fetchWithRetry so it
// can't reach here as a DOMException. Any AbortError that does reach here
// is therefore a real caller-abort and should stop the loop.
return (
signalAborted ||
(error instanceof DOMException && error.name === "AbortError")