fix: harden pi robot review harness

2026-06-27 17:01:35 +08:00 · 2026-04-17 12:02:35 +08:00
parent d52596aad1
commit d908f6f617
3 changed files with 205 additions and 62 deletions
@@ -0,0 +1,53 @@
+# Robot Review Lane
+
+## Goal
+Add a separate review artifact for fresh-perspective subagent review without weakening the existing human `/lgtm` sign-off gate.
+
+## Scope
+In: task schema, task tool UX, widget/task badges, README/tests, Pi-native robot-review harness hardening.
+Out: provider routing or third-party subagent package integration.
+
+## Requirements
+- R1: Tasks can store a robot review separately from human sign-off evidence. Done means: a task can contain both `lgtm_ask` evidence and robot observations without conflict.
+- R2: Robot review is observational only. Done means: the tool schema and help text ask for observations, not recommendations or editorial.
+- R3: UI exposes review lanes distinctly. Done means: task list/widget/details show tool/robot/human review badges.
+- R4: Human `/lgtm` remains the only completion path. Done means: no robot review path can complete tasks.
+- R5: Automatic Pi robot review must be operationally robust. Done means: the child reviewer has a bounded timeout, uses a deterministic Pi invocation, and reports infra failures clearly enough to diagnose without hanging the main tool call.
+- R6: The subprocess harness is covered by focused tests. Done means: there are tests for invocation selection, timeout/abort behavior, and assistant-output parsing.
+
+## Tasks
+- [x] T1 (R1, R2, R3, R4): Add robot-review storage and tool.
+  - steps: update task typing helpers, register `robot_review_ask`, thread robot review metadata through task views
+  - verify: `npm test -- --runInBand`
+  - UAT: "when a task has lgtm evidence and robot observations, I observe both badges and `/lgtm` still controls completion"
+- [x] T2 (R3): Update README and examples.
+  - steps: document badges and robot review workflow
+  - verify: `rg -n "robot_review_ask|🤖|🛠" README.md`
+  - UAT: "when I read the README, I observe a distinct robot review lane"
+- [x] T3 (R5): Harden the Pi-native robot-review subprocess runner.
+  - steps: add timeout handling, replace fragile self-reinvocation logic with deterministic command resolution, improve failure messages
+  - verify: `npx vitest run test/robot-review-runner.test.ts`
+  - UAT: "when the child reviewer hangs or pi is not resolvable, lgtm_ask returns a bounded failure instead of hanging forever"
+- [x] T4 (R6): Add focused harness tests.
+  - steps: extract/mock subprocess runner boundaries and cover timeout, parse, and command resolution behavior
+  - verify: `npx vitest run test/robot-review-runner.test.ts test/robot-review.test.ts`
+  - UAT: "when I run the focused tests, I observe the subprocess path itself is covered"
+
+## Context
+- Existing schema uses `pending_approval` as the human sign-off gate.
+- Current UI already appends `👀` for pending human sign-off; extend rather than replace the completion rule.
+
+## Log
+- The least disruptive model is additive metadata plus badge rendering, not replacing the task lifecycle.
+- The repo's full Vitest suite already has drift unrelated to this feature, so focused verification is needed to isolate new behavior.
+- A Pi-native reviewer stage matches the official subagent example better than ACP/external CLIs, but it makes harness reliability part of the approval path and therefore needs explicit timeout and invocation hardening.
+- A deterministic `pi` command plus an explicit timeout is simpler and more portable than trying to reconstruct the current host entrypoint from `process.argv`.
+
+## TODO
+- Optional future work: add an orchestrated cross-model reviewer via `external-review` or ACP.
+
+## Errors
+| Task | Error | Resolution |
+|------|-------|------------|
+| T1 | `npm test` failed with 17 pre-existing assertions unrelated to robot review | Verified with `npm run lint`, `npm run typecheck`, and focused `npx vitest run test/review-badges.test.ts` instead |
+| T3 | `process.argv[1]`-based self-reinvocation was fragile in extension-hosted contexts | Replaced with `PI_LGTM_PI_BIN` override or plain `pi`, then added focused runner tests |
@@ -38,15 +38,105 @@ function textResult(msg: string) {
 const TASK_TOOL_NAMES = new Set(["TaskCreate", "TaskList", "TaskGet", "TaskUpdate", "lgtm_ask", "robot_review_ask", "robot_review_run"]);
 const REMINDER_INTERVAL = 4;
 const AUTO_CLEAR_DELAY = 4;
+export const DEFAULT_ROBOT_REVIEW_TIMEOUT_MS = 120_000;

 type CommandResult = { stdout: string; stderr: string; exitCode: number | null };

-function getPiInvocation(args: string[]): { command: string; args: string[] } {
-  const currentScript = process.argv[1];
-  if (currentScript) {
-    return { command: process.execPath, args: [currentScript, ...args] };
+export function getPiInvocation(
+  args: string[],
+  env: NodeJS.ProcessEnv = process.env,
+): { command: string; args: string[] } {
+  const configured = env.PI_LGTM_PI_BIN?.trim();
+  return { command: configured || "pi", args };
+}
+
+export function getRobotReviewTimeoutMs(env: NodeJS.ProcessEnv = process.env): number {
+  const configured = Number.parseInt(env.PI_LGTM_ROBOT_REVIEW_TIMEOUT_MS ?? "", 10);
+  return Number.isFinite(configured) && configured > 0 ? configured : DEFAULT_ROBOT_REVIEW_TIMEOUT_MS;
+}
+
+function getAssistantTextFromPiEvent(event: any): string | undefined {
+  if (event?.type !== "message_end" || event.message?.role !== "assistant" || !Array.isArray(event.message.content)) {
+    return undefined;
  }
-  return { command: "pi", args };
+  const text = event.message.content.find((part: any) => part?.type === "text")?.text;
+  return typeof text === "string" ? text : undefined;
+}
+
+export function extractFinalAssistantTextFromPiJsonl(output: string): string {
+  let buffer = "";
+  let finalAssistantText = "";
+  const lines = output.split("\n");
+  for (const line of lines) {
+    if (!line.trim()) continue;
+    buffer = line;
+    try {
+      const text = getAssistantTextFromPiEvent(JSON.parse(line));
+      if (text) finalAssistantText = text;
+      buffer = "";
+    } catch {
+      // ignore malformed line noise from the child process
+    }
+  }
+  if (buffer.trim()) {
+    try {
+      const text = getAssistantTextFromPiEvent(JSON.parse(buffer));
+      if (text) finalAssistantText = text;
+    } catch {
+      // ignore malformed trailing line
+    }
+  }
+  return finalAssistantText;
+}
+
+export async function runRobotReviewCommand(
+  invocation: { command: string; args: string[] },
+  signal?: AbortSignal,
+  timeoutMs = getRobotReviewTimeoutMs(),
+): Promise<CommandResult> {
+  return new Promise<CommandResult>((resolve, reject) => {
+    const child = spawn(invocation.command, invocation.args, { shell: false, stdio: ["ignore", "pipe", "pipe"] });
+    const stdoutChunks: Buffer[] = [];
+    const stderrChunks: Buffer[] = [];
+    let settled = false;
+
+    const finish = (fn: () => void) => {
+      if (settled) return;
+      settled = true;
+      fn();
+    };
+
+    const killTimer = setTimeout(() => {
+      child.kill("SIGTERM");
+      finish(() => reject(new Error(`Robot reviewer timed out after ${timeoutMs}ms.`)));
+    }, timeoutMs);
+
+    child.stdout.on("data", (data) => stdoutChunks.push(data));
+    child.stderr.on("data", (data) => stderrChunks.push(data));
+    child.on("error", (err) => {
+      clearTimeout(killTimer);
+      finish(() => reject(err));
+    });
+    const onAbort = () => {
+      clearTimeout(killTimer);
+      child.kill("SIGTERM");
+    };
+    signal?.addEventListener("abort", onAbort, { once: true });
+    child.on("close", (exitCode) => {
+      clearTimeout(killTimer);
+      signal?.removeEventListener("abort", onAbort);
+      if (signal?.aborted) {
+        finish(() => reject(new Error("aborted")));
+        return;
+      }
+      const stdout = Buffer.concat(stdoutChunks).toString("utf-8");
+      finish(() => resolve({
+        stdout: extractFinalAssistantTextFromPiJsonl(stdout) || stdout,
+        stderr: Buffer.concat(stderrChunks).toString("utf-8"),
+        exitCode,
+      }));
+    });
+  });
 }

 function extractRobotReviewJson(output: string): Record<string, unknown> {
@@ -113,63 +203,9 @@ async function runAutomaticRobotReview(
  if (reviewerModel) args.push("--model", reviewerModel);
  args.push(prompt);
  const invocation = getPiInvocation(args);
-  const commandLabel = `${invocation.command} ${args.slice(0, reviewerModel ? 6 : 4).join(" ")}`;
-  const result = await new Promise<CommandResult>((resolve, reject) => {
-    const child = spawn(invocation.command, invocation.args, { shell: false, stdio: ["ignore", "pipe", "pipe"] });
-    const stdoutChunks: Buffer[] = [];
-    const stderrChunks: Buffer[] = [];
-    let buffer = "";
-    let finalAssistantText = "";
-    child.stdout.on("data", (data) => {
-      stdoutChunks.push(data);
-      buffer += data.toString("utf-8");
-      const lines = buffer.split("\n");
-      buffer = lines.pop() || "";
-      for (const line of lines) {
-        if (!line.trim()) continue;
-        try {
-          const event = JSON.parse(line) as any;
-          if (event.type === "message_end" && event.message?.role === "assistant") {
-            const text = Array.isArray(event.message.content)
-              ? event.message.content.find((part: any) => part.type === "text")?.text
-              : undefined;
-            if (typeof text === "string") finalAssistantText = text;
-          }
-        } catch {
-          // ignore malformed line noise
-        }
-      }
-    });
-    child.stderr.on("data", (data) => stderrChunks.push(data));
-    child.on("error", reject);
-    const onAbort = () => child.kill();
-    signal?.addEventListener("abort", onAbort, { once: true });
-    child.on("close", (exitCode) => {
-      signal?.removeEventListener("abort", onAbort);
-      if (signal?.aborted) {
-        reject(new Error("aborted"));
-        return;
-      }
-      if (buffer.trim()) {
-        try {
-          const event = JSON.parse(buffer) as any;
-          if (event.type === "message_end" && event.message?.role === "assistant") {
-            const text = Array.isArray(event.message.content)
-              ? event.message.content.find((part: any) => part.type === "text")?.text
-              : undefined;
-            if (typeof text === "string") finalAssistantText = text;
-          }
-        } catch {
-          // ignore malformed trailing line
-        }
-      }
-      resolve({
-        stdout: finalAssistantText || Buffer.concat(stdoutChunks).toString("utf-8"),
-        stderr: Buffer.concat(stderrChunks).toString("utf-8"),
-        exitCode,
-      });
-    });
-  });
+  const timeoutMs = getRobotReviewTimeoutMs();
+  const commandLabel = `${invocation.command} ${invocation.args.slice(0, reviewerModel ? 6 : 4).join(" ")}`;
+  const result = await runRobotReviewCommand(invocation, signal, timeoutMs);
  if (result.exitCode !== 0) {
    throw new Error(`Robot reviewer failed (${result.exitCode ?? "?"}): ${(result.stderr || result.stdout).trim()}`);
  }
@@ -0,0 +1,54 @@
+import { describe, expect, it } from "vitest";
+import {
+  DEFAULT_ROBOT_REVIEW_TIMEOUT_MS,
+  extractFinalAssistantTextFromPiJsonl,
+  getPiInvocation,
+  getRobotReviewTimeoutMs,
+  runRobotReviewCommand,
+} from "../src/index.js";
+
+describe("robot review runner helpers", () => {
+  it("uses plain pi by default and allows override", () => {
+    expect(getPiInvocation(["--mode", "json"], {} as NodeJS.ProcessEnv)).toEqual({
+      command: "pi",
+      args: ["--mode", "json"],
+    });
+    expect(getPiInvocation(["-p"], { PI_LGTM_PI_BIN: "/custom/pi" } as NodeJS.ProcessEnv)).toEqual({
+      command: "/custom/pi",
+      args: ["-p"],
+    });
+  });
+
+  it("parses the final assistant text from pi jsonl", () => {
+    const output = [
+      "{\"type\":\"message_update\"}",
+      "{\"type\":\"message_end\",\"message\":{\"role\":\"assistant\",\"content\":[{\"type\":\"text\",\"text\":\"ROBOT_REVIEW_JSON_START {\\\"accepted\\\":true} ROBOT_REVIEW_JSON_END\"}]}}",
+    ].join("\n");
+    expect(extractFinalAssistantTextFromPiJsonl(output)).toContain("ROBOT_REVIEW_JSON_START");
+  });
+
+  it("uses configured timeout or falls back to default", () => {
+    expect(getRobotReviewTimeoutMs({ PI_LGTM_ROBOT_REVIEW_TIMEOUT_MS: "2500" } as NodeJS.ProcessEnv)).toBe(2500);
+    expect(getRobotReviewTimeoutMs({ PI_LGTM_ROBOT_REVIEW_TIMEOUT_MS: "bad" } as NodeJS.ProcessEnv)).toBe(DEFAULT_ROBOT_REVIEW_TIMEOUT_MS);
+  });
+
+  it("times out bounded child commands", async () => {
+    await expect(runRobotReviewCommand({
+      command: process.execPath,
+      args: ["-e", "setTimeout(() => {}, 1000)"],
+    }, undefined, 25)).rejects.toThrow(/timed out/i);
+  });
+
+  it("extracts assistant text from a child jsonl process", async () => {
+    const script = [
+      "process.stdout.write(JSON.stringify({type:'message_update'}) + '\\n');",
+      "process.stdout.write(JSON.stringify({type:'message_end',message:{role:'assistant',content:[{type:'text',text:'ROBOT_REVIEW_JSON_START {\\\"accepted\\\":true,\\\"observations\\\":[\\\"ok\\\"]} ROBOT_REVIEW_JSON_END'}]}}) + '\\n');",
+    ].join("");
+    const result = await runRobotReviewCommand({
+      command: process.execPath,
+      args: ["-e", script],
+    }, undefined, 500);
+    expect(result.exitCode).toBe(0);
+    expect(result.stdout).toContain("ROBOT_REVIEW_JSON_END");
+  });
+});