Simplify automatic robot review

2026-06-27 15:31:29 +08:00 · 2026-06-13 23:07:40 +08:00
parent 8b7159cc6d
commit 9423d299a3
5 changed files with 98 additions and 58 deletions
@@ -92,7 +92,7 @@ After calling this, the task shows `👀` and is only completable via `/lgtm <id

 The tool result includes a non-blocking self-check prompt asking whether the evidence directly addresses the `done_criterion` and whether a skeptical reviewer would find it convincing.

-`lgtm_ask` always runs the robot-review stage immediately after storing evidence. A failing or errored robot review clears `pending_approval` until the evidence is strengthened and reviewed again.
+`lgtm_ask` always runs the robot-review stage immediately after storing evidence. A robot review that rejects the evidence clears `pending_approval` until the evidence is strengthened and reviewed again. A reviewer crash, auth failure, timeout, or malformed output is recorded as a warning and leaves human sign-off open.

 ### `lgtm_supersede`

@@ -122,21 +122,15 @@ Use this from a separate subagent or other model when possible. Reviews append a

 ### `robot_review_run`

-Run the configured automatic robot reviewer against the current task evidence.
+Run the automatic robot reviewer against the current task evidence using the current session model.

 Default reviewer stage:

 ```bash
-pi --mode json -p --no-session
+pi --mode json -p --no-session --no-tools --no-extensions --model <current-session-model>
 ```

-Override with:
-
-```bash
-PI_LGTM_ROBOT_REVIEW_MODEL='openai/gpt-5'
-```
-
-This appends a new robot-review iteration. The reviewer returns an explicit `accepted` boolean as well as detailed observations, blind spots, and missing evidence. If the latest robot review rejects the evidence, `/lgtm` is blocked until stronger evidence is submitted and reviewed again.
+This appends a new robot-review iteration. The reviewer returns an explicit `accepted` boolean as well as detailed observations, blind spots, and missing evidence. If the latest robot review rejects the evidence, `/lgtm` is blocked until stronger evidence is submitted and reviewed again. If the reviewer process fails to run or returns malformed output, the failure is recorded but human sign-off stays open.

 ## Commands

@@ -154,8 +148,8 @@ Interactive menu: view tasks, create task, clear completed/all.
 pending -> in_progress -> (lgtm_ask)
                       -> current evidence iteration N
                       -> robot review iteration(s) on current evidence 🤖
-                       -> pending_approval 👀   if latest robot review passes
-                       -> reviewer_failed_to_run | reviewer_rejected
+                       -> pending_approval 👀   if latest robot review passes, or reviewer infra fails
+                       -> reviewer_rejected
                       -> lgtm_supersede or newer lgtm_ask -> superseded history + fresh current evidence
                       -> (/lgtm) -> completed
                       -> deleted
@@ -77,27 +77,20 @@ export function getRobotReviewTimeoutMs(env: NodeJS.ProcessEnv = process.env): n
  return Number.isFinite(configured) && configured > 0 ? configured : DEFAULT_ROBOT_REVIEW_TIMEOUT_MS;
 }

-/**
- * Pick a reviewer model from a different provider than the current one.
- * Prefers cheap/fast models suitable for review tasks.
- * Returns undefined if no alternate provider is available (falls back to same model).
- */
-export function pickAlternateReviewerModel(currentProviderId?: string): string | undefined {
-  // Ordered by: cheap, fast, good enough for structured review
-  const providerModels: Record<string, string> = {
-    anthropic: "anthropic/claude-haiku-4-5",
-    "github-copilot": "github-copilot/gemini-3-flash-preview",
-    openrouter: "openrouter/google/gemini-2.5-flash",
-  };
-  const providers = Object.keys(providerModels);
-  const current = currentProviderId ?? "";
-
-  // Try a different provider first
-  for (const p of providers) {
-    if (p !== current) return providerModels[p];
-  }
-  // All same? Just return the first non-current
-  return providers.length > 0 ? providerModels[providers[0]] : undefined;
+/** Format pi's current model object as the CLI's provider/model reference. */
+export function getCurrentModelRef(model: unknown): string | undefined {
+  if (!model || typeof model !== "object") return undefined;
+  const provider = typeof (model as any).provider === "string"
+    ? (model as any).provider
+    : typeof (model as any).providerId === "string"
+      ? (model as any).providerId
+      : undefined;
+  const id = typeof (model as any).id === "string"
+    ? (model as any).id
+    : typeof (model as any).modelId === "string"
+      ? (model as any).modelId
+      : undefined;
+  return provider && id ? `${provider}/${id}` : undefined;
 }

 function getAssistantTextFromPiEvent(event: any): string | undefined {
@@ -583,16 +576,17 @@ function buildRobotReviewPrompt(task: any): string {
 async function runAutomaticRobotReview(
  task: any,
  signal?: AbortSignal,
-  currentProviderId?: string,
+  currentModelRef?: string,
 ): Promise<{ review: Omit<RobotReviewRecord, "iteration">; command: string }> {
+  if (!currentModelRef) {
+    throw new Error("Automatic robot review requires an active current session model.");
+  }
  const prompt = buildRobotReviewPrompt(task);
-  const args = ["--mode", "json", "-p", "--no-session", "--no-tools", "--no-extensions"];
-  const reviewerModel = process.env.PI_LGTM_ROBOT_REVIEW_MODEL?.trim() || pickAlternateReviewerModel(currentProviderId);
-  if (reviewerModel) args.push("--model", reviewerModel);
+  const args = ["--mode", "json", "-p", "--no-session", "--no-tools", "--no-extensions", "--model", currentModelRef];
  args.push(prompt);
  const invocation = getPiInvocation(args);
  const timeoutMs = getRobotReviewTimeoutMs();
-  const commandLabel = `${invocation.command} ${invocation.args.slice(0, reviewerModel ? 6 : 4).join(" ")}`;
+  const commandLabel = `${invocation.command} ${invocation.args.slice(0, -1).join(" ")}`;
  const result = await runRobotReviewCommand(invocation, signal, timeoutMs);
  if (result.exitCode !== 0) {
    const error = new Error(`Robot reviewer failed (${result.exitCode ?? "?"}): ${(result.stderr || result.stdout).trim()}`) as Error & { rawOutput?: string };
@@ -710,14 +704,11 @@ export default function (pi: ExtensionAPI) {
  let currentTurn = 0;
  let lastTaskToolUseTurn = 0;
  let reminderInjectedThisCycle = false;
-  let currentProvider: string | undefined;

  pi.on("turn_start", async (_event, ctx) => {
    currentTurn++;
    widget.setUICtx(ctx.ui as UICtx);
    upgradeStoreIfNeeded(ctx);
-    const model = ctx.model;
-    if (model) currentProvider = (model as any).providerId ?? (model as any).provider;
    if (autoClear.onTurnStart(currentTurn)) widget.update();
  });

@@ -1035,7 +1026,7 @@ Do NOT summarize or interpret. Paste literal command output, exact log lines, ma
      supersede_reason: Type.Optional(Type.String({ description: "Why this evidence replaces an older submission on the same task." })),
    }),

-    async execute(_toolCallId, params, signal, _onUpdate, _ctx) {
+    async execute(_toolCallId, params, signal, _onUpdate, ctx) {
      const task = store.get(params.taskId);
      if (!task) return Promise.resolve(textResult(`Task #${params.taskId} not found`));
      if (task.status === "completed") return Promise.resolve(textResult(`Task #${params.taskId} already completed`));
@@ -1065,7 +1056,7 @@ Do NOT summarize or interpret. Paste literal command output, exact log lines, ma
      const refreshedTask = store.get(params.taskId);
      if (!refreshedTask) return textResult(`Task #${params.taskId} not found after evidence update`);
      try {
-        const { review, command } = await runAutomaticRobotReview(refreshedTask, signal, currentProvider);
+        const { review, command } = await runAutomaticRobotReview(refreshedTask, signal, getCurrentModelRef(ctx.model));
        store.update(params.taskId, {
          pending_approval: shouldOpenHumanSignoffGate(refreshedTask, review.accepted),
          metadata: {
@@ -1091,13 +1082,13 @@ Do NOT summarize or interpret. Paste literal command output, exact log lines, ma
        }
      } catch (err: any) {
        store.update(params.taskId, {
-          pending_approval: false,
+          pending_approval: refreshedTask.pending_approval,
          metadata: getAutomaticReviewFailureMetadata(err.message, err.rawOutput),
        });
        robotReviewNote =
          `\n\n### Automatic robot review\n` +
          `Reviewer failed: ${err.message}\n` +
-          `Human sign-off is blocked until the reviewer stage succeeds.` +
+          `Human sign-off is still allowed because reviewer failures are warnings, not evidence rejections.` +
          (typeof err.rawOutput === "string" && err.rawOutput.trim()
            ? `\n\n${formatReviewTextBlock("Reviewer raw output", err.rawOutput.trim())}`
            : "");
@@ -1233,7 +1224,7 @@ This does not complete the task. Human /lgtm remains the only completion path.`,
  pi.registerTool({
    name: "robot_review_run",
    label: "robot_review_run",
-    description: `Run the configured automatic robot reviewer against the current task evidence.
+    description: `Run the automatic robot reviewer against the current task evidence using the current session model.

 Runs the same Pi-native reviewer stage used automatically by \`lgtm_ask\`.

@@ -1250,7 +1241,7 @@ This appends a new robot-review iteration. If the reviewer marks evidence incomp
      }

      try {
-        const { review, command } = await runAutomaticRobotReview(task, signal, currentProvider);
+        const { review, command } = await runAutomaticRobotReview(task, signal, getCurrentModelRef(_ctx.model));
        store.update(params.taskId, {
          pending_approval: shouldOpenHumanSignoffGate(task, review.accepted),
          metadata: {
@@ -1277,13 +1268,14 @@ This appends a new robot-review iteration. If the reviewer marks evidence incomp
        );
      } catch (err: any) {
        store.update(params.taskId, {
-          pending_approval: false,
+          pending_approval: task.pending_approval,
          metadata: getAutomaticReviewFailureMetadata(err.message, err.rawOutput),
        });
        widget.update();
        return textResult(
          `## Automatic robot review for task #${task.id}: ${task.subject}\n` +
          `Reviewer failed: ${err.message}\n\n` +
+          `Automatic review failures are warnings, not evidence rejections, so human sign-off is still allowed.\n\n` +
          `Gate status: ${getGateStatus(store.get(params.taskId) ?? task)}` +
          (typeof err.rawOutput === "string" && err.rawOutput.trim()
            ? `\n\n${formatReviewTextBlock("Reviewer raw output", err.rawOutput.trim())}`
@@ -1463,12 +1455,17 @@ This appends a new robot-review iteration. If the reviewer marks evidence incomp
    if (noEvidence) {
      title = `⚠ Task #${taskId} has no agent-submitted evidence.\nSign off anyway?\nDone: ${task.done_criterion}`;
      signLabel = "✓ Override — sign off without evidence";
-    } else if (reviewerFailed) {
-      title = `⚠ Task #${taskId} automatic robot review failed.\nSign off anyway?\nDone: ${task.done_criterion}`;
-      signLabel = "✓ Override — sign off despite reviewer failure";
    } else if (robotRejected) {
      title = `⚠ Task #${taskId} robot review rejected the evidence.\nSign off anyway?\nDone: ${task.done_criterion}`;
      signLabel = "✓ Override — sign off despite rejected review";
+    } else if (reviewerFailed) {
+      if (task.pending_approval) {
+        title = `⚠ Task #${taskId} automatic robot review failed, but human sign-off is still allowed.\nContinue?\nDone: ${task.done_criterion}`;
+        signLabel = "✓ LGTM — sign off despite reviewer warning";
+      } else {
+        title = `⚠ Task #${taskId} automatic robot review failed.\nSign off anyway?\nDone: ${task.done_criterion}`;
+        signLabel = "✓ Override — sign off despite reviewer failure";
+      }
    }
    const confirm = await ctx.ui.select(title, [signLabel, "✗ Cancel"]);
    if (confirm !== signLabel) return;
@@ -1505,7 +1502,7 @@ This appends a new robot-review iteration. If the reviewer marks evidence incomp
        };
        for (const t of open) groups[getDisplayStatus(t)].push(t);
        const groupLabel: Record<DisplayStatus, string> = {
-          awaiting_signoff: "READY (robot ✓)",
+          awaiting_signoff: "READY (human sign-off open)",
          in_progress: "ACTIVE (no /lgtm evidence)",
          pending: "PENDING (not started)",
          completed: "DONE",
@@ -1516,7 +1513,11 @@ This appends a new robot-review iteration. If the reviewer marks evidence incomp
          if (inBucket.length === 0) continue;
          lines.push(`  ${groupLabel[status]}:`);
          for (const t of inBucket) {
-            const warn = status === "awaiting_signoff" && !latestRobotReviewPasses(t) ? " ⚠ robot rejected" : "";
+            const warn = status === "awaiting_signoff"
+              ? (typeof t.metadata?.robot_review_last_error === "string"
+                ? " ⚠ reviewer warning"
+                : (!latestRobotReviewPasses(t) ? " ⚠ robot rejected" : ""))
+              : "";
            lines.push(`    #${t.id} ${t.subject}${warn}`);
          }
        }
@@ -57,10 +57,10 @@ export function getCompletionMode(task: Task): CompletionMode {

 export function getReviewState(task: Task): ReviewState {
  if (task.status === "completed") return "human_signed_off";
-  if (task.pending_approval && hasCurrentEvidence(task)) return "ready_for_human";
-  if (typeof task.metadata?.robot_review_last_error === "string") return "reviewer_failed_to_run";
  const latest = getLatestRobotReview(task);
  if (latest && !latest.accepted) return "reviewer_rejected";
+  if (task.pending_approval && hasCurrentEvidence(task)) return "ready_for_human";
+  if (typeof task.metadata?.robot_review_last_error === "string") return "reviewer_failed_to_run";
  if (hasCurrentEvidence(task)) return "evidence_submitted";
  if (hasEvidenceHistory(task)) return "superseded";
  return "no_evidence";
@@ -70,7 +70,12 @@ export function getGateStatus(task: Task): string {
  const state = getReviewState(task);
  if (state === "human_signed_off") return "human signed off";
  if (state === "no_evidence") return "no lgtm evidence submitted";
-  if (state === "ready_for_human") return `ready for human sign-off via /lgtm ${task.id}`;
+  if (state === "ready_for_human") {
+    if (typeof task.metadata?.robot_review_last_error === "string") {
+      return `warning: automatic robot review failed, human sign-off still allowed via /lgtm ${task.id}: ${task.metadata.robot_review_last_error}`;
+    }
+    return `ready for human sign-off via /lgtm ${task.id}`;
+  }
  if (state === "reviewer_failed_to_run") {
    return `blocked: automatic robot review failed: ${task.metadata.robot_review_last_error}`;
  }
@@ -82,13 +82,23 @@ describe("getGateStatus", () => {
    }))).toBe("ready for human sign-off via /lgtm 1");
  });

-  it("reports reviewer failure separately from rejected evidence", () => {
+  it("reports blocking reviewer failure when human sign-off is closed", () => {
    expect(getGateStatus(makeTask({
      metadata: {
        lgtm_evidence: "ok",
        robot_review_last_error: "Unexpected token 'a'",
      },
-    }))).toContain("automatic robot review failed");
+    }))).toContain("blocked: automatic robot review failed");
+  });
+
+  it("reports reviewer failure as a warning when human sign-off stays open", () => {
+    expect(getGateStatus(makeTask({
+      pending_approval: true,
+      metadata: {
+        lgtm_evidence: "ok",
+        robot_review_last_error: "Unexpected token 'a'",
+      },
+    }))).toContain("warning: automatic robot review failed");
  });

  it("reports rejected robot review when latest review does not accept", () => {
@@ -111,6 +121,29 @@ describe("getGateStatus", () => {
      },
    }))).toBe("blocked: latest robot review rejected the evidence");
  });
+
+  it("keeps rejection higher priority than a later reviewer warning", () => {
+    expect(getGateStatus(makeTask({
+      pending_approval: true,
+      metadata: {
+        lgtm_evidence: "ok",
+        robot_review_last_error: "timeout",
+        robot_reviews: [{
+          iteration: 1,
+          reviewer: "opencode",
+          scope: "task evidence",
+          observations: ["Observed missing output"],
+          blind_spots: "none",
+          accepted: false,
+          evidence_complete: false,
+          evidence_convincing: false,
+          missing_evidence: ["literal output"],
+          submitted_at: "2026-04-17T00:00:00.000Z",
+          mode: "manual",
+        }],
+      },
+    }))).toBe("blocked: latest robot review rejected the evidence");
+  });
 });

 describe("getDisplayStatus", () => {
@@ -3,6 +3,7 @@ import {
  DEFAULT_ROBOT_REVIEW_TIMEOUT_MS,
  extractFinalAssistantTextFromPiJsonl,
  extractRobotReviewJson,
+  getCurrentModelRef,
  getPiInvocation,
  getRobotReviewTimeoutMs,
  runRobotReviewCommand,
@@ -49,6 +50,12 @@ describe("robot review runner helpers", () => {
    expect(getRobotReviewTimeoutMs({ PI_LGTM_ROBOT_REVIEW_TIMEOUT_MS: "bad" } as NodeJS.ProcessEnv)).toBe(DEFAULT_ROBOT_REVIEW_TIMEOUT_MS);
  });

+  it("formats the current model as the reviewer model ref", () => {
+    expect(getCurrentModelRef({ provider: "openai", id: "gpt-5" })).toBe("openai/gpt-5");
+    expect(getCurrentModelRef({ providerId: "anthropic", modelId: "claude-haiku" })).toBe("anthropic/claude-haiku");
+    expect(getCurrentModelRef({ provider: "openai" })).toBeUndefined();
+  });
+
  it("times out bounded child commands", async () => {
    await expect(runRobotReviewCommand({
      command: process.execPath,