feat: run robot review via pi harness

2026-06-27 17:01:35 +08:00 · 2026-04-17 11:10:13 +08:00
parent 0ba0173615
commit d52596aad1
2 changed files with 103 additions and 70 deletions
@@ -89,7 +89,7 @@ After calling this, the task shows `👀` and is only completable via `/lgtm <id

 The tool result includes a non-blocking self-check prompt asking whether the evidence directly addresses the `done_criterion` and whether a skeptical reviewer would find it convincing.

-`lgtm_ask` also accepts `run_robot_review` (optional). If true, or if `PI_LGTM_AUTO_ROBOT_REVIEW=1`, the extension runs the configured robot reviewer immediately after storing evidence. A failing robot review clears `pending_approval` until the evidence is strengthened.
+`lgtm_ask` always runs the robot-review stage immediately after storing evidence. A failing or errored robot review clears `pending_approval` until the evidence is strengthened and reviewed again.

 ### `robot_review_ask`

@@ -115,17 +115,16 @@ Use this from a separate subagent or other model when possible. Reviews append a

 Run the configured automatic robot reviewer against the current task evidence.

-Default reviewer command:
+Default reviewer stage:

 ```bash
-acpx --approve-reads --non-interactive-permissions deny opencode exec
+pi --mode json -p --no-session
 ```

 Override with:

 ```bash
-PI_LGTM_ROBOT_REVIEW_CMD='acpx --approve-reads --non-interactive-permissions deny codex exec'
-PI_LGTM_AUTO_ROBOT_REVIEW=1
+PI_LGTM_ROBOT_REVIEW_MODEL='openai/gpt-5'
 ```

 This appends a new robot-review iteration. The reviewer returns an explicit `accepted` boolean as well as detailed observations, blind spots, and missing evidence. If the latest robot review rejects the evidence, `/lgtm` is blocked until stronger evidence is submitted and reviewed again.
@@ -41,33 +41,12 @@ const AUTO_CLEAR_DELAY = 4;

 type CommandResult = { stdout: string; stderr: string; exitCode: number | null };

-function shellQuote(text: string): string {
-  return JSON.stringify(text);
-}
-
-function runShellCommand(command: string, signal?: AbortSignal): Promise<CommandResult> {
-  return new Promise((resolve, reject) => {
-    const child = spawn("bash", ["-lc", command], { stdio: ["ignore", "pipe", "pipe"] });
-    const stdoutChunks: Buffer[] = [];
-    const stderrChunks: Buffer[] = [];
-    child.stdout.on("data", (data) => stdoutChunks.push(data));
-    child.stderr.on("data", (data) => stderrChunks.push(data));
-    child.on("error", reject);
-    const onAbort = () => child.kill();
-    signal?.addEventListener("abort", onAbort, { once: true });
-    child.on("close", (exitCode) => {
-      signal?.removeEventListener("abort", onAbort);
-      if (signal?.aborted) {
-        reject(new Error("aborted"));
-        return;
-      }
-      resolve({
-        stdout: Buffer.concat(stdoutChunks).toString("utf-8"),
-        stderr: Buffer.concat(stderrChunks).toString("utf-8"),
-        exitCode,
-      });
-    });
-  });
+function getPiInvocation(args: string[]): { command: string; args: string[] } {
+  const currentScript = process.argv[1];
+  if (currentScript) {
+    return { command: process.execPath, args: [currentScript, ...args] };
+  }
+  return { command: "pi", args };
 }

 function extractRobotReviewJson(output: string): Record<string, unknown> {
@@ -128,11 +107,69 @@ async function runAutomaticRobotReview(
  task: any,
  signal?: AbortSignal,
 ): Promise<{ review: Omit<RobotReviewRecord, "iteration">; command: string }> {
-  const reviewerCommand = process.env.PI_LGTM_ROBOT_REVIEW_CMD?.trim()
-    || "acpx --approve-reads --non-interactive-permissions deny opencode exec";
  const prompt = buildRobotReviewPrompt(task);
-  const command = `${reviewerCommand} ${shellQuote(prompt)}`;
-  const result = await runShellCommand(command, signal);
+  const args = ["--mode", "json", "-p", "--no-session"];
+  const reviewerModel = process.env.PI_LGTM_ROBOT_REVIEW_MODEL?.trim();
+  if (reviewerModel) args.push("--model", reviewerModel);
+  args.push(prompt);
+  const invocation = getPiInvocation(args);
+  const commandLabel = `${invocation.command} ${args.slice(0, reviewerModel ? 6 : 4).join(" ")}`;
+  const result = await new Promise<CommandResult>((resolve, reject) => {
+    const child = spawn(invocation.command, invocation.args, { shell: false, stdio: ["ignore", "pipe", "pipe"] });
+    const stdoutChunks: Buffer[] = [];
+    const stderrChunks: Buffer[] = [];
+    let buffer = "";
+    let finalAssistantText = "";
+    child.stdout.on("data", (data) => {
+      stdoutChunks.push(data);
+      buffer += data.toString("utf-8");
+      const lines = buffer.split("\n");
+      buffer = lines.pop() || "";
+      for (const line of lines) {
+        if (!line.trim()) continue;
+        try {
+          const event = JSON.parse(line) as any;
+          if (event.type === "message_end" && event.message?.role === "assistant") {
+            const text = Array.isArray(event.message.content)
+              ? event.message.content.find((part: any) => part.type === "text")?.text
+              : undefined;
+            if (typeof text === "string") finalAssistantText = text;
+          }
+        } catch {
+          // ignore malformed line noise
+        }
+      }
+    });
+    child.stderr.on("data", (data) => stderrChunks.push(data));
+    child.on("error", reject);
+    const onAbort = () => child.kill();
+    signal?.addEventListener("abort", onAbort, { once: true });
+    child.on("close", (exitCode) => {
+      signal?.removeEventListener("abort", onAbort);
+      if (signal?.aborted) {
+        reject(new Error("aborted"));
+        return;
+      }
+      if (buffer.trim()) {
+        try {
+          const event = JSON.parse(buffer) as any;
+          if (event.type === "message_end" && event.message?.role === "assistant") {
+            const text = Array.isArray(event.message.content)
+              ? event.message.content.find((part: any) => part.type === "text")?.text
+              : undefined;
+            if (typeof text === "string") finalAssistantText = text;
+          }
+        } catch {
+          // ignore malformed trailing line
+        }
+      }
+      resolve({
+        stdout: finalAssistantText || Buffer.concat(stdoutChunks).toString("utf-8"),
+        stderr: Buffer.concat(stderrChunks).toString("utf-8"),
+        exitCode,
+      });
+    });
+  });
  if (result.exitCode !== 0) {
    throw new Error(`Robot reviewer failed (${result.exitCode ?? "?"}): ${(result.stderr || result.stdout).trim()}`);
  }
@@ -143,9 +180,9 @@ async function runAutomaticRobotReview(
    ? parsed.missing_evidence.filter((item): item is string => typeof item === "string")
    : [];
  return {
-    command: reviewerCommand,
+    command: commandLabel,
    review: {
-      reviewer: typeof parsed.reviewer === "string" ? parsed.reviewer : reviewerCommand,
+      reviewer: typeof parsed.reviewer === "string" ? parsed.reviewer : commandLabel,
      scope: typeof parsed.scope === "string" ? parsed.scope : "task evidence package",
      observations,
      blind_spots: typeof parsed.blind_spots === "string" ? parsed.blind_spots : "not stated",
@@ -502,7 +539,6 @@ After this, task enters pending sign-off state — only completable via /lgtm <i
      falsification_test: Type.String({ description: "What you ran and what you got, presented so both you and the human can sanity-check it. State: what you ran (command/experiment/log check), the actual output or result, and why that result could not occur if a failure mode were real. Must be traceable: include file paths, log snippets, counts, or commit. The human should be able to verify without re-running anything." }),
      verification_hints: Type.Array(Type.String(), { description: "Where to look and what to check. Descriptions of evidence locations, not bare file paths. E.g. 'lines 45-60 in src/loss.py show the gradient check' not 'src/loss.py'." }),
      remaining_uncertainty: Type.String({ description: "What's NOT tested, known limitations, edge cases deferred. If you can't articulate uncertainty, you haven't thought hard enough." }),
-      run_robot_review: Type.Optional(Type.Boolean({ description: "If true, run the configured automatic robot reviewer immediately after storing evidence." })),
    }),

    async execute(_toolCallId, params, signal, _onUpdate, _ctx) {
@@ -525,34 +561,33 @@ After this, task enters pending sign-off state — only completable via /lgtm <i
        },
      });
      let robotReviewNote = "";
-      const shouldRunRobotReview = params.run_robot_review ?? process.env.PI_LGTM_AUTO_ROBOT_REVIEW === "1";
-      if (shouldRunRobotReview) {
-        const refreshedTask = store.get(params.taskId);
-        if (!refreshedTask) return textResult(`Task #${params.taskId} not found after evidence update`);
-        try {
-          const { review, command } = await runAutomaticRobotReview(refreshedTask, signal);
-          store.update(params.taskId, {
-            pending_approval: review.evidence_complete && review.evidence_convincing,
-            metadata: appendRobotReviewMetadata(refreshedTask, review),
-          });
-          robotReviewNote =
-            `\n\n### Automatic robot review\n` +
-            `Reviewer command: ${command}\n` +
-            `Evidence complete: ${review.evidence_complete ? "yes" : "no"}\n` +
-            `Evidence convincing: ${review.evidence_convincing ? "yes" : "no"}\n` +
-            `${review.observations.map(o => `- ${o}`).join("\n")}`;
-          if (review.missing_evidence.length > 0) {
-            robotReviewNote += `\nMissing evidence:\n${review.missing_evidence.map(item => `- ${item}`).join("\n")}`;
-          }
-          if (!(review.evidence_complete && review.evidence_convincing)) {
-            robotReviewNote += `\nResult: human sign-off has been held back until the evidence is strengthened and reviewed again.`;
-          }
-        } catch (err: any) {
-          robotReviewNote =
-            `\n\n### Automatic robot review\n` +
-            `Reviewer failed: ${err.message}\n` +
-            `Task remains pending human sign-off; rerun with stronger evidence or call \`robot_review_run\` after fixing reviewer setup.`;
+      const refreshedTask = store.get(params.taskId);
+      if (!refreshedTask) return textResult(`Task #${params.taskId} not found after evidence update`);
+      try {
+        const { review, command } = await runAutomaticRobotReview(refreshedTask, signal);
+        store.update(params.taskId, {
+          pending_approval: review.accepted,
+          metadata: appendRobotReviewMetadata(refreshedTask, review),
+        });
+        robotReviewNote =
+          `\n\n### Automatic robot review\n` +
+          `Reviewer: ${command}\n` +
+          `Accepted: ${review.accepted ? "yes" : "no"}\n` +
+          `Evidence complete: ${review.evidence_complete ? "yes" : "no"}\n` +
+          `Evidence convincing: ${review.evidence_convincing ? "yes" : "no"}\n` +
+          `${review.observations.map(o => `- ${o}`).join("\n")}`;
+        if (review.missing_evidence.length > 0) {
+          robotReviewNote += `\nMissing evidence:\n${review.missing_evidence.map(item => `- ${item}`).join("\n")}`;
        }
+        if (!review.accepted) {
+          robotReviewNote += `\nResult: human sign-off has been held back until the evidence is strengthened and reviewed again.`;
+        }
+      } catch (err: any) {
+        store.update(params.taskId, { pending_approval: false });
+        robotReviewNote =
+          `\n\n### Automatic robot review\n` +
+          `Reviewer failed: ${err.message}\n` +
+          `Human sign-off is blocked until the reviewer stage succeeds.`;
      }
      widget.update();

@@ -574,7 +609,7 @@ After this, task enters pending sign-off state — only completable via /lgtm <i
        uncertaintySection +
        robotReviewNote +
        `\n\n---\n` +
-        `Task #${task.id} is now ${shouldRunRobotReview && !store.get(task.id)?.pending_approval ? "not yet ready for human sign-off" : `pending human sign-off via \`/lgtm ${task.id}\``}.\n\n` +
+        `Task #${task.id} is now ${store.get(task.id)?.pending_approval ? `pending human sign-off via \`/lgtm ${task.id}\`` : "not yet ready for human sign-off"}.\n\n` +
        `**Self-check (non-blocking):** Look at this as the human will see it. ` +
        `Does the evidence directly address the done_criterion "${task.done_criterion}"? ` +
        `Would a skeptical reviewer find this convincing, or would they immediately ask ` +
@@ -654,8 +689,7 @@ This does not complete the task. Human /lgtm remains the only completion path.`,
    label: "robot_review_run",
    description: `Run the configured automatic robot reviewer against the current task evidence.

-Uses PI_LGTM_ROBOT_REVIEW_CMD if set, otherwise defaults to:
-\`acpx --approve-reads --non-interactive-permissions deny opencode exec\`
+Runs the same Pi-native reviewer stage used automatically by \`lgtm_ask\`.

 This appends a new robot-review iteration. If the reviewer marks evidence incomplete or unconvincing, pending human sign-off is cleared until stronger evidence is submitted and reviewed again.`,
    parameters: Type.Object({
@@ -671,7 +705,7 @@ This appends a new robot-review iteration. If the reviewer marks evidence incomp

      const { review, command } = await runAutomaticRobotReview(task, signal);
      store.update(params.taskId, {
-        pending_approval: review.evidence_complete && review.evidence_convincing ? task.pending_approval : false,
+        pending_approval: review.accepted ? task.pending_approval : false,
        metadata: appendRobotReviewMetadata(task, review),
      });
      widget.update();