feat: run robot review via pi harness

2026-06-27 17:01:35 +08:00 · 2026-04-17 11:10:13 +08:00
parent 0ba0173615
commit d52596aad1
2 changed files with 103 additions and 70 deletions
@@ -89,7 +89,7 @@ After calling this, the task shows `👀` and is only completable via `/lgtm <id
 The tool result includes a non-blocking self-check prompt asking whether the evidence directly addresses the `done_criterion` and whether a skeptical reviewer would find it convincing.
-`lgtm_ask` also accepts `run_robot_review` (optional). If true, or if `PI_LGTM_AUTO_ROBOT_REVIEW=1`, the extension runs the configured robot reviewer immediately after storing evidence. A failing robot review clears `pending_approval` until the evidence is strengthened.
+`lgtm_ask` always runs the robot-review stage immediately after storing evidence. A failing or errored robot review clears `pending_approval` until the evidence is strengthened and reviewed again.
 ### `robot_review_ask`
@@ -115,17 +115,16 @@ Use this from a separate subagent or other model when possible. Reviews append a
 Run the configured automatic robot reviewer against the current task evidence.
-Default reviewer command:
+Default reviewer stage:
 ```bash
-acpx --approve-reads --non-interactive-permissions deny opencode exec
+pi --mode json -p --no-session
 ```
 Override with:
 ```bash
-PI_LGTM_ROBOT_REVIEW_CMD='acpx --approve-reads --non-interactive-permissions deny codex exec'
+PI_LGTM_ROBOT_REVIEW_MODEL='openai/gpt-5'
 PI_LGTM_AUTO_ROBOT_REVIEW=1
 ```
 This appends a new robot-review iteration. The reviewer returns an explicit `accepted` boolean as well as detailed observations, blind spots, and missing evidence. If the latest robot review rejects the evidence, `/lgtm` is blocked until stronger evidence is submitted and reviewed again.
@@ -41,33 +41,12 @@ const AUTO_CLEAR_DELAY = 4;
 type CommandResult = { stdout: string; stderr: string; exitCode: number | null };
-function shellQuote(text: string): string {
+function getPiInvocation(args: string[]): { command: string; args: string[] } {
-  return JSON.stringify(text);
+  const currentScript = process.argv[1];
-}
+  if (currentScript) {
-
+    return { command: process.execPath, args: [currentScript, ...args] };
-function runShellCommand(command: string, signal?: AbortSignal): Promise<CommandResult> {
+  }
-  return new Promise((resolve, reject) => {
+  return { command: "pi", args };
    const child = spawn("bash", ["-lc", command], { stdio: ["ignore", "pipe", "pipe"] });
    const stdoutChunks: Buffer[] = [];
    const stderrChunks: Buffer[] = [];
    child.stdout.on("data", (data) => stdoutChunks.push(data));
    child.stderr.on("data", (data) => stderrChunks.push(data));
    child.on("error", reject);
    const onAbort = () => child.kill();
    signal?.addEventListener("abort", onAbort, { once: true });
    child.on("close", (exitCode) => {
      signal?.removeEventListener("abort", onAbort);
      if (signal?.aborted) {
        reject(new Error("aborted"));
        return;
      }
      resolve({
        stdout: Buffer.concat(stdoutChunks).toString("utf-8"),
        stderr: Buffer.concat(stderrChunks).toString("utf-8"),
        exitCode,
      });
    });
  });
 }
 function extractRobotReviewJson(output: string): Record<string, unknown> {
@@ -128,11 +107,69 @@ async function runAutomaticRobotReview(
  task: any,
  signal?: AbortSignal,
 ): Promise<{ review: Omit<RobotReviewRecord, "iteration">; command: string }> {
  const reviewerCommand = process.env.PI_LGTM_ROBOT_REVIEW_CMD?.trim()
    || "acpx --approve-reads --non-interactive-permissions deny opencode exec";
  const prompt = buildRobotReviewPrompt(task);
-  const command = `${reviewerCommand} ${shellQuote(prompt)}`;
+  const args = ["--mode", "json", "-p", "--no-session"];
-  const result = await runShellCommand(command, signal);
+  const reviewerModel = process.env.PI_LGTM_ROBOT_REVIEW_MODEL?.trim();
  if (reviewerModel) args.push("--model", reviewerModel);
  args.push(prompt);
  const invocation = getPiInvocation(args);
  const commandLabel = `${invocation.command} ${args.slice(0, reviewerModel ? 6 : 4).join(" ")}`;
  const result = await new Promise<CommandResult>((resolve, reject) => {
    const child = spawn(invocation.command, invocation.args, { shell: false, stdio: ["ignore", "pipe", "pipe"] });
    const stdoutChunks: Buffer[] = [];
    const stderrChunks: Buffer[] = [];
    let buffer = "";
    let finalAssistantText = "";
    child.stdout.on("data", (data) => {
      stdoutChunks.push(data);
      buffer += data.toString("utf-8");
      const lines = buffer.split("\n");
      buffer = lines.pop() || "";
      for (const line of lines) {
        if (!line.trim()) continue;
        try {
          const event = JSON.parse(line) as any;
          if (event.type === "message_end" && event.message?.role === "assistant") {
            const text = Array.isArray(event.message.content)
              ? event.message.content.find((part: any) => part.type === "text")?.text
              : undefined;
            if (typeof text === "string") finalAssistantText = text;
          }
        } catch {
          // ignore malformed line noise
        }
      }
    });
    child.stderr.on("data", (data) => stderrChunks.push(data));
    child.on("error", reject);
    const onAbort = () => child.kill();
    signal?.addEventListener("abort", onAbort, { once: true });
    child.on("close", (exitCode) => {
      signal?.removeEventListener("abort", onAbort);
      if (signal?.aborted) {
        reject(new Error("aborted"));
        return;
      }
      if (buffer.trim()) {
        try {
          const event = JSON.parse(buffer) as any;
          if (event.type === "message_end" && event.message?.role === "assistant") {
            const text = Array.isArray(event.message.content)
              ? event.message.content.find((part: any) => part.type === "text")?.text
              : undefined;
            if (typeof text === "string") finalAssistantText = text;
          }
        } catch {
          // ignore malformed trailing line
        }
      }
      resolve({
        stdout: finalAssistantText || Buffer.concat(stdoutChunks).toString("utf-8"),
        stderr: Buffer.concat(stderrChunks).toString("utf-8"),
        exitCode,
      });
    });
  });
  if (result.exitCode !== 0) {
    throw new Error(`Robot reviewer failed (${result.exitCode ?? "?"}): ${(result.stderr || result.stdout).trim()}`);
  }
@@ -143,9 +180,9 @@ async function runAutomaticRobotReview(
    ? parsed.missing_evidence.filter((item): item is string => typeof item === "string")
    : [];
  return {
-    command: reviewerCommand,
+    command: commandLabel,
    review: {
-      reviewer: typeof parsed.reviewer === "string" ? parsed.reviewer : reviewerCommand,
+      reviewer: typeof parsed.reviewer === "string" ? parsed.reviewer : commandLabel,
      scope: typeof parsed.scope === "string" ? parsed.scope : "task evidence package",
      observations,
      blind_spots: typeof parsed.blind_spots === "string" ? parsed.blind_spots : "not stated",
@@ -502,7 +539,6 @@ After this, task enters pending sign-off state — only completable via /lgtm <i
      falsification_test: Type.String({ description: "What you ran and what you got, presented so both you and the human can sanity-check it. State: what you ran (command/experiment/log check), the actual output or result, and why that result could not occur if a failure mode were real. Must be traceable: include file paths, log snippets, counts, or commit. The human should be able to verify without re-running anything." }),
      verification_hints: Type.Array(Type.String(), { description: "Where to look and what to check. Descriptions of evidence locations, not bare file paths. E.g. 'lines 45-60 in src/loss.py show the gradient check' not 'src/loss.py'." }),
      remaining_uncertainty: Type.String({ description: "What's NOT tested, known limitations, edge cases deferred. If you can't articulate uncertainty, you haven't thought hard enough." }),
      run_robot_review: Type.Optional(Type.Boolean({ description: "If true, run the configured automatic robot reviewer immediately after storing evidence." })),
    }),
    async execute(_toolCallId, params, signal, _onUpdate, _ctx) {
@@ -525,34 +561,33 @@ After this, task enters pending sign-off state — only completable via /lgtm <i
        },
      });
      let robotReviewNote = "";
-      const shouldRunRobotReview = params.run_robot_review ?? process.env.PI_LGTM_AUTO_ROBOT_REVIEW === "1";
+      const refreshedTask = store.get(params.taskId);
-      if (shouldRunRobotReview) {
+      if (!refreshedTask) return textResult(`Task #${params.taskId} not found after evidence update`);
-        const refreshedTask = store.get(params.taskId);
+      try {
-        if (!refreshedTask) return textResult(`Task #${params.taskId} not found after evidence update`);
+        const { review, command } = await runAutomaticRobotReview(refreshedTask, signal);
-        try {
+        store.update(params.taskId, {
-          const { review, command } = await runAutomaticRobotReview(refreshedTask, signal);
+          pending_approval: review.accepted,
-          store.update(params.taskId, {
+          metadata: appendRobotReviewMetadata(refreshedTask, review),
-            pending_approval: review.evidence_complete && review.evidence_convincing,
+        });
-            metadata: appendRobotReviewMetadata(refreshedTask, review),
+        robotReviewNote =
-          });
+          `\n\n### Automatic robot review\n` +
-          robotReviewNote =
+          `Reviewer: ${command}\n` +
-            `\n\n### Automatic robot review\n` +
+          `Accepted: ${review.accepted ? "yes" : "no"}\n` +
-            `Reviewer command: ${command}\n` +
+          `Evidence complete: ${review.evidence_complete ? "yes" : "no"}\n` +
-            `Evidence complete: ${review.evidence_complete ? "yes" : "no"}\n` +
+          `Evidence convincing: ${review.evidence_convincing ? "yes" : "no"}\n` +
-            `Evidence convincing: ${review.evidence_convincing ? "yes" : "no"}\n` +
+          `${review.observations.map(o => `- ${o}`).join("\n")}`;
-            `${review.observations.map(o => `- ${o}`).join("\n")}`;
+        if (review.missing_evidence.length > 0) {
-          if (review.missing_evidence.length > 0) {
+          robotReviewNote += `\nMissing evidence:\n${review.missing_evidence.map(item => `- ${item}`).join("\n")}`;
            robotReviewNote += `\nMissing evidence:\n${review.missing_evidence.map(item => `- ${item}`).join("\n")}`;
          }
          if (!(review.evidence_complete && review.evidence_convincing)) {
            robotReviewNote += `\nResult: human sign-off has been held back until the evidence is strengthened and reviewed again.`;
          }
        } catch (err: any) {
          robotReviewNote =
            `\n\n### Automatic robot review\n` +
            `Reviewer failed: ${err.message}\n` +
            `Task remains pending human sign-off; rerun with stronger evidence or call \`robot_review_run\` after fixing reviewer setup.`;
        }
        if (!review.accepted) {
          robotReviewNote += `\nResult: human sign-off has been held back until the evidence is strengthened and reviewed again.`;
        }
      } catch (err: any) {
        store.update(params.taskId, { pending_approval: false });
        robotReviewNote =
          `\n\n### Automatic robot review\n` +
          `Reviewer failed: ${err.message}\n` +
          `Human sign-off is blocked until the reviewer stage succeeds.`;
      }
      widget.update();
@@ -574,7 +609,7 @@ After this, task enters pending sign-off state — only completable via /lgtm <i
        uncertaintySection +
        robotReviewNote +
        `\n\n---\n` +
-        `Task #${task.id} is now ${shouldRunRobotReview && !store.get(task.id)?.pending_approval ? "not yet ready for human sign-off" : `pending human sign-off via \`/lgtm ${task.id}\``}.\n\n` +
+        `Task #${task.id} is now ${store.get(task.id)?.pending_approval ? `pending human sign-off via \`/lgtm ${task.id}\`` : "not yet ready for human sign-off"}.\n\n` +
        `**Self-check (non-blocking):** Look at this as the human will see it. ` +
        `Does the evidence directly address the done_criterion "${task.done_criterion}"? ` +
        `Would a skeptical reviewer find this convincing, or would they immediately ask ` +
@@ -654,8 +689,7 @@ This does not complete the task. Human /lgtm remains the only completion path.`,
    label: "robot_review_run",
    description: `Run the configured automatic robot reviewer against the current task evidence.
-Uses PI_LGTM_ROBOT_REVIEW_CMD if set, otherwise defaults to:
+Runs the same Pi-native reviewer stage used automatically by \`lgtm_ask\`.
 \`acpx --approve-reads --non-interactive-permissions deny opencode exec\`
 This appends a new robot-review iteration. If the reviewer marks evidence incomplete or unconvincing, pending human sign-off is cleared until stronger evidence is submitted and reviewed again.`,
    parameters: Type.Object({
@@ -671,7 +705,7 @@ This appends a new robot-review iteration. If the reviewer marks evidence incomp
      const { review, command } = await runAutomaticRobotReview(task, signal);
      store.update(params.taskId, {
-        pending_approval: review.evidence_complete && review.evidence_convincing ? task.pending_approval : false,
+        pending_approval: review.accepted ? task.pending_approval : false,
        metadata: appendRobotReviewMetadata(task, review),
      });
      widget.update();