feat: run robot review via pi harness

This commit is contained in:
wassname
2026-04-17 11:10:13 +08:00
parent 0ba0173615
commit d52596aad1
2 changed files with 103 additions and 70 deletions
+4 -5
View File
@@ -89,7 +89,7 @@ After calling this, the task shows `👀` and is only completable via `/lgtm <id
The tool result includes a non-blocking self-check prompt asking whether the evidence directly addresses the `done_criterion` and whether a skeptical reviewer would find it convincing.
`lgtm_ask` also accepts `run_robot_review` (optional). If true, or if `PI_LGTM_AUTO_ROBOT_REVIEW=1`, the extension runs the configured robot reviewer immediately after storing evidence. A failing robot review clears `pending_approval` until the evidence is strengthened.
`lgtm_ask` always runs the robot-review stage immediately after storing evidence. A failing or errored robot review clears `pending_approval` until the evidence is strengthened and reviewed again.
### `robot_review_ask`
@@ -115,17 +115,16 @@ Use this from a separate subagent or other model when possible. Reviews append a
Run the configured automatic robot reviewer against the current task evidence.
Default reviewer command:
Default reviewer stage:
```bash
acpx --approve-reads --non-interactive-permissions deny opencode exec
pi --mode json -p --no-session
```
Override with:
```bash
PI_LGTM_ROBOT_REVIEW_CMD='acpx --approve-reads --non-interactive-permissions deny codex exec'
PI_LGTM_AUTO_ROBOT_REVIEW=1
PI_LGTM_ROBOT_REVIEW_MODEL='openai/gpt-5'
```
This appends a new robot-review iteration. The reviewer returns an explicit `accepted` boolean as well as detailed observations, blind spots, and missing evidence. If the latest robot review rejects the evidence, `/lgtm` is blocked until stronger evidence is submitted and reviewed again.
+99 -65
View File
@@ -41,33 +41,12 @@ const AUTO_CLEAR_DELAY = 4;
type CommandResult = { stdout: string; stderr: string; exitCode: number | null };
function shellQuote(text: string): string {
return JSON.stringify(text);
}
function runShellCommand(command: string, signal?: AbortSignal): Promise<CommandResult> {
return new Promise((resolve, reject) => {
const child = spawn("bash", ["-lc", command], { stdio: ["ignore", "pipe", "pipe"] });
const stdoutChunks: Buffer[] = [];
const stderrChunks: Buffer[] = [];
child.stdout.on("data", (data) => stdoutChunks.push(data));
child.stderr.on("data", (data) => stderrChunks.push(data));
child.on("error", reject);
const onAbort = () => child.kill();
signal?.addEventListener("abort", onAbort, { once: true });
child.on("close", (exitCode) => {
signal?.removeEventListener("abort", onAbort);
if (signal?.aborted) {
reject(new Error("aborted"));
return;
}
resolve({
stdout: Buffer.concat(stdoutChunks).toString("utf-8"),
stderr: Buffer.concat(stderrChunks).toString("utf-8"),
exitCode,
});
});
});
function getPiInvocation(args: string[]): { command: string; args: string[] } {
const currentScript = process.argv[1];
if (currentScript) {
return { command: process.execPath, args: [currentScript, ...args] };
}
return { command: "pi", args };
}
function extractRobotReviewJson(output: string): Record<string, unknown> {
@@ -128,11 +107,69 @@ async function runAutomaticRobotReview(
task: any,
signal?: AbortSignal,
): Promise<{ review: Omit<RobotReviewRecord, "iteration">; command: string }> {
const reviewerCommand = process.env.PI_LGTM_ROBOT_REVIEW_CMD?.trim()
|| "acpx --approve-reads --non-interactive-permissions deny opencode exec";
const prompt = buildRobotReviewPrompt(task);
const command = `${reviewerCommand} ${shellQuote(prompt)}`;
const result = await runShellCommand(command, signal);
const args = ["--mode", "json", "-p", "--no-session"];
const reviewerModel = process.env.PI_LGTM_ROBOT_REVIEW_MODEL?.trim();
if (reviewerModel) args.push("--model", reviewerModel);
args.push(prompt);
const invocation = getPiInvocation(args);
const commandLabel = `${invocation.command} ${args.slice(0, reviewerModel ? 6 : 4).join(" ")}`;
const result = await new Promise<CommandResult>((resolve, reject) => {
const child = spawn(invocation.command, invocation.args, { shell: false, stdio: ["ignore", "pipe", "pipe"] });
const stdoutChunks: Buffer[] = [];
const stderrChunks: Buffer[] = [];
let buffer = "";
let finalAssistantText = "";
child.stdout.on("data", (data) => {
stdoutChunks.push(data);
buffer += data.toString("utf-8");
const lines = buffer.split("\n");
buffer = lines.pop() || "";
for (const line of lines) {
if (!line.trim()) continue;
try {
const event = JSON.parse(line) as any;
if (event.type === "message_end" && event.message?.role === "assistant") {
const text = Array.isArray(event.message.content)
? event.message.content.find((part: any) => part.type === "text")?.text
: undefined;
if (typeof text === "string") finalAssistantText = text;
}
} catch {
// ignore malformed line noise
}
}
});
child.stderr.on("data", (data) => stderrChunks.push(data));
child.on("error", reject);
const onAbort = () => child.kill();
signal?.addEventListener("abort", onAbort, { once: true });
child.on("close", (exitCode) => {
signal?.removeEventListener("abort", onAbort);
if (signal?.aborted) {
reject(new Error("aborted"));
return;
}
if (buffer.trim()) {
try {
const event = JSON.parse(buffer) as any;
if (event.type === "message_end" && event.message?.role === "assistant") {
const text = Array.isArray(event.message.content)
? event.message.content.find((part: any) => part.type === "text")?.text
: undefined;
if (typeof text === "string") finalAssistantText = text;
}
} catch {
// ignore malformed trailing line
}
}
resolve({
stdout: finalAssistantText || Buffer.concat(stdoutChunks).toString("utf-8"),
stderr: Buffer.concat(stderrChunks).toString("utf-8"),
exitCode,
});
});
});
if (result.exitCode !== 0) {
throw new Error(`Robot reviewer failed (${result.exitCode ?? "?"}): ${(result.stderr || result.stdout).trim()}`);
}
@@ -143,9 +180,9 @@ async function runAutomaticRobotReview(
? parsed.missing_evidence.filter((item): item is string => typeof item === "string")
: [];
return {
command: reviewerCommand,
command: commandLabel,
review: {
reviewer: typeof parsed.reviewer === "string" ? parsed.reviewer : reviewerCommand,
reviewer: typeof parsed.reviewer === "string" ? parsed.reviewer : commandLabel,
scope: typeof parsed.scope === "string" ? parsed.scope : "task evidence package",
observations,
blind_spots: typeof parsed.blind_spots === "string" ? parsed.blind_spots : "not stated",
@@ -502,7 +539,6 @@ After this, task enters pending sign-off state — only completable via /lgtm <i
falsification_test: Type.String({ description: "What you ran and what you got, presented so both you and the human can sanity-check it. State: what you ran (command/experiment/log check), the actual output or result, and why that result could not occur if a failure mode were real. Must be traceable: include file paths, log snippets, counts, or commit. The human should be able to verify without re-running anything." }),
verification_hints: Type.Array(Type.String(), { description: "Where to look and what to check. Descriptions of evidence locations, not bare file paths. E.g. 'lines 45-60 in src/loss.py show the gradient check' not 'src/loss.py'." }),
remaining_uncertainty: Type.String({ description: "What's NOT tested, known limitations, edge cases deferred. If you can't articulate uncertainty, you haven't thought hard enough." }),
run_robot_review: Type.Optional(Type.Boolean({ description: "If true, run the configured automatic robot reviewer immediately after storing evidence." })),
}),
async execute(_toolCallId, params, signal, _onUpdate, _ctx) {
@@ -525,34 +561,33 @@ After this, task enters pending sign-off state — only completable via /lgtm <i
},
});
let robotReviewNote = "";
const shouldRunRobotReview = params.run_robot_review ?? process.env.PI_LGTM_AUTO_ROBOT_REVIEW === "1";
if (shouldRunRobotReview) {
const refreshedTask = store.get(params.taskId);
if (!refreshedTask) return textResult(`Task #${params.taskId} not found after evidence update`);
try {
const { review, command } = await runAutomaticRobotReview(refreshedTask, signal);
store.update(params.taskId, {
pending_approval: review.evidence_complete && review.evidence_convincing,
metadata: appendRobotReviewMetadata(refreshedTask, review),
});
robotReviewNote =
`\n\n### Automatic robot review\n` +
`Reviewer command: ${command}\n` +
`Evidence complete: ${review.evidence_complete ? "yes" : "no"}\n` +
`Evidence convincing: ${review.evidence_convincing ? "yes" : "no"}\n` +
`${review.observations.map(o => `- ${o}`).join("\n")}`;
if (review.missing_evidence.length > 0) {
robotReviewNote += `\nMissing evidence:\n${review.missing_evidence.map(item => `- ${item}`).join("\n")}`;
}
if (!(review.evidence_complete && review.evidence_convincing)) {
robotReviewNote += `\nResult: human sign-off has been held back until the evidence is strengthened and reviewed again.`;
}
} catch (err: any) {
robotReviewNote =
`\n\n### Automatic robot review\n` +
`Reviewer failed: ${err.message}\n` +
`Task remains pending human sign-off; rerun with stronger evidence or call \`robot_review_run\` after fixing reviewer setup.`;
const refreshedTask = store.get(params.taskId);
if (!refreshedTask) return textResult(`Task #${params.taskId} not found after evidence update`);
try {
const { review, command } = await runAutomaticRobotReview(refreshedTask, signal);
store.update(params.taskId, {
pending_approval: review.accepted,
metadata: appendRobotReviewMetadata(refreshedTask, review),
});
robotReviewNote =
`\n\n### Automatic robot review\n` +
`Reviewer: ${command}\n` +
`Accepted: ${review.accepted ? "yes" : "no"}\n` +
`Evidence complete: ${review.evidence_complete ? "yes" : "no"}\n` +
`Evidence convincing: ${review.evidence_convincing ? "yes" : "no"}\n` +
`${review.observations.map(o => `- ${o}`).join("\n")}`;
if (review.missing_evidence.length > 0) {
robotReviewNote += `\nMissing evidence:\n${review.missing_evidence.map(item => `- ${item}`).join("\n")}`;
}
if (!review.accepted) {
robotReviewNote += `\nResult: human sign-off has been held back until the evidence is strengthened and reviewed again.`;
}
} catch (err: any) {
store.update(params.taskId, { pending_approval: false });
robotReviewNote =
`\n\n### Automatic robot review\n` +
`Reviewer failed: ${err.message}\n` +
`Human sign-off is blocked until the reviewer stage succeeds.`;
}
widget.update();
@@ -574,7 +609,7 @@ After this, task enters pending sign-off state — only completable via /lgtm <i
uncertaintySection +
robotReviewNote +
`\n\n---\n` +
`Task #${task.id} is now ${shouldRunRobotReview && !store.get(task.id)?.pending_approval ? "not yet ready for human sign-off" : `pending human sign-off via \`/lgtm ${task.id}\``}.\n\n` +
`Task #${task.id} is now ${store.get(task.id)?.pending_approval ? `pending human sign-off via \`/lgtm ${task.id}\`` : "not yet ready for human sign-off"}.\n\n` +
`**Self-check (non-blocking):** Look at this as the human will see it. ` +
`Does the evidence directly address the done_criterion "${task.done_criterion}"? ` +
`Would a skeptical reviewer find this convincing, or would they immediately ask ` +
@@ -654,8 +689,7 @@ This does not complete the task. Human /lgtm remains the only completion path.`,
label: "robot_review_run",
description: `Run the configured automatic robot reviewer against the current task evidence.
Uses PI_LGTM_ROBOT_REVIEW_CMD if set, otherwise defaults to:
\`acpx --approve-reads --non-interactive-permissions deny opencode exec\`
Runs the same Pi-native reviewer stage used automatically by \`lgtm_ask\`.
This appends a new robot-review iteration. If the reviewer marks evidence incomplete or unconvincing, pending human sign-off is cleared until stronger evidence is submitted and reviewed again.`,
parameters: Type.Object({
@@ -671,7 +705,7 @@ This appends a new robot-review iteration. If the reviewer marks evidence incomp
const { review, command } = await runAutomaticRobotReview(task, signal);
store.update(params.taskId, {
pending_approval: review.evidence_complete && review.evidence_convincing ? task.pending_approval : false,
pending_approval: review.accepted ? task.pending_approval : false,
metadata: appendRobotReviewMetadata(task, review),
});
widget.update();