Simplify automatic robot review

This commit is contained in:
wassname
2026-06-13 23:07:40 +08:00
parent 8b7159cc6d
commit 9423d299a3
5 changed files with 98 additions and 58 deletions
+6 -12
View File
@@ -92,7 +92,7 @@ After calling this, the task shows `👀` and is only completable via `/lgtm <id
The tool result includes a non-blocking self-check prompt asking whether the evidence directly addresses the `done_criterion` and whether a skeptical reviewer would find it convincing. The tool result includes a non-blocking self-check prompt asking whether the evidence directly addresses the `done_criterion` and whether a skeptical reviewer would find it convincing.
`lgtm_ask` always runs the robot-review stage immediately after storing evidence. A failing or errored robot review clears `pending_approval` until the evidence is strengthened and reviewed again. `lgtm_ask` always runs the robot-review stage immediately after storing evidence. A robot review that rejects the evidence clears `pending_approval` until the evidence is strengthened and reviewed again. A reviewer crash, auth failure, timeout, or malformed output is recorded as a warning and leaves human sign-off open.
### `lgtm_supersede` ### `lgtm_supersede`
@@ -122,21 +122,15 @@ Use this from a separate subagent or other model when possible. Reviews append a
### `robot_review_run` ### `robot_review_run`
Run the configured automatic robot reviewer against the current task evidence. Run the automatic robot reviewer against the current task evidence using the current session model.
Default reviewer stage: Default reviewer stage:
```bash ```bash
pi --mode json -p --no-session pi --mode json -p --no-session --no-tools --no-extensions --model <current-session-model>
``` ```
Override with: This appends a new robot-review iteration. The reviewer returns an explicit `accepted` boolean as well as detailed observations, blind spots, and missing evidence. If the latest robot review rejects the evidence, `/lgtm` is blocked until stronger evidence is submitted and reviewed again. If the reviewer process fails to run or returns malformed output, the failure is recorded but human sign-off stays open.
```bash
PI_LGTM_ROBOT_REVIEW_MODEL='openai/gpt-5'
```
This appends a new robot-review iteration. The reviewer returns an explicit `accepted` boolean as well as detailed observations, blind spots, and missing evidence. If the latest robot review rejects the evidence, `/lgtm` is blocked until stronger evidence is submitted and reviewed again.
## Commands ## Commands
@@ -154,8 +148,8 @@ Interactive menu: view tasks, create task, clear completed/all.
pending -> in_progress -> (lgtm_ask) pending -> in_progress -> (lgtm_ask)
-> current evidence iteration N -> current evidence iteration N
-> robot review iteration(s) on current evidence 🤖 -> robot review iteration(s) on current evidence 🤖
-> pending_approval 👀 if latest robot review passes -> pending_approval 👀 if latest robot review passes, or reviewer infra fails
-> reviewer_failed_to_run | reviewer_rejected -> reviewer_rejected
-> lgtm_supersede or newer lgtm_ask -> superseded history + fresh current evidence -> lgtm_supersede or newer lgtm_ask -> superseded history + fresh current evidence
-> (/lgtm) -> completed -> (/lgtm) -> completed
-> deleted -> deleted
+42 -41
View File
@@ -77,27 +77,20 @@ export function getRobotReviewTimeoutMs(env: NodeJS.ProcessEnv = process.env): n
return Number.isFinite(configured) && configured > 0 ? configured : DEFAULT_ROBOT_REVIEW_TIMEOUT_MS; return Number.isFinite(configured) && configured > 0 ? configured : DEFAULT_ROBOT_REVIEW_TIMEOUT_MS;
} }
/** /** Format pi's current model object as the CLI's provider/model reference. */
* Pick a reviewer model from a different provider than the current one. export function getCurrentModelRef(model: unknown): string | undefined {
* Prefers cheap/fast models suitable for review tasks. if (!model || typeof model !== "object") return undefined;
* Returns undefined if no alternate provider is available (falls back to same model). const provider = typeof (model as any).provider === "string"
*/ ? (model as any).provider
export function pickAlternateReviewerModel(currentProviderId?: string): string | undefined { : typeof (model as any).providerId === "string"
// Ordered by: cheap, fast, good enough for structured review ? (model as any).providerId
const providerModels: Record<string, string> = { : undefined;
anthropic: "anthropic/claude-haiku-4-5", const id = typeof (model as any).id === "string"
"github-copilot": "github-copilot/gemini-3-flash-preview", ? (model as any).id
openrouter: "openrouter/google/gemini-2.5-flash", : typeof (model as any).modelId === "string"
}; ? (model as any).modelId
const providers = Object.keys(providerModels); : undefined;
const current = currentProviderId ?? ""; return provider && id ? `${provider}/${id}` : undefined;
// Try a different provider first
for (const p of providers) {
if (p !== current) return providerModels[p];
}
// All same? Just return the first non-current
return providers.length > 0 ? providerModels[providers[0]] : undefined;
} }
function getAssistantTextFromPiEvent(event: any): string | undefined { function getAssistantTextFromPiEvent(event: any): string | undefined {
@@ -583,16 +576,17 @@ function buildRobotReviewPrompt(task: any): string {
async function runAutomaticRobotReview( async function runAutomaticRobotReview(
task: any, task: any,
signal?: AbortSignal, signal?: AbortSignal,
currentProviderId?: string, currentModelRef?: string,
): Promise<{ review: Omit<RobotReviewRecord, "iteration">; command: string }> { ): Promise<{ review: Omit<RobotReviewRecord, "iteration">; command: string }> {
if (!currentModelRef) {
throw new Error("Automatic robot review requires an active current session model.");
}
const prompt = buildRobotReviewPrompt(task); const prompt = buildRobotReviewPrompt(task);
const args = ["--mode", "json", "-p", "--no-session", "--no-tools", "--no-extensions"]; const args = ["--mode", "json", "-p", "--no-session", "--no-tools", "--no-extensions", "--model", currentModelRef];
const reviewerModel = process.env.PI_LGTM_ROBOT_REVIEW_MODEL?.trim() || pickAlternateReviewerModel(currentProviderId);
if (reviewerModel) args.push("--model", reviewerModel);
args.push(prompt); args.push(prompt);
const invocation = getPiInvocation(args); const invocation = getPiInvocation(args);
const timeoutMs = getRobotReviewTimeoutMs(); const timeoutMs = getRobotReviewTimeoutMs();
const commandLabel = `${invocation.command} ${invocation.args.slice(0, reviewerModel ? 6 : 4).join(" ")}`; const commandLabel = `${invocation.command} ${invocation.args.slice(0, -1).join(" ")}`;
const result = await runRobotReviewCommand(invocation, signal, timeoutMs); const result = await runRobotReviewCommand(invocation, signal, timeoutMs);
if (result.exitCode !== 0) { if (result.exitCode !== 0) {
const error = new Error(`Robot reviewer failed (${result.exitCode ?? "?"}): ${(result.stderr || result.stdout).trim()}`) as Error & { rawOutput?: string }; const error = new Error(`Robot reviewer failed (${result.exitCode ?? "?"}): ${(result.stderr || result.stdout).trim()}`) as Error & { rawOutput?: string };
@@ -710,14 +704,11 @@ export default function (pi: ExtensionAPI) {
let currentTurn = 0; let currentTurn = 0;
let lastTaskToolUseTurn = 0; let lastTaskToolUseTurn = 0;
let reminderInjectedThisCycle = false; let reminderInjectedThisCycle = false;
let currentProvider: string | undefined;
pi.on("turn_start", async (_event, ctx) => { pi.on("turn_start", async (_event, ctx) => {
currentTurn++; currentTurn++;
widget.setUICtx(ctx.ui as UICtx); widget.setUICtx(ctx.ui as UICtx);
upgradeStoreIfNeeded(ctx); upgradeStoreIfNeeded(ctx);
const model = ctx.model;
if (model) currentProvider = (model as any).providerId ?? (model as any).provider;
if (autoClear.onTurnStart(currentTurn)) widget.update(); if (autoClear.onTurnStart(currentTurn)) widget.update();
}); });
@@ -1035,7 +1026,7 @@ Do NOT summarize or interpret. Paste literal command output, exact log lines, ma
supersede_reason: Type.Optional(Type.String({ description: "Why this evidence replaces an older submission on the same task." })), supersede_reason: Type.Optional(Type.String({ description: "Why this evidence replaces an older submission on the same task." })),
}), }),
async execute(_toolCallId, params, signal, _onUpdate, _ctx) { async execute(_toolCallId, params, signal, _onUpdate, ctx) {
const task = store.get(params.taskId); const task = store.get(params.taskId);
if (!task) return Promise.resolve(textResult(`Task #${params.taskId} not found`)); if (!task) return Promise.resolve(textResult(`Task #${params.taskId} not found`));
if (task.status === "completed") return Promise.resolve(textResult(`Task #${params.taskId} already completed`)); if (task.status === "completed") return Promise.resolve(textResult(`Task #${params.taskId} already completed`));
@@ -1065,7 +1056,7 @@ Do NOT summarize or interpret. Paste literal command output, exact log lines, ma
const refreshedTask = store.get(params.taskId); const refreshedTask = store.get(params.taskId);
if (!refreshedTask) return textResult(`Task #${params.taskId} not found after evidence update`); if (!refreshedTask) return textResult(`Task #${params.taskId} not found after evidence update`);
try { try {
const { review, command } = await runAutomaticRobotReview(refreshedTask, signal, currentProvider); const { review, command } = await runAutomaticRobotReview(refreshedTask, signal, getCurrentModelRef(ctx.model));
store.update(params.taskId, { store.update(params.taskId, {
pending_approval: shouldOpenHumanSignoffGate(refreshedTask, review.accepted), pending_approval: shouldOpenHumanSignoffGate(refreshedTask, review.accepted),
metadata: { metadata: {
@@ -1091,13 +1082,13 @@ Do NOT summarize or interpret. Paste literal command output, exact log lines, ma
} }
} catch (err: any) { } catch (err: any) {
store.update(params.taskId, { store.update(params.taskId, {
pending_approval: false, pending_approval: refreshedTask.pending_approval,
metadata: getAutomaticReviewFailureMetadata(err.message, err.rawOutput), metadata: getAutomaticReviewFailureMetadata(err.message, err.rawOutput),
}); });
robotReviewNote = robotReviewNote =
`\n\n### Automatic robot review\n` + `\n\n### Automatic robot review\n` +
`Reviewer failed: ${err.message}\n` + `Reviewer failed: ${err.message}\n` +
`Human sign-off is blocked until the reviewer stage succeeds.` + `Human sign-off is still allowed because reviewer failures are warnings, not evidence rejections.` +
(typeof err.rawOutput === "string" && err.rawOutput.trim() (typeof err.rawOutput === "string" && err.rawOutput.trim()
? `\n\n${formatReviewTextBlock("Reviewer raw output", err.rawOutput.trim())}` ? `\n\n${formatReviewTextBlock("Reviewer raw output", err.rawOutput.trim())}`
: ""); : "");
@@ -1233,7 +1224,7 @@ This does not complete the task. Human /lgtm remains the only completion path.`,
pi.registerTool({ pi.registerTool({
name: "robot_review_run", name: "robot_review_run",
label: "robot_review_run", label: "robot_review_run",
description: `Run the configured automatic robot reviewer against the current task evidence. description: `Run the automatic robot reviewer against the current task evidence using the current session model.
Runs the same Pi-native reviewer stage used automatically by \`lgtm_ask\`. Runs the same Pi-native reviewer stage used automatically by \`lgtm_ask\`.
@@ -1250,7 +1241,7 @@ This appends a new robot-review iteration. If the reviewer marks evidence incomp
} }
try { try {
const { review, command } = await runAutomaticRobotReview(task, signal, currentProvider); const { review, command } = await runAutomaticRobotReview(task, signal, getCurrentModelRef(_ctx.model));
store.update(params.taskId, { store.update(params.taskId, {
pending_approval: shouldOpenHumanSignoffGate(task, review.accepted), pending_approval: shouldOpenHumanSignoffGate(task, review.accepted),
metadata: { metadata: {
@@ -1277,13 +1268,14 @@ This appends a new robot-review iteration. If the reviewer marks evidence incomp
); );
} catch (err: any) { } catch (err: any) {
store.update(params.taskId, { store.update(params.taskId, {
pending_approval: false, pending_approval: task.pending_approval,
metadata: getAutomaticReviewFailureMetadata(err.message, err.rawOutput), metadata: getAutomaticReviewFailureMetadata(err.message, err.rawOutput),
}); });
widget.update(); widget.update();
return textResult( return textResult(
`## Automatic robot review for task #${task.id}: ${task.subject}\n` + `## Automatic robot review for task #${task.id}: ${task.subject}\n` +
`Reviewer failed: ${err.message}\n\n` + `Reviewer failed: ${err.message}\n\n` +
`Automatic review failures are warnings, not evidence rejections, so human sign-off is still allowed.\n\n` +
`Gate status: ${getGateStatus(store.get(params.taskId) ?? task)}` + `Gate status: ${getGateStatus(store.get(params.taskId) ?? task)}` +
(typeof err.rawOutput === "string" && err.rawOutput.trim() (typeof err.rawOutput === "string" && err.rawOutput.trim()
? `\n\n${formatReviewTextBlock("Reviewer raw output", err.rawOutput.trim())}` ? `\n\n${formatReviewTextBlock("Reviewer raw output", err.rawOutput.trim())}`
@@ -1463,12 +1455,17 @@ This appends a new robot-review iteration. If the reviewer marks evidence incomp
if (noEvidence) { if (noEvidence) {
title = `⚠ Task #${taskId} has no agent-submitted evidence.\nSign off anyway?\nDone: ${task.done_criterion}`; title = `⚠ Task #${taskId} has no agent-submitted evidence.\nSign off anyway?\nDone: ${task.done_criterion}`;
signLabel = "✓ Override — sign off without evidence"; signLabel = "✓ Override — sign off without evidence";
} else if (reviewerFailed) {
title = `⚠ Task #${taskId} automatic robot review failed.\nSign off anyway?\nDone: ${task.done_criterion}`;
signLabel = "✓ Override — sign off despite reviewer failure";
} else if (robotRejected) { } else if (robotRejected) {
title = `⚠ Task #${taskId} robot review rejected the evidence.\nSign off anyway?\nDone: ${task.done_criterion}`; title = `⚠ Task #${taskId} robot review rejected the evidence.\nSign off anyway?\nDone: ${task.done_criterion}`;
signLabel = "✓ Override — sign off despite rejected review"; signLabel = "✓ Override — sign off despite rejected review";
} else if (reviewerFailed) {
if (task.pending_approval) {
title = `⚠ Task #${taskId} automatic robot review failed, but human sign-off is still allowed.\nContinue?\nDone: ${task.done_criterion}`;
signLabel = "✓ LGTM — sign off despite reviewer warning";
} else {
title = `⚠ Task #${taskId} automatic robot review failed.\nSign off anyway?\nDone: ${task.done_criterion}`;
signLabel = "✓ Override — sign off despite reviewer failure";
}
} }
const confirm = await ctx.ui.select(title, [signLabel, "✗ Cancel"]); const confirm = await ctx.ui.select(title, [signLabel, "✗ Cancel"]);
if (confirm !== signLabel) return; if (confirm !== signLabel) return;
@@ -1505,7 +1502,7 @@ This appends a new robot-review iteration. If the reviewer marks evidence incomp
}; };
for (const t of open) groups[getDisplayStatus(t)].push(t); for (const t of open) groups[getDisplayStatus(t)].push(t);
const groupLabel: Record<DisplayStatus, string> = { const groupLabel: Record<DisplayStatus, string> = {
awaiting_signoff: "READY (robot ✓)", awaiting_signoff: "READY (human sign-off open)",
in_progress: "ACTIVE (no /lgtm evidence)", in_progress: "ACTIVE (no /lgtm evidence)",
pending: "PENDING (not started)", pending: "PENDING (not started)",
completed: "DONE", completed: "DONE",
@@ -1516,7 +1513,11 @@ This appends a new robot-review iteration. If the reviewer marks evidence incomp
if (inBucket.length === 0) continue; if (inBucket.length === 0) continue;
lines.push(` ${groupLabel[status]}:`); lines.push(` ${groupLabel[status]}:`);
for (const t of inBucket) { for (const t of inBucket) {
const warn = status === "awaiting_signoff" && !latestRobotReviewPasses(t) ? " ⚠ robot rejected" : ""; const warn = status === "awaiting_signoff"
? (typeof t.metadata?.robot_review_last_error === "string"
? " ⚠ reviewer warning"
: (!latestRobotReviewPasses(t) ? " ⚠ robot rejected" : ""))
: "";
lines.push(` #${t.id} ${t.subject}${warn}`); lines.push(` #${t.id} ${t.subject}${warn}`);
} }
} }
+8 -3
View File
@@ -57,10 +57,10 @@ export function getCompletionMode(task: Task): CompletionMode {
export function getReviewState(task: Task): ReviewState { export function getReviewState(task: Task): ReviewState {
if (task.status === "completed") return "human_signed_off"; if (task.status === "completed") return "human_signed_off";
if (task.pending_approval && hasCurrentEvidence(task)) return "ready_for_human";
if (typeof task.metadata?.robot_review_last_error === "string") return "reviewer_failed_to_run";
const latest = getLatestRobotReview(task); const latest = getLatestRobotReview(task);
if (latest && !latest.accepted) return "reviewer_rejected"; if (latest && !latest.accepted) return "reviewer_rejected";
if (task.pending_approval && hasCurrentEvidence(task)) return "ready_for_human";
if (typeof task.metadata?.robot_review_last_error === "string") return "reviewer_failed_to_run";
if (hasCurrentEvidence(task)) return "evidence_submitted"; if (hasCurrentEvidence(task)) return "evidence_submitted";
if (hasEvidenceHistory(task)) return "superseded"; if (hasEvidenceHistory(task)) return "superseded";
return "no_evidence"; return "no_evidence";
@@ -70,7 +70,12 @@ export function getGateStatus(task: Task): string {
const state = getReviewState(task); const state = getReviewState(task);
if (state === "human_signed_off") return "human signed off"; if (state === "human_signed_off") return "human signed off";
if (state === "no_evidence") return "no lgtm evidence submitted"; if (state === "no_evidence") return "no lgtm evidence submitted";
if (state === "ready_for_human") return `ready for human sign-off via /lgtm ${task.id}`; if (state === "ready_for_human") {
if (typeof task.metadata?.robot_review_last_error === "string") {
return `warning: automatic robot review failed, human sign-off still allowed via /lgtm ${task.id}: ${task.metadata.robot_review_last_error}`;
}
return `ready for human sign-off via /lgtm ${task.id}`;
}
if (state === "reviewer_failed_to_run") { if (state === "reviewer_failed_to_run") {
return `blocked: automatic robot review failed: ${task.metadata.robot_review_last_error}`; return `blocked: automatic robot review failed: ${task.metadata.robot_review_last_error}`;
} }
+35 -2
View File
@@ -82,13 +82,23 @@ describe("getGateStatus", () => {
}))).toBe("ready for human sign-off via /lgtm 1"); }))).toBe("ready for human sign-off via /lgtm 1");
}); });
it("reports reviewer failure separately from rejected evidence", () => { it("reports blocking reviewer failure when human sign-off is closed", () => {
expect(getGateStatus(makeTask({ expect(getGateStatus(makeTask({
metadata: { metadata: {
lgtm_evidence: "ok", lgtm_evidence: "ok",
robot_review_last_error: "Unexpected token 'a'", robot_review_last_error: "Unexpected token 'a'",
}, },
}))).toContain("automatic robot review failed"); }))).toContain("blocked: automatic robot review failed");
});
it("reports reviewer failure as a warning when human sign-off stays open", () => {
expect(getGateStatus(makeTask({
pending_approval: true,
metadata: {
lgtm_evidence: "ok",
robot_review_last_error: "Unexpected token 'a'",
},
}))).toContain("warning: automatic robot review failed");
}); });
it("reports rejected robot review when latest review does not accept", () => { it("reports rejected robot review when latest review does not accept", () => {
@@ -111,6 +121,29 @@ describe("getGateStatus", () => {
}, },
}))).toBe("blocked: latest robot review rejected the evidence"); }))).toBe("blocked: latest robot review rejected the evidence");
}); });
it("keeps rejection higher priority than a later reviewer warning", () => {
expect(getGateStatus(makeTask({
pending_approval: true,
metadata: {
lgtm_evidence: "ok",
robot_review_last_error: "timeout",
robot_reviews: [{
iteration: 1,
reviewer: "opencode",
scope: "task evidence",
observations: ["Observed missing output"],
blind_spots: "none",
accepted: false,
evidence_complete: false,
evidence_convincing: false,
missing_evidence: ["literal output"],
submitted_at: "2026-04-17T00:00:00.000Z",
mode: "manual",
}],
},
}))).toBe("blocked: latest robot review rejected the evidence");
});
}); });
describe("getDisplayStatus", () => { describe("getDisplayStatus", () => {
+7
View File
@@ -3,6 +3,7 @@ import {
DEFAULT_ROBOT_REVIEW_TIMEOUT_MS, DEFAULT_ROBOT_REVIEW_TIMEOUT_MS,
extractFinalAssistantTextFromPiJsonl, extractFinalAssistantTextFromPiJsonl,
extractRobotReviewJson, extractRobotReviewJson,
getCurrentModelRef,
getPiInvocation, getPiInvocation,
getRobotReviewTimeoutMs, getRobotReviewTimeoutMs,
runRobotReviewCommand, runRobotReviewCommand,
@@ -49,6 +50,12 @@ describe("robot review runner helpers", () => {
expect(getRobotReviewTimeoutMs({ PI_LGTM_ROBOT_REVIEW_TIMEOUT_MS: "bad" } as NodeJS.ProcessEnv)).toBe(DEFAULT_ROBOT_REVIEW_TIMEOUT_MS); expect(getRobotReviewTimeoutMs({ PI_LGTM_ROBOT_REVIEW_TIMEOUT_MS: "bad" } as NodeJS.ProcessEnv)).toBe(DEFAULT_ROBOT_REVIEW_TIMEOUT_MS);
}); });
it("formats the current model as the reviewer model ref", () => {
expect(getCurrentModelRef({ provider: "openai", id: "gpt-5" })).toBe("openai/gpt-5");
expect(getCurrentModelRef({ providerId: "anthropic", modelId: "claude-haiku" })).toBe("anthropic/claude-haiku");
expect(getCurrentModelRef({ provider: "openai" })).toBeUndefined();
});
it("times out bounded child commands", async () => { it("times out bounded child commands", async () => {
await expect(runRobotReviewCommand({ await expect(runRobotReviewCommand({
command: process.execPath, command: process.execPath,