mirror of
https://github.com/wassname/pi-goals.git
synced 2026-06-27 15:16:14 +08:00
Judge: read-only + bash (no edit/write), renderCall/renderResult, streaming progress
- Judge gets read, bash, grep, find, ls but edit+write are blocked via --exclude-tools - Added renderCall: shows goal name while running - Added renderResult: shows accept/reject icon, model, duration, collapsed/expanded view - Wired onUpdate through decideSignOff -> runJudge so the TUI shows progress while judging - Added SignOffDetails type for structured metadata - Added 120s timeout on judge subprocess
This commit is contained in:
+112
-28
@@ -36,6 +36,8 @@ import { spawn, spawnSync } from "node:child_process";
|
|||||||
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
|
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
|
||||||
import { basename, join, resolve } from "node:path";
|
import { basename, join, resolve } from "node:path";
|
||||||
import type { ExtensionAPI, ExtensionCommandContext, ExtensionContext } from "@earendil-works/pi-coding-agent";
|
import type { ExtensionAPI, ExtensionCommandContext, ExtensionContext } from "@earendil-works/pi-coding-agent";
|
||||||
|
import { getMarkdownTheme } from "@earendil-works/pi-coding-agent";
|
||||||
|
import { Container, Markdown, Spacer, Text } from "@earendil-works/pi-tui";
|
||||||
import { Type } from "@sinclair/typebox";
|
import { Type } from "@sinclair/typebox";
|
||||||
import { counts, findGoal, type Goal, type PlanDoc, parse, recordSignOff, type SignOff } from "./plan-file.js";
|
import { counts, findGoal, type Goal, type PlanDoc, parse, recordSignOff, type SignOff } from "./plan-file.js";
|
||||||
import {
|
import {
|
||||||
@@ -52,7 +54,11 @@ const STATE = "pi-goals-state";
|
|||||||
const PLAN_CONTEXT = "pi-goals-context"; // injected plan-mode guidance, stripped from history later
|
const PLAN_CONTEXT = "pi-goals-context"; // injected plan-mode guidance, stripped from history later
|
||||||
const STATUS_KEY = "pi-goals";
|
const STATUS_KEY = "pi-goals";
|
||||||
const WIDGET_KEY = "pi-goals-widget";
|
const WIDGET_KEY = "pi-goals-widget";
|
||||||
const READ_ONLY_TOOLS = ["read", "grep", "find", "ls", "bash"];
|
// Tools the sign-off judge gets: read-only inspection + bash (for git log, cat, running scripts to
|
||||||
|
// inspect). File mutators (edit, write) are blocked so the judge cannot modify anything.
|
||||||
|
// Names match pi's internal tool registry (grep→ffgrep, find→fffind, etc.).
|
||||||
|
const JUDGE_TOOLS = ["read", "bash", "grep", "find", "ls"];
|
||||||
|
const JUDGE_BLOCKED_TOOLS = ["edit", "write"];
|
||||||
// File mutators blocked while drafting goals (read-only plan mode, like narumiruna/pi-plan-mode), so
|
// File mutators blocked while drafting goals (read-only plan mode, like narumiruna/pi-plan-mode), so
|
||||||
// code isn't written before goals are agreed. The one allowed write is goals.md itself (the
|
// code isn't written before goals are agreed. The one allowed write is goals.md itself (the
|
||||||
// deliverable). A read-only task (a pure search) can still be explored in plan mode by nature.
|
// deliverable). A read-only task (a pure search) can still be explored in plan mode by nature.
|
||||||
@@ -270,18 +276,12 @@ export default function piGoalsExtension(pi: ExtensionAPI): void {
|
|||||||
|
|
||||||
pi.registerTool({
|
pi.registerTool({
|
||||||
name: "CompleteGoal",
|
name: "CompleteGoal",
|
||||||
// FIXME(label): "Complete goal" is ambiguous in the TUI — the user sees it running with no
|
label: "Goal signoff",
|
||||||
// hint that it spawns a read-only subagent. Change to "Subagent goal signoff" or similar so the
|
|
||||||
// running tool reads as an action, not just a checkbox flip.
|
|
||||||
// FIXME(progress): while the judge runs (potentially 10-120s), the user only sees "Working". Surface
|
|
||||||
// judge progress — e.g. set the tool label to "Subagent goal signoff (judging...)" via onUpdate,
|
|
||||||
// or emit a status notification so the user knows what's happening and can tell a hang from slowness.
|
|
||||||
label: "Complete goal",
|
|
||||||
description: completeGoalDescription,
|
description: completeGoalDescription,
|
||||||
parameters: Type.Object({
|
parameters: Type.Object({
|
||||||
goal: Type.String({ description: completeGoalParamDescription }),
|
goal: Type.String({ description: completeGoalParamDescription }),
|
||||||
}),
|
}),
|
||||||
async execute(_id, params, signal, _onUpdate, ctx) {
|
async execute(_id, params, signal, onUpdate, ctx) {
|
||||||
const content = readPlan(ctx);
|
const content = readPlan(ctx);
|
||||||
const goal = findGoal(parse(content), params.goal);
|
const goal = findGoal(parse(content), params.goal);
|
||||||
if (!goal) return text(`No goal "${params.goal}" in goals.md. Use the exact text after "goal:".`, true);
|
if (!goal) return text(`No goal "${params.goal}" in goals.md. Use the exact text after "goal:".`, true);
|
||||||
@@ -289,15 +289,68 @@ export default function piGoalsExtension(pi: ExtensionAPI): void {
|
|||||||
return text(`Goal "${goal.subject}" has no evidence yet. Add an evidence: list to the goal in goals.md (artifacts + a short read showing the discriminator is satisfied), then call CompleteGoal.`, true);
|
return text(`Goal "${goal.subject}" has no evidence yet. Add an evidence: list to the goal in goals.md (artifacts + a short read showing the discriminator is satisfied), then call CompleteGoal.`, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Decide the outcome (the I/O); recordSignOff applies it to the file (the pure write).
|
const handleUpdate = (partial: { content: Array<{ type: "text"; text: string }>; details: SignOffDetails }) => {
|
||||||
// Evidence and the artifacts to inspect both come from the goal's evidence: block (single source of truth).
|
onUpdate?.(partial);
|
||||||
const { outcome, reasoning } = await decideSignOff(goal, goal.evidence.join("\n"), goal.evidence, state.judgeModel, ctx.cwd, signal);
|
};
|
||||||
|
|
||||||
|
const { outcome, reasoning, durationMs } = await decideSignOff(goal, goal.evidence.join("\n"), goal.evidence, state.judgeModel, ctx.cwd, signal, handleUpdate);
|
||||||
const res = recordSignOff(content, goal.subject, stamp(), outcome);
|
const res = recordSignOff(content, goal.subject, stamp(), outcome);
|
||||||
if (res.content !== content) writePlan(ctx, res.content);
|
if (res.content !== content) writePlan(ctx, res.content);
|
||||||
updateWidget(ctx);
|
updateWidget(ctx);
|
||||||
// Surface the sign-off judge's actual reasoning, not just the verdict, so it's visible (was a gap).
|
|
||||||
const detail = reasoning ? `\n\n--- sign-off judge ---\n${reasoning}` : "";
|
const detail = reasoning ? `\n\n--- sign-off judge ---\n${reasoning}` : "";
|
||||||
return text(res.message + detail, res.isError);
|
const outcomeLabel = outcome.kind === "accepted" ? "accepted" : outcome.kind === "verify_failed" ? "verify_failed" : "rejected";
|
||||||
|
const details: SignOffDetails = {
|
||||||
|
goal: goal.subject,
|
||||||
|
outcome: outcomeLabel,
|
||||||
|
durationMs,
|
||||||
|
verifyCommand: goal.verify ?? undefined,
|
||||||
|
verifyExitCode: outcome.kind === "verify_failed" ? outcome.exitCode : undefined,
|
||||||
|
judgeModel: state.judgeModel ?? undefined,
|
||||||
|
reasoning,
|
||||||
|
isError: res.isError,
|
||||||
|
};
|
||||||
|
return textWithDetails(res.message + detail, details, res.isError);
|
||||||
|
},
|
||||||
|
|
||||||
|
renderCall(args, theme) {
|
||||||
|
const goalText = args.goal.length > 80 ? `${args.goal.slice(0, 80)}...` : args.goal;
|
||||||
|
return new Text(
|
||||||
|
`${theme.fg("toolTitle", theme.bold("goal signoff "))}${theme.fg("dim", goalText)}`,
|
||||||
|
0, 0,
|
||||||
|
);
|
||||||
|
},
|
||||||
|
|
||||||
|
renderResult(result, { expanded }, theme) {
|
||||||
|
const details = result.details as SignOffDetails | undefined;
|
||||||
|
const body = result.content[0]?.type === "text" ? result.content[0].text : "(no output)";
|
||||||
|
if (!details || details.outcome === "running") return new Text(body, 0, 0);
|
||||||
|
|
||||||
|
const icon = details.outcome === "accepted" ? theme.fg("success", "✔") : theme.fg("error", "✗");
|
||||||
|
const outcomeText = details.outcome === "accepted" ? "accepted" : details.outcome === "verify_failed" ? `verify failed (exit ${details.verifyExitCode})` : "rejected";
|
||||||
|
const header = `${icon} ${theme.fg("toolTitle", theme.bold("goal signoff "))}${theme.fg("accent", outcomeText)}`;
|
||||||
|
const duration = details.durationMs < 1000 ? `${details.durationMs}ms` : `${(details.durationMs / 1000).toFixed(1)}s`;
|
||||||
|
const sub = [details.judgeModel, duration].filter(Boolean).join(" · ");
|
||||||
|
|
||||||
|
if (!expanded) {
|
||||||
|
let text = header;
|
||||||
|
if (sub) text += `\n${theme.fg("dim", sub)}`;
|
||||||
|
text += `\n\n${theme.fg("toolOutput", body.slice(0, 500))}`;
|
||||||
|
if (body.length > 500) text += theme.fg("dim", "...");
|
||||||
|
text += `\n${theme.fg("muted", "(Ctrl+O to expand)")}`;
|
||||||
|
return new Text(text, 0, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
const container = new Container();
|
||||||
|
container.addChild(new Text(header, 0, 0));
|
||||||
|
if (sub) container.addChild(new Text(theme.fg("dim", sub), 0, 0));
|
||||||
|
if (details.verifyCommand) {
|
||||||
|
container.addChild(new Spacer(1));
|
||||||
|
container.addChild(new Text(theme.fg("muted", `verify: ${details.verifyCommand}`), 0, 0));
|
||||||
|
}
|
||||||
|
container.addChild(new Spacer(1));
|
||||||
|
container.addChild(new Text(theme.fg("muted", "Judge"), 0, 0));
|
||||||
|
container.addChild(new Markdown(body.trim(), 0, 0, getMarkdownTheme()));
|
||||||
|
return container;
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -384,10 +437,27 @@ export default function piGoalsExtension(pi: ExtensionAPI): void {
|
|||||||
|
|
||||||
// --- helpers (module scope; pure enough to keep out of the closure) -------------------------------
|
// --- helpers (module scope; pure enough to keep out of the closure) -------------------------------
|
||||||
|
|
||||||
|
/** Structured details returned by CompleteGoal so renderCall/renderResult can show metadata. */
|
||||||
|
interface SignOffDetails {
|
||||||
|
goal: string;
|
||||||
|
outcome: "accepted" | "rejected" | "verify_failed" | "running";
|
||||||
|
phase?: string; // "verifying" | "spawning" | "judging" — while running
|
||||||
|
durationMs: number;
|
||||||
|
verifyCommand?: string;
|
||||||
|
verifyExitCode?: number;
|
||||||
|
judgeModel?: string;
|
||||||
|
reasoning: string;
|
||||||
|
isError?: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
function text(s: string, isError = false) {
|
function text(s: string, isError = false) {
|
||||||
return { content: [{ type: "text" as const, text: s }], details: { isError }, isError };
|
return { content: [{ type: "text" as const, text: s }], details: { isError }, isError };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function textWithDetails(s: string, details: SignOffDetails, isError = false) {
|
||||||
|
return { content: [{ type: "text" as const, text: s }], details, isError };
|
||||||
|
}
|
||||||
|
|
||||||
function stamp(): string {
|
function stamp(): string {
|
||||||
return new Date().toISOString().slice(0, 16).replace("T", " ");
|
return new Date().toISOString().slice(0, 16).replace("T", " ");
|
||||||
}
|
}
|
||||||
@@ -401,20 +471,30 @@ async function decideSignOff(
|
|||||||
judgeModel: string | null,
|
judgeModel: string | null,
|
||||||
cwd: string,
|
cwd: string,
|
||||||
signal: AbortSignal | undefined,
|
signal: AbortSignal | undefined,
|
||||||
): Promise<{ outcome: SignOff; reasoning: string }> {
|
onUpdate?: (partial: { content: Array<{ type: "text"; text: string }>; details: SignOffDetails }) => void,
|
||||||
|
): Promise<{ outcome: SignOff; reasoning: string; durationMs: number }> {
|
||||||
|
const startedAt = Date.now();
|
||||||
|
const emit = (phase: string, text: string) => {
|
||||||
|
onUpdate?.({
|
||||||
|
content: [{ type: "text" as const, text }],
|
||||||
|
details: { goal: goal.subject, outcome: "running", phase, durationMs: Date.now() - startedAt, verifyCommand: goal.verify ?? undefined, judgeModel: judgeModel ?? undefined, reasoning: "" },
|
||||||
|
});
|
||||||
|
};
|
||||||
let verifyResult: { command: string; exitCode: number; outputTail: string } | null = null;
|
let verifyResult: { command: string; exitCode: number; outputTail: string } | null = null;
|
||||||
if (goal.verify) {
|
if (goal.verify) {
|
||||||
|
emit("verifying", `Running verify: ${goal.verify}`);
|
||||||
verifyResult = runVerify(goal.verify, cwd, signal);
|
verifyResult = runVerify(goal.verify, cwd, signal);
|
||||||
if (verifyResult.exitCode !== 0) {
|
if (verifyResult.exitCode !== 0) {
|
||||||
return {
|
return {
|
||||||
outcome: { kind: "verify_failed", exitCode: verifyResult.exitCode, outputTail: verifyResult.outputTail },
|
outcome: { kind: "verify_failed", exitCode: verifyResult.exitCode, outputTail: verifyResult.outputTail },
|
||||||
reasoning: `verify \`${goal.verify}\` exited ${verifyResult.exitCode}:\n${verifyResult.outputTail}`,
|
reasoning: `verify \`${goal.verify}\` exited ${verifyResult.exitCode}:\n${verifyResult.outputTail}`,
|
||||||
|
durationMs: Date.now() - startedAt,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
const verdict = await runJudge(goal, evidence, paths, verifyResult, judgeModel, cwd, signal);
|
const verdict = await runJudge(goal, evidence, paths, verifyResult, judgeModel, cwd, signal, onUpdate);
|
||||||
const outcome: SignOff = verdict.accept ? { kind: "accepted" } : { kind: "rejected", missing: verdict.missing };
|
const outcome: SignOff = verdict.accept ? { kind: "accepted" } : { kind: "rejected", missing: verdict.missing };
|
||||||
return { outcome, reasoning: verdict.reasoning };
|
return { outcome, reasoning: verdict.reasoning, durationMs: verdict.durationMs };
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Run the goal's verify command. It is agent-authored and trusted (single-user machine, guide-not-guard). */
|
/** Run the goal's verify command. It is agent-authored and trusted (single-user machine, guide-not-guard). */
|
||||||
@@ -442,7 +522,15 @@ async function runJudge(
|
|||||||
judgeModel: string | null,
|
judgeModel: string | null,
|
||||||
cwd: string,
|
cwd: string,
|
||||||
signal: AbortSignal | undefined,
|
signal: AbortSignal | undefined,
|
||||||
): Promise<{ accept: boolean; missing: string; reasoning: string }> {
|
onUpdate?: (partial: { content: Array<{ type: "text"; text: string }>; details: SignOffDetails }) => void,
|
||||||
|
): Promise<{ accept: boolean; missing: string; reasoning: string; durationMs: number }> {
|
||||||
|
const startedAt = Date.now();
|
||||||
|
const emit = (phase: string, text: string) => {
|
||||||
|
onUpdate?.({
|
||||||
|
content: [{ type: "text" as const, text }],
|
||||||
|
details: { goal: goal.subject, outcome: "running", phase, durationMs: Date.now() - startedAt, verifyCommand: goal.verify ?? undefined, judgeModel: judgeModel ?? undefined, reasoning: "" },
|
||||||
|
});
|
||||||
|
};
|
||||||
const task = evidenceJudgeUser({
|
const task = evidenceJudgeUser({
|
||||||
subject: goal.subject,
|
subject: goal.subject,
|
||||||
discriminator: goal.discriminator,
|
discriminator: goal.discriminator,
|
||||||
@@ -452,19 +540,15 @@ async function runJudge(
|
|||||||
evidence,
|
evidence,
|
||||||
paths,
|
paths,
|
||||||
});
|
});
|
||||||
const args = ["-p", "--no-session", "--tools", READ_ONLY_TOOLS.join(","), "--append-system-prompt", evidenceJudgeSystem];
|
const args = ["-p", "--no-session", "--tools", JUDGE_TOOLS.join(","), "--exclude-tools", JUDGE_BLOCKED_TOOLS.join(","), "--append-system-prompt", evidenceJudgeSystem];
|
||||||
if (judgeModel) args.push("--model", judgeModel);
|
if (judgeModel) args.push("--model", judgeModel);
|
||||||
args.push(task);
|
args.push(task);
|
||||||
|
|
||||||
|
emit("spawning", `Spawning read-only judge for: ${goal.subject}`);
|
||||||
const inv = getPiInvocation(args);
|
const inv = getPiInvocation(args);
|
||||||
// FIXME(hang): no timeout — if the judge subprocess stalls, this promise never resolves and the
|
|
||||||
// user sees "Working" indefinitely with no way to know what's happening. Add a setTimeout that
|
|
||||||
// resolves with a reject verdict after, say, 120s, and surface progress (e.g. stderr lines) so
|
|
||||||
// the user can tell the judge is still alive. See also: no progress indication in TUI while judging.
|
|
||||||
// FIXME(side-effect): pi -p --no-session clones the repo into the PARENT of cwd (so alongside
|
// FIXME(side-effect): pi -p --no-session clones the repo into the PARENT of cwd (so alongside
|
||||||
// the working dir), leaving a stale pi-plan/ directory. The judge should run in a temp dir or
|
// the working dir), leaving a stale directory. The judge should run in a temp dir or inside the
|
||||||
// inside the existing repo checkout so it doesn't pollute the user's workspace with side-effect
|
// existing repo checkout so it doesn't pollute the user's workspace.
|
||||||
// clones that then fail the next sign-off (the judge found its own clone and rejected the goal).
|
|
||||||
const JUDGE_TIMEOUT_MS = 120_000;
|
const JUDGE_TIMEOUT_MS = 120_000;
|
||||||
const output = await new Promise<string>((resolve) => {
|
const output = await new Promise<string>((resolve) => {
|
||||||
let settled = false;
|
let settled = false;
|
||||||
@@ -472,7 +556,7 @@ async function runJudge(
|
|||||||
if (!settled) {
|
if (!settled) {
|
||||||
settled = true;
|
settled = true;
|
||||||
proc.kill();
|
proc.kill();
|
||||||
resolve(`VERDICT: reject\nmissing: judge timed out after ${JUDGE_TIMEOUT_MS / 1000}s — the subagent did not return a verdict`);
|
resolve(`VERDICT: reject\nmissing: judge timed out after ${JUDGE_TIMEOUT_MS / 1000}s`);
|
||||||
}
|
}
|
||||||
}, JUDGE_TIMEOUT_MS);
|
}, JUDGE_TIMEOUT_MS);
|
||||||
const proc = spawn(inv.command, inv.args, { cwd, shell: false, stdio: ["ignore", "pipe", "pipe"], signal });
|
const proc = spawn(inv.command, inv.args, { cwd, shell: false, stdio: ["ignore", "pipe", "pipe"], signal });
|
||||||
@@ -506,5 +590,5 @@ async function runJudge(
|
|||||||
// end, so keep the tail when it's long.
|
// end, so keep the tail when it's long.
|
||||||
const trimmed = clean.trim();
|
const trimmed = clean.trim();
|
||||||
const reasoning = trimmed.length > 1800 ? `...\n${trimmed.slice(-1800)}` : trimmed;
|
const reasoning = trimmed.length > 1800 ? `...\n${trimmed.slice(-1800)}` : trimmed;
|
||||||
return { accept, missing, reasoning };
|
return { accept, missing, reasoning, durationMs: Date.now() - startedAt };
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user