pi-plan: plan-mode goals + evidence in one plan.md, subagent sign-off

Small, guide-not-gate plan/goal tracker for pi. The agent edits plan.md with its normal Edit tool; CompleteGoal is the one blessed path that runs verify + a read-only judge and records the result. Plan mode drafts goals (done_when + failure_modes + subtasks), a per-turn injection keeps the active goal alive through compaction, and a reminder drives upkeep + autonomy. - src/plan-file.ts: pure parse + the two writes CompleteGoal needs + recordSignOff - src/index.ts: plan mode, review menu, injection, reminder, widget, CompleteGoal, oracle spawn - src/prompts.ts: all model-facing text in flow order - test/: 15 unit tests (parser, disambiguation, sign-off record logic) Co-Authored-By: Claudypoo <288921227+claudypoo@users.noreply.github.com>
2026-06-27 18:05:50 +08:00 · 2026-06-15 18:15:03 +08:00
commit d97b532d7b
10 changed files with 1449 additions and 0 deletions
@@ -0,0 +1,380 @@
+/**
+ * pi-plan — plan mode that sets up goals with evidence, tracked in one plan.md, signed off by a
+ * read-only subagent check. A successor to pi-lgtm, kept deliberately small (≈ burneikis/pi-plan
+ * plus the additions: goals + failure_modes + subtasks, a sign-off check, a widget, a reminder).
+ *
+ * Philosophy (spec D3): the form guides, it does not gate. The agent edits plan.md with its normal
+ * Edit tool. The one blessed tool is CompleteGoal, which runs the sign-off check and records it. The
+ * reminder + the injected plan + git/widget visibility carry the process; we trust the agent's
+ * judgement rather than guarding it.
+ *
+ * Flow:
+ *   /plan <objective>  -> plan mode: agent explores, drafts goals into plan.md (planDrafting guides)
+ *   agent_end          -> review menu (Ready / Edit / $EDITOR / Cancel); Ready offers compaction
+ *   execution          -> each turn, inject the plan summary (survives compaction) + a reminder;
+ *                         agent works goals, ticks subtasks, appends ## Log, calls CompleteGoal
+ *   CompleteGoal       -> optional deterministic verify, then a read-only oracle judge -> accept
+ *                         flips status:done + logs; reject returns what's missing
+ *
+ * All model-facing text lives in prompts.tsx, in flow order.
+ */
+
+import { spawn, spawnSync } from "node:child_process";
+import { existsSync, readFileSync, writeFileSync } from "node:fs";
+import { basename, join } from "node:path";
+import type { ExtensionAPI, ExtensionCommandContext, ExtensionContext } from "@earendil-works/pi-coding-agent";
+import { Type } from "@sinclair/typebox";
+import { counts, findGoal, type Goal, type PlanDoc, parse, recordSignOff, type SignOff } from "./plan-file.js";
+import { evidenceJudgeSystem, evidenceJudgeUser, planDrafting, planInjection, reminder } from "./prompts.js";
+
+const STATE = "pi-plan-state";
+const PLAN_CONTEXT = "pi-plan-context"; // injected plan-mode guidance, stripped from history later
+const STATUS_KEY = "pi-plan";
+const WIDGET_KEY = "pi-plan-widget";
+const READ_ONLY_TOOLS = ["read", "grep", "find", "ls", "bash"];
+
+interface PlanState {
+	isPlanMode: boolean;
+	objective: string | null;
+	/** Optional model ref for the sign-off judge; unset => the subprocess uses pi's default model. */
+	judgeModel: string | null;
+}
+
+export default function piPlanExtension(pi: ExtensionAPI): void {
+	let state: PlanState = { isPlanMode: false, objective: null, judgeModel: null };
+	// Reminder cadence: fire when an active goal exists but plan.md was not touched since last turn.
+	let lastInjectedPlan = "";
+
+	const planPath = (ctx: ExtensionContext) => join(ctx.cwd, "plan.md");
+	const readPlan = (ctx: ExtensionContext): string => (existsSync(planPath(ctx)) ? readFileSync(planPath(ctx), "utf-8") : "");
+
+	function persist(): void {
+		pi.appendEntry<PlanState>(STATE, state);
+	}
+
+	function updateWidget(ctx: ExtensionContext): void {
+		if (state.isPlanMode) {
+			ctx.ui.setStatus(STATUS_KEY, ctx.ui.theme.fg("warning", "planning"));
+			ctx.ui.setWidget(WIDGET_KEY, ["pi-plan: drafting goals", "Write goals to plan.md, then review."]);
+			return;
+		}
+		const doc = parse(readPlan(ctx));
+		if (doc.goals.length === 0) {
+			ctx.ui.setStatus(STATUS_KEY, undefined);
+			ctx.ui.setWidget(WIDGET_KEY, undefined);
+			return;
+		}
+		const c = counts(doc);
+		ctx.ui.setStatus(STATUS_KEY, ctx.ui.theme.fg("accent", `◷ ${c.done}/${doc.goals.length} goals`));
+		ctx.ui.setWidget(WIDGET_KEY, goalWidgetLines(doc));
+	}
+
+	function goalWidgetLines(doc: PlanDoc): string[] {
+		const mark: Record<Goal["status"], string> = { done: "✔", active: "▸", open: "◻", cancelled: "✗" };
+		const lines = [`Plan: ${doc.objective || "(untitled)"}`];
+		for (const g of doc.goals) {
+			if (g.status === "done") continue; // hide finished goals; they stay in the file
+			const open = g.subtasks.filter((s) => !s.done).length;
+			lines.push(`${mark[g.status]} ${g.subject}${open ? ` (${open} todo)` : ""}`);
+		}
+		const c = counts(doc);
+		if (c.done) lines.push(`(${c.done} done, hidden)`);
+		return lines;
+	}
+
+	// --- plan mode: setup -------------------------------------------------------------------------
+
+	pi.registerCommand("plan", {
+		description: "Plan mode: set up goals (with evidence) in plan.md, then work them. /plan <objective>",
+		handler: async (args, ctx) => {
+			const arg = args.trim();
+			if (arg === "clear") {
+				await clearPlan(ctx);
+				return;
+			}
+			if (arg.startsWith("judge")) {
+				setJudge(arg.slice("judge".length).trim(), ctx);
+				return;
+			}
+			if (!arg) {
+				showPlan(ctx);
+				return;
+			}
+
+			state = { ...state, isPlanMode: true, objective: arg };
+			persist();
+			updateWidget(ctx);
+			pi.sendUserMessage(
+				`Enter plan mode for this objective: ${arg}\n\nExplore read-only, then write the plan to ${planPath(ctx)}.`,
+				{ deliverAs: "followUp" },
+			);
+		},
+	});
+
+	function setJudge(ref: string, ctx: ExtensionContext): void {
+		state = { ...state, judgeModel: ref || null };
+		persist();
+		ctx.ui.notify(ref ? `Sign-off judge model set to ${ref}` : "Sign-off judge reset to the default model", "info");
+	}
+
+	async function clearPlan(ctx: ExtensionContext): Promise<void> {
+		if (!existsSync(planPath(ctx))) {
+			ctx.ui.notify("No plan.md to clear.", "info");
+			return;
+		}
+		if (ctx.hasUI) {
+			const ok = await ctx.ui.select("Clear plan.md? (it stays in git history)", ["Cancel", "Clear plan.md"]);
+			if (ok !== "Clear plan.md") return;
+		}
+		writeFileSync(planPath(ctx), "");
+		state = { ...state, isPlanMode: false, objective: null };
+		persist();
+		updateWidget(ctx);
+		ctx.ui.notify("Cleared plan.md.", "info");
+	}
+
+	function showPlan(ctx: ExtensionContext): void {
+		const content = readPlan(ctx);
+		if (!content.trim()) {
+			ctx.ui.notify("No plan yet. Use /plan <objective> to start.", "info");
+			return;
+		}
+		ctx.ui.notify(content, "info");
+	}
+
+	// --- review loop (after the agent drafts the plan) --------------------------------------------
+
+	async function reviewLoop(ctx: ExtensionContext, cmdCtx: ExtensionCommandContext): Promise<void> {
+		while (true) {
+			const doc = parse(readPlan(ctx));
+			const choice = await ctx.ui.select(`Plan: ${doc.goals.length} goal(s). What next?`, [
+				"Ready — start working the plan",
+				"Edit — ask the agent to revise",
+				"Open in $EDITOR",
+				"Cancel — leave plan mode",
+			]);
+			if (!choice || choice.startsWith("Cancel")) {
+				exitPlanMode(ctx);
+				ctx.ui.notify("Left plan mode. plan.md kept.", "info");
+				return;
+			}
+			if (choice.startsWith("Ready")) return startExecution(ctx, cmdCtx);
+			if (choice.startsWith("Edit")) {
+				const changes = await ctx.ui.editor("What should change about the plan?", "");
+				if (changes?.trim()) {
+					pi.sendUserMessage(`Revise the plan at ${planPath(ctx)} with these changes, same format:\n\n${changes.trim()}`);
+					return; // agent_end re-opens the review loop
+				}
+				continue;
+			}
+			if (choice.startsWith("Open")) {
+				const editor = process.env.EDITOR || process.env.VISUAL || "vi";
+				spawnSync(editor, [planPath(ctx)], { stdio: "inherit" });
+			}
+		}
+	}
+
+	function exitPlanMode(ctx: ExtensionContext): void {
+		state = { ...state, isPlanMode: false };
+		persist();
+		updateWidget(ctx);
+	}
+
+	async function startExecution(ctx: ExtensionContext, cmdCtx: ExtensionCommandContext): Promise<void> {
+		// Offer a clean execution context (D13): some runs want the fresh handoff, some want to keep it.
+		let fresh = false;
+		if (ctx.hasUI) {
+			const choice = await ctx.ui.select("Start working the plan in...", [
+				"This context (keep history)",
+				"A fresh, compacted context",
+			]);
+			fresh = choice?.startsWith("A fresh") ?? false;
+		}
+		exitPlanMode(ctx);
+		const doc = parse(readPlan(ctx));
+		if (doc.objective) pi.setSessionName(`Plan: ${doc.objective}`);
+
+		if (fresh) {
+			const result = await cmdCtx.newSession({ parentSession: ctx.sessionManager.getSessionFile() });
+			if (result.cancelled) {
+				ctx.ui.notify("Execution cancelled.", "warning");
+				return;
+			}
+		}
+		pi.sendUserMessage(
+			`Work the plan in ${planPath(ctx)}. Pick an open goal, set it active, work its subtasks, and when its done_when is met call CompleteGoal with the evidence. Keep plan.md current as you go.`,
+			{ deliverAs: "followUp" },
+		);
+	}
+
+	// --- the one blessed tool: CompleteGoal -------------------------------------------------------
+
+	pi.registerTool({
+		name: "CompleteGoal",
+		label: "Complete goal",
+		description:
+			"Sign off a goal once its done_when is met. Runs the goal's verify command (if any) then a " +
+			"read-only subagent that inspects your evidence against the repo. On accept, the goal is marked " +
+			"done and logged; on reject, it stays open and you get what is missing. Point evidence at durable " +
+			"artifacts (saved logs, committed diffs, files), not claims.",
+		parameters: Type.Object({
+			goal_id: Type.String({ description: "The goal's <!-- id --> from plan.md" }),
+			evidence: Type.String({ description: "What shows the done_when is met, and where to verify it" }),
+			paths: Type.Optional(Type.Array(Type.String(), { description: "Durable artifacts the judge should inspect" })),
+		}),
+		async execute(_id, params, signal, _onUpdate, ctx) {
+			const content = readPlan(ctx);
+			const goal = findGoal(parse(content), params.goal_id);
+			if (!goal) return text(`No goal #${params.goal_id} in plan.md.`, true);
+
+			// Decide the outcome (the I/O); recordSignOff applies it to the file (the pure write).
+			const outcome = await decideSignOff(goal, params.evidence, params.paths ?? [], state.judgeModel, ctx.cwd, signal);
+			const res = recordSignOff(content, goal.id, stamp(), outcome);
+			if (res.content !== content) writeFileSync(planPath(ctx), res.content);
+			updateWidget(ctx);
+			return text(res.message, res.isError);
+		},
+	});
+
+	// --- hooks ------------------------------------------------------------------------------------
+
+	pi.on("before_agent_start", async (_event, ctx) => {
+		if (state.isPlanMode) {
+			return { message: { customType: PLAN_CONTEXT, content: `${planDrafting}\n\nWrite the plan to ${planPath(ctx)}.`, display: false } };
+		}
+		const doc = parse(readPlan(ctx));
+		if (doc.goals.length === 0) return;
+
+		const active = doc.goals.find((g) => g.status === "active") ?? doc.goals.find((g) => g.status === "open") ?? null;
+		const c = counts(doc);
+		let body = planInjection({
+			objective: doc.objective,
+			activeGoal: active
+				? { subject: active.subject, done_when: active.done_when, openSubtasks: active.subtasks.filter((s) => !s.done).map((s) => s.text) }
+				: null,
+			lastLogLine: doc.log.at(-1) ?? null,
+			counts: { done: c.done, open: c.open + c.active },
+		});
+		// Reminder fires when there is an active goal but plan.md was untouched since the last turn.
+		const planNow = readPlan(ctx);
+		if (active && planNow === lastInjectedPlan) body += `\n\n${reminder}`;
+		lastInjectedPlan = planNow;
+		return { message: { customType: PLAN_CONTEXT, content: body, display: false } };
+	});
+
+	pi.on("agent_end", async (_event, ctx) => {
+		if (!state.isPlanMode || !ctx.hasUI) return;
+		const doc = parse(readPlan(ctx));
+		if (doc.goals.length === 0) {
+			ctx.ui.notify("No goals found in plan.md yet — ask the agent to draft them.", "warning");
+			return;
+		}
+		await reviewLoop(ctx, ctx as ExtensionCommandContext);
+	});
+
+	// Keep only the freshest injected plan summary; strip stale ones so history does not bloat and
+	// the model never sees an out-of-date plan. (The current turn's injection is the one kept.)
+	pi.on("context", async (event) => {
+		const isCtx = (m: unknown) => (m as { customType?: string }).customType === PLAN_CONTEXT;
+		let lastIdx = -1;
+		event.messages.forEach((m, i) => {
+			if (isCtx(m)) lastIdx = i;
+		});
+		return { messages: event.messages.filter((m, i) => !isCtx(m) || i === lastIdx) };
+	});
+
+	pi.on("session_start", async (_event, ctx) => {
+		const last = ctx.sessionManager
+			.getEntries()
+			.filter((e: { type?: string; customType?: string }) => e.type === "custom" && e.customType === STATE)
+			.pop() as { data?: PlanState } | undefined;
+		if (last?.data) state = { ...state, ...last.data };
+		updateWidget(ctx);
+	});
+}
+
+// --- helpers (module scope; pure enough to keep out of the closure) -------------------------------
+
+function text(s: string, isError = false) {
+	return { content: [{ type: "text" as const, text: s }], details: { isError }, isError };
+}
+
+function stamp(): string {
+	return new Date().toISOString().slice(0, 16).replace("T", " ");
+}
+
+/** Decide a sign-off: deterministic verify first (cheap; skip the model call if it fails), then the judge. */
+async function decideSignOff(
+	goal: Goal,
+	evidence: string,
+	paths: string[],
+	judgeModel: string | null,
+	cwd: string,
+	signal: AbortSignal | undefined,
+): Promise<SignOff> {
+	let verifyResult: { command: string; exitCode: number; outputTail: string } | null = null;
+	if (goal.verify) {
+		verifyResult = runVerify(goal.verify, cwd, signal);
+		if (verifyResult.exitCode !== 0) {
+			return { kind: "verify_failed", exitCode: verifyResult.exitCode, outputTail: verifyResult.outputTail };
+		}
+	}
+	const verdict = await runJudge(goal, evidence, paths, verifyResult, judgeModel, cwd, signal);
+	return verdict.accept ? { kind: "accepted" } : { kind: "rejected", missing: verdict.missing };
+}
+
+/** Run the goal's verify command. It is agent-authored and trusted (single-user machine, guide-not-guard). */
+function runVerify(command: string, cwd: string, signal: AbortSignal | undefined): { command: string; exitCode: number; outputTail: string } {
+	const res = spawnSync("sh", ["-c", command], { cwd, encoding: "utf-8", signal, timeout: 600_000 });
+	const out = `${res.stdout ?? ""}${res.stderr ?? ""}`;
+	return { command, exitCode: res.status ?? 1, outputTail: out.split("\n").slice(-30).join("\n") };
+}
+
+/** Locate the pi binary the same way the oracle extension does, so spawning works under bun or node. */
+function getPiInvocation(args: string[]): { command: string; args: string[] } {
+	const script = process.argv[1];
+	if (script && !script.startsWith("/$bunfs/root/") && existsSync(script)) return { command: process.execPath, args: [script, ...args] };
+	const execName = basename(process.execPath).toLowerCase();
+	if (!/^(node|bun)(\.exe)?$/.test(execName)) return { command: process.execPath, args };
+	return { command: "pi", args };
+}
+
+/** Stage 2: a read-only pi subprocess inspects the evidence against the repo and returns a verdict. */
+async function runJudge(
+	goal: Goal,
+	evidence: string,
+	paths: string[],
+	verifyResult: { command: string; exitCode: number; outputTail: string } | null,
+	judgeModel: string | null,
+	cwd: string,
+	signal: AbortSignal | undefined,
+): Promise<{ accept: boolean; missing: string }> {
+	const task = evidenceJudgeUser({
+		subject: goal.subject,
+		done_when: goal.done_when,
+		verify: goal.verify ?? null,
+		verifyResult,
+		failure_modes: goal.failure_modes,
+		evidence,
+		paths,
+	});
+	const args = ["-p", "--no-session", "--tools", READ_ONLY_TOOLS.join(","), "--append-system-prompt", evidenceJudgeSystem];
+	if (judgeModel) args.push("--model", judgeModel);
+	args.push(task);
+
+	const inv = getPiInvocation(args);
+	const output = await new Promise<string>((resolve) => {
+		const proc = spawn(inv.command, inv.args, { cwd, shell: false, stdio: ["ignore", "pipe", "pipe"], signal });
+		let out = "";
+		proc.stdout.on("data", (d) => (out += d));
+		proc.stderr.on("data", (d) => (out += d));
+		proc.on("close", () => resolve(out));
+		proc.on("error", (e) => resolve(`VERDICT: reject\nmissing: judge subprocess failed: ${e.message}`));
+	});
+
+	const verdictLine = output.split("\n").find((l) => /^\s*VERDICT\s*:/i.test(l)) ?? "";
+	const accept = /accept/i.test(verdictLine);
+	const missingMatch = output.match(/missing\s*:\s*([\s\S]*)$/i);
+	const missing = accept ? "" : (missingMatch?.[1].trim() || output.trim().slice(-500) || "judge gave no reason");
+	return { accept, missing };
+}
@@ -0,0 +1,225 @@
+/**
+ * plan-file.ts — read plan.md, and the two writes CompleteGoal needs. That is all.
+ *
+ * Pure module, no pi deps, so it unit-tests without a runtime. The file is the canonical store and
+ * the agent edits it with its normal Edit tool (create goals, tick subtasks, append log), guided by
+ * the format in prompts.tsx and the reminder -- the form guides, it does not gate (spec D3). So this
+ * module does NOT render or create goals; the format's single source of truth is the planDrafting
+ * prompt. The only programmatic writers are setGoalStatus + appendLog, used by CompleteGoal to
+ * record an accepted sign-off; both touch one line so the git diff stays readable.
+ *
+ * Format (spec §4):
+ *
+ *   # Plan: <objective>
+ *
+ *   ## Goal: <subject>
+ *   <!-- id: <slug> -->
+ *   status: open | active | done | cancelled
+ *   done_when: <falsifiable check; plus the symptom if NOT met>
+ *   verify: <shell command, optional>
+ *   failure_modes:
+ *     - <pre-mortem item>
+ *   - [ ] <subtask>
+ *
+ *   ## Log
+ *   - <verbatim append-only line>
+ */
+
+export type GoalStatus = "open" | "active" | "done" | "cancelled";
+
+export interface Subtask {
+	text: string;
+	done: boolean;
+}
+
+export interface Goal {
+	id: string;
+	subject: string;
+	status: GoalStatus;
+	done_when: string;
+	verify?: string;
+	failure_modes: string[];
+	subtasks: Subtask[];
+}
+
+export interface PlanDoc {
+	objective: string;
+	goals: Goal[];
+	/** Verbatim ## Log lines, including the leading "- ". */
+	log: string[];
+}
+
+const GOAL_HEADER = /^##\s+Goal:\s*(.*)$/;
+const ANY_HEADER = /^#{1,6}\s/;
+const LOG_HEADER = /^##\s+Log\s*$/i;
+const ID_COMMENT = /^<!--\s*id:\s*(.+?)\s*-->$/;
+const CHECKBOX = /^- \[([ xX])\]\s+(.*)$/;
+
+export function parse(text: string): PlanDoc {
+	const lines = text.split("\n");
+	let objective = "";
+	const goals: Goal[] = [];
+	const log: string[] = [];
+
+	let cur: Goal | null = null;
+	let inFailureModes = false;
+	let inLog = false;
+
+	const flush = () => {
+		if (cur) goals.push(cur);
+		cur = null;
+		inFailureModes = false;
+	};
+
+	for (const line of lines) {
+		const objMatch = /^#\s+Plan:\s*(.*)$/.exec(line);
+		if (objMatch) {
+			objective = objMatch[1].trim();
+			continue;
+		}
+
+		const goalMatch = GOAL_HEADER.exec(line);
+		if (goalMatch) {
+			flush();
+			inLog = false;
+			cur = { id: "", subject: goalMatch[1].trim(), status: "open", done_when: "", failure_modes: [], subtasks: [] };
+			continue;
+		}
+
+		if (LOG_HEADER.test(line)) {
+			flush();
+			inLog = true;
+			continue;
+		}
+
+		// Any other header ends the current goal / log section.
+		if (ANY_HEADER.test(line)) {
+			flush();
+			inLog = false;
+			continue;
+		}
+
+		if (inLog) {
+			if (/^\s*-\s+/.test(line)) log.push(line);
+			continue;
+		}
+
+		if (!cur) continue;
+
+		const idMatch = ID_COMMENT.exec(line.trim());
+		if (idMatch) {
+			cur.id = idMatch[1];
+			continue;
+		}
+
+		// A checkbox (column 0) is a subtask; checked first so it is never read as a failure mode.
+		const checkbox = CHECKBOX.exec(line);
+		if (checkbox) {
+			inFailureModes = false;
+			cur.subtasks.push({ done: checkbox[1].toLowerCase() === "x", text: checkbox[2].trim() });
+			continue;
+		}
+
+		const kv = /^(status|done_when|verify|failure_modes)\s*:\s*(.*)$/.exec(line);
+		if (kv) {
+			const [, key, value] = kv;
+			if (key === "status") cur.status = value.trim() as GoalStatus;
+			else if (key === "done_when") cur.done_when = value.trim();
+			else if (key === "verify") cur.verify = value.trim() || undefined;
+			else if (key === "failure_modes") inFailureModes = true;
+			continue;
+		}
+
+		// Indented "- " items under failure_modes: (a column-0 checkbox already returned above).
+		if (inFailureModes) {
+			const fm = /^\s*-\s+(.*)$/.exec(line);
+			if (fm) {
+				cur.failure_modes.push(fm[1].trim());
+				continue;
+			}
+			if (line.trim() !== "") inFailureModes = false;
+		}
+	}
+	flush();
+
+	return { objective, goals, log };
+}
+
+export function findGoal(doc: PlanDoc, id: string): Goal | undefined {
+	return doc.goals.find((g) => g.id === id);
+}
+
+export function counts(doc: PlanDoc): { done: number; open: number; active: number } {
+	const c = { done: 0, open: 0, active: 0 };
+	for (const g of doc.goals) {
+		if (g.status === "done") c.done++;
+		else if (g.status === "active") c.active++;
+		else if (g.status === "open") c.open++;
+	}
+	return c;
+}
+
+/** Flip a goal's `status:` line in place (the one write CompleteGoal needs). */
+export function setGoalStatus(text: string, id: string, status: GoalStatus): string {
+	const lines = text.split("\n");
+	let i = lines.findIndex((l) => ID_COMMENT.test(l.trim()) && ID_COMMENT.exec(l.trim())?.[1] === id);
+	if (i === -1) throw new Error(`Goal #${id} not found`);
+	for (; i < lines.length; i++) {
+		if (i > 0 && ANY_HEADER.test(lines[i]) && !GOAL_HEADER.test(lines[i]) && !LOG_HEADER.test(lines[i])) break;
+		const kv = /^(status\s*:\s*)(.*)$/.exec(lines[i]);
+		if (kv) {
+			lines[i] = `${kv[1]}${status}`;
+			return lines.join("\n");
+		}
+	}
+	throw new Error(`Goal #${id} has no status: line`);
+}
+
+/**
+ * The outcome of a sign-off attempt, decided by CompleteGoal (which runs verify + the judge). Kept
+ * separate from the I/O so the record logic below is pure and testable.
+ */
+export type SignOff =
+	| { kind: "verify_failed"; exitCode: number; outputTail: string }
+	| { kind: "rejected"; missing: string }
+	| { kind: "accepted" };
+
+/** Apply a sign-off outcome to plan.md text: accept flips status + logs; reject only logs. Pure. */
+export function recordSignOff(
+	text: string,
+	goalId: string,
+	when: string,
+	outcome: SignOff,
+): { content: string; message: string; isError: boolean } {
+	const goal = findGoal(parse(text), goalId);
+	if (!goal) return { content: text, message: `No goal #${goalId} in plan.md.`, isError: true };
+
+	if (outcome.kind === "verify_failed") {
+		const content = appendLog(text, `${when} reject #${goalId}: verify exit ${outcome.exitCode}`);
+		return { content, message: `Sign-off rejected: verify failed (exit ${outcome.exitCode}).\n${outcome.outputTail}`, isError: true };
+	}
+	if (outcome.kind === "rejected") {
+		const oneLine = outcome.missing.replace(/\s+/g, " ").trim().slice(0, 200);
+		const content = appendLog(text, `${when} reject #${goalId}: ${oneLine}`);
+		return { content, message: `Sign-off rejected. Missing:\n${outcome.missing}`, isError: true };
+	}
+	const flipped = setGoalStatus(text, goalId, "done");
+	const content = appendLog(flipped, `${when} signed off #${goalId}: ${goal.subject} (oracle accept)`);
+	return { content, message: `Signed off #${goalId}: ${goal.subject}. Marked done in plan.md.`, isError: false };
+}
+
+/** Append one verbatim line to ## Log (creating the section if absent). The other CompleteGoal write. */
+export function appendLog(text: string, entry: string): string {
+	const lines = text.split("\n");
+	const line = `- ${entry}`;
+	const header = lines.findIndex((l) => LOG_HEADER.test(l));
+	if (header === -1) return `${text.replace(/\n+$/, "")}\n\n## Log\n${line}\n`;
+
+	let insertAt = header + 1;
+	for (let i = header + 1; i < lines.length; i++) {
+		if (ANY_HEADER.test(lines[i])) break;
+		if (/^\s*-\s+/.test(lines[i])) insertAt = i + 1;
+	}
+	lines.splice(insertAt, 0, line);
+	return lines.join("\n");
+}
@@ -0,0 +1,199 @@
+/**
+ * pi-plan — all model-facing text, in flow order.
+ *
+ * Philosophy: the form guides a process; it does not police one. The agent can
+ * edit plan.md freely. These prompts + the plan.md structure make the right path
+ * the easy path. The only step that is genuinely rigorous is the evidence judge
+ * (6), and even that is reached by guiding the agent to call CompleteGoal, not by
+ * trapping it. Bypasses stay visible in the git diff and the widget.
+ *
+ * Flow:
+ *   SETUP (plan mode)     1. planDrafting        — strong/sticky model drafts goals
+ *   EXEC, each turn start 2. planInjection       — "here is your plan, where you are"
+ *   EXEC, periodic        3. reminder            — the typed nudge that drives upkeep + autonomy
+ *   EXEC, loop continue   4. continuation        — keep going toward the active goal
+ *   EXEC, after each turn 5. loopJudge           — continue / pause (cheap, foolable, ok)
+ *   SIGN-OFF              6. evidenceJudge        — read-only verify (rigorous; the one real check)
+ *
+ * Read top to bottom to see the whole process. 5 and 6 are kept adjacent on
+ * purpose: the cheap-foolable vs must-not-be-fooled contrast is the design.
+ */
+
+/* ─────────────────────────────────────────────────────────────────────────
+ * 1. planDrafting  —  SETUP, plan mode
+ *
+ * System guidance for the plan-phase agent. Runs on the plan model (may differ
+ * from the execution model; the choice is sticky — see oracle.json-style config).
+ * This phase is read-only: explore, then draft goals into plan.md. No code yet.
+ * The field requirements here are the whole "elicitation" — get them agreed up
+ * front, because the human reviews this output before any execution.
+ * ──────────────────────────────────────────────────────────────────────── */
+export const planDrafting = `\
+You are in plan mode. Explore the repository read-only, then draft a plan into plan.md.
+Do not write or run code in this phase. Produce goals the human will review and approve.
+
+Write each goal in this shape:
+
+## Goal: <one short imperative line>
+status: open
+done_when: <a falsifiable check, plus the symptom you'd see if it's NOT met>
+verify: <a shell command that exits 0 only when the goal is met — include this whenever
+         success is expressible as tests/lint/build/a threshold; omit it otherwise>
+failure_modes:
+  - <a concrete way this could look done but isn't>
+  - <another>
+  - <if verify exists: "verify passes on a trivial or gamed test">
+- [ ] <first subtask>
+- [ ] <next subtask>
+
+Rules for a good plan:
+- Keep goals small enough that done_when is checkable in one sitting.
+- done_when must be falsifiable. "Works well" is not a criterion; "p95 < 50ms on bench-X,
+  else timeouts in load-test.log" is.
+- failure_modes are a pre-mortem: the cheap, specific ways a later "done" could be wrong.
+  This is the highest-value part — it shapes what evidence you'll collect.
+- Prefer a verify command. A green deterministic check is worth more than a paragraph of
+  description, and it's the first thing checked at sign-off.
+
+When the plan is drafted, present it and stop for review. Do not begin execution.`;
+
+/* ─────────────────────────────────────────────────────────────────────────
+ * 2. planInjection  —  EXEC, injected at each agent start (and after compaction)
+ *
+ * A late user-role message, NOT a system-prompt mutation (keeps the prefix cache
+ * valid). Built from the parsed plan. MUST be byte-identical when nothing changed:
+ * fixed field order, no volatile timestamps in the body. Pass only the active
+ * goal + its open subtasks + the last log line — not the whole file.
+ * ──────────────────────────────────────────────────────────────────────── */
+export function planInjection(p: {
+  objective: string;
+  activeGoal: { subject: string; done_when: string; openSubtasks: string[] } | null;
+  lastLogLine: string | null;
+  counts: { done: number; open: number };
+}): string {
+  if (!p.activeGoal) {
+    return `Plan (plan.md): ${p.objective}\nNo active goal. ${p.counts.open} open, ${p.counts.done} done. Pick the next goal or run /plan.`;
+  }
+  const subtasks = p.activeGoal.openSubtasks.length
+    ? p.activeGoal.openSubtasks.map((s) => `  - [ ] ${s}`).join("\n")
+    : "  (no open subtasks)";
+  return `\
+Plan (plan.md): ${p.objective}
+Active goal: ${p.activeGoal.subject}
+done_when: ${p.activeGoal.done_when}
+Open subtasks:
+${subtasks}
+Last log: ${p.lastLogLine ?? "(none yet)"}
+Progress: ${p.counts.done} done, ${p.counts.open} open.`;
+}
+
+/* ─────────────────────────────────────────────────────────────────────────
+ * 3. reminder  —  EXEC, periodic system-reminder
+ *
+ * The typed nudge. This is both the housekeeping and the autonomy engine — it is
+ * what makes the process get followed without a hard gate. Fires after N
+ * file-modifying turns since the last plan.md update while a goal is active.
+ * Keep the wording stable so it doesn't thrash the cache.
+ * ──────────────────────────────────────────────────────────────────────── */
+export const reminder = `\
+<system-reminder>
+Keep plan.md current as you work:
+- tasks: tick the subtasks you've finished; add any new ones you've discovered.
+- log: append ONE short line to ## Log (append — don't rewrite earlier lines).
+- goal: if the active goal's evidence is in, sign it off by calling CompleteGoal with that
+  evidence. Don't edit status to done by hand — CompleteGoal runs the check and records it.
+- otherwise: keep working toward the active goal. Don't stop to ask unless you're genuinely
+  blocked; if blocked, say what's blocking and why.
+</system-reminder>`;
+
+/* ─────────────────────────────────────────────────────────────────────────
+ * 4. continuation  —  EXEC, the loop's "keep going" turn
+ *
+ * Hermes-style. A plain user-role message appended when the loop judge (5) says
+ * continue. Does not mutate the system prompt, so the cache holds.
+ * ──────────────────────────────────────────────────────────────────────── */
+export const continuation = `\
+Continue toward the active goal in plan.md. If it now meets its done_when, call CompleteGoal
+with your evidence (point to durable artifacts — saved logs, committed diffs, files — not just
+claims). If you're blocked, state what's blocking it.`;
+
+/* ─────────────────────────────────────────────────────────────────────────
+ * 5. loopJudge  —  EXEC, runs after each turn to decide continue / pause
+ *
+ * Cheap, conservative, fail-open. Reads only the agent's last response, so it CAN
+ * be fooled by an asserted "done" — that's acceptable: its worst case is a
+ * premature pause, caught by you or the iteration budget. It does NOT sign goals
+ * off; that's the evidence judge's job. Return strict JSON, no prose.
+ * ──────────────────────────────────────────────────────────────────────── */
+export const loopJudgeSystem = `\
+You decide whether an autonomous coding agent should keep working or pause for the human.
+Be conservative: only pause when the work is plainly finished or plainly blocked. When in
+doubt, continue. You are not verifying correctness — a later read-only judge does that.
+Reply with ONLY a JSON object, no other text: {"done": boolean, "reason": "<one sentence>"}.
+Set done=true only if the agent's last message shows the active goal's done_when is met, or
+the agent says it is blocked and needs the human.`;
+
+export function loopJudgeUser(p: { activeGoalDoneWhen: string; lastResponse: string }): string {
+  return `\
+Active goal done_when: ${p.activeGoalDoneWhen}
+
+Agent's last message:
+"""
+${p.lastResponse}
+"""
+
+{"done": ?, "reason": ?}`;
+}
+
+/* ─────────────────────────────────────────────────────────────────────────
+ * 6. evidenceJudge  —  SIGN-OFF, the one rigorous check
+ *
+ * Runs inside CompleteGoal, on the read-only oracle subprocess (fresh context,
+ * strongest reasoning on the chosen provider; override to a different vendor for
+ * high-stakes goals). It re-derives from the repo rather than trusting the
+ * agent's transcription, and it judges whether a verify command actually tests
+ * the criterion or could pass while a named failure mode holds (gaming).
+ *
+ * The transport gives it read/grep/find/ls. The prompt below imposes the verdict
+ * contract — the oracle returns prose by default, so parse the VERDICT line.
+ * ──────────────────────────────────────────────────────────────────────── */
+export const evidenceJudgeSystem = `\
+You are a read-only reviewer signing off a coding goal. Do not trust claims — verify.
+Use read/grep/find/ls to inspect the repository and the cited artifacts yourself. Re-read the
+files, logs, and diffs the evidence points to; if something it asserts isn't on disk, you can't
+confirm it. If a verify command was run, judge whether it genuinely tests the criterion or
+could pass while one of the listed failure modes still holds — a tautological or skipped test
+is a reject. Check each failure mode is actually ruled out, not just unmentioned.
+
+Finish with exactly these two lines and nothing after:
+VERDICT: accept | reject
+missing: <empty if accept; otherwise a short list of what's needed before this can be accepted>`;
+
+export function evidenceJudgeUser(p: {
+  subject: string;
+  done_when: string;
+  verify: string | null;
+  verifyResult: { command: string; exitCode: number; outputTail: string } | null;
+  failure_modes: string[];
+  evidence: string;
+  paths: string[];
+}): string {
+  const verifyBlock = p.verify
+    ? `verify command: ${p.verify}\nverify result: exit ${p.verifyResult?.exitCode ?? "n/a"}\n${p.verifyResult?.outputTail ?? ""}`
+    : "verify command: none (no deterministic check for this goal)";
+  return `\
+Goal: ${p.subject}
+done_when: ${p.done_when}
+failure_modes:
+${p.failure_modes.map((f) => `  - ${f}`).join("\n")}
+
+${verifyBlock}
+
+Agent's evidence:
+${p.evidence}
+
+Artifacts it points to (inspect these):
+${p.paths.map((x) => `  - ${x}`).join("\n") || "  (none listed — note this)"}
+
+Verify the goal against its done_when. Then give your VERDICT.`;
+}