From 927a482d79faabd45922cef5943c4cf1e398d9dd Mon Sep 17 00:00:00 2001 From: wassname <1103714+wassname@users.noreply.github.com> Date: Sun, 14 Jun 2026 10:00:50 +0800 Subject: [PATCH] rename to pi-proof-tasks and simplify proof log --- README.md | 145 +++---- package-lock.json | 4 +- package.json | 14 +- src/index.ts | 648 ++++++++++++++++--------------- src/review-badges.ts | 65 ++-- src/robot-review.ts | 35 +- src/task-store.ts | 25 +- src/types.ts | 2 +- src/ui/task-widget.ts | 5 +- test/auto-clear.test.ts | 27 +- test/review-badges.test.ts | 61 ++- test/robot-review-runner.test.ts | 6 +- test/robot-review.test.ts | 134 ++++++- test/task-store.test.ts | 75 ++-- test/task-widget.test.ts | 5 - 15 files changed, 677 insertions(+), 574 deletions(-) diff --git a/README.md b/README.md index e925647..71af3e7 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# @wassname/pi-lgtm +# @wassname2/pi-proof-tasks Original ask: > I would like a task list where @@ -7,18 +7,18 @@ Original ask: > 3) A submit_proof form where a subagent provides independant sanity check of on the evidence before completing > 4) - wassname -Help your agent track goals and aim for human sign off. +Hermes-style evidence + judge task list for Pi. -A [pi](https://pi.dev) extension that adds structured human sign-off to task tracking. Fork of [@tintinweb/pi-tasks](https://github.com/tintinweb/pi-tasks) with a minimal LGTM layer. +A [pi](https://pi.dev) extension that adds proof-gated top-level tasks to task tracking. Fork of [@tintinweb/pi-tasks](https://github.com/tintinweb/pi-tasks) with an evidence/review layer inspired by `/until-done`. -The core idea: agents cannot mark tasks complete themselves. They must call `lgtm_ask` with auditable evidence and explicit failure-mode analysis, then a human signs off via `/lgtm `. +The core idea: subtasks are normal checklist items, but top-level tasks are goals. Agents cannot mark top-level tasks complete directly. They must call `TaskClaimDone` with auditable evidence, UAT hints, and explicit failure-mode analysis. A fresh judge then accepts or rejects the claim. Accepted review completes the task; rejected review leaves it open with suggestions. -Tasks can also carry a separate fresh-perspective robot review from a subagent or other model family. Robot reviews can iterate: if the latest review says the evidence is incomplete or unconvincing, human sign-off is held back until the agent strengthens the evidence and reruns review. +Humans can use `/lgtm` to view the proof log and sanity-check the reviewer notes later. `/lgtm` is intentionally thin: proof viewing lives there, task management stays in `/tasks`. ## Install ```bash -pi install npm:@wassname2/pi-lgtm +pi install npm:@wassname2/pi-proof-tasks ``` Or for development: @@ -32,11 +32,11 @@ pi -e ./src/index.ts ## What is different from pi-tasks -| pi-tasks | pi-lgtm | +| pi-tasks | pi-proof-tasks | |---|---| -| Agent calls `TaskUpdate { status: "completed" }` | Blocked -- throws error | -| No evidence required | `lgtm_ask` requires evidence, 2 failure modes, falsification test | -| Tasks complete immediately | Agent sets `pending_approval`, human runs `/lgtm ` | +| Agent calls `TaskUpdate { status: "completed" }` on any task | Allowed only for subtasks; top-level tasks reject direct completion | +| No evidence required | `TaskClaimDone` requires evidence, likely/subtle/unknown failures, falsification test, and uncertainty | +| Tasks complete immediately | Top-level tasks complete only after accepted automatic proof review | | No done criterion | `done_criterion` required on create: falsifiable observation | Stripped: `TaskExecute`, `TaskOutput`, `TaskStop`, `process-tracker.ts`, subagent RPC, settings menu. @@ -47,85 +47,71 @@ Stripped: `TaskExecute`, `TaskOutput`, `TaskStop`, `process-tracker.ts`, subagen ● 3 tasks (1 done, 1 in progress, 1 open) ✔ #1 Design schema ✳ #2 Implementing cache layer… (2m 49s · ↑ 4.1k ↓ 1.2k) - ◻ #3 Load test 🛠 🤖 👀 + ◻ #3 Load test ``` -Badges: - -- `🛠` tool evidence attached via `lgtm_ask` -- `🤖` one or more robot review iterations attached -- `👀` pending human sign-off via `/lgtm` +Collapsed rows stay simple. Proof details live in `TaskGet` and `/lgtm`, not in the widget row itself. ## Tools ### `TaskCreate` ``` -subject, description, done_criterion (required), progress_label (optional) +subject, description, done_criterion (required), progress_label (optional), parentId (optional) ``` -`done_criterion` must be a falsifiable observation: what you expect to see AND what you would see if it is wrong. Example: `"All 92 tests pass. If wrong: type errors in build or failures in task-store.test.ts."` +Omit `parentId` for a proof-gated top-level goal. Set `parentId` for a directly tickable subtask. + +`done_criterion` must be a falsifiable observation: what you expect to see AND what you would see if it is wrong. Example: `"All 125 tests pass. If wrong: type errors in build or failures in task-store.test.ts."` ### `TaskList` -Lists all tasks. `👀` indicates pending sign-off. +Lists all tasks in the same compact one-line style as the widget. Proof details live in `TaskGet` and `/lgtm`. ### `TaskGet` -Full task details including `done_criterion`, approval state, `completion mode`, `review state`, a one-line gate status such as `ready for human sign-off via /lgtm 5` or `blocked: automatic robot review failed: ...`, and evidence-iteration history. +Full task details including `done_criterion`, task kind, `completion mode`, `review state`, gate status, evidence packet, review iterations, and evidence history. ### `TaskUpdate` -Update status (`pending | in_progress | deleted`), subject, description, done_criterion, dependencies. Cannot set `completed` -- use `/lgtm`. +Update status (`pending | in_progress | completed | deleted`), subject, description, done_criterion, dependencies, metadata, or `parentId`. -### `lgtm_ask` +`status=completed` is allowed for subtasks only. Top-level tasks reject with a message telling the agent to use `TaskClaimDone`. -The epistemic gate. Required fields: +### `TaskClaimDone` + +The epistemic gate for proof and UAT. Required fields: | Field | Description | |---|---| -| `taskId` | Task to submit | -| `evidence` | Exact command run + output, commit hash, config/seeds, file paths. "I ran X and got Y" not "I wrote X". | +| `taskId` | Top-level task to claim done | +| `evidence` | Exact command output, commit hash, config/seeds, file paths. Verbatim proof, not a summary | | `failure_likely` | Most likely way this is wrong despite evidence | -| `failure_sneaky` | Perverse/silent failure that looks like success superficially | -| `falsification_test` | What you ran and what you got, so both you and the human can sanity-check it. Why that result could not occur if a failure mode were real. | -| `verification_hints` | Where to look and what to check. These still force the agent to think, but weak hints are advisory rather than a hard block when the verbatim evidence already proves the claim. Core evidence still has to pass on its own. | +| `failure_sneaky` | Subtle/silent failure that looks like success superficially | +| `failure_unknown` | Unknown or untested failure class that could remain | +| `falsification_test` | What you ran and what you got, with literal output | +| `evidence_reasoning` | Why this evidence cheaply distinguishes success from the named failures | +| `verification_hints` | Where to look and what to check, with specific content quoted | | `remaining_uncertainty` | What is NOT tested, deferred edge cases, known limitations | | `commands` | Optional structured command records: `{ cmd, exit_code, stdout_path?, stderr_path? }` | | `evidence_paths` / `falsification_paths` | Optional local artifact paths. Stored as absolute path + sha256 + byte size | | `supersede_reason` | Optional reason when this replaces older evidence on the same task | -After calling this, the task shows `👀` and is only completable via `/lgtm `. Evidence is stored on the task so the human can review it hours later without scrolling back. Re-submitting evidence archives the prior package into superseded history instead of silently overwriting it. +The tool stores a compact canonical proof packet. The automatic reviewer sees that exact packet. Humans later see the same packet via `/lgtm`. -The tool result includes a non-blocking self-check prompt asking whether the evidence directly addresses the `done_criterion` and whether a skeptical reviewer would find it convincing. - -`lgtm_ask` always runs the robot-review stage immediately after storing evidence. A robot review that rejects the evidence clears `pending_approval` until the evidence is strengthened and reviewed again. Weak verification hints are advisory if the core verbatim evidence already proves the done criterion. A reviewer crash, auth failure, timeout, or malformed output is recorded as a warning and leaves human sign-off open. +If the reviewer accepts, the task is completed. If it rejects, the task remains open with missing evidence and suggestions. If the reviewer fails to run, the task still completes and the failure note is stored in the proof log for later inspection. ### `lgtm_supersede` Explicitly retire the current evidence package without completing the task. -Use this when the claim changed or the prior evidence is stale. The tool archives the current evidence, current robot reviews, and reviewer-failure context into history with your reason, then closes the human gate until new evidence is submitted. +Use this when the claim changed or the prior evidence is stale. The tool archives the current evidence, current robot reviews, and reviewer-failure context into history with your reason. Submit a fresh `TaskClaimDone` claim to complete the task. ### `robot_review_ask` Attach a fresh-perspective robot review to a task. -Required fields: - -| Field | Description | -|---|---| -| `taskId` | Task to annotate | -| `reviewer` | Model/provider/family/class used for the review | -| `scope` | What the reviewer inspected | -| `observations` | Concrete observations only. No advice, verdicts, or editorial | -| `blind_spots` | What the reviewer did not inspect or could not verify | -| `accepted` | Overall accept/reject decision for whether the task is ready to advance | -| `evidence_complete` | Whether the supplied evidence actually covers the done criterion | -| `evidence_convincing` | Whether the supplied evidence would convince a skeptical reviewer | -| `missing_evidence` | Concrete missing checks or artifacts needed before human sign-off | - -Use this from a separate subagent or other model when possible. Reviews append as iterations; the latest one is what gates human sign-off. If stored LGTM evidence already exists, an accepted manual review reopens the human sign-off gate. +Use this from a separate model/subagent when possible. Reviews append as iterations and are advisory. They do not complete tasks; the automatic gate runs through `TaskClaimDone` or `robot_review_run`. ### `robot_review_run` @@ -137,29 +123,35 @@ Default reviewer stage: pi --mode json -p --no-session --no-tools --no-extensions --model ``` -This appends a new robot-review iteration. The reviewer returns an explicit `accepted` boolean as well as detailed observations, blind spots, and missing evidence. If the latest robot review rejects the evidence, `/lgtm` is blocked until stronger evidence is submitted and reviewed again. If the reviewer process fails to run or returns malformed output, the failure is recorded but human sign-off stays open. +The reviewer deliberately reuses the active session model in a fresh Pi process. That keeps model selection simple and avoids choosing a registry-listed judge model that exists but does not have working auth. + +The reviewer returns an explicit `accepted` boolean plus observations, concerns, suggestions, blind spots, missing evidence, and rubric reasons. Rejection keeps the task open. Reviewer infrastructure failure is fail-open: autonomy continues and the failure note is stored in the proof log. ## Commands -### `/lgtm ` +### `/lgtm` -Human-only sign-off. Shows stored evidence, falsification output, failure modes, review status, and remaining uncertainty in structured sections for review, then asks for confirmation. Without ``, shows a list of pending-approval tasks. +Proof-log viewer. Use `/lgtm` to pick a task, `/lgtm ` to open specific proof logs, and `/lgtm *` to open all open proof logs. It does not complete, delete, or clear tasks. ### `/tasks` -Interactive menu: view tasks, create task, clear completed/all. +Interactive task-management menu: view tasks, create task, delete a selected task, clear completed, or clear all. ## Task lifecycle -``` -pending -> in_progress -> (lgtm_ask) - -> current evidence iteration N - -> robot review iteration(s) on current evidence 🤖 - -> pending_approval 👀 if latest robot review passes, or reviewer infra fails - -> reviewer_rejected - -> lgtm_supersede or newer lgtm_ask -> superseded history + fresh current evidence - -> (/lgtm) -> completed +```text +Top-level task: +pending -> in_progress -> TaskClaimDone + -> current evidence iteration N 🛠 + -> robot review iteration(s) 🤖 + -> completed ✓ if latest robot review accepts + -> remains open if reviewer rejects + -> completed if reviewer infrastructure fails (fail-open, note logged) + -> lgtm_supersede or newer TaskClaimDone -> superseded history + fresh current evidence -> deleted + +Subtask: +pending -> in_progress -> TaskUpdate(status=completed) -> completed ``` ## Storage @@ -183,26 +175,37 @@ PI_TASKS_DEBUG=1 # trace to stderr ## Architecture -``` +```text src/ -├── index.ts # 8 tools + /tasks + /lgtm commands + widget + event handlers -├── review-badges.ts # Review badge helpers for tool/robot/human lanes -├── robot-review.ts # Robot review iteration storage + compatibility helpers -├── types.ts # Task, TaskStatus types -├── task-store.ts # File-backed store with CRUD, locking, complete() method -├── auto-clear.ts # Turn-based auto-clearing of completed tasks -├── tasks-config.ts # Config persistence -> .pi/tasks-config.json +├── index.ts # tools + /tasks + /lgtm evidence viewer + widget + event handlers +├── review-badges.ts # Review badge helpers for evidence/review/completion lanes +├── robot-review.ts # Robot review iteration storage + compatibility helpers +├── types.ts # Task, TaskStatus types +├── task-store.ts # File-backed store with CRUD, locking, complete() method +├── auto-clear.ts # Turn-based auto-clearing of completed tasks +├── tasks-config.ts # Config persistence -> .pi/tasks-config.json └── ui/ - └── task-widget.ts # Widget with status icons, spinner, 👀 indicator + └── task-widget.ts # Widget with status icons and spinner ``` -## Development +## UI split + +- `/tasks` is the management surface. +- `/lgtm` is the proof-log viewer. +- `TaskClaimDone` is the completion gate for top-level tasks. + +That split is deliberate. It keeps proof inspection separate from task mutation and stays closer to the simpler pre-fork task UI. + +## UAT and development + +The intended proof mix is one end-to-end functional test, one live UAT in the extension itself, and only a few targeted unit tests for invariants. ```bash npm install npm run typecheck -npm test # 92 tests +npm test npm run build +npm run lint ``` ## License diff --git a/package-lock.json b/package-lock.json index d2cb107..79fe7b1 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,11 +1,11 @@ { - "name": "@wassname/pi-lgtm", + "name": "@wassname2/pi-proof-tasks", "version": "0.4.2", "lockfileVersion": 3, "requires": true, "packages": { "": { - "name": "@wassname/pi-lgtm", + "name": "@wassname2/pi-proof-tasks", "version": "0.4.2", "license": "MIT", "dependencies": { diff --git a/package.json b/package.json index 26c7940..5ce7ed7 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { - "name": "@wassname2/pi-lgtm", + "name": "@wassname2/pi-proof-tasks", "version": "0.4.2", - "description": "A pi extension providing goal tracking with structural sign-off and LGTM workflow.", + "description": "Hermes-style evidence + judge task list for Pi, with proof-gated top-level completion and UAT logs.", "author": "wassname", "license": "MIT", "repository": { @@ -16,9 +16,13 @@ "pi-package", "pi", "pi-extension", - "lgtm", - "sign-off", - "goal-tracking" + "proof", + "judge", + "uat", + "evidence", + "task-list", + "failure-modes", + "hermes-style" ], "dependencies": { "@mariozechner/pi-coding-agent": "^0.62.0", diff --git a/src/index.ts b/src/index.ts index 76a8be6..df2d82a 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,25 +1,26 @@ /** - * pi-lgtm — Task tracking with structured human sign-off for pi coding agent. + * pi-proof-tasks — Hermes-style evidence + judge task list for pi coding agent. * * Two-tier model: - * - Tasks: agent self-manages. Trivial bookkeeping completes via TaskUpdate. - * - LGTMs: significant claims. lgtm_ask submits evidence, robot review gates, - * human /lgtm completes. + * - Subtasks: agent self-manages. Checklist work completes via TaskUpdate. + * - Top-level tasks: goals. TaskClaimDone submits a compact proof/UAT packet, + * a fresh judge gives an independent perspective, and explicit rejection keeps + * the task open for a stronger retry. * * Tools: * TaskCreate — Create a task with done_criterion * TaskList — List tasks grouped by status * TaskGet — Get full task details - * TaskUpdate — Update task fields/status (gated for tasks with lgtm evidence) - * lgtm_ask — Present evidence + failure modes for human sign-off + * TaskUpdate — Update task fields/status (gated for top-level proof goals) + * TaskClaimDone — Present evidence + failure modes for proof review * robot_review_ask — Attach observational review from a fresh-perspective agent * robot_review_run — Re-run the automatic robot reviewer * * Commands: * /tasks — Interactive task management menu - * /lgtm — Human signs off on one or more tasks (override allowed even without lgtm_ask) - * /lgtm * — Sign off ALL open tasks (READY/ACTIVE/PENDING) after a grouped confirm - * /lgtm — Pick from any open task (with [READY]/[ACTIVE]/[PENDING] tags) + * /lgtm — View the proof log for one or more tasks + * /lgtm * — View all open task proof logs + * /lgtm — Pick from open tasks to inspect proof logs */ import { spawn } from "node:child_process"; @@ -44,10 +45,9 @@ import { appendRobotReviewMetadata, getLatestRobotReview, getRobotReviews, - latestRobotReviewPasses, type RobotReviewRecord, relaxAdvisoryVerificationHints, - shouldOpenHumanSignoffGate, + shouldCompleteAfterAcceptedReview, } from "./robot-review.js"; import { TaskStore } from "./task-store.js"; import { loadTasksConfig } from "./tasks-config.js"; @@ -58,7 +58,7 @@ function textResult(msg: string) { return { content: [{ type: "text" as const, text: msg }], details: undefined as any }; } -const TASK_TOOL_NAMES = new Set(["TaskCreate", "TaskList", "TaskGet", "TaskUpdate", "lgtm_ask", "lgtm_supersede", "robot_review_ask", "robot_review_run"]); +const TASK_TOOL_NAMES = new Set(["TaskCreate", "TaskList", "TaskGet", "TaskUpdate", "TaskClaimDone", "lgtm_supersede", "robot_review_ask", "robot_review_run"]); const REMINDER_INTERVAL = 4; const AUTO_CLEAR_DELAY = 4; export const DEFAULT_ROBOT_REVIEW_TIMEOUT_MS = 120_000; @@ -69,12 +69,12 @@ export function getPiInvocation( args: string[], env: NodeJS.ProcessEnv = process.env, ): { command: string; args: string[] } { - const configured = env.PI_LGTM_PI_BIN?.trim(); + const configured = env.PI_PROOF_TASKS_PI_BIN?.trim(); return { command: configured || "pi", args }; } export function getRobotReviewTimeoutMs(env: NodeJS.ProcessEnv = process.env): number { - const configured = Number.parseInt(env.PI_LGTM_ROBOT_REVIEW_TIMEOUT_MS ?? "", 10); + const configured = Number.parseInt(env.PI_PROOF_TASKS_ROBOT_REVIEW_TIMEOUT_MS ?? "", 10); return Number.isFinite(configured) && configured > 0 ? configured : DEFAULT_ROBOT_REVIEW_TIMEOUT_MS; } @@ -246,7 +246,9 @@ interface EvidenceIterationRecord { evidence: string; failure_likely: string; failure_sneaky: string; + failure_unknown: string; falsification_test: string; + evidence_reasoning: string; verification_hints: string[]; remaining_uncertainty: string; commands: EvidenceCommandRecord[]; @@ -267,6 +269,8 @@ const ROBOT_REVIEW_KEYS = [ "robot_review_reviewer", "robot_review_scope", "robot_review_observations", + "robot_review_concerns", + "robot_review_suggestions", "robot_review_blind_spots", "robot_review_accepted", "robot_review_evidence_complete", @@ -283,7 +287,9 @@ const CURRENT_EVIDENCE_KEYS = [ "lgtm_evidence", "lgtm_failure_likely", "lgtm_failure_sneaky", + "lgtm_failure_unknown", "lgtm_falsification_test", + "lgtm_evidence_reasoning", "lgtm_verification_hints", "lgtm_remaining_uncertainty", "lgtm_submitted_at", @@ -292,6 +298,26 @@ const CURRENT_EVIDENCE_KEYS = [ "lgtm_falsification_artifacts", ] as const; +const RESERVED_METADATA_PREFIXES = ["lgtm_", "robot_review"]; + +function assertNoReservedMetadata(metadata: Record | undefined): string | null { + if (!metadata) return null; + for (const key of Object.keys(metadata)) { + if (RESERVED_METADATA_PREFIXES.some(prefix => key.startsWith(prefix))) { + return `Metadata key ${key} is reserved for proof/review internals. Use TaskClaimDone or robot_review_run instead.`; + } + } + return null; +} + +function requiredTextError(fields: Record, names: string[]): string | null { + for (const name of names) { + const value = fields[name]; + if (typeof value !== "string" || value.trim().length === 0) return `${name} is required and cannot be blank.`; + } + return null; +} + function nullRecord(keys: readonly string[]): Record { return Object.fromEntries(keys.map((key) => [key, null])); } @@ -370,7 +396,9 @@ export function getCurrentEvidenceIteration(task: Task): EvidenceIterationRecord evidence: metadata.lgtm_evidence, failure_likely: typeof metadata.lgtm_failure_likely === "string" ? metadata.lgtm_failure_likely : "", failure_sneaky: typeof metadata.lgtm_failure_sneaky === "string" ? metadata.lgtm_failure_sneaky : "", + failure_unknown: typeof metadata.lgtm_failure_unknown === "string" ? metadata.lgtm_failure_unknown : "", falsification_test: typeof metadata.lgtm_falsification_test === "string" ? metadata.lgtm_falsification_test : "", + evidence_reasoning: typeof metadata.lgtm_evidence_reasoning === "string" ? metadata.lgtm_evidence_reasoning : "", verification_hints: Array.isArray(metadata.lgtm_verification_hints) ? metadata.lgtm_verification_hints.filter((hint: unknown): hint is string => typeof hint === "string") : [], remaining_uncertainty: typeof metadata.lgtm_remaining_uncertainty === "string" ? metadata.lgtm_remaining_uncertainty : "", commands: normalizeCommandRecords(metadata.lgtm_commands), @@ -409,52 +437,131 @@ function formatReviewTextBlock(title: string, body: string): string { return `### ${title}\n\n\`\`\`text\n${body}\n\`\`\``; } +function presentOrMissing(value: string | undefined): string { + return value && value.trim().length > 0 ? value : "(missing)"; +} + +function formatBulletList(title: string, items: string[], empty = "(none)"): string { + return `### ${title}\n${items.length > 0 ? items.map((item) => `- ${item}`).join("\n") : `- ${empty}`}`; +} + function formatCommandRecords(commands: EvidenceCommandRecord[]): string | undefined { if (commands.length === 0) return undefined; - return `### Commands\n${commands.map((command, index) => { - const parts = [ - `${index + 1}. \`${command.cmd}\``, - `exit=${command.exit_code}`, - ]; - if (command.stdout_path) parts.push(`stdout=${command.stdout_path}`); - if (command.stderr_path) parts.push(`stderr=${command.stderr_path}`); - return `- ${parts.join(" | ")}`; - }).join("\n")}`; + return `### Commands\n${commands.map((command) => `- \`${command.cmd}\` (exit ${command.exit_code})${command.stdout_path ? ` stdout: ${command.stdout_path}` : ""}${command.stderr_path ? ` stderr: ${command.stderr_path}` : ""}`).join("\n")}`; } function formatArtifactRecords(title: string, artifacts: EvidenceArtifactRecord[]): string | undefined { if (artifacts.length === 0) return undefined; - return `### ${title}\n${artifacts.map((artifact) => `- ${artifact.path} | sha256=${artifact.sha256} | bytes=${artifact.bytes}`).join("\n")}`; + return `### ${title}\n${artifacts.map((artifact) => `- ${artifact.path} (${artifact.bytes} bytes, sha256 ${artifact.sha256})`).join("\n")}`; } -function formatEvidencePackage(task: Task): string[] { +function renderPlannedEvidence(entry: EvidenceIterationRecord): string { + return [ + "## Planned evidence / UAT", + formatBulletList("Verification hints", entry.verification_hints, "(missing)"), + formatReviewTextBlock("Falsification test", presentOrMissing(entry.falsification_test)), + ].join("\n\n"); +} + +function summarizeJudgement(entry: EvidenceIterationRecord): { title: string; body: string; suggestions: string[] } { + const latestReview = entry.robot_reviews[entry.robot_reviews.length - 1]; + if (latestReview) { + const judgement = latestReview.accepted ? "Accepted" : "Refused"; + const concerns = [ + ...latestReview.observations, + ...latestReview.concerns, + ...latestReview.missing_evidence.map((item) => `Missing evidence: ${item}`), + ]; + const suggestions = latestReview.suggestions.length > 0 + ? latestReview.suggestions + : latestReview.accepted + ? [] + : latestReview.missing_evidence.map((item) => `Strengthen the proof for: ${item}`); + return { + title: judgement, + body: `${judgement} by ${latestReview.reviewer} on ${latestReview.submitted_at}.`, + suggestions: [...concerns, ...suggestions], + }; + } + if (entry.automatic_review_failure) { + return { + title: "Reviewer unavailable", + body: entry.automatic_review_failure.message, + suggestions: [ + "Autonomy continued without blocking completion.", + "Inspect the reviewer failure note if you want a fresh external perspective later.", + ], + }; + } + return { + title: "Pending review", + body: "No judge result recorded yet.", + suggestions: [], + }; +} + +function renderAttempt(entry: EvidenceIterationRecord): string { + const judgement = summarizeJudgement(entry); + return [ + `## Attempt ${entry.iteration}`, + formatReviewTextBlock("Submitted evidence", presentOrMissing(entry.evidence)), + `### Judgement\n${judgement.title}\n\n${judgement.body}`, + formatBulletList("Suggestions / concerns", judgement.suggestions, "(none)"), + ].join("\n\n"); +} + +export function renderEvidencePacket(task: Task): string { const current = getCurrentEvidenceIteration(task); - const sections: string[] = []; + if (!current) return "(No current proof claim. The agent never called TaskClaimDone, or the prior claim was superseded.)"; + + return [ + "## Goal", + `Task #${task.id}: ${task.subject}`, + presentOrMissing(task.done_criterion), + renderPlannedEvidence(current), + renderAttempt(current), + formatBulletList("Failure modes", [ + `Likely: ${presentOrMissing(current.failure_likely)}`, + `Sneaky: ${presentOrMissing(current.failure_sneaky)}`, + `Unknown: ${presentOrMissing(current.failure_unknown)}`, + ]), + formatReviewTextBlock("Why this proves success", presentOrMissing(current.evidence_reasoning)), + formatReviewTextBlock("Remaining uncertainty", presentOrMissing(current.remaining_uncertainty)), + formatCommandRecords(current.commands), + formatArtifactRecords("Evidence artifacts", current.evidence_artifacts), + formatArtifactRecords("Falsification artifacts", current.falsification_artifacts), + ].filter((section): section is string => typeof section === "string" && section.length > 0).join("\n\n"); +} + +function renderAutomaticReviewFailure(task: Task): string | undefined { + if (typeof task.metadata?.robot_review_last_error !== "string") return undefined; + const sections = [`### Automatic robot review failure\n${task.metadata.robot_review_last_error}`]; + if (typeof task.metadata?.robot_review_last_error_output === "string" && task.metadata.robot_review_last_error_output.trim()) { + sections.push(formatReviewTextBlock("Reviewer raw output", task.metadata.robot_review_last_error_output)); + } + return sections.join("\n\n"); +} + +export function renderProofLog(task: Task): string { + const history = getEvidenceHistory(task); + const attempts = history.map(renderAttempt); + const current = getCurrentEvidenceIteration(task); + const lines = [ + `# Task #${task.id}: ${task.subject}`, + `Status: ${task.status}`, + `Gate status: ${getGateStatus(task)}`, + "", + "## Goal", + presentOrMissing(task.done_criterion), + ]; if (current) { - sections.push(`Evidence iteration: ${current.iteration} of ${getEvidenceIterationCount(task)}`); - sections.push(formatReviewTextBlock("Evidence", current.evidence)); - if (current.failure_likely) sections.push(`### Failure (likely)\n${current.failure_likely}`); - if (current.failure_sneaky) sections.push(`### Failure (sneaky)\n${current.failure_sneaky}`); - if (current.falsification_test) sections.push(formatReviewTextBlock("Falsification test", current.falsification_test)); - const commands = formatCommandRecords(current.commands); - if (commands) sections.push(commands); - const evidenceArtifacts = formatArtifactRecords("Evidence artifacts", current.evidence_artifacts); - if (evidenceArtifacts) sections.push(evidenceArtifacts); - const falsificationArtifacts = formatArtifactRecords("Falsification artifacts", current.falsification_artifacts); - if (falsificationArtifacts) sections.push(falsificationArtifacts); - if (current.verification_hints.length > 0) { - sections.push(`### Verification hints\n${current.verification_hints.map((hint) => `- ${hint}`).join("\n")}`); - } - if (current.remaining_uncertainty) sections.push(`### Remaining uncertainty\n${current.remaining_uncertainty}`); - sections.push(`Submitted: ${current.submitted_at}`); + lines.push("", renderPlannedEvidence(current), "", ...attempts, renderAttempt(current)); + } else if (attempts.length > 0) { + lines.push("", ...attempts); + } else { + lines.push("", "(No current proof claim.)"); } - if (typeof task.metadata?.robot_review_last_error === "string") { - sections.push(`### Automatic robot review failure\n${task.metadata.robot_review_last_error}`); - if (typeof task.metadata?.robot_review_last_error_output === "string" && task.metadata.robot_review_last_error_output.trim()) { - sections.push(formatReviewTextBlock("Reviewer raw output", task.metadata.robot_review_last_error_output)); - } - } - return sections; + return lines.join("\n"); } function getNonReviewMetadata(task: Task): Record { @@ -510,66 +617,56 @@ function formatRobotReview(review: RobotReviewRecord): string { parts.push(`Rubric:\n${rubricLines.join("\n")}`); } parts.push( - `**Accepted: ${review.accepted ? "yes" : "no"}**`, - `**Evidence complete: ${review.evidence_complete ? "yes" : "no"}**`, - `**Evidence convincing: ${review.evidence_convincing ? "yes" : "no"}**`, + `Accepted: ${review.accepted ? "yes" : "no"}`, + `Evidence complete: ${review.evidence_complete ? "yes" : "no"}`, + `Evidence convincing: ${review.evidence_convincing ? "yes" : "no"}`, `Observations:\n- ${review.observations.join("\n- ")}`, ); + if (review.concerns.length > 0) parts.push(`Concerns:\n- ${review.concerns.join("\n- ")}`); + if (review.suggestions.length > 0) parts.push(`Suggestions:\n- ${review.suggestions.join("\n- ")}`); if (review.missing_evidence.length > 0) parts.push(`Missing evidence:\n- ${review.missing_evidence.join("\n- ")}`); if (review.blind_spots) parts.push(`Blind spots: ${review.blind_spots}`); return parts.join("\n"); } -function buildRobotReviewPrompt(task: any): string { - const priorReviews = getRobotReviews(task); - const priorSection = priorReviews.length > 0 - ? `\nPrevious robot reviews:\n${priorReviews.map(formatRobotReview).join("\n\n")}\n` - : "\nPrevious robot reviews:\n(none)\n"; +export function buildRobotReviewPrompt(task: Task): string { return [ - "You are a VALIDATION reviewer, not a flaw-finder. Your job is to sanity-check that the evidence addresses the done criterion.", - "Your role: validate and sanity-check. Comment and suggest, but the gate is only the rubric below.", + "You are a fresh validation judge for a Hermes-style proof log.", + "Question: in retrospect, does this evidence prove success for the stated goal?", + "If not, say no and explain what the agent should do next. Suggestions are advisory guidance, not a separate gate.", "", "## Critical: Evidence must be verbatim", "", - "Evidence should contain literal output — verbatim command output, exact log lines, markdown block quotes, table rows, URLs — not summaries or interpretations. If the evidence only says 'it worked' or 'returned 5 results' without showing the actual output, flag it under evidence_covers_done_criterion or falsification_test_runnable, not verification_hints_actionable.", - "A human must be able to verify the claim from the evidence alone, without re-running anything. Summaries are not evidence. Literal output is evidence.", + "Evidence should contain literal output, exact log lines, markdown block quotes, table rows, and URLs, not summaries or interpretations.", + "A human must be able to inspect the evidence alone without re-running anything.", "", "## Rubric (rate each item pass/fail)", "", - "1. evidence_covers_done_criterion: Does the evidence directly address the stated done criterion? Evidence must be verbatim (literal output, not 'it worked').", - "2. falsification_test_runnable: Is the falsification test concrete enough that someone could run it and get a yes/no result? Must include actual output, not just 'ran X and it worked'.", - "3. failure_modes_addressed: Are the failure_likely and failure_sneaky plausibly the top failure modes? (Not: are there OTHER failure modes?)", - "4. verification_hints_actionable: Can a human follow the verification hints to check the claim without re-running experiments? Hints should reference specific content (line ranges, output snippets, URLs), not bare paths or counts.", + "1. evidence_covers_done_criterion: Does the evidence directly address the stated done criterion?", + "2. falsification_test_runnable: Is the falsification test concrete enough that someone could run it and get a yes/no result?", + "3. failure_modes_addressed: Are the likely, sneaky, and unknown failure modes plausible enough to guide evidence choice?", + "4. evidence_distinguishes_success: Does the agent explain why the evidence distinguishes success from those failure modes?", + "5. verification_hints_actionable: Can a human follow the verification hints to inspect the claim without re-running experiments?", "", "Set evidence_complete=true only if items 1 and 2 pass.", "Set evidence_convincing=true only if items 1 and 2 pass. Item 4 is advisory unless it reveals that items 1 or 2 were overstated.", - "Set accepted=true only if items 1, 2, and 3 pass. Do not reject solely because verification hints are weak if the verbatim evidence already proves the done criterion.", + "Set accepted=true only if items 1, 2, 3, and 4 pass. Do not reject solely because verification hints are weak if the verbatim evidence already proves the done criterion.", "", - "Observations: report what you see, not what might be missing. Comments and suggestions go in observations.", + "observations: what you saw in the packet.", + "concerns: concise reasons the current evidence may not prove success yet.", + "suggestions: what the agent should do next if the evidence is not yet enough. Nonblocking guidance only.", "missing_evidence: ONLY items from the rubric that failed. Do NOT add new dimensions.", "", "Return exactly one JSON object between the markers ROBOT_REVIEW_JSON_START and ROBOT_REVIEW_JSON_END.", - "JSON schema (reasoning before booleans — think first, then judge):", - '{"reviewer":"string","scope":"string","rubric":{"evidence_covers_done_criterion":{"reason":"...","pass":true},"falsification_test_runnable":{"reason":"...","pass":true},"failure_modes_addressed":{"reason":"...","pass":true},"verification_hints_actionable":{"reason":"...","pass":true}},"observations":["string"],"blind_spots":"string","missing_evidence":["string"],"evidence_complete":true,"evidence_convincing":true,"accepted":true}', + "JSON schema:", + '{"reviewer":"string","scope":"string","rubric":{"evidence_covers_done_criterion":{"reason":"...","pass":true},"falsification_test_runnable":{"reason":"...","pass":true},"failure_modes_addressed":{"reason":"...","pass":true},"evidence_distinguishes_success":{"reason":"...","pass":true},"verification_hints_actionable":{"reason":"...","pass":true}},"observations":["string"],"concerns":["string"],"suggestions":["string"],"blind_spots":"string","missing_evidence":["string"],"evidence_complete":true,"evidence_convincing":true,"accepted":true}', "", - `Task #${task.id}: ${task.subject}`, - `Done criterion: ${task.done_criterion}`, - `Description: ${task.description}`, + "You are reviewing exactly the same proof packet shown by TaskGet and /lgtm. Do not assume hidden context beyond this packet.", "", - "Evidence package:", - `Evidence: ${task.metadata?.lgtm_evidence ?? "(missing)"}`, - `Failure likely: ${task.metadata?.lgtm_failure_likely ?? "(missing)"}`, - `Failure sneaky: ${task.metadata?.lgtm_failure_sneaky ?? "(missing)"}`, - `Falsification test: ${task.metadata?.lgtm_falsification_test ?? "(missing)"}`, - `Verification hints: ${Array.isArray(task.metadata?.lgtm_verification_hints) ? task.metadata.lgtm_verification_hints.join(" | ") : "(missing)"}`, - `Remaining uncertainty: ${task.metadata?.lgtm_remaining_uncertainty ?? "(missing)"}`, - `Commands: ${JSON.stringify(normalizeCommandRecords(task.metadata?.lgtm_commands))}`, - `Evidence artifacts: ${JSON.stringify(normalizeArtifactRecords(task.metadata?.lgtm_evidence_artifacts))}`, - `Falsification artifacts: ${JSON.stringify(normalizeArtifactRecords(task.metadata?.lgtm_falsification_artifacts))}`, - priorSection, + renderEvidencePacket(task), "Output format:", "ROBOT_REVIEW_JSON_START", - '{"reviewer":"...","scope":"...","rubric":{...},"observations":["..."],"blind_spots":"...","missing_evidence":["..."],"evidence_complete":true,"evidence_convincing":true,"accepted":true}', + '{"reviewer":"...","scope":"...","rubric":{...},"observations":["..."],"concerns":["..."],"suggestions":["..."],"blind_spots":"...","missing_evidence":["..."],"evidence_complete":true,"evidence_convincing":true,"accepted":true}', "ROBOT_REVIEW_JSON_END", ].join("\n"); } @@ -583,6 +680,8 @@ async function runAutomaticRobotReview( throw new Error("Automatic robot review requires an active current session model."); } const prompt = buildRobotReviewPrompt(task); + // Keep reviewer model selection simple: reuse the active session model in a fresh Pi process. + // This avoids picking a registry-listed judge model that exists but lacks working auth. const args = ["--mode", "json", "-p", "--no-session", "--no-tools", "--no-extensions", "--model", currentModelRef]; args.push(prompt); const invocation = getPiInvocation(args); @@ -608,6 +707,8 @@ async function runAutomaticRobotReview( error.rawOutput = result.stdout.trim(); throw error; } + const concerns = Array.isArray(parsed.concerns) ? parsed.concerns.filter((item): item is string => typeof item === "string") : []; + const suggestions = Array.isArray(parsed.suggestions) ? parsed.suggestions.filter((item): item is string => typeof item === "string") : []; const rawMissing: string[] = Array.isArray(parsed.missing_evidence) ? parsed.missing_evidence.filter((item): item is string => typeof item === "string") : []; @@ -628,6 +729,8 @@ async function runAutomaticRobotReview( reviewer: typeof parsed.reviewer === "string" ? parsed.reviewer : commandLabel, scope: typeof parsed.scope === "string" ? parsed.scope : "task evidence package", observations, + concerns, + suggestions, blind_spots: typeof parsed.blind_spots === "string" ? parsed.blind_spots : "not stated", accepted: typeof parsed.accepted === "boolean" ? parsed.accepted @@ -649,8 +752,8 @@ async function runAutomaticRobotReview( const SYSTEM_REMINDER = ` Task tools haven't been used recently. Check the task list and keep it accurate: - Mark tasks in_progress when you start them (TaskUpdate status=in_progress). -- Complete trivial subtasks directly: TaskUpdate(status=completed). Drop irrelevant ones with status=deleted. -- For significant claims with uncertainty (a feature, an experiment result, run-until-X), call lgtm_ask with evidence — that triggers robot review and a human /lgtm gate. +- Complete subtasks directly: TaskUpdate(status=completed). Drop irrelevant ones with status=deleted. +- Complete top-level tasks with TaskClaimDone: include verbatim evidence, likely/subtle/unknown failure modes, falsification test, and remaining uncertainty. Explicit rejection keeps the task open; reviewer infrastructure failures are logged but do not block autonomy. A stale list is worse than no list. Ignore this reminder if not applicable. Never mention it to the user. `; @@ -753,13 +856,13 @@ export default function (pi: ExtensionAPI) { const missing = latest.missing_evidence.length > 0 ? ` Missing evidence: ${latest.missing_evidence.join("; ")}.` : ""; - return `- Task #${task.id} ${task.subject}: latest robot review rejected the evidence.${missing} Strengthen the evidence, call lgtm_ask again, then rerun robot_review_run before asking for human sign-off.`; + return `- Task #${task.id} ${task.subject}: latest proof review rejected the evidence.${missing} Strengthen the evidence and call TaskClaimDone again.`; }).join("\n"); return { systemPrompt: event.systemPrompt + - `\n\n\nLatest robot review follow-up required:\n${reminder}\nDo not ask for human sign-off until the latest robot review accepts the evidence.\n\n`, + `\n\n\nLatest proof review follow-up required:\n${reminder}\nDo not complete the top-level task until the latest proof review accepts the evidence.\n\n`, }; }); @@ -788,18 +891,19 @@ export default function (pi: ExtensionAPI) { ## Two tiers -- **Tasks**: agent-managed. Trivial bookkeeping (e.g. "monitor pueue 30") can be completed directly via TaskUpdate(status=completed). Subtasks lead up to verification. -- **LGTMs**: for significant claims with uncertainty (implement a feature, run-until-X). Call lgtm_ask with evidence — that triggers robot review and routes completion through /lgtm. +- **Top-level tasks**: goals with proof. They cannot be completed directly; call TaskClaimDone with evidence and failure modes. +- **Subtasks**: agent-managed checklist items under a top-level task. They can be completed directly via TaskUpdate(status=completed). ## Task Fields - **subject**: Brief actionable title - **description**: Detailed description with context - **done_criterion**: REQUIRED. Falsifiable observation that distinguishes done from fail/null/incomplete/silent-fail. State expected AND wrong-case observations (e.g., "All 92 tests pass. If wrong: type errors in build or test failures in task-store.test.ts") -- **progress_label** (optional): What the agent is currently doing, shown during in-progress tasks`, +- **progress_label** (optional): What the agent is currently doing, shown during in-progress tasks +- **parentId** (optional): Set this to make a directly tickable subtask. Omit it for a proof-gated top-level goal.`, promptGuidelines: [ - "Use TaskCreate for complex tasks. Include a specific done_criterion.", - "Mark tasks in_progress before starting. Complete trivial tasks via TaskUpdate; call lgtm_ask for significant claims, then human /lgtm.", + "Use TaskCreate for complex top-level goals. Include a specific done_criterion.", + "Mark tasks in_progress before starting. Complete subtasks via TaskUpdate; complete top-level tasks via TaskClaimDone with proof evidence.", ], parameters: Type.Object({ subject: Type.String({ description: "Brief task title" }), @@ -807,11 +911,19 @@ export default function (pi: ExtensionAPI) { done_criterion: Type.String({ description: "Falsifiable observation that distinguishes DONE from fail, null result, incomplete, or silent failure. State what you expect to see AND what you'd see if it's wrong." }), progress_label: Type.Optional(Type.String({ description: "What the agent is currently doing, shown during in-progress tasks" })), metadata: Type.Optional(Type.Record(Type.String(), Type.Any())), + parentId: Type.Optional(Type.String({ description: "Parent task ID. If set, this task is a directly tickable subtask; if omitted, this is a proof-gated top-level goal." })), }), execute(_toolCallId, params, _signal, _onUpdate, _ctx) { + const metadataError = assertNoReservedMetadata(params.metadata); + if (metadataError) return Promise.resolve(textResult(metadataError)); autoClear.resetBatchCountdown(); - const task = store.create(params.subject, params.description, params.done_criterion, params.progress_label, params.metadata); + let task: Task; + try { + task = store.create(params.subject, params.description, params.done_criterion, params.progress_label, params.metadata, params.parentId); + } catch (err: any) { + return Promise.resolve(textResult(err.message)); + } widget.update(); return Promise.resolve(textResult(`Task #${task.id} created: ${task.subject}\nDone criterion: ${task.done_criterion}`)); }, @@ -824,7 +936,7 @@ export default function (pi: ExtensionAPI) { pi.registerTool({ name: "TaskList", label: "TaskList", - description: `List all tasks grouped by status. State tag: [READY] (signoff-ready) [ACTIVE] [PENDING] [DONE]. Pipeline stages: [🛠🤖👀] = evidence→review→signoff (·=pending).`, + description: `List all tasks grouped by status. State tag: [ACTIVE] [PENDING] [DONE]. Pipeline stages: [🛠🤖✓] = evidence→review→completed (·=pending).`, parameters: Type.Object({}), execute(_toolCallId, _params, _signal, _onUpdate, _ctx) { @@ -832,7 +944,8 @@ export default function (pi: ExtensionAPI) { if (tasks.length === 0) return Promise.resolve(textResult("No tasks found")); const renderTask = (task: typeof tasks[number]) => { - let line = ` [${getStateTag(task).padEnd(7)}] #${task.id} ${task.subject} ${getReviewBadges(task)}`; + const parent = task.parentId ? ` [subtask of #${task.parentId}]` : ""; + let line = ` [${getStateTag(task).padEnd(7)}] #${task.id} ${task.subject}${parent} ${getReviewBadges(task)}`; if (task.blockedBy.length > 0) { const openBlockers = task.blockedBy.filter(bid => { const blocker = store.get(bid); @@ -845,7 +958,6 @@ export default function (pi: ExtensionAPI) { const buckets: { label: string; status: DisplayStatus }[] = [ { label: "Active", status: "in_progress" }, - { label: "Awaiting sign-off", status: "awaiting_signoff" }, { label: "Pending", status: "pending" }, { label: "Completed", status: "completed" }, ]; @@ -870,7 +982,7 @@ export default function (pi: ExtensionAPI) { pi.registerTool({ name: "TaskGet", label: "TaskGet", - description: `Get full LGTM sign-off task details including done_criterion and approval state.`, + description: `Get full proof-gated task details including done_criterion, evidence packet, and reviewer state.`, parameters: Type.Object({ taskId: Type.String({ description: "Task ID to retrieve" }), }), @@ -887,7 +999,7 @@ export default function (pi: ExtensionAPI) { const history = getEvidenceHistory(task); const lines: string[] = [ `Task #${task.id}: ${task.subject}`, - `Status: ${task.status} ${getReviewBadges(task)}${task.pending_approval && task.status !== "completed" ? " (pending human sign-off)" : ""}`, + `Status: ${task.status} ${getReviewBadges(task)}`, `Completion mode: ${completionMode}`, `Review state: ${reviewState}`, `Gate status: ${getGateStatus(task)}`, @@ -895,13 +1007,17 @@ export default function (pi: ExtensionAPI) { `Description: ${desc}`, ]; lines.push(`Evidence iterations: total=${getEvidenceIterationCount(task)}, current=${currentEvidence ? currentEvidence.iteration : 0}, superseded=${history.length}`); - lines.push(`Human sign-off pending: ${task.pending_approval ? "yes" : "no"}`); + lines.push(`Task kind: ${task.parentId ? `subtask of #${task.parentId}` : "top-level proof goal"}`); if (robotReviews.length > 0) { const latest = robotReviews[robotReviews.length - 1]; lines.push(`Robot reviews on current evidence: ${robotReviews.length} (latest: accepted=${latest.accepted ? "yes" : "no"}, complete=${latest.evidence_complete ? "yes" : "no"}, convincing=${latest.evidence_convincing ? "yes" : "no"})`); } - const evidenceSections = formatEvidencePackage(task); - if (evidenceSections.length > 0) lines.push(...evidenceSections); + lines.push(renderEvidencePacket(task)); + const automaticReviewFailure = renderAutomaticReviewFailure(task); + if (automaticReviewFailure) lines.push(automaticReviewFailure); + if (robotReviews.length > 0) { + lines.push(`### Robot reviews\n${robotReviews.map(formatRobotReview).join("\n\n")}`); + } const historySummary = formatHistorySummary(task); if (historySummary) lines.push(historySummary); if (task.blockedBy.length > 0) { @@ -929,8 +1045,8 @@ export default function (pi: ExtensionAPI) { description: `Update task fields or status. Two-tier model: -- Trivial bookkeeping tasks (e.g. "monitor pueue 30") can be marked completed directly here. -- Tasks that called lgtm_ask are gated: completion requires /lgtm . Strengthen evidence and re-run lgtm_ask if the robot review rejected it.`, +- Subtasks can be marked completed directly here. +- Top-level tasks are proof goals: TaskUpdate(status=completed) is rejected. Use TaskClaimDone so the failure-mode/evidence form and automatic reviewer run.`, parameters: Type.Object({ taskId: Type.String({ description: "Task ID to update" }), status: Type.Optional(Type.Unsafe<"pending" | "in_progress" | "completed" | "deleted">({ @@ -938,7 +1054,7 @@ Two-tier model: { type: "string", enum: ["pending", "in_progress", "completed"] }, { type: "string", const: "deleted" }, ], - description: "New status. Setting completed is allowed for trivial tasks; tasks with lgtm evidence must complete via /lgtm.", + description: "New status. Setting completed is allowed for subtasks only; top-level tasks must complete via TaskClaimDone.", })), subject: Type.Optional(Type.String({ description: "Brief task title" })), description: Type.Optional(Type.String({ description: "Detailed description" })), @@ -950,6 +1066,9 @@ Two-tier model: }), execute(_toolCallId, params, _signal, _onUpdate, _ctx) { + const metadataError = assertNoReservedMetadata(params.metadata); + if (metadataError) return Promise.resolve(textResult(metadataError)); + const { taskId, ...fields } = params; let task: any, changedFields: string[], warnings: string[]; try { @@ -972,7 +1091,7 @@ Two-tier model: autoClear.trackCompletion(taskId, currentTurn); } else if (fields.status === "deleted") { widget.setActiveTask(taskId, false); - warnings.push("Task deleted via agent tool. Use /tasks to confirm or undo. Deleting tasks without human sign-off is discouraged — tasks should be completed via /lgtm or explicitly dismissed by the user."); + warnings.push("Task deleted via agent tool. Use /tasks to confirm or undo. Deleting tasks should be reserved for dismissed or irrelevant work."); } widget.update(); @@ -983,16 +1102,16 @@ Two-tier model: }); // ────────────────────────────────────────────────── - // Tool 5: lgtm_ask + // Tool 5: TaskClaimDone // ────────────────────────────────────────────────── pi.registerTool({ - name: "lgtm_ask", - label: "lgtm_ask", - description: `Present evidence that a task meets its done_criterion and request human sign-off. + name: "TaskClaimDone", + label: "TaskClaimDone", + description: `Claim that a top-level task meets its done_criterion. -Forces structured thinking about failure modes. All text fields required. -After this, task enters pending sign-off state — only completable via /lgtm . +Forces structured thinking about failure modes and cheap evidence. All text fields required. +Accepted automatic review completes the task. Rejected review leaves it open with guidance. Reviewer infrastructure failure is logged but does not block autonomy. ## CRITICAL: Evidence must be verbatim @@ -1002,19 +1121,23 @@ Do NOT summarize or interpret. Paste literal command output, exact log lines, ma - **evidence**: Verbatim auditable proof — literal output, not summaries - **failure_likely**: Most likely way this could be wrong despite evidence -- **failure_sneaky**: Most perverse or sneaky failure -- one that looks like success superficially, corrupts silently, or only breaks under specific conditions (scale, time, edge case). E.g. feature active but wrong mechanism, works in tests but degrades in prod, correct output for wrong reason. +- **failure_sneaky**: Subtle/sneaky failure -- one that looks like success superficially, corrupts silently, or only breaks under specific conditions (scale, time, edge case). E.g. feature active but wrong mechanism, works in tests but degrades in prod, correct output for wrong reason. +- **failure_unknown**: What class of unknown/untested failure could remain even if the evidence is true - **falsification_test**: What you ran and the literal output you got, with reasoning why that output disproves the failure mode +- **evidence_reasoning**: Why this evidence cheaply distinguishes done-criterion success from the likely/subtle/unknown failures - **verification_hints**: Where to look and what to check, with specific content quoted (not bare paths or counts) - **remaining_uncertainty**: What's NOT tested, known limitations, deferred edge cases - **commands**: Optional first-class command records for the evidence package - **evidence_paths / falsification_paths**: Optional local artifact paths. The tool stores absolute path, sha256, and byte size for auditability. - **supersede_reason**: Optional reason when this submission replaces an older one on the same task`, parameters: Type.Object({ - taskId: Type.String({ description: "Task ID to submit for sign-off" }), + taskId: Type.String({ description: "Top-level task ID to claim done" }), evidence: Type.String({ description: "Verbatim auditable proof: literal command output, exact log lines, markdown block quotes, table rows, URLs. NOT summaries or interpretations. 'I ran X and got Y' is not evidence -- paste the actual output of X. A human must verify from this alone without re-running. (One short paragraph is fine; verbatim matters more than length.)" }), failure_likely: Type.String({ description: "Most likely way this could be wrong despite evidence. One short sentence preferred — pick the top one, not a list." }), - failure_sneaky: Type.String({ description: "Most perverse failure: looks like success superficially, corrupts silently, or only breaks at scale/time/edge case. One short sentence preferred." }), + failure_sneaky: Type.String({ description: "Subtle/sneaky failure: looks like success superficially, corrupts silently, or only breaks at scale/time/edge case. One short sentence preferred." }), + failure_unknown: Type.String({ description: "What unknown or untested failure class could remain even if this evidence is true. One short sentence preferred." }), falsification_test: Type.String({ description: "What you ran and the literal output you got. Include verbatim command + output, not 'it worked'. State why that output could not occur if a failure mode were real. Brevity is fine; the verbatim output is what counts." }), + evidence_reasoning: Type.String({ description: "Why this evidence cheaply distinguishes done-criterion success from the likely/subtle/unknown failures." }), verification_hints: Type.Array(Type.String(), { description: "Where to look, with specific content quoted (not bare paths or counts). E.g. 'src/loss.py:45-60 shows grad_norm=0.001'. One or two short hints is enough." }), remaining_uncertainty: Type.String({ description: "What's NOT tested, known limitations, deferred edges. One short sentence preferred. If you can't articulate uncertainty, you haven't thought hard enough." }), commands: Type.Optional(Type.Array(Type.Object({ @@ -1035,16 +1158,24 @@ Do NOT summarize or interpret. Paste literal command output, exact log lines, ma // verification_hints are descriptions, not validated file paths + if (task.parentId) return Promise.resolve(textResult(`Task #${params.taskId} is a subtask. Use TaskUpdate(status=completed) for subtasks; TaskClaimDone is for top-level proof goals.`)); + const blankField = requiredTextError(params, ["evidence", "failure_likely", "failure_sneaky", "failure_unknown", "falsification_test", "evidence_reasoning", "remaining_uncertainty"]); + if (blankField) return Promise.resolve(textResult(blankField)); + if (!params.verification_hints.some((hint: string) => hint.trim().length > 0)) { + return Promise.resolve(textResult("verification_hints must include at least one non-blank hint.")); + } + store.update(params.taskId, { - pending_approval: true, metadata: { - ...archiveCurrentEvidence(task, params.supersede_reason ?? "replaced by newer lgtm submission"), + ...archiveCurrentEvidence(task, params.supersede_reason ?? "replaced by newer proof claim"), ...clearCurrentEvidenceMetadata(), ...clearRobotReviewMetadata(), lgtm_evidence: params.evidence, lgtm_failure_likely: params.failure_likely, lgtm_failure_sneaky: params.failure_sneaky, + lgtm_failure_unknown: params.failure_unknown, lgtm_falsification_test: params.falsification_test, + lgtm_evidence_reasoning: params.evidence_reasoning, lgtm_verification_hints: params.verification_hints, lgtm_remaining_uncertainty: params.remaining_uncertainty, lgtm_submitted_at: new Date().toISOString(), @@ -1060,37 +1191,38 @@ Do NOT summarize or interpret. Paste literal command output, exact log lines, ma try { const { review, command } = await runAutomaticRobotReview(refreshedTask, signal, getCurrentModelRef(ctx.model)); store.update(params.taskId, { - pending_approval: shouldOpenHumanSignoffGate(refreshedTask, review.accepted), metadata: { ...appendRobotReviewMetadata(refreshedTask, review), ...clearAutomaticReviewFailureMetadata(), }, }); + if (shouldCompleteAfterAcceptedReview(store.get(params.taskId) ?? refreshedTask, review.accepted)) { + store.complete(params.taskId); + autoClear.trackCompletion(params.taskId, currentTurn); + widget.setActiveTask(params.taskId, false); + } + const storedReview = getLatestRobotReview(store.get(params.taskId) ?? refreshedTask); robotReviewNote = `\n\n### Automatic robot review\n` + - `Reviewer: ${command}\n` + - `Accepted: ${review.accepted ? "yes" : "no"}\n` + - `Evidence complete: ${review.evidence_complete ? "yes" : "no"}\n` + - `Evidence convincing: ${review.evidence_convincing ? "yes" : "no"}\n` + - (review.rubric - ? `Rubric:\n${Object.entries(review.rubric).map(([k, v]) => `- ${v.pass ? "PASS" : "FAIL"} ${k}: ${v.reason}`).join("\n")}\n` - : "") + - `${review.observations.map(o => `- ${o}`).join("\n")}`; - if (review.missing_evidence.length > 0) { - robotReviewNote += `\nMissing evidence:\n${review.missing_evidence.map(item => `- ${item}`).join("\n")}`; - } + `Reviewer command: ${command}\n\n` + + `${storedReview ? formatRobotReview(storedReview) : formatRobotReview({ ...review, iteration: 1 })}`; if (!review.accepted) { - robotReviewNote += `\nResult: human sign-off has been held back until the evidence is strengthened and reviewed again.`; + robotReviewNote += `\n\nResult: task remains open until the evidence is strengthened and reviewed again.`; } } catch (err: any) { store.update(params.taskId, { - pending_approval: refreshedTask.pending_approval, metadata: getAutomaticReviewFailureMetadata(err.message, err.rawOutput), }); + const taskAfterFailure = store.get(params.taskId) ?? refreshedTask; + if (!taskAfterFailure.parentId) { + store.complete(params.taskId); + autoClear.trackCompletion(params.taskId, currentTurn); + widget.setActiveTask(params.taskId, false); + } robotReviewNote = `\n\n### Automatic robot review\n` + - `Reviewer failed: ${err.message}\n` + - `Human sign-off is still allowed because reviewer failures are warnings, not evidence rejections.` + + `Reviewer unavailable: ${err.message}\n` + + `Autonomy continued without blocking completion.` + (typeof err.rawOutput === "string" && err.rawOutput.trim() ? `\n\n${formatReviewTextBlock("Reviewer raw output", err.rawOutput.trim())}` : ""); @@ -1099,16 +1231,11 @@ Do NOT summarize or interpret. Paste literal command output, exact log lines, ma const updatedTask = store.get(task.id) ?? task; const result = - `## Task #${task.id}: ${task.subject}\n` + - `Done criterion: ${task.done_criterion}\n\n` + - `${formatEvidencePackage(updatedTask).join("\n\n")}` + + `${renderProofLog(updatedTask)}` + robotReviewNote + `\n\n---\n` + `Gate status: ${getGateStatus(updatedTask)}\n\n` + - `**Self-check (non-blocking):** Look at this as the human will see it. ` + - `Does the evidence directly address the done_criterion "${task.done_criterion}"? ` + - `Would a skeptical reviewer find this convincing, or would they immediately ask ` + - `"but what about..."? If evidence feels thin, call lgtm_ask again with stronger evidence.`; + `Self-check: if a skeptical reviewer would still ask "but what about...", call TaskClaimDone again with stronger proof.`; return textResult(result); }, @@ -1117,9 +1244,9 @@ Do NOT summarize or interpret. Paste literal command output, exact log lines, ma pi.registerTool({ name: "lgtm_supersede", label: "lgtm_supersede", - description: `Mark the current LGTM evidence package as superseded without completing the task. + description: `Mark the current proof package as superseded without completing the task. -Use this when a prior claim is stale or wrong and reviewers should stop treating it as the current evidence. The current evidence, robot reviews, and reviewer-failure context are archived into history with your reason. Human /lgtm remains the only completion path.`, +Use this when a prior claim is stale or wrong and reviewers should stop treating it as the current evidence. The current evidence, robot reviews, and reviewer-failure context are archived into history with your reason. Submit a fresh TaskClaimDone claim to complete the task.`, parameters: Type.Object({ taskId: Type.String({ description: "Task ID whose current evidence should be superseded" }), reason: Type.String({ description: "Why the current evidence is stale or replaced" }), @@ -1133,7 +1260,6 @@ Use this when a prior claim is stale or wrong and reviewers should stop treating } store.update(params.taskId, { - pending_approval: false, metadata: { ...archiveCurrentEvidence(task, params.reason), ...clearCurrentEvidenceMetadata(), @@ -1160,23 +1286,24 @@ Use this when a prior claim is stale or wrong and reviewers should stop treating description: `Attach fresh-perspective robot review observations to a task. Use this from a separate subagent or model when possible, ideally from a different model family/class than the implementation agent. -Your role is VALIDATION, not flaw-finding. Sanity-check that the evidence addresses the done criterion. Comment and suggest, but the gate is only the rubric items. -Observations only: report what you saw, not advice or editorial. Structured gate fields record whether the evidence is complete and convincing enough to advance. +Your role is VALIDATION, not flaw-finding. Sanity-check that the evidence addresses the done criterion. Observations, concerns, and suggestions are welcome, but the gate is only the rubric items. -This does not complete the task. Human /lgtm remains the only completion path.`, +This records an independent review but does not itself complete the task. Use TaskClaimDone or robot_review_run for the automatic completion gate.`, parameters: Type.Object({ taskId: Type.String({ description: "Task ID to attach robot review to" }), reviewer: Type.String({ description: "Reviewer identity, model family, or class" }), scope: Type.String({ description: "What the reviewer examined" }), observations: Type.Array(Type.String(), { minItems: 1, - description: "Observations only. Concrete things noticed in the artifacts. No recommendations, interpretation, or editorial.", + description: "Concrete things noticed in the artifacts.", }), + concerns: Type.Optional(Type.Array(Type.String(), { description: "Why the current evidence may not yet prove success." })), + suggestions: Type.Optional(Type.Array(Type.String(), { description: "What the agent should do next if the evidence is not yet enough." })), blind_spots: Type.String({ description: "What the reviewer did not inspect or could not verify" }), evidence_complete: Type.Boolean({ description: "Whether the supplied evidence covers the claimed done criterion." }), evidence_convincing: Type.Boolean({ description: "Whether the supplied evidence would convince a skeptical reviewer." }), accepted: Type.Optional(Type.Boolean({ description: "Overall review decision. Defaults to evidence_complete && evidence_convincing." })), - missing_evidence: Type.Optional(Type.Array(Type.String(), { description: "Concrete missing checks, artifacts, or observations needed before human sign-off." })), + missing_evidence: Type.Optional(Type.Array(Type.String(), { description: "Concrete missing checks, artifacts, or observations needed before completion." })), }), execute(_toolCallId, params, _signal, _onUpdate, _ctx) { @@ -1186,12 +1313,13 @@ This does not complete the task. Human /lgtm remains the only completion path.`, const accepted = params.accepted ?? (params.evidence_complete && params.evidence_convincing); store.update(params.taskId, { - pending_approval: shouldOpenHumanSignoffGate(task, accepted), metadata: { ...appendRobotReviewMetadata(task, { reviewer: params.reviewer, scope: params.scope, observations: params.observations, + concerns: params.concerns ?? [], + suggestions: params.suggestions ?? [], blind_spots: params.blind_spots, accepted, evidence_complete: params.evidence_complete, @@ -1214,10 +1342,12 @@ This does not complete the task. Human /lgtm remains the only completion path.`, `Evidence complete: ${params.evidence_complete ? "yes" : "no"}\n` + `Evidence convincing: ${params.evidence_convincing ? "yes" : "no"}\n\n` + `### Observations\n${params.observations.map(o => `- ${o}`).join("\n")}\n\n` + + `${(params.concerns?.length ?? 0) > 0 ? `### Concerns\n${(params.concerns ?? []).map(item => `- ${item}`).join("\n")}\n\n` : ""}` + + `${(params.suggestions?.length ?? 0) > 0 ? `### Suggestions\n${(params.suggestions ?? []).map(item => `- ${item}`).join("\n")}\n\n` : ""}` + `${(params.missing_evidence?.length ?? 0) > 0 ? `### Missing evidence\n${(params.missing_evidence ?? []).map(item => `- ${item}`).join("\n")}\n\n` : ""}` + `### Blind spots\n${params.blind_spots}\n\n` + `Gate status: ${getGateStatus(store.get(params.taskId) ?? task)}\n\n` + - `🤖 Robot review stored. Human sign-off still requires \`/lgtm ${task.id}\`.`; + `🤖 Robot review stored. Manual reviews are advisory; the automatic proof gate runs through TaskClaimDone or robot_review_run.`; return Promise.resolve(textResult(result)); }, @@ -1228,9 +1358,9 @@ This does not complete the task. Human /lgtm remains the only completion path.`, label: "robot_review_run", description: `Run the automatic robot reviewer against the current task evidence using the current session model. -Runs the same Pi-native reviewer stage used automatically by \`lgtm_ask\`. +Runs the same Pi-native reviewer stage used automatically by \`TaskClaimDone\`. -This appends a new robot-review iteration. If the reviewer marks evidence incomplete or unconvincing, pending human sign-off is cleared until stronger evidence is submitted and reviewed again.`, +This appends a new robot-review iteration. If accepted for a top-level proof task, the task completes. If rejected, the task stays open. Reviewer infrastructure failure is logged but does not block autonomy.`, parameters: Type.Object({ taskId: Type.String({ description: "Task ID to review" }), }), @@ -1239,46 +1369,52 @@ This appends a new robot-review iteration. If the reviewer marks evidence incomp const task = store.get(params.taskId); if (!task) return textResult(`Task #${params.taskId} not found`); if (!task.metadata?.lgtm_evidence) { - return textResult(`Task #${params.taskId} has no stored evidence yet. Call lgtm_ask first.`); + return textResult(`Task #${params.taskId} has no stored evidence yet. Call TaskClaimDone first.`); } try { const { review, command } = await runAutomaticRobotReview(task, signal, getCurrentModelRef(_ctx.model)); store.update(params.taskId, { - pending_approval: shouldOpenHumanSignoffGate(task, review.accepted), metadata: { ...appendRobotReviewMetadata(task, review), ...clearAutomaticReviewFailureMetadata(), }, }); + const reviewedTask = store.get(params.taskId) ?? task; + if (!reviewedTask.parentId && shouldCompleteAfterAcceptedReview(reviewedTask, review.accepted)) { + store.complete(params.taskId); + autoClear.trackCompletion(params.taskId, currentTurn); + widget.setActiveTask(params.taskId, false); + } widget.update(); + const updatedTask = store.get(params.taskId) ?? task; + const storedReview = getLatestRobotReview(updatedTask); return textResult( - `## Automatic robot review for task #${task.id}: ${task.subject}\n` + - `Reviewer command: ${command}\n` + - `Iteration: ${getRobotReviews(store.get(params.taskId)!).length}\n` + - `Accepted: ${review.accepted ? "yes" : "no"}\n` + - `Evidence complete: ${review.evidence_complete ? "yes" : "no"}\n` + - `Evidence convincing: ${review.evidence_convincing ? "yes" : "no"}\n\n` + - (review.rubric - ? `### Rubric\n${Object.entries(review.rubric).map(([k, v]) => `- ${v.pass ? "PASS" : "FAIL"} ${k}: ${v.reason}`).join("\n")}\n\n` - : "") + - `### Observations\n${review.observations.map(o => `- ${o}`).join("\n")}\n\n` + - `${review.missing_evidence.length > 0 ? `### Missing evidence\n${review.missing_evidence.map(item => `- ${item}`).join("\n")}\n\n` : ""}` + - `### Blind spots\n${review.blind_spots}\n\n` + - `Gate status: ${getGateStatus(store.get(params.taskId) ?? task)}`, + `${renderProofLog(updatedTask)}\n\n` + + `### Automatic robot review\n` + + `Reviewer command: ${command}\n\n` + + `${storedReview ? formatRobotReview(storedReview) : formatRobotReview({ ...review, iteration: 1 })}\n\n` + + `Gate status: ${getGateStatus(updatedTask)}`, ); } catch (err: any) { store.update(params.taskId, { - pending_approval: task.pending_approval, metadata: getAutomaticReviewFailureMetadata(err.message, err.rawOutput), }); + const failedTask = store.get(params.taskId) ?? task; + if (!failedTask.parentId && failedTask.status !== "completed") { + store.complete(params.taskId); + autoClear.trackCompletion(params.taskId, currentTurn); + widget.setActiveTask(params.taskId, false); + } widget.update(); + const updatedTask = store.get(params.taskId) ?? task; return textResult( - `## Automatic robot review for task #${task.id}: ${task.subject}\n` + - `Reviewer failed: ${err.message}\n\n` + - `Automatic review failures are warnings, not evidence rejections, so human sign-off is still allowed.\n\n` + - `Gate status: ${getGateStatus(store.get(params.taskId) ?? task)}` + + `${renderProofLog(updatedTask)}\n\n` + + `### Automatic robot review\n` + + `Reviewer unavailable: ${err.message}\n\n` + + `Autonomy continued without blocking completion.\n\n` + + `Gate status: ${getGateStatus(updatedTask)}` + (typeof err.rawOutput === "string" && err.rawOutput.trim() ? `\n\n${formatReviewTextBlock("Reviewer raw output", err.rawOutput.trim())}` : ""), @@ -1355,23 +1491,19 @@ This appends a new robot-review iteration. If the reviewer marks evidence incomp const actions: string[] = []; if (task.status === "pending") actions.push("▸ Start (in_progress)"); - if (task.pending_approval && task.status !== "completed") { - actions.push(`(type /lgtm ${taskId} to sign off)`); + if (task.metadata.lgtm_evidence) { + actions.push(`(type /lgtm ${taskId} to view proof evidence)`); } actions.push("✗ Delete"); actions.push("← Back"); - const pendingNote = task.pending_approval && task.status !== "completed" ? `\n👀 Pending /lgtm sign-off` : ""; + const pendingNote = task.metadata.lgtm_evidence && task.status !== "completed" ? `\nProof review: ${getGateStatus(task)}` : ""; const em = task.metadata; let evidenceNote = ""; if (em.lgtm_evidence) { - const parts = [`\n\nEvidence (${em.lgtm_submitted_at ?? "?"}):\n${em.lgtm_evidence}`]; - parts.push(`Failure (likely): ${em.lgtm_failure_likely}`); - parts.push(`Failure (sneaky): ${em.lgtm_failure_sneaky}`); - if (em.lgtm_falsification_test) parts.push(`Falsification test: ${em.lgtm_falsification_test}`); - if (em.lgtm_remaining_uncertainty) parts.push(`Uncertainty: ${em.lgtm_remaining_uncertainty}`); - if (em.lgtm_verification_hints?.length) parts.push(`Hints: ${em.lgtm_verification_hints.join(", ")}`); - evidenceNote = parts.join("\n"); + evidenceNote = `\n\n${renderEvidencePacket(task)}`; + const automaticReviewFailure = renderAutomaticReviewFailure(task); + if (automaticReviewFailure) evidenceNote += `\n\n${automaticReviewFailure}`; } let robotNote = ""; const robotReviews = getRobotReviews(task); @@ -1416,173 +1548,55 @@ This appends a new robot-review iteration. If the reviewer marks evidence incomp }); // ────────────────────────────────────────────────── - // /lgtm command — human sign-off only + // /lgtm command — proof log viewer // ────────────────────────────────────────────────── - async function signOff(taskId: string, ctx: ExtensionCommandContext): Promise { + function renderTaskEvidenceForHuman(task: Task): string { + return renderProofLog(task); + } + + async function viewEvidence(taskId: string, ctx: ExtensionCommandContext): Promise { const task = store.get(taskId); if (!task) { ctx.ui.notify(`Task #${taskId} not found`, "error"); return; } - if (task.status === "completed") { ctx.ui.notify(`Task #${taskId} already completed`, "info"); return; } - - // Build human-visible state summary (the human is the final gate; we just surface friction). - const m = task.metadata; - const robotReviews = getRobotReviews(task); - const noEvidence = !task.pending_approval && !m.lgtm_evidence; - const robotRejected = robotReviews.length > 0 && !latestRobotReviewPasses(task); - const reviewerFailed = typeof m.robot_review_last_error === "string"; - - // Print evidence to the conversation so the user can review it there - const evidenceParts: string[] = [`Gate status: ${getGateStatus(task)}`]; - if (m.lgtm_evidence) { - evidenceParts.push(...formatEvidencePackage(task)); - } else { - evidenceParts.push(`(No current agent-submitted evidence — agent never called lgtm_ask, or the prior evidence was superseded.)`); - } - const historySummary = formatHistorySummary(task); - if (historySummary) evidenceParts.push(historySummary); - if (robotReviews.length > 0) { - evidenceParts.push( - `Robot reviews (${robotReviews.length} total):\n${robotReviews.map(formatRobotReview).join("\n\n")}`, - ); - if (robotRejected) { - evidenceParts.push("⚠ Latest robot review says the evidence is not yet complete/convincing."); - } - } - if (evidenceParts.length > 0) { - ctx.ui.notify(evidenceParts.join("\n\n"), "info"); - } - - let title = `Sign off #${taskId}: ${task.subject}\nDone: ${task.done_criterion}`; - let signLabel = "✓ LGTM — sign off"; - if (noEvidence) { - title = `⚠ Task #${taskId} has no agent-submitted evidence.\nSign off anyway?\nDone: ${task.done_criterion}`; - signLabel = "✓ Override — sign off without evidence"; - } else if (robotRejected) { - title = `⚠ Task #${taskId} robot review rejected the evidence.\nSign off anyway?\nDone: ${task.done_criterion}`; - signLabel = "✓ Override — sign off despite rejected review"; - } else if (reviewerFailed) { - if (task.pending_approval) { - title = `⚠ Task #${taskId} automatic robot review failed, but human sign-off is still allowed.\nContinue?\nDone: ${task.done_criterion}`; - signLabel = "✓ LGTM — sign off despite reviewer warning"; - } else { - title = `⚠ Task #${taskId} automatic robot review failed.\nSign off anyway?\nDone: ${task.done_criterion}`; - signLabel = "✓ Override — sign off despite reviewer failure"; - } - } - const confirm = await ctx.ui.select(title, [signLabel, "✗ Cancel"]); - if (confirm !== signLabel) return; - - try { - store.complete(taskId); - } catch (err: any) { - ctx.ui.notify(err.message, "error"); - return; - } - autoClear.trackCompletion(taskId, currentTurn); - widget.setActiveTask(taskId, false); - widget.update(); - ctx.ui.notify(`Task #${taskId} signed off. ✓`, "info"); + ctx.ui.notify(renderTaskEvidenceForHuman(task), "info"); } pi.registerCommand("lgtm", { description: - "Sign off on tasks. /lgtm [...] signs specific tasks; /lgtm * signs ALL open tasks (READY + ACTIVE + PENDING) after confirmation. Human override allowed even when the agent never called lgtm_ask.", + "View the proof log and judge notes. /lgtm [...] shows specific tasks; /lgtm * shows all open tasks. It does not complete tasks.", handler: async (args: string, ctx: ExtensionCommandContext) => { const trimmed = args.trim(); if (trimmed === "*") { - // Sign off all open (non-completed) tasks at once. Human is the final gate. const open = store.list().filter(t => t.status !== "completed"); if (open.length === 0) { - ctx.ui.notify("No open tasks to sign off.", "info"); + ctx.ui.notify("No open tasks to inspect.", "info"); return; } - const groups: Record = { - awaiting_signoff: [], - in_progress: [], - pending: [], - completed: [], - }; - for (const t of open) groups[getDisplayStatus(t)].push(t); - const groupLabel: Record = { - awaiting_signoff: "READY (human sign-off open)", - in_progress: "ACTIVE (no /lgtm evidence)", - pending: "PENDING (not started)", - completed: "DONE", - }; - const lines: string[] = []; - for (const status of ["awaiting_signoff", "in_progress", "pending"] as DisplayStatus[]) { - const inBucket = groups[status]; - if (inBucket.length === 0) continue; - lines.push(` ${groupLabel[status]}:`); - for (const t of inBucket) { - const warn = status === "awaiting_signoff" - ? (typeof t.metadata?.robot_review_last_error === "string" - ? " ⚠ reviewer warning" - : (!latestRobotReviewPasses(t) ? " ⚠ robot rejected" : "")) - : ""; - lines.push(` #${t.id} ${t.subject}${warn}`); - } - } - ctx.ui.notify(`About to sign off ALL ${open.length} open tasks:\n${lines.join("\n")}`, "info"); - const choice = await ctx.ui.select( - `Sign off ALL ${open.length} open tasks?`, - [`✓ Sign off all ${open.length}`, "← Cancel"], - ); - if (!choice || choice === "← Cancel") return; - let signed = 0; - for (const t of open) { - try { - store.complete(t.id); - autoClear.trackCompletion(t.id, currentTurn); - widget.setActiveTask(t.id, false); - signed++; - } catch (err: any) { - ctx.ui.notify(`Failed to sign off #${t.id}: ${err.message}`, "error"); - } - } - widget.update(); - ctx.ui.notify(`Signed off ${signed}/${open.length} tasks. ✓`, "info"); + ctx.ui.notify(open.map(renderTaskEvidenceForHuman).join("\n\n---\n\n"), "info"); return; } if (!trimmed) { - const open = store.list().filter(t => t.status !== "completed"); + const open = store.list(); if (open.length === 0) { - ctx.ui.notify("No open tasks. Use /lgtm * to confirm-clear everything, or /lgtm .", "info"); + ctx.ui.notify("No tasks to inspect.", "info"); return; } const tag = (t: typeof open[number]) => { - const s = getDisplayStatus(t); - if (s === "awaiting_signoff") return "[READY] "; - if (s === "in_progress") return "[ACTIVE] "; + if (t.status === "completed") return "[DONE] "; + if (t.status === "in_progress") return "[ACTIVE] "; return "[PENDING] "; }; const choice = await ctx.ui.select( - "Sign off on (any open task — human override allowed):", + "View proof log:", open.map(t => `${tag(t)}#${t.id} ${t.subject}`).concat(["← Cancel"]), ); if (!choice || choice === "← Cancel") return; const match = choice.match(/#(\d+)/); - if (match) signOff(match[1], ctx); + if (match) await viewEvidence(match[1], ctx); return; } - // Accept one or more whitespace-separated IDs (also tolerate `#1` and commas). const ids = trimmed.split(/[\s,]+/).map(t => t.replace(/^#/, "")).filter(Boolean); - if (ids.length === 1) { - await signOff(ids[0], ctx); - return; - } - const results: string[] = []; - for (const id of ids) { - const before = store.get(id); - await signOff(id, ctx); - const after = store.get(id); - if (after?.status === "completed" && before?.status !== "completed") { - results.push(`✓ #${id}`); - } else { - results.push(`✗ #${id}`); - } - } - ctx.ui.notify(`Batch sign-off: ${results.join(", ")}`, "info"); + for (const id of ids) await viewEvidence(id, ctx); }, }); } diff --git a/src/review-badges.ts b/src/review-badges.ts index 7d313f9..871df8e 100644 --- a/src/review-badges.ts +++ b/src/review-badges.ts @@ -1,7 +1,7 @@ import { getLatestRobotReview, getRobotReviews } from "./robot-review.js"; import type { Task } from "./types.js"; -const STAGES = ["🛠", "🤖", "👀"] as const; +const STAGES = ["🛠", "🤖", "✓"] as const; function hasCurrentEvidence(task: Task): boolean { return typeof task.metadata?.lgtm_evidence === "string" && task.metadata.lgtm_evidence.length > 0; @@ -11,12 +11,12 @@ function hasEvidenceHistory(task: Task): boolean { return Array.isArray(task.metadata?.lgtm_history) && task.metadata.lgtm_history.length > 0; } -/** Pipeline stages: `[🛠·🤖·👀]` fills left-to-right as evidence→review→signoff progresses. */ +/** Pipeline stages: `[🛠·🤖·✓]` fills left-to-right as evidence→review→completed progresses. */ export function getReviewBadges(task: Task): string { const filled = [ !!task.metadata?.lgtm_evidence, getRobotReviews(task).length > 0, - task.pending_approval && task.status !== "completed", + task.status === "completed", ]; const slots = STAGES.map((emoji, i) => filled[i] ? emoji : "·"); return `[${slots.join("")}]`; @@ -25,77 +25,74 @@ export function getReviewBadges(task: Task): string { export const REVIEW_BADGES = { evidence: STAGES[0], robot: STAGES[1], - human: STAGES[2], + complete: STAGES[2], pipeline: STAGES, }; -export type DisplayStatus = "awaiting_signoff" | "in_progress" | "pending" | "completed"; +export type DisplayStatus = "in_progress" | "pending" | "completed"; -/** Derived display bucket. `awaiting_signoff` is pending_approval && !completed. */ export function getDisplayStatus(task: Task): DisplayStatus { - if (task.status === "completed") return "completed"; - if (task.pending_approval) return "awaiting_signoff"; return task.status; } -export type CompletionMode = "direct" | "lgtm"; +export type CompletionMode = "direct" | "proof"; export type ReviewState = - | "no_evidence" - | "evidence_submitted" + | "no_claim" + | "claim_submitted" | "reviewer_failed_to_run" | "reviewer_rejected" - | "ready_for_human" + | "reviewer_accepted" | "superseded" - | "human_signed_off"; -export type StateTag = "READY" | "ACTIVE" | "PENDING" | "DONE"; + | "completed"; +export type StateTag = "ACTIVE" | "PENDING" | "DONE"; export function getCompletionMode(task: Task): CompletionMode { - return hasCurrentEvidence(task) || hasEvidenceHistory(task) || getRobotReviews(task).length > 0 || task.pending_approval - ? "lgtm" - : "direct"; + return task.parentId ? "direct" : "proof"; } export function getReviewState(task: Task): ReviewState { - if (task.status === "completed") return "human_signed_off"; + if (task.status === "completed") return "completed"; const latest = getLatestRobotReview(task); if (latest && !latest.accepted) return "reviewer_rejected"; - if (task.pending_approval && hasCurrentEvidence(task)) return "ready_for_human"; + if (latest?.accepted) return "reviewer_accepted"; if (typeof task.metadata?.robot_review_last_error === "string") return "reviewer_failed_to_run"; - if (hasCurrentEvidence(task)) return "evidence_submitted"; + if (hasCurrentEvidence(task)) return "claim_submitted"; if (hasEvidenceHistory(task)) return "superseded"; - return "no_evidence"; + return "no_claim"; } export function getGateStatus(task: Task): string { const state = getReviewState(task); - if (state === "human_signed_off") return "human signed off"; - if (state === "no_evidence") return "no lgtm evidence submitted"; - if (state === "ready_for_human") { + if (task.parentId) { + return task.status === "completed" ? "completed directly as subtask" : "subtask: direct completion allowed"; + } + if (task.status === "completed") { if (typeof task.metadata?.robot_review_last_error === "string") { - return `warning: automatic robot review failed, human sign-off still allowed via /lgtm ${task.id}: ${task.metadata.robot_review_last_error}`; + return `completed with reviewer unavailable: ${task.metadata.robot_review_last_error}`; } - return `ready for human sign-off via /lgtm ${task.id}`; + if (getLatestRobotReview(task)?.accepted) return "completed after accepted proof review"; + return "completed"; } + if (state === "no_claim") return "top-level task requires TaskClaimDone evidence before completion"; + if (state === "reviewer_accepted") return "review accepted; task should be completed"; if (state === "reviewer_failed_to_run") { - return `blocked: automatic robot review failed: ${task.metadata.robot_review_last_error}`; + return `review unavailable; autonomy continues: ${task.metadata.robot_review_last_error}`; } - if (state === "reviewer_rejected") return "blocked: latest robot review rejected the evidence"; - if (state === "superseded") return "current evidence superseded, waiting for a new lgtm submission"; - return "blocked: evidence submitted, robot review still required"; + if (state === "reviewer_rejected") return "latest proof review rejected the evidence; strengthen the proof and try again"; + if (state === "superseded") return "current evidence superseded, waiting for a new proof claim"; + return "proof claim submitted, automatic review still required"; } -/** Short uppercase tag for the human ("can I /lgtm this?" at a glance). */ +/** Short uppercase tag for compact task-list display. */ export function getStateTag(task: Task): StateTag { const s = getDisplayStatus(task); if (s === "completed") return "DONE"; - if (s === "awaiting_signoff") return "READY"; if (s === "in_progress") return "ACTIVE"; return "PENDING"; } /** Theme colour key for each state tag (only theme colours present in pi-tui are used). */ -export function getStateTagColor(tag: StateTag): "success" | "accent" | "dim" | undefined { - if (tag === "READY") return "success"; +export function getStateTagColor(tag: StateTag): "accent" | "dim" | undefined { if (tag === "ACTIVE") return "accent"; if (tag === "DONE") return "dim"; return undefined; // PENDING — default fg diff --git a/src/robot-review.ts b/src/robot-review.ts index dca3513..9846873 100644 --- a/src/robot-review.ts +++ b/src/robot-review.ts @@ -7,6 +7,8 @@ export interface RobotReviewRecord { reviewer: string; scope: string; observations: string[]; + concerns: string[]; + suggestions: string[]; blind_spots: string; accepted: boolean; evidence_complete: boolean; @@ -46,6 +48,8 @@ function normalizeReview(value: unknown, index: number): RobotReviewRecord | und reviewer, scope, observations, + concerns: toStringArray(review.concerns), + suggestions: toStringArray(review.suggestions), blind_spots: typeof review.blind_spots === "string" ? review.blind_spots : "not recorded", accepted: typeof review.accepted === "boolean" ? review.accepted @@ -69,6 +73,8 @@ function getLegacyRobotReview(task: Task): RobotReviewRecord | undefined { reviewer: typeof task.metadata?.robot_review_reviewer === "string" ? task.metadata.robot_review_reviewer : "unknown", scope: typeof task.metadata?.robot_review_scope === "string" ? task.metadata.robot_review_scope : "unknown", observations, + concerns: toStringArray(task.metadata?.robot_review_concerns), + suggestions: toStringArray(task.metadata?.robot_review_suggestions), blind_spots: typeof task.metadata?.robot_review_blind_spots === "string" ? task.metadata.robot_review_blind_spots : "not recorded", accepted: typeof task.metadata?.robot_review_accepted === "boolean" ? task.metadata.robot_review_accepted @@ -101,14 +107,33 @@ export function getLatestRobotReview(task: Task): RobotReviewRecord | undefined return reviews.length > 0 ? reviews[reviews.length - 1] : undefined; } -export function shouldOpenHumanSignoffGate(task: Task, reviewAccepted: boolean): boolean { - return reviewAccepted && typeof task.metadata?.lgtm_evidence === "string" && task.metadata.lgtm_evidence.length > 0; +function hasNonEmptyString(value: unknown): boolean { + return typeof value === "string" && value.trim().length > 0; +} + +export function hasCompleteProofClaim(task: Task): boolean { + const metadata = task.metadata ?? {}; + return [ + metadata.lgtm_evidence, + metadata.lgtm_failure_likely, + metadata.lgtm_failure_sneaky, + metadata.lgtm_failure_unknown, + metadata.lgtm_falsification_test, + metadata.lgtm_evidence_reasoning, + metadata.lgtm_remaining_uncertainty, + ].every(hasNonEmptyString) + && Array.isArray(metadata.lgtm_verification_hints) + && metadata.lgtm_verification_hints.some(hasNonEmptyString); +} + +export function shouldCompleteAfterAcceptedReview(task: Task, reviewAccepted: boolean): boolean { + return reviewAccepted && hasCompleteProofClaim(task); } export function relaxAdvisoryVerificationHints(review: Omit): Omit { const rubric = review.rubric; if (!rubric || review.evidence_complete !== true) return review; - const requiredCoreKeys = ["evidence_covers_done_criterion", "falsification_test_runnable", "failure_modes_addressed"]; + const requiredCoreKeys = ["evidence_covers_done_criterion", "falsification_test_runnable", "failure_modes_addressed", "evidence_distinguishes_success"]; if (!requiredCoreKeys.every((key) => rubric[key]?.pass === true)) return review; const failedKeys = Object.entries(rubric) .filter(([, item]) => item.pass !== true) @@ -122,6 +147,8 @@ export function relaxAdvisoryVerificationHints(review: Omit item !== "verification_hints_actionable" && !/verification hint/i.test(item)), }; } @@ -138,6 +165,8 @@ export function appendRobotReviewMetadata(task: Task, review: Omit): Task { + create(subject: string, description: string, done_criterion: string, progress_label?: string, metadata?: Record, parentId?: string): Task { return this.withLock(() => { + if (parentId && !this.tasks.has(parentId)) throw new Error(`Parent task #${parentId} not found`); const now = Date.now(); const task: Task = { id: String(this.nextId++), subject, description, done_criterion, - pending_approval: false, + parentId, status: "pending", progress_label, metadata: metadata ?? {}, @@ -116,9 +117,9 @@ export class TaskStore { subject?: string; description?: string; done_criterion?: string; - pending_approval?: boolean; progress_label?: string; metadata?: Record; + parentId?: string | null; add_blocks?: string[]; add_blocked_by?: string[]; }): { task: Task | undefined; changedFields: string[]; warnings: string[] } { @@ -129,13 +130,10 @@ export class TaskStore { const changedFields: string[] = []; const warnings: string[] = []; - // Self-completion is allowed for trivial tasks that never escalated to lgtm_ask. - // Once a task has stored lgtm evidence, completion must go through /lgtm so the - // human gate + robot review can't be skipped. - if (fields.status === "completed") { - if (task.pending_approval || task.metadata?.lgtm_evidence || (Array.isArray(task.metadata?.lgtm_history) && task.metadata.lgtm_history.length > 0)) { - throw new Error(`Use /lgtm ${id} to complete this task — completion_mode=lgtm because evidence was submitted.`); - } + // Subtasks are normal checklist items. Top-level tasks are goals and need a proof + // claim plus automatic review; TaskClaimDone is the only agent path that completes them. + if (fields.status === "completed" && !task.parentId) { + throw new Error(`Top-level task #${id} requires proof. Use TaskClaimDone with evidence and failure modes; subtasks can be completed directly.`); } if (fields.status === "deleted") { @@ -151,7 +149,6 @@ export class TaskStore { if (fields.subject !== undefined) { task.subject = fields.subject; changedFields.push("subject"); } if (fields.description !== undefined) { task.description = fields.description; changedFields.push("description"); } if (fields.done_criterion !== undefined) { task.done_criterion = fields.done_criterion; changedFields.push("done_criterion"); } - if (fields.pending_approval !== undefined) { task.pending_approval = fields.pending_approval; changedFields.push("pending_approval"); } if (fields.progress_label !== undefined) { task.progress_label = fields.progress_label; changedFields.push("progress_label"); } if (fields.metadata !== undefined) { @@ -162,6 +159,10 @@ export class TaskStore { changedFields.push("metadata"); } + if (fields.parentId !== undefined) { + throw new Error("parentId is creation-only. Create subtasks with TaskCreate(parentId); do not downgrade top-level proof goals."); + } + if (fields.add_blocks?.length) { for (const targetId of fields.add_blocks) { if (!task.blocks.includes(targetId)) task.blocks.push(targetId); @@ -191,7 +192,7 @@ export class TaskStore { }); } - /** Complete a task. Called only by /lgtm. The human-confirm gate lives in the command layer. */ + /** Complete a task. Called by accepted proof review or direct subtask completion paths. */ complete(id: string): Task { return this.withLock(() => { const task = this.tasks.get(id); diff --git a/src/types.ts b/src/types.ts index a64c0e1..28326be 100644 --- a/src/types.ts +++ b/src/types.ts @@ -9,7 +9,7 @@ export interface Task { subject: string; description: string; done_criterion: string; // required: what "done" looks like - pending_approval: boolean; // set by lgtm_ask, required before /lgtm + parentId?: string; // no parent = top-level goal, requires proof claim to complete status: TaskStatus; progress_label?: string; metadata: Record; diff --git a/src/ui/task-widget.ts b/src/ui/task-widget.ts index 4ca1b63..29b3855 100644 --- a/src/ui/task-widget.ts +++ b/src/ui/task-widget.ts @@ -125,12 +125,11 @@ export class TaskWidget { if (tasks.length === 0) return []; - const counts = { completed: 0, awaiting_signoff: 0, in_progress: 0, pending: 0 }; + const counts = { completed: 0, in_progress: 0, pending: 0 }; for (const t of tasks) counts[getDisplayStatus(t)]++; const parts: string[] = []; if (counts.completed > 0) parts.push(`${counts.completed} done`); - if (counts.awaiting_signoff > 0) parts.push(`${counts.awaiting_signoff} awaiting sign-off`); if (counts.in_progress > 0) parts.push(`${counts.in_progress} in progress`); if (counts.pending > 0) parts.push(`${counts.pending} open`); const statusText = `${tasks.length} tasks (${parts.join(", ")})`; @@ -144,7 +143,7 @@ export class TaskWidget { const isActive = this.activeTaskIds.has(task.id) && task.status === "in_progress"; const reviewSuffix = ` ${getReviewBadges(task)}`; const tag = getStateTag(task); - // [READY ] [ACTIVE ] [PENDING] [DONE ] — pad so columns line up. + // [ACTIVE ] [PENDING] [DONE ] — pad so columns line up. const tagColour = getStateTagColor(tag); const tagBox = `[${tag.padEnd(7)}]`; const tagPrefix = (tagColour ? theme.fg(tagColour, tagBox) : tagBox) + " "; diff --git a/test/auto-clear.test.ts b/test/auto-clear.test.ts index 3716b80..5d165b7 100644 --- a/test/auto-clear.test.ts +++ b/test/auto-clear.test.ts @@ -14,7 +14,6 @@ describe("auto-clear: on_task_complete mode", () => { it("does not clear completed task before REMINDER_INTERVAL turns", () => { store.create("Task", "Desc", "done"); - store.update("1", { pending_approval: true }); store.complete("1"); manager.trackCompletion("1", 1); @@ -28,7 +27,6 @@ describe("auto-clear: on_task_complete mode", () => { it("clears completed task after REMINDER_INTERVAL turns", () => { store.create("Task", "Desc", "done"); - store.update("1", { pending_approval: true }); store.complete("1"); manager.trackCompletion("1", 1); @@ -42,11 +40,9 @@ describe("auto-clear: on_task_complete mode", () => { store.create("Task A", "Desc", "done"); store.create("Task B", "Desc", "done"); - store.update("1", { pending_approval: true }); store.complete("1"); manager.trackCompletion("1", 1); - store.update("2", { pending_approval: true }); store.complete("2"); manager.trackCompletion("2", 3); @@ -65,7 +61,6 @@ describe("auto-clear: on_task_complete mode", () => { store.create("In Progress", "Desc", "done"); store.create("Completed", "Desc", "done"); store.update("2", { status: "in_progress" }); - store.update("3", { pending_approval: true }); store.complete("3"); manager.trackCompletion("3", 1); @@ -78,8 +73,7 @@ describe("auto-clear: on_task_complete mode", () => { it("cleans up dependency edges when auto-clearing", () => { store.create("Blocker", "Desc", "done"); store.create("Blocked", "Desc", "done"); - store.update("1", { addBlocks: ["2"] }); - store.update("1", { pending_approval: true }); + store.update("1", { add_blocks: ["2"] }); store.complete("1"); manager.trackCompletion("1", 1); @@ -90,7 +84,6 @@ describe("auto-clear: on_task_complete mode", () => { it("returns true when tasks are cleared", () => { store.create("Task", "Desc", "done"); - store.update("1", { pending_approval: true }); store.complete("1"); manager.trackCompletion("1", 1); @@ -111,7 +104,6 @@ describe("auto-clear: on_list_complete mode", () => { it("does not clear when some tasks are still pending", () => { store.create("Done", "Desc", "done"); store.create("Pending", "Desc", "done"); - store.update("1", { pending_approval: true }); store.complete("1"); manager.trackCompletion("1", 1); @@ -125,9 +117,7 @@ describe("auto-clear: on_list_complete mode", () => { it("does not clear immediately when all tasks complete", () => { store.create("A", "Desc", "done"); store.create("B", "Desc", "done"); - store.update("1", { pending_approval: true }); store.complete("1"); - store.update("2", { pending_approval: true }); store.complete("2"); manager.trackCompletion("2", 1); @@ -141,9 +131,7 @@ describe("auto-clear: on_list_complete mode", () => { it("clears all completed tasks after REMINDER_INTERVAL turns when all are completed", () => { store.create("A", "Desc", "done"); store.create("B", "Desc", "done"); - store.update("1", { pending_approval: true }); store.complete("1"); - store.update("2", { pending_approval: true }); store.complete("2"); manager.trackCompletion("2", 1); @@ -153,7 +141,6 @@ describe("auto-clear: on_list_complete mode", () => { it("resets countdown when a new task is created before REMINDER_INTERVAL", () => { store.create("A", "Desc", "done"); - store.update("1", { pending_approval: true }); store.complete("1"); manager.trackCompletion("1", 1); @@ -170,9 +157,7 @@ describe("auto-clear: on_list_complete mode", () => { it("resets countdown when a task goes back to in_progress", () => { store.create("A", "Desc", "done"); store.create("B", "Desc", "done"); - store.update("1", { pending_approval: true }); store.complete("1"); - store.update("2", { pending_approval: true }); store.complete("2"); manager.trackCompletion("2", 1); @@ -188,7 +173,6 @@ describe("auto-clear: on_list_complete mode", () => { it("returns true when tasks are cleared", () => { store.create("Task", "Desc", "done"); - store.update("1", { pending_approval: true }); store.complete("1"); manager.trackCompletion("1", 1); @@ -209,9 +193,7 @@ describe("auto-clear: never mode", () => { it("never clears completed tasks regardless of turns", () => { store.create("A", "Desc", "done"); store.create("B", "Desc", "done"); - store.update("1", { pending_approval: true }); store.complete("1"); - store.update("2", { pending_approval: true }); store.complete("2"); manager.trackCompletion("1", 1); manager.trackCompletion("2", 1); @@ -224,7 +206,6 @@ describe("auto-clear: never mode", () => { it("trackCompletion is a no-op", () => { store.create("Task", "Desc", "done"); - store.update("1", { pending_approval: true }); store.complete("1"); manager.trackCompletion("1", 1); @@ -240,7 +221,6 @@ describe("auto-clear: dynamic mode switching", () => { const manager = new AutoClearManager(() => store, () => mode); store.create("Task", "Desc", "done"); - store.update("1", { pending_approval: true }); store.complete("1"); // Track in never mode — no-op @@ -262,7 +242,6 @@ describe("auto-clear: store getter (session switch)", () => { const manager = new AutoClearManager(() => store, () => "on_task_complete"); store.create("Old task", "Desc", "done"); - store.update("1", { pending_approval: true }); store.complete("1"); manager.trackCompletion("1", 1); @@ -284,7 +263,6 @@ describe("auto-clear: store getter (session switch)", () => { // Swap to new store with a completed task store = new TaskStore(); store.create("Task in new store", "Desc", "done"); - store.update("1", { pending_approval: true }); store.complete("1"); manager.trackCompletion("1", 1); @@ -299,7 +277,6 @@ describe("auto-clear: reset (new session)", () => { const manager = new AutoClearManager(() => store, () => "on_task_complete"); store.create("Task", "Desc", "done"); - store.update("1", { pending_approval: true }); store.complete("1"); manager.trackCompletion("1", 1); @@ -316,7 +293,6 @@ describe("auto-clear: reset (new session)", () => { const manager = new AutoClearManager(() => store, () => "on_list_complete"); store.create("Task", "Desc", "done"); - store.update("1", { pending_approval: true }); store.complete("1"); manager.trackCompletion("1", 1); @@ -333,7 +309,6 @@ describe("auto-clear: reset (new session)", () => { const manager = new AutoClearManager(() => store, () => "on_task_complete"); store.create("Task", "Desc", "done"); - store.update("1", { pending_approval: true }); store.complete("1"); manager.trackCompletion("1", 1); manager.reset(); diff --git a/test/review-badges.test.ts b/test/review-badges.test.ts index 4e1989a..d6397d1 100644 --- a/test/review-badges.test.ts +++ b/test/review-badges.test.ts @@ -8,7 +8,6 @@ function makeTask(overrides: Partial = {}): Task { subject: "Test", description: "Desc", done_criterion: "done", - pending_approval: false, status: "pending", progress_label: undefined, metadata: {}, @@ -25,9 +24,8 @@ describe("getReviewBadges", () => { expect(getReviewBadges(makeTask())).toBe("[···]"); }); - it("fills tool/robot/human slots independently", () => { + it("fills evidence/review/completed slots independently", () => { const task = makeTask({ - pending_approval: true, metadata: { lgtm_evidence: "npm test", robot_reviews: [{ @@ -35,6 +33,8 @@ describe("getReviewBadges", () => { reviewer: "opencode", scope: "task evidence", observations: ["Observed one unchecked edge case"], + concerns: ["Evidence does not cover prod traffic."], + suggestions: ["Inspect one prod traffic sample."], blind_spots: "Did not inspect prod traffic", accepted: false, evidence_complete: false, @@ -46,27 +46,26 @@ describe("getReviewBadges", () => { }, }); - expect(getReviewBadges(task)).toBe("[🛠🤖👀]"); + expect(getReviewBadges(task)).toBe("[🛠🤖·]"); }); - it("hides the human badge once the task is completed", () => { + it("fills the completed badge once the task is completed", () => { const task = makeTask({ - pending_approval: true, status: "completed", metadata: { lgtm_evidence: "ok" }, }); - expect(getReviewBadges(task)).toBe("[🛠··]"); + expect(getReviewBadges(task)).toBe("[🛠·✓]"); }); }); describe("review state helpers", () => { - it("reports completion mode as direct before any lgtm evidence", () => { - expect(getCompletionMode(makeTask())).toBe("direct"); + it("reports completion mode as proof for top-level tasks", () => { + expect(getCompletionMode(makeTask())).toBe("proof"); }); - it("reports completion mode as lgtm after evidence history exists", () => { - expect(getCompletionMode(makeTask({ metadata: { lgtm_history: [{ iteration: 1 }] } }))).toBe("lgtm"); + it("reports completion mode as direct for subtasks", () => { + expect(getCompletionMode(makeTask({ parentId: "1" }))).toBe("direct"); }); it("reports superseded when only history remains", () => { @@ -75,30 +74,17 @@ describe("review state helpers", () => { }); describe("getGateStatus", () => { - it("reports ready when human sign-off is open", () => { - expect(getGateStatus(makeTask({ - pending_approval: true, - metadata: { lgtm_evidence: "ok" }, - }))).toBe("ready for human sign-off via /lgtm 1"); + it("reports top-level proof requirement before evidence", () => { + expect(getGateStatus(makeTask())).toBe("top-level task requires TaskClaimDone evidence before completion"); }); - it("reports blocking reviewer failure when human sign-off is closed", () => { + it("reports non-blocking reviewer failure", () => { expect(getGateStatus(makeTask({ metadata: { lgtm_evidence: "ok", robot_review_last_error: "Unexpected token 'a'", }, - }))).toContain("blocked: automatic robot review failed"); - }); - - it("reports reviewer failure as a warning when human sign-off stays open", () => { - expect(getGateStatus(makeTask({ - pending_approval: true, - metadata: { - lgtm_evidence: "ok", - robot_review_last_error: "Unexpected token 'a'", - }, - }))).toContain("warning: automatic robot review failed"); + }))).toContain("review unavailable; autonomy continues"); }); it("reports rejected robot review when latest review does not accept", () => { @@ -110,6 +96,8 @@ describe("getGateStatus", () => { reviewer: "opencode", scope: "task evidence", observations: ["Observed missing output"], + concerns: ["The current evidence is summary-only."], + suggestions: ["Paste the literal output."], blind_spots: "none", accepted: false, evidence_complete: false, @@ -119,12 +107,11 @@ describe("getGateStatus", () => { mode: "manual", }], }, - }))).toBe("blocked: latest robot review rejected the evidence"); + }))).toBe("latest proof review rejected the evidence; strengthen the proof and try again"); }); it("keeps rejection higher priority than a later reviewer warning", () => { expect(getGateStatus(makeTask({ - pending_approval: true, metadata: { lgtm_evidence: "ok", robot_review_last_error: "timeout", @@ -133,6 +120,8 @@ describe("getGateStatus", () => { reviewer: "opencode", scope: "task evidence", observations: ["Observed missing output"], + concerns: ["The current evidence is summary-only."], + suggestions: ["Paste the literal output."], blind_spots: "none", accepted: false, evidence_complete: false, @@ -142,7 +131,7 @@ describe("getGateStatus", () => { mode: "manual", }], }, - }))).toBe("blocked: latest robot review rejected the evidence"); + }))).toBe("latest proof review rejected the evidence; strengthen the proof and try again"); }); }); @@ -155,13 +144,7 @@ describe("getDisplayStatus", () => { expect(getDisplayStatus(makeTask({ status: "in_progress" }))).toBe("in_progress"); }); - it("returns awaiting_signoff when pending_approval is set", () => { - expect(getDisplayStatus(makeTask({ status: "in_progress", pending_approval: true }))) - .toBe("awaiting_signoff"); - }); - - it("returns completed regardless of pending_approval flag", () => { - expect(getDisplayStatus(makeTask({ status: "completed", pending_approval: true }))) - .toBe("completed"); + it("returns completed for completed tasks", () => { + expect(getDisplayStatus(makeTask({ status: "completed" }))).toBe("completed"); }); }); diff --git a/test/robot-review-runner.test.ts b/test/robot-review-runner.test.ts index b8a760c..dcb59fe 100644 --- a/test/robot-review-runner.test.ts +++ b/test/robot-review-runner.test.ts @@ -15,7 +15,7 @@ describe("robot review runner helpers", () => { command: "pi", args: ["--mode", "json"], }); - expect(getPiInvocation(["-p"], { PI_LGTM_PI_BIN: "/custom/pi" } as NodeJS.ProcessEnv)).toEqual({ + expect(getPiInvocation(["-p"], { PI_PROOF_TASKS_PI_BIN: "/custom/pi" } as NodeJS.ProcessEnv)).toEqual({ command: "/custom/pi", args: ["-p"], }); @@ -46,8 +46,8 @@ describe("robot review runner helpers", () => { }); it("uses configured timeout or falls back to default", () => { - expect(getRobotReviewTimeoutMs({ PI_LGTM_ROBOT_REVIEW_TIMEOUT_MS: "2500" } as NodeJS.ProcessEnv)).toBe(2500); - expect(getRobotReviewTimeoutMs({ PI_LGTM_ROBOT_REVIEW_TIMEOUT_MS: "bad" } as NodeJS.ProcessEnv)).toBe(DEFAULT_ROBOT_REVIEW_TIMEOUT_MS); + expect(getRobotReviewTimeoutMs({ PI_PROOF_TASKS_ROBOT_REVIEW_TIMEOUT_MS: "2500" } as NodeJS.ProcessEnv)).toBe(2500); + expect(getRobotReviewTimeoutMs({ PI_PROOF_TASKS_ROBOT_REVIEW_TIMEOUT_MS: "bad" } as NodeJS.ProcessEnv)).toBe(DEFAULT_ROBOT_REVIEW_TIMEOUT_MS); }); it("formats the current model as the reviewer model ref", () => { diff --git a/test/robot-review.test.ts b/test/robot-review.test.ts index 4a89047..741f766 100644 --- a/test/robot-review.test.ts +++ b/test/robot-review.test.ts @@ -2,8 +2,8 @@ import { mkdtempSync, writeFileSync } from "node:fs"; import { tmpdir } from "node:os"; import { join } from "node:path"; import { describe, expect, it } from "vitest"; -import { archiveCurrentEvidence, buildArtifactRecords, getCurrentEvidenceIteration, getEvidenceHistory } from "../src/index.js"; -import { appendRobotReviewMetadata, getLatestRobotReview, getRobotReviews, relaxAdvisoryVerificationHints, shouldOpenHumanSignoffGate } from "../src/robot-review.js"; +import { archiveCurrentEvidence, buildArtifactRecords, buildRobotReviewPrompt, getCurrentEvidenceIteration, getEvidenceHistory, renderEvidencePacket, renderProofLog } from "../src/index.js"; +import { appendRobotReviewMetadata, getLatestRobotReview, getRobotReviews, hasCompleteProofClaim, relaxAdvisoryVerificationHints, shouldCompleteAfterAcceptedReview } from "../src/robot-review.js"; import type { Task } from "../src/types.js"; function makeTask(overrides: Partial = {}): Task { @@ -12,7 +12,6 @@ function makeTask(overrides: Partial = {}): Task { subject: "Test", description: "Desc", done_criterion: "done", - pending_approval: false, status: "pending", progress_label: undefined, metadata: {}, @@ -25,10 +24,23 @@ function makeTask(overrides: Partial = {}): Task { } describe("robot review helpers", () => { - it("reopens the human gate when accepted review exists for stored evidence", () => { - expect(shouldOpenHumanSignoffGate(makeTask({ metadata: { lgtm_evidence: "literal output" } }), true)).toBe(true); - expect(shouldOpenHumanSignoffGate(makeTask({ metadata: { lgtm_evidence: "literal output" } }), false)).toBe(false); - expect(shouldOpenHumanSignoffGate(makeTask(), true)).toBe(false); + it("completes only after accepted review and complete proof claim", () => { + const task = makeTask({ + metadata: { + lgtm_evidence: "literal output", + lgtm_failure_likely: "wrong command", + lgtm_failure_sneaky: "right output for wrong reason", + lgtm_failure_unknown: "untested platform", + lgtm_falsification_test: "npm test\npass", + lgtm_evidence_reasoning: "the test output rules out the named failures for this scope", + lgtm_verification_hints: ["test/robot-review.test.ts shows the expectation"], + lgtm_remaining_uncertainty: "does not test prod install", + }, + }); + expect(hasCompleteProofClaim(task)).toBe(true); + expect(shouldCompleteAfterAcceptedReview(task, true)).toBe(true); + expect(shouldCompleteAfterAcceptedReview(task, false)).toBe(false); + expect(shouldCompleteAfterAcceptedReview(makeTask({ metadata: { lgtm_evidence: "literal output" } }), true)).toBe(false); }); it("reads legacy single-review metadata", () => { @@ -50,7 +62,7 @@ describe("robot review helpers", () => { }); it("builds artifact records with absolute path and sha256", () => { - const dir = mkdtempSync(join(tmpdir(), "pi-lgtm-")); + const dir = mkdtempSync(join(tmpdir(), "pi-proof-tasks-")); const path = join(dir, "evidence.log"); writeFileSync(path, "hello\n"); @@ -66,7 +78,9 @@ describe("robot review helpers", () => { lgtm_evidence: "literal output", lgtm_failure_likely: "wrong seed", lgtm_failure_sneaky: "wrong threshold", + lgtm_failure_unknown: "untested environment", lgtm_falsification_test: "pytest -k check", + lgtm_evidence_reasoning: "pytest output distinguishes the expected passing path from the named failures", lgtm_verification_hints: ["see line 5"], lgtm_remaining_uncertainty: "not load tested", lgtm_submitted_at: "2026-06-07T00:00:00.000Z", @@ -86,6 +100,8 @@ describe("robot review helpers", () => { reviewer: "auto", scope: "task evidence", observations: ["Observed commit, push, and test logs"], + concerns: [], + suggestions: [], blind_spots: "Did not inspect interactive UI", accepted: false, evidence_complete: true, @@ -97,6 +113,7 @@ describe("robot review helpers", () => { evidence_covers_done_criterion: { reason: "verbatim logs match", pass: true }, falsification_test_runnable: { reason: "command and output shown", pass: true }, failure_modes_addressed: { reason: "plausible top risks named", pass: true }, + evidence_distinguishes_success: { reason: "evidence rules out named failures", pass: true }, verification_hints_actionable: { reason: "paths are vague", pass: false }, }, }); @@ -112,6 +129,8 @@ describe("robot review helpers", () => { reviewer: "auto", scope: "task evidence", observations: ["Observed vague summary only"], + concerns: [], + suggestions: [], blind_spots: "Did not rerun tests", accepted: false, evidence_complete: true, @@ -123,6 +142,7 @@ describe("robot review helpers", () => { evidence_covers_done_criterion: { reason: "summary only", pass: false }, falsification_test_runnable: { reason: "command and output shown", pass: true }, failure_modes_addressed: { reason: "plausible top risks named", pass: true }, + evidence_distinguishes_success: { reason: "evidence does not rule out summary-only failure", pass: false }, verification_hints_actionable: { reason: "paths are vague", pass: false }, }, }); @@ -131,12 +151,40 @@ describe("robot review helpers", () => { expect(review.evidence_convincing).toBe(false); }); + it("renders one compact evidence packet for both human and robot review", () => { + const task = makeTask({ + metadata: { + lgtm_evidence: "literal output", + lgtm_failure_likely: "wrong seed", + lgtm_failure_sneaky: "wrong threshold", + lgtm_failure_unknown: "does not test UI rendering", + lgtm_falsification_test: "pytest -k check\nPASSED", + lgtm_evidence_reasoning: "The passing pytest transcript distinguishes success from wrong-threshold and wrong-seed failures for this test scope.", + lgtm_verification_hints: ["test/robot-review.test.ts contains the new guard test"], + lgtm_remaining_uncertainty: "not load tested", + lgtm_submitted_at: "2026-06-14T00:00:00.000Z", + lgtm_commands: [{ cmd: "npm test", exit_code: 0, stdout_path: "/tmp/test.log" }], + lgtm_evidence_artifacts: [{ path: "/tmp/test.log", sha256: "abc", bytes: 123 }], + }, + }); + + const packet = renderEvidencePacket(task); + const prompt = buildRobotReviewPrompt(task); + expect(packet).toContain("## Goal"); + expect(packet).toContain("## Planned evidence / UAT"); + expect(packet).toContain("## Attempt 1"); + expect(prompt).toContain(packet); + expect(prompt).toContain("does this evidence prove success for the stated goal"); + }); + it("appends robot reviews as iterations", () => { const task = makeTask(); const metadata1 = appendRobotReviewMetadata(task, { reviewer: "opencode", scope: "task evidence", observations: ["Observed missing benchmark output"], + concerns: ["The current evidence does not show the claimed speedup."], + suggestions: ["Add the benchmark transcript for the claimed speedup."], blind_spots: "Did not inspect prod config", accepted: false, evidence_complete: false, @@ -150,6 +198,8 @@ describe("robot review helpers", () => { reviewer: "opencode", scope: "updated task evidence", observations: ["Observed benchmark output and test transcript"], + concerns: [], + suggestions: [], blind_spots: "Did not inspect long-run stability", accepted: true, evidence_complete: true, @@ -167,5 +217,73 @@ describe("robot review helpers", () => { expect(getLatestRobotReview(task2)?.evidence_convincing).toBe(true); expect(task2.metadata.robot_review_iteration_count).toBe(2); }); + + it("renders a simple proof log with judgement and suggestions", () => { + const taskWithEvidence = makeTask({ + metadata: { + lgtm_evidence: "npm test\n125 passed", + lgtm_failure_likely: "old package name still in README", + lgtm_failure_sneaky: "top-level direct completion still slips through", + lgtm_failure_unknown: "fresh judge command fails in a real session", + lgtm_falsification_test: "npm test\n125 passed", + lgtm_evidence_reasoning: "The test transcript and grep distinguish the intended behavior from stale workflow regressions.", + lgtm_verification_hints: ["README.md install block shows pi-proof-tasks"], + lgtm_remaining_uncertainty: "Did not exercise every model provider.", + lgtm_submitted_at: "2026-06-14T00:00:00.000Z", + }, + }); + const task = makeTask({ + metadata: { + ...taskWithEvidence.metadata, + ...appendRobotReviewMetadata(taskWithEvidence, { + reviewer: "auto", + scope: "proof log", + observations: ["Observed the test transcript and renamed package."], + concerns: ["The live Pi session path is still untested."], + suggestions: ["Run one self-hosted TaskClaimDone UAT."], + blind_spots: "Did not inspect external auth state", + accepted: false, + evidence_complete: true, + evidence_convincing: false, + missing_evidence: ["self-hosted TaskClaimDone UAT"], + submitted_at: "2026-06-14T00:01:00.000Z", + mode: "auto", + }), + }, + }); + + const log = renderProofLog(task); + expect(log).toContain("# Task #1: Test"); + expect(log).toContain("## Goal"); + expect(log).toContain("## Planned evidence / UAT"); + expect(log).toContain("## Attempt 1"); + expect(log).toContain("### Submitted evidence"); + expect(log).toContain("### Judgement"); + expect(log).toContain("Refused by auto"); + expect(log).toContain("Run one self-hosted TaskClaimDone UAT."); + }); + + it("renders reviewer-unavailable proof logs for fail-open completion notes", () => { + const task = makeTask({ + status: "completed", + metadata: { + lgtm_evidence: "npm test\n125 passed", + lgtm_failure_likely: "old package name still in README", + lgtm_failure_sneaky: "top-level direct completion still slips through", + lgtm_failure_unknown: "fresh judge command fails in a real session", + lgtm_falsification_test: "npm test\n125 passed", + lgtm_evidence_reasoning: "The test transcript and grep distinguish the intended behavior from stale workflow regressions.", + lgtm_verification_hints: ["README.md install block shows pi-proof-tasks"], + lgtm_remaining_uncertainty: "Did not exercise every model provider.", + robot_review_last_error: "judge auth failed", + }, + }); + + const log = renderProofLog(task); + expect(log).toContain("completed with reviewer unavailable"); + expect(log).toContain("### Judgement"); + expect(log).toContain("judge auth failed"); + expect(log).toContain("Autonomy continued without blocking completion."); + }); }); diff --git a/test/task-store.test.ts b/test/task-store.test.ts index 3367750..dfac154 100644 --- a/test/task-store.test.ts +++ b/test/task-store.test.ts @@ -4,11 +4,10 @@ import { join } from "node:path"; import { afterEach, beforeEach, describe, expect, it } from "vitest"; import { TaskStore } from "../src/task-store.js"; -// Helper: create a task and set pending_approval so complete() works -function createAndApprove(store: TaskStore, subject: string) { - const task = store.create(subject, "Desc", "done criterion"); - store.update(task.id, { pending_approval: true }); - return task; +// Helper: create a subtask, which can be ticked off directly. +function createSubtask(store: TaskStore, subject: string) { + const parent = store.create(`${subject} parent`, "Desc", "done criterion"); + return store.create(subject, "Desc", "done criterion", undefined, undefined, parent.id); } describe("TaskStore (in-memory)", () => { @@ -28,7 +27,6 @@ describe("TaskStore (in-memory)", () => { expect(t1.subject).toBe("First task"); expect(t1.description).toBe("Description 1"); expect(t1.done_criterion).toBe("criterion 1"); - expect(t1.pending_approval).toBe(false); }); it("creates tasks with optional fields", () => { @@ -110,7 +108,7 @@ describe("TaskStore (in-memory)", () => { expect(task.metadata).toEqual({ a: 1, c: 3, d: 4 }); }); - it("sets up bidirectional blocks via addBlocks", () => { + it("sets up bidirectional blocks via add_blocks", () => { store.create("Blocker", "Desc", "done"); store.create("Blocked", "Desc", "done"); @@ -122,7 +120,7 @@ describe("TaskStore (in-memory)", () => { expect(t2.blockedBy).toContain("1"); }); - it("sets up bidirectional blocks via addBlockedBy", () => { + it("sets up bidirectional blocks via add_blocked_by", () => { store.create("Blocker", "Desc", "done"); store.create("Blocked", "Desc", "done"); @@ -157,7 +155,7 @@ describe("TaskStore (in-memory)", () => { }); it("clears completed tasks", () => { - createAndApprove(store, "Completed"); + store.create("Completed", "Desc", "done"); store.create("Pending", "Desc", "done"); store.complete("1"); @@ -168,36 +166,28 @@ describe("TaskStore (in-memory)", () => { expect(store.list()[0].id).toBe("2"); }); - it("allows TaskUpdate(status=completed) for trivial tasks (no lgtm evidence)", () => { - store.create("Trivial", "Desc", "done"); - const { task, changedFields } = store.update("1", { status: "completed" }); + it("allows TaskUpdate(status=completed) for subtasks", () => { + createSubtask(store, "Checklist item"); + const { task, changedFields } = store.update("2", { status: "completed" }); expect(task!.status).toBe("completed"); expect(changedFields).toContain("status"); }); - it("blocks TaskUpdate(status=completed) when pending_approval=true", () => { - store.create("Significant", "Desc", "done"); - store.update("1", { pending_approval: true }); - expect(() => store.update("1", { status: "completed" })).toThrow("/lgtm"); + it("blocks TaskUpdate(status=completed) for top-level tasks", () => { + store.create("Goal", "Desc", "done"); + expect(() => store.update("1", { status: "completed" })).toThrow("Top-level task #1 requires proof"); }); - it("blocks TaskUpdate(status=completed) when lgtm evidence is stored (even if review rejected)", () => { + it("keeps top-level completion gated even after proof evidence exists", () => { store.create("Escalated", "Desc", "done"); - // lgtm_ask path stores evidence; if robot review rejects, pending_approval becomes false. - // The agent must not be able to bypass the gate by self-completing afterwards. - store.update("1", { metadata: { lgtm_evidence: "literal output" }, pending_approval: false }); - expect(() => store.update("1", { status: "completed" })).toThrow("/lgtm"); + store.update("1", { metadata: { lgtm_evidence: "literal output" } }); + expect(() => store.update("1", { status: "completed" })).toThrow("TaskClaimDone"); }); - it("blocks TaskUpdate(status=completed) after evidence was superseded into history", () => { - store.create("Superseded", "Desc", "done"); - store.update("1", { - metadata: { - lgtm_history: [{ iteration: 1, supersede_reason: "threshold changed" }], - }, - pending_approval: false, - }); - expect(() => store.update("1", { status: "completed" })).toThrow("completion_mode=lgtm"); + it("rejects changing parentId after creation", () => { + store.create("Parent", "Desc", "done"); + store.create("Child", "Desc", "done"); + expect(() => store.update("2", { parentId: "1" })).toThrow("parentId is creation-only"); }); it("returns not found for update on non-existent task", () => { @@ -206,16 +196,15 @@ describe("TaskStore (in-memory)", () => { expect(changedFields).toEqual([]); }); - it("complete() works without pending_approval (human override path)", () => { - // The /lgtm command layer is the human gate; complete() itself is permissive. + it("complete() is the internal proof-review completion path", () => { store.create("Test", "Desc", "done"); const task = store.complete("1"); expect(task.status).toBe("completed"); }); - it("complete() works when pending_approval=true", () => { - createAndApprove(store, "Test"); - const task = store.complete("1"); + it("complete() also works for subtasks", () => { + createSubtask(store, "Test"); + const task = store.complete("2"); expect(task.status).toBe("completed"); }); @@ -307,8 +296,7 @@ describe("TaskStore (in-memory)", () => { store.create("Blocker", "Desc", "done"); store.create("Blocked", "Desc", "done"); store.update("1", { add_blocks: ["2"] }); - // Set pending_approval on task 1 so complete() works via /lgtm path - store.update("1", { pending_approval: true }); + // complete() is the internal proof-review completion path. store.complete("1"); store.clearCompleted(); @@ -317,7 +305,7 @@ describe("TaskStore (in-memory)", () => { expect(t2.blockedBy).toEqual([]); }); - it("handles multiple addBlocks in one call", () => { + it("handles multiple add_blocks in one call", () => { store.create("Blocker", "Desc", "done"); store.create("B1", "Desc", "done"); store.create("B2", "Desc", "done"); @@ -329,21 +317,21 @@ describe("TaskStore (in-memory)", () => { expect(store.get("3")!.blockedBy).toContain("1"); }); - it("addBlockedBy warns on self-dependency", () => { + it("add_blocked_by warns on self-dependency", () => { store.create("Self", "Desc", "done"); const { warnings } = store.update("1", { add_blocked_by: ["1"] }); expect(store.get("1")!.blockedBy).toContain("1"); expect(warnings).toContain("#1 blocks itself"); }); - it("addBlockedBy warns on dangling ref", () => { + it("add_blocked_by warns on dangling ref", () => { store.create("Real", "Desc", "done"); const { warnings } = store.update("1", { add_blocked_by: ["9999"] }); expect(store.get("1")!.blockedBy).toContain("9999"); expect(warnings).toContain("#9999 does not exist"); }); - it("addBlockedBy warns on cycle", () => { + it("add_blocked_by warns on cycle", () => { store.create("A", "Desc", "done"); store.create("B", "Desc", "done"); store.update("1", { add_blocks: ["2"] }); @@ -358,7 +346,7 @@ describe("TaskStore (in-memory)", () => { it("list sorts pending → in_progress → completed with all three present", () => { store.create("Pending task", "Desc", "done"); - createAndApprove(store, "Completed task"); + store.create("Completed task", "Desc", "done"); store.create("In-progress task", "Desc", "done"); store.create("Another pending", "Desc", "done"); @@ -413,7 +401,6 @@ describe("TaskStore (file-backed)", () => { const store1 = new TaskStore(testListId); store1.create("Done task", "Desc", "done"); store1.create("Pending task", "Desc", "done"); - store1.update("1", { pending_approval: true }); store1.complete("1"); const store2 = new TaskStore(testListId); @@ -429,7 +416,6 @@ describe("TaskStore (file-backed)", () => { store1.create("In progress", "Desc", "done"); store1.create("Done", "Desc", "done"); store1.update("2", { status: "in_progress" }); - store1.update("3", { pending_approval: true }); store1.complete("3"); const store2 = new TaskStore(testListId); @@ -473,7 +459,6 @@ describe("TaskStore (absolute path)", () => { const store1 = new TaskStore(absFilePath); store1.create("Pending", "Desc", "done"); store1.create("Completed", "Desc", "done"); - store1.update("2", { pending_approval: true }); store1.complete("2"); const raw = JSON.parse(readFileSync(absFilePath, "utf-8")); diff --git a/test/task-widget.test.ts b/test/task-widget.test.ts index 35d9291..ca0880d 100644 --- a/test/task-widget.test.ts +++ b/test/task-widget.test.ts @@ -92,7 +92,6 @@ describe("TaskWidget", () => { it("renders completed tasks with ✔ icon and strikethrough", () => { store.create("Done task", "Desc", "done"); - store.update("1", { pending_approval: true }); store.complete("1"); widget.update(); @@ -105,7 +104,6 @@ describe("TaskWidget", () => { store.create("Done task", "Desc", "done"); store.update("1", { metadata: { robot_review_observations: ["Observed output drift on seed 2"] }, - pending_approval: true, }); store.complete("1"); widget.update(); @@ -143,7 +141,6 @@ describe("TaskWidget", () => { store.create("Blocker", "Desc", "done"); store.create("Blocked", "Desc", "done"); store.update("2", { add_blocked_by: ["1"] }); - store.update("1", { pending_approval: true }); store.complete("1"); widget.update(); @@ -156,7 +153,6 @@ describe("TaskWidget", () => { store.create("Task A", "Desc", "done"); store.create("Task B", "Desc", "done"); store.create("Task C", "Desc", "done"); - store.update("1", { pending_approval: true }); store.complete("1"); store.update("2", { status: "in_progress" }); widget.update(); @@ -226,7 +222,6 @@ describe("TaskWidget", () => { widget.setActiveTask("1", true); // Complete the task externally - store.update("1", { pending_approval: true }); store.complete("1"); widget.update();