Relax verification hints auto-review gate

This commit is contained in:
wassname
2026-06-14 06:35:00 +08:00
parent 9423d299a3
commit d76ed41655
4 changed files with 96 additions and 23 deletions
+2 -2
View File
@@ -82,7 +82,7 @@ The epistemic gate. Required fields:
| `failure_likely` | Most likely way this is wrong despite evidence |
| `failure_sneaky` | Perverse/silent failure that looks like success superficially |
| `falsification_test` | What you ran and what you got, so both you and the human can sanity-check it. Why that result could not occur if a failure mode were real. |
| `verification_hints` | Where to look and what to check. Descriptions of evidence locations. |
| `verification_hints` | Where to look and what to check. These still force the agent to think, but weak hints are advisory rather than a hard block when the verbatim evidence already proves the claim. Core evidence still has to pass on its own. |
| `remaining_uncertainty` | What is NOT tested, deferred edge cases, known limitations |
| `commands` | Optional structured command records: `{ cmd, exit_code, stdout_path?, stderr_path? }` |
| `evidence_paths` / `falsification_paths` | Optional local artifact paths. Stored as absolute path + sha256 + byte size |
@@ -92,7 +92,7 @@ After calling this, the task shows `👀` and is only completable via `/lgtm <id
The tool result includes a non-blocking self-check prompt asking whether the evidence directly addresses the `done_criterion` and whether a skeptical reviewer would find it convincing.
`lgtm_ask` always runs the robot-review stage immediately after storing evidence. A robot review that rejects the evidence clears `pending_approval` until the evidence is strengthened and reviewed again. A reviewer crash, auth failure, timeout, or malformed output is recorded as a warning and leaves human sign-off open.
`lgtm_ask` always runs the robot-review stage immediately after storing evidence. A robot review that rejects the evidence clears `pending_approval` until the evidence is strengthened and reviewed again. Weak verification hints are advisory if the core verbatim evidence already proves the done criterion. A reviewer crash, auth failure, timeout, or malformed output is recorded as a warning and leaves human sign-off open.
### `lgtm_supersede`
+22 -20
View File
@@ -46,6 +46,7 @@ import {
getRobotReviews,
latestRobotReviewPasses,
type RobotReviewRecord,
relaxAdvisoryVerificationHints,
shouldOpenHumanSignoffGate,
} from "./robot-review.js";
import { TaskStore } from "./task-store.js";
@@ -530,7 +531,7 @@ function buildRobotReviewPrompt(task: any): string {
"",
"## Critical: Evidence must be verbatim",
"",
"Evidence should contain literal output — verbatim command output, exact log lines, markdown block quotes, table rows, URLs — not summaries or interpretations. If the evidence only says 'it worked' or 'returned 5 results' without showing the actual output, flag it under verification_hints_actionable or evidence_covers_done_criterion.",
"Evidence should contain literal output — verbatim command output, exact log lines, markdown block quotes, table rows, URLs — not summaries or interpretations. If the evidence only says 'it worked' or 'returned 5 results' without showing the actual output, flag it under evidence_covers_done_criterion or falsification_test_runnable, not verification_hints_actionable.",
"A human must be able to verify the claim from the evidence alone, without re-running anything. Summaries are not evidence. Literal output is evidence.",
"",
"## Rubric (rate each item pass/fail)",
@@ -538,11 +539,11 @@ function buildRobotReviewPrompt(task: any): string {
"1. evidence_covers_done_criterion: Does the evidence directly address the stated done criterion? Evidence must be verbatim (literal output, not 'it worked').",
"2. falsification_test_runnable: Is the falsification test concrete enough that someone could run it and get a yes/no result? Must include actual output, not just 'ran X and it worked'.",
"3. failure_modes_addressed: Are the failure_likely and failure_sneaky plausibly the top failure modes? (Not: are there OTHER failure modes?)",
"4. verification_hints_actionable: Can a human follow the verification hints to check the claim without re-running experiments? Hints must reference specific content (line ranges, output snippets, URLs), not bare paths or counts.",
"4. verification_hints_actionable: Can a human follow the verification hints to check the claim without re-running experiments? Hints should reference specific content (line ranges, output snippets, URLs), not bare paths or counts.",
"",
"Set evidence_complete=true only if items 1 and 2 pass.",
"Set evidence_convincing=true only if items 1, 2, AND 4 pass.",
"Set accepted=true only if ALL rubric items pass.",
"Set evidence_convincing=true only if items 1 and 2 pass. Item 4 is advisory unless it reveals that items 1 or 2 were overstated.",
"Set accepted=true only if items 1, 2, and 3 pass. Do not reject solely because verification hints are weak if the verbatim evidence already proves the done criterion.",
"",
"Observations: report what you see, not what might be missing. Comments and suggestions go in observations.",
"missing_evidence: ONLY items from the rubric that failed. Do NOT add new dimensions.",
@@ -623,24 +624,25 @@ async function runAutomaticRobotReview(
}
if (Object.keys(r).length > 0) rubric = r;
}
const review = relaxAdvisoryVerificationHints({
reviewer: typeof parsed.reviewer === "string" ? parsed.reviewer : commandLabel,
scope: typeof parsed.scope === "string" ? parsed.scope : "task evidence package",
observations,
blind_spots: typeof parsed.blind_spots === "string" ? parsed.blind_spots : "not stated",
accepted: typeof parsed.accepted === "boolean"
? parsed.accepted
: parsed.evidence_complete === true && parsed.evidence_convincing === true,
evidence_complete: parsed.evidence_complete === true,
evidence_convincing: parsed.evidence_convincing === true,
missing_evidence,
submitted_at: new Date().toISOString(),
mode: "auto",
raw_output: result.stdout.trim(),
rubric,
});
return {
command: commandLabel,
review: {
reviewer: typeof parsed.reviewer === "string" ? parsed.reviewer : commandLabel,
scope: typeof parsed.scope === "string" ? parsed.scope : "task evidence package",
observations,
blind_spots: typeof parsed.blind_spots === "string" ? parsed.blind_spots : "not stated",
accepted: typeof parsed.accepted === "boolean"
? parsed.accepted
: parsed.evidence_complete === true && parsed.evidence_convincing === true,
evidence_complete: parsed.evidence_complete === true,
evidence_convincing: parsed.evidence_convincing === true,
missing_evidence,
submitted_at: new Date().toISOString(),
mode: "auto",
raw_output: result.stdout.trim(),
rubric,
},
review,
};
}
+21
View File
@@ -105,6 +105,27 @@ export function shouldOpenHumanSignoffGate(task: Task, reviewAccepted: boolean):
return reviewAccepted && typeof task.metadata?.lgtm_evidence === "string" && task.metadata.lgtm_evidence.length > 0;
}
export function relaxAdvisoryVerificationHints(review: Omit<RobotReviewRecord, "iteration">): Omit<RobotReviewRecord, "iteration"> {
const rubric = review.rubric;
if (!rubric || review.evidence_complete !== true) return review;
const requiredCoreKeys = ["evidence_covers_done_criterion", "falsification_test_runnable", "failure_modes_addressed"];
if (!requiredCoreKeys.every((key) => rubric[key]?.pass === true)) return review;
const failedKeys = Object.entries(rubric)
.filter(([, item]) => item.pass !== true)
.map(([key]) => key);
if (failedKeys.length !== 1 || failedKeys[0] !== "verification_hints_actionable") return review;
return {
...review,
accepted: true,
evidence_convincing: true,
observations: [
...review.observations,
"Verification hints were weak, but treated as advisory because the verbatim evidence already covered the done criterion.",
],
missing_evidence: review.missing_evidence.filter((item) => item !== "verification_hints_actionable" && !/verification hint/i.test(item)),
};
}
export function appendRobotReviewMetadata(task: Task, review: Omit<RobotReviewRecord, "iteration">): Record<string, unknown> {
const robot_reviews = [...getRobotReviews(task), { ...review, iteration: 0 }].map((entry, index) => ({
...entry,
+51 -1
View File
@@ -3,7 +3,7 @@ import { tmpdir } from "node:os";
import { join } from "node:path";
import { describe, expect, it } from "vitest";
import { archiveCurrentEvidence, buildArtifactRecords, getCurrentEvidenceIteration, getEvidenceHistory } from "../src/index.js";
import { appendRobotReviewMetadata, getLatestRobotReview, getRobotReviews, shouldOpenHumanSignoffGate } from "../src/robot-review.js";
import { appendRobotReviewMetadata, getLatestRobotReview, getRobotReviews, relaxAdvisoryVerificationHints, shouldOpenHumanSignoffGate } from "../src/robot-review.js";
import type { Task } from "../src/types.js";
function makeTask(overrides: Partial<Task> = {}): Task {
@@ -81,6 +81,56 @@ describe("robot review helpers", () => {
expect(getEvidenceHistory(taskWithHistory)[0].supersede_reason).toBe("threshold changed");
});
it("treats verification hints as advisory when core evidence already passes", () => {
const review = relaxAdvisoryVerificationHints({
reviewer: "auto",
scope: "task evidence",
observations: ["Observed commit, push, and test logs"],
blind_spots: "Did not inspect interactive UI",
accepted: false,
evidence_complete: true,
evidence_convincing: false,
missing_evidence: ["verification_hints_actionable"],
submitted_at: "2026-06-13T00:00:00.000Z",
mode: "auto",
rubric: {
evidence_covers_done_criterion: { reason: "verbatim logs match", pass: true },
falsification_test_runnable: { reason: "command and output shown", pass: true },
failure_modes_addressed: { reason: "plausible top risks named", pass: true },
verification_hints_actionable: { reason: "paths are vague", pass: false },
},
});
expect(review.accepted).toBe(true);
expect(review.evidence_convincing).toBe(true);
expect(review.observations.at(-1)).toContain("treated as advisory");
expect(review.missing_evidence).toEqual([]);
});
it("does not relax verification hints unless the core rubric passes", () => {
const review = relaxAdvisoryVerificationHints({
reviewer: "auto",
scope: "task evidence",
observations: ["Observed vague summary only"],
blind_spots: "Did not rerun tests",
accepted: false,
evidence_complete: true,
evidence_convincing: false,
missing_evidence: ["verification_hints_actionable"],
submitted_at: "2026-06-13T00:00:00.000Z",
mode: "auto",
rubric: {
evidence_covers_done_criterion: { reason: "summary only", pass: false },
falsification_test_runnable: { reason: "command and output shown", pass: true },
failure_modes_addressed: { reason: "plausible top risks named", pass: true },
verification_hints_actionable: { reason: "paths are vague", pass: false },
},
});
expect(review.accepted).toBe(false);
expect(review.evidence_convincing).toBe(false);
});
it("appends robot reviews as iterations", () => {
const task = makeTask();
const metadata1 = appendRobotReviewMetadata(task, {