Relax verification hints auto-review gate

2026-06-27 17:01:35 +08:00 · 2026-06-14 06:35:00 +08:00
parent 9423d299a3
commit d76ed41655
4 changed files with 96 additions and 23 deletions
@@ -82,7 +82,7 @@ The epistemic gate. Required fields:
 | `failure_likely` | Most likely way this is wrong despite evidence |
 | `failure_sneaky` | Perverse/silent failure that looks like success superficially |
 | `falsification_test` | What you ran and what you got, so both you and the human can sanity-check it. Why that result could not occur if a failure mode were real. |
-| `verification_hints` | Where to look and what to check. Descriptions of evidence locations. |
+| `verification_hints` | Where to look and what to check. These still force the agent to think, but weak hints are advisory rather than a hard block when the verbatim evidence already proves the claim. Core evidence still has to pass on its own. |
 | `remaining_uncertainty` | What is NOT tested, deferred edge cases, known limitations |
 | `commands` | Optional structured command records: `{ cmd, exit_code, stdout_path?, stderr_path? }` |
 | `evidence_paths` / `falsification_paths` | Optional local artifact paths. Stored as absolute path + sha256 + byte size |
@@ -92,7 +92,7 @@ After calling this, the task shows `👀` and is only completable via `/lgtm <id

 The tool result includes a non-blocking self-check prompt asking whether the evidence directly addresses the `done_criterion` and whether a skeptical reviewer would find it convincing.

-`lgtm_ask` always runs the robot-review stage immediately after storing evidence. A robot review that rejects the evidence clears `pending_approval` until the evidence is strengthened and reviewed again. A reviewer crash, auth failure, timeout, or malformed output is recorded as a warning and leaves human sign-off open.
+`lgtm_ask` always runs the robot-review stage immediately after storing evidence. A robot review that rejects the evidence clears `pending_approval` until the evidence is strengthened and reviewed again. Weak verification hints are advisory if the core verbatim evidence already proves the done criterion. A reviewer crash, auth failure, timeout, or malformed output is recorded as a warning and leaves human sign-off open.

 ### `lgtm_supersede`

@@ -46,6 +46,7 @@ import {
  getRobotReviews,
  latestRobotReviewPasses,
  type RobotReviewRecord,
+  relaxAdvisoryVerificationHints,
  shouldOpenHumanSignoffGate,
 } from "./robot-review.js";
 import { TaskStore } from "./task-store.js";
@@ -530,7 +531,7 @@ function buildRobotReviewPrompt(task: any): string {
    "",
    "## Critical: Evidence must be verbatim",
    "",
-    "Evidence should contain literal output — verbatim command output, exact log lines, markdown block quotes, table rows, URLs — not summaries or interpretations. If the evidence only says 'it worked' or 'returned 5 results' without showing the actual output, flag it under verification_hints_actionable or evidence_covers_done_criterion.",
+    "Evidence should contain literal output — verbatim command output, exact log lines, markdown block quotes, table rows, URLs — not summaries or interpretations. If the evidence only says 'it worked' or 'returned 5 results' without showing the actual output, flag it under evidence_covers_done_criterion or falsification_test_runnable, not verification_hints_actionable.",
    "A human must be able to verify the claim from the evidence alone, without re-running anything. Summaries are not evidence. Literal output is evidence.",
    "",
    "## Rubric (rate each item pass/fail)",
@@ -538,11 +539,11 @@ function buildRobotReviewPrompt(task: any): string {
    "1. evidence_covers_done_criterion: Does the evidence directly address the stated done criterion? Evidence must be verbatim (literal output, not 'it worked').",
    "2. falsification_test_runnable: Is the falsification test concrete enough that someone could run it and get a yes/no result? Must include actual output, not just 'ran X and it worked'.",
    "3. failure_modes_addressed: Are the failure_likely and failure_sneaky plausibly the top failure modes? (Not: are there OTHER failure modes?)",
-    "4. verification_hints_actionable: Can a human follow the verification hints to check the claim without re-running experiments? Hints must reference specific content (line ranges, output snippets, URLs), not bare paths or counts.",
+    "4. verification_hints_actionable: Can a human follow the verification hints to check the claim without re-running experiments? Hints should reference specific content (line ranges, output snippets, URLs), not bare paths or counts.",
    "",
    "Set evidence_complete=true only if items 1 and 2 pass.",
-    "Set evidence_convincing=true only if items 1, 2, AND 4 pass.",
-    "Set accepted=true only if ALL rubric items pass.",
+    "Set evidence_convincing=true only if items 1 and 2 pass. Item 4 is advisory unless it reveals that items 1 or 2 were overstated.",
+    "Set accepted=true only if items 1, 2, and 3 pass. Do not reject solely because verification hints are weak if the verbatim evidence already proves the done criterion.",
    "",
    "Observations: report what you see, not what might be missing. Comments and suggestions go in observations.",
    "missing_evidence: ONLY items from the rubric that failed. Do NOT add new dimensions.",
@@ -623,24 +624,25 @@ async function runAutomaticRobotReview(
    }
    if (Object.keys(r).length > 0) rubric = r;
  }
+  const review = relaxAdvisoryVerificationHints({
+    reviewer: typeof parsed.reviewer === "string" ? parsed.reviewer : commandLabel,
+    scope: typeof parsed.scope === "string" ? parsed.scope : "task evidence package",
+    observations,
+    blind_spots: typeof parsed.blind_spots === "string" ? parsed.blind_spots : "not stated",
+    accepted: typeof parsed.accepted === "boolean"
+      ? parsed.accepted
+      : parsed.evidence_complete === true && parsed.evidence_convincing === true,
+    evidence_complete: parsed.evidence_complete === true,
+    evidence_convincing: parsed.evidence_convincing === true,
+    missing_evidence,
+    submitted_at: new Date().toISOString(),
+    mode: "auto",
+    raw_output: result.stdout.trim(),
+    rubric,
+  });
  return {
    command: commandLabel,
-    review: {
-      reviewer: typeof parsed.reviewer === "string" ? parsed.reviewer : commandLabel,
-      scope: typeof parsed.scope === "string" ? parsed.scope : "task evidence package",
-      observations,
-      blind_spots: typeof parsed.blind_spots === "string" ? parsed.blind_spots : "not stated",
-      accepted: typeof parsed.accepted === "boolean"
-        ? parsed.accepted
-        : parsed.evidence_complete === true && parsed.evidence_convincing === true,
-      evidence_complete: parsed.evidence_complete === true,
-      evidence_convincing: parsed.evidence_convincing === true,
-      missing_evidence,
-      submitted_at: new Date().toISOString(),
-      mode: "auto",
-      raw_output: result.stdout.trim(),
-      rubric,
-    },
+    review,
  };
 }

@@ -105,6 +105,27 @@ export function shouldOpenHumanSignoffGate(task: Task, reviewAccepted: boolean):
  return reviewAccepted && typeof task.metadata?.lgtm_evidence === "string" && task.metadata.lgtm_evidence.length > 0;
 }

+export function relaxAdvisoryVerificationHints(review: Omit<RobotReviewRecord, "iteration">): Omit<RobotReviewRecord, "iteration"> {
+  const rubric = review.rubric;
+  if (!rubric || review.evidence_complete !== true) return review;
+  const requiredCoreKeys = ["evidence_covers_done_criterion", "falsification_test_runnable", "failure_modes_addressed"];
+  if (!requiredCoreKeys.every((key) => rubric[key]?.pass === true)) return review;
+  const failedKeys = Object.entries(rubric)
+    .filter(([, item]) => item.pass !== true)
+    .map(([key]) => key);
+  if (failedKeys.length !== 1 || failedKeys[0] !== "verification_hints_actionable") return review;
+  return {
+    ...review,
+    accepted: true,
+    evidence_convincing: true,
+    observations: [
+      ...review.observations,
+      "Verification hints were weak, but treated as advisory because the verbatim evidence already covered the done criterion.",
+    ],
+    missing_evidence: review.missing_evidence.filter((item) => item !== "verification_hints_actionable" && !/verification hint/i.test(item)),
+  };
+}
+
 export function appendRobotReviewMetadata(task: Task, review: Omit<RobotReviewRecord, "iteration">): Record<string, unknown> {
  const robot_reviews = [...getRobotReviews(task), { ...review, iteration: 0 }].map((entry, index) => ({
    ...entry,
@@ -3,7 +3,7 @@ import { tmpdir } from "node:os";
 import { join } from "node:path";
 import { describe, expect, it } from "vitest";
 import { archiveCurrentEvidence, buildArtifactRecords, getCurrentEvidenceIteration, getEvidenceHistory } from "../src/index.js";
-import { appendRobotReviewMetadata, getLatestRobotReview, getRobotReviews, shouldOpenHumanSignoffGate } from "../src/robot-review.js";
+import { appendRobotReviewMetadata, getLatestRobotReview, getRobotReviews, relaxAdvisoryVerificationHints, shouldOpenHumanSignoffGate } from "../src/robot-review.js";
 import type { Task } from "../src/types.js";

 function makeTask(overrides: Partial<Task> = {}): Task {
@@ -81,6 +81,56 @@ describe("robot review helpers", () => {
    expect(getEvidenceHistory(taskWithHistory)[0].supersede_reason).toBe("threshold changed");
  });

+  it("treats verification hints as advisory when core evidence already passes", () => {
+    const review = relaxAdvisoryVerificationHints({
+      reviewer: "auto",
+      scope: "task evidence",
+      observations: ["Observed commit, push, and test logs"],
+      blind_spots: "Did not inspect interactive UI",
+      accepted: false,
+      evidence_complete: true,
+      evidence_convincing: false,
+      missing_evidence: ["verification_hints_actionable"],
+      submitted_at: "2026-06-13T00:00:00.000Z",
+      mode: "auto",
+      rubric: {
+        evidence_covers_done_criterion: { reason: "verbatim logs match", pass: true },
+        falsification_test_runnable: { reason: "command and output shown", pass: true },
+        failure_modes_addressed: { reason: "plausible top risks named", pass: true },
+        verification_hints_actionable: { reason: "paths are vague", pass: false },
+      },
+    });
+
+    expect(review.accepted).toBe(true);
+    expect(review.evidence_convincing).toBe(true);
+    expect(review.observations.at(-1)).toContain("treated as advisory");
+    expect(review.missing_evidence).toEqual([]);
+  });
+
+  it("does not relax verification hints unless the core rubric passes", () => {
+    const review = relaxAdvisoryVerificationHints({
+      reviewer: "auto",
+      scope: "task evidence",
+      observations: ["Observed vague summary only"],
+      blind_spots: "Did not rerun tests",
+      accepted: false,
+      evidence_complete: true,
+      evidence_convincing: false,
+      missing_evidence: ["verification_hints_actionable"],
+      submitted_at: "2026-06-13T00:00:00.000Z",
+      mode: "auto",
+      rubric: {
+        evidence_covers_done_criterion: { reason: "summary only", pass: false },
+        falsification_test_runnable: { reason: "command and output shown", pass: true },
+        failure_modes_addressed: { reason: "plausible top risks named", pass: true },
+        verification_hints_actionable: { reason: "paths are vague", pass: false },
+      },
+    });
+
+    expect(review.accepted).toBe(false);
+    expect(review.evidence_convincing).toBe(false);
+  });
+
  it("appends robot reviews as iterations", () => {
    const task = makeTask();
    const metadata1 = appendRobotReviewMetadata(task, {