pi-lgtm/test/robot-review.test.ts

import { mkdtempSync, writeFileSync } from "node:fs";
import { tmpdir } from "node:os";
import { join } from "node:path";
import { describe, expect, it } from "vitest";
import { archiveCurrentEvidence, buildArtifactRecords, buildRobotReviewPrompt, getCurrentEvidenceIteration, getEvidenceHistory, renderEvidencePacket, renderProofLog } from "../src/index.js";
import { appendRobotReviewMetadata, getLatestRobotReview, getRobotReviews, hasCompleteProofClaim, relaxAdvisoryVerificationHints, shouldCompleteAfterAcceptedReview } from "../src/robot-review.js";
import type { Task } from "../src/types.js";

function makeTask(overrides: Partial<Task> = {}): Task {
  return {
    id: "1",
    subject: "Test",
    description: "Desc",
    done_criterion: "done",
    status: "pending",
    progress_label: undefined,
    metadata: {},
    blocks: [],
    blockedBy: [],
    createdAt: 0,
    updatedAt: 0,
    ...overrides,
  };
}

describe("robot review helpers", () => {
  it("completes only after accepted review and complete proof claim", () => {
    const task = makeTask({
      metadata: {
        lgtm_evidence: "literal output",
        lgtm_failure_likely: "wrong command",
        lgtm_failure_sneaky: "right output for wrong reason",
        lgtm_failure_unknown: "untested platform",
        lgtm_falsification_test: "npm test\npass",
        lgtm_evidence_reasoning: "the test output rules out the named failures for this scope",
        lgtm_verification_hints: ["test/robot-review.test.ts shows the expectation"],
        lgtm_remaining_uncertainty: "does not test prod install",
      },
    });
    expect(hasCompleteProofClaim(task)).toBe(true);
    expect(shouldCompleteAfterAcceptedReview(task, true)).toBe(true);
    expect(shouldCompleteAfterAcceptedReview(task, false)).toBe(false);
    expect(shouldCompleteAfterAcceptedReview(makeTask({ metadata: { lgtm_evidence: "literal output" } }), true)).toBe(false);
  });

  it("reads legacy single-review metadata", () => {
    const task = makeTask({
      metadata: {
        robot_review_reviewer: "opencode",
        robot_review_scope: "task evidence",
        robot_review_observations: ["Observed no command output for the core claim"],
        robot_review_blind_spots: "Did not rerun tests",
        robot_review_submitted_at: "2026-04-17T00:00:00.000Z",
      },
    });

    const reviews = getRobotReviews(task);
    expect(reviews).toHaveLength(1);
    expect(reviews[0].reviewer).toBe("opencode");
    expect(reviews[0].iteration).toBe(1);
    expect(reviews[0].accepted).toBe(true);
  });

  it("builds artifact records with absolute path and sha256", () => {
    const dir = mkdtempSync(join(tmpdir(), "pi-proof-tasks-"));
    const path = join(dir, "evidence.log");
    writeFileSync(path, "hello\n");

    const [artifact] = buildArtifactRecords([path]);
    expect(artifact.path).toBe(path);
    expect(artifact.bytes).toBe(6);
    expect(artifact.sha256).toHaveLength(64);
  });

  it("archives current evidence with reason", () => {
    const task = makeTask({
      metadata: {
        lgtm_evidence: "literal output",
        lgtm_failure_likely: "wrong seed",
        lgtm_failure_sneaky: "wrong threshold",
        lgtm_failure_unknown: "untested environment",
        lgtm_falsification_test: "pytest -k check",
        lgtm_evidence_reasoning: "pytest output distinguishes the expected passing path from the named failures",
        lgtm_verification_hints: ["see line 5"],
        lgtm_remaining_uncertainty: "not load tested",
        lgtm_submitted_at: "2026-06-07T00:00:00.000Z",
        lgtm_commands: [{ cmd: "pytest", exit_code: 0 }],
      },
    });

    const archived = archiveCurrentEvidence(task, "threshold changed");
    const taskWithHistory = makeTask({ metadata: archived });
    expect(getCurrentEvidenceIteration(task)?.iteration).toBe(1);
    expect(getEvidenceHistory(taskWithHistory)).toHaveLength(1);
    expect(getEvidenceHistory(taskWithHistory)[0].supersede_reason).toBe("threshold changed");
  });

  it("treats verification hints as advisory when core evidence already passes", () => {
    const review = relaxAdvisoryVerificationHints({
      reviewer: "auto",
      scope: "task evidence",
      observations: ["Observed commit, push, and test logs"],
      concerns: [],
      suggestions: [],
      blind_spots: "Did not inspect interactive UI",
      accepted: false,
      evidence_complete: true,
      evidence_convincing: false,
      missing_evidence: ["verification_hints_actionable"],
      submitted_at: "2026-06-13T00:00:00.000Z",
      mode: "auto",
      rubric: {
        evidence_covers_done_criterion: { reason: "verbatim logs match", pass: true },
        falsification_test_runnable: { reason: "command and output shown", pass: true },
        failure_modes_addressed: { reason: "plausible top risks named", pass: true },
        evidence_distinguishes_success: { reason: "evidence rules out named failures", pass: true },
        verification_hints_actionable: { reason: "paths are vague", pass: false },
      },
    });

    expect(review.accepted).toBe(true);
    expect(review.evidence_convincing).toBe(true);
    expect(review.observations.at(-1)).toContain("treated as advisory");
    expect(review.missing_evidence).toEqual([]);
  });

  it("does not relax verification hints unless the core rubric passes", () => {
    const review = relaxAdvisoryVerificationHints({
      reviewer: "auto",
      scope: "task evidence",
      observations: ["Observed vague summary only"],
      concerns: [],
      suggestions: [],
      blind_spots: "Did not rerun tests",
      accepted: false,
      evidence_complete: true,
      evidence_convincing: false,
      missing_evidence: ["verification_hints_actionable"],
      submitted_at: "2026-06-13T00:00:00.000Z",
      mode: "auto",
      rubric: {
        evidence_covers_done_criterion: { reason: "summary only", pass: false },
        falsification_test_runnable: { reason: "command and output shown", pass: true },
        failure_modes_addressed: { reason: "plausible top risks named", pass: true },
        evidence_distinguishes_success: { reason: "evidence does not rule out summary-only failure", pass: false },
        verification_hints_actionable: { reason: "paths are vague", pass: false },
      },
    });

    expect(review.accepted).toBe(false);
    expect(review.evidence_convincing).toBe(false);
  });

  it("renders one compact evidence packet for both human and robot review", () => {
    const task = makeTask({
      metadata: {
        lgtm_evidence: "literal output",
        lgtm_failure_likely: "wrong seed",
        lgtm_failure_sneaky: "wrong threshold",
        lgtm_failure_unknown: "does not test UI rendering",
        lgtm_falsification_test: "pytest -k check\nPASSED",
        lgtm_evidence_reasoning: "The passing pytest transcript distinguishes success from wrong-threshold and wrong-seed failures for this test scope.",
        lgtm_verification_hints: ["test/robot-review.test.ts contains the new guard test"],
        lgtm_remaining_uncertainty: "not load tested",
        lgtm_submitted_at: "2026-06-14T00:00:00.000Z",
        lgtm_commands: [{ cmd: "npm test", exit_code: 0, stdout_path: "/tmp/test.log" }],
        lgtm_evidence_artifacts: [{ path: "/tmp/test.log", sha256: "abc", bytes: 123 }],
      },
    });

    const packet = renderEvidencePacket(task);
    const prompt = buildRobotReviewPrompt(task);
    expect(packet).toContain("## Goal");
    expect(packet).toContain("## Planned evidence / UAT");
    expect(packet).toContain("## Attempt 1");
    expect(prompt).toContain(packet);
    expect(prompt).toContain("does this evidence prove success for the stated goal");
  });

  it("appends robot reviews as iterations", () => {
    const task = makeTask();
    const metadata1 = appendRobotReviewMetadata(task, {
      reviewer: "opencode",
      scope: "task evidence",
      observations: ["Observed missing benchmark output"],
      concerns: ["The current evidence does not show the claimed speedup."],
      suggestions: ["Add the benchmark transcript for the claimed speedup."],
      blind_spots: "Did not inspect prod config",
      accepted: false,
      evidence_complete: false,
      evidence_convincing: false,
      missing_evidence: ["Benchmark output for the claimed speedup"],
      submitted_at: "2026-04-17T00:00:00.000Z",
      mode: "auto",
    });
    const task1 = makeTask({ metadata: metadata1 });
    const metadata2 = appendRobotReviewMetadata(task1, {
      reviewer: "opencode",
      scope: "updated task evidence",
      observations: ["Observed benchmark output and test transcript"],
      concerns: [],
      suggestions: [],
      blind_spots: "Did not inspect long-run stability",
      accepted: true,
      evidence_complete: true,
      evidence_convincing: true,
      missing_evidence: [],
      submitted_at: "2026-04-17T01:00:00.000Z",
      mode: "auto",
    });

    const task2 = makeTask({ metadata: metadata2 });
    const reviews = getRobotReviews(task2);
    expect(reviews).toHaveLength(2);
    expect(reviews[0].iteration).toBe(1);
    expect(reviews[1].iteration).toBe(2);
    expect(getLatestRobotReview(task2)?.evidence_convincing).toBe(true);
    expect(task2.metadata.robot_review_iteration_count).toBe(2);
  });

  it("renders a simple proof log with judgement and suggestions", () => {
    const taskWithEvidence = makeTask({
      metadata: {
        lgtm_evidence: "npm test\n125 passed",
        lgtm_failure_likely: "old package name still in README",
        lgtm_failure_sneaky: "top-level direct completion still slips through",
        lgtm_failure_unknown: "fresh judge command fails in a real session",
        lgtm_falsification_test: "npm test\n125 passed",
        lgtm_evidence_reasoning: "The test transcript and grep distinguish the intended behavior from stale workflow regressions.",
        lgtm_verification_hints: ["README.md install block shows pi-proof-tasks"],
        lgtm_remaining_uncertainty: "Did not exercise every model provider.",
        lgtm_submitted_at: "2026-06-14T00:00:00.000Z",
      },
    });
    const task = makeTask({
      metadata: {
        ...taskWithEvidence.metadata,
        ...appendRobotReviewMetadata(taskWithEvidence, {
          reviewer: "auto",
          scope: "proof log",
          observations: ["Observed the test transcript and renamed package."],
          concerns: ["The live Pi session path is still untested."],
          suggestions: ["Run one self-hosted TaskClaimDone UAT."],
          blind_spots: "Did not inspect external auth state",
          accepted: false,
          evidence_complete: true,
          evidence_convincing: false,
          missing_evidence: ["self-hosted TaskClaimDone UAT"],
          submitted_at: "2026-06-14T00:01:00.000Z",
          mode: "auto",
        }),
      },
    });

    const log = renderProofLog(task);
    expect(log).toContain("# Task #1: Test");
    expect(log).toContain("## Goal");
    expect(log).toContain("## Planned evidence / UAT");
    expect(log).toContain("## Attempt 1");
    expect(log).toContain("### Submitted evidence");
    expect(log).toContain("### Judgement");
    expect(log).toContain("Refused by auto");
    expect(log).toContain("Run one self-hosted TaskClaimDone UAT.");
  });

  it("renders reviewer-unavailable proof logs for fail-open completion notes", () => {
    const task = makeTask({
      status: "completed",
      metadata: {
        lgtm_evidence: "npm test\n125 passed",
        lgtm_failure_likely: "old package name still in README",
        lgtm_failure_sneaky: "top-level direct completion still slips through",
        lgtm_failure_unknown: "fresh judge command fails in a real session",
        lgtm_falsification_test: "npm test\n125 passed",
        lgtm_evidence_reasoning: "The test transcript and grep distinguish the intended behavior from stale workflow regressions.",
        lgtm_verification_hints: ["README.md install block shows pi-proof-tasks"],
        lgtm_remaining_uncertainty: "Did not exercise every model provider.",
        robot_review_last_error: "judge auth failed",
      },
    });

    const log = renderProofLog(task);
    expect(log).toContain("completed with reviewer unavailable");
    expect(log).toContain("### Judgement");
    expect(log).toContain("judge auth failed");
    expect(log).toContain("Autonomy continued without blocking completion.");
  });
});