pi-lgtm/test/task-claim-done-flow.test.ts

import { chmodSync, mkdtempSync, writeFileSync } from "node:fs";
import { tmpdir } from "node:os";
import { join } from "node:path";
import { afterEach, describe, expect, it, vi } from "vitest";
import proofTasksExtension from "../src/index.js";

type RegisteredTool = {
	name: string;
	execute: (...args: any[]) => Promise<any>;
};

function makeHarness() {
	const tools = new Map<string, RegisteredTool>();
	const pi = {
		on: vi.fn(),
		registerTool: vi.fn((tool: RegisteredTool) => tools.set(tool.name, tool)),
		registerCommand: vi.fn(),
		sendMessage: vi.fn(),
	};

	proofTasksExtension(pi as any);

	async function execTool(
		name: string,
		params: Record<string, unknown>,
		ctx: Record<string, unknown> = {},
	) {
		const tool = tools.get(name);
		if (!tool) throw new Error(`Tool ${name} not registered`);
		return tool.execute("tool-call", params, undefined, undefined, ctx);
	}

	return { execTool };
}

function writeReviewerScript(source: string): string {
	const dir = mkdtempSync(join(tmpdir(), "pi-proof-reviewer-"));
	const path = join(dir, "reviewer.js");
	writeFileSync(path, `#!/usr/bin/env node\n${source}\n`);
	chmodSync(path, 0o755);
	return path;
}

const ORIGINAL_PI_BIN = process.env.PI_PROOF_TASKS_PI_BIN;
afterEach(() => {
	if (ORIGINAL_PI_BIN === undefined) delete process.env.PI_PROOF_TASKS_PI_BIN;
	else process.env.PI_PROOF_TASKS_PI_BIN = ORIGINAL_PI_BIN;
});

describe("TaskClaimDone end-to-end proof flow", () => {
	it("keeps the task open on rejected review and /lgtm-style TaskGet shows truncated evidence", async () => {
		const reviewer = writeReviewerScript(`
const review = {
  reviewer: "fake-judge",
  scope: "task evidence",
  rubric: {
    evidence_covers_done_criterion: { reason: "missing one artifact", pass: false },
    falsification_test_runnable: { reason: "ok", pass: true },
    failure_modes_addressed: { reason: "ok", pass: true },
    evidence_distinguishes_success: { reason: "not enough", pass: false },
    verification_hints_actionable: { reason: "ok", pass: true }
  },
  observations: ["Observed truncated proof packet"],
  concerns: ["Need stronger evidence"],
  suggestions: ["Add one more artifact"],
  blind_spots: "Did not inspect live TUI",
  missing_evidence: ["evidence_covers_done_criterion", "evidence_distinguishes_success"],
  evidence_complete: false,
  evidence_convincing: false,
  accepted: false
};
console.log("ROBOT_REVIEW_JSON_START");
console.log(JSON.stringify(review));
console.log("ROBOT_REVIEW_JSON_END");
`);
		process.env.PI_PROOF_TASKS_PI_BIN = reviewer;

		const harness = makeHarness();
		await harness.execTool("TaskCreate", {
			subject: "Proof task",
			description: "Desc",
			done_criterion: "done",
		});

		const artifactPath = join(tmpdir(), "proof-long-evidence.log");
		const longEvidence = Array.from(
			{ length: 35 },
			(_, i) => `line ${i + 1}`,
		).join("\n");
		writeFileSync(artifactPath, longEvidence);

		const claim = await harness.execTool(
			"TaskClaimDone",
			{
				taskId: "1",
				evidence: longEvidence,
				failure_likely: "missing artifact",
				failure_sneaky: "right shape for wrong reason",
				failure_unknown: "untested provider path",
				falsification_test: "npm test\npass",
				evidence_reasoning:
					"The packet distinguishes the named failures for this test scope.",
				verification_hints: ["look at the proof log"],
				remaining_uncertainty: "Did not inspect live TUI",
				evidence_paths: [artifactPath],
			},
			{ model: { provider: "openai", id: "gpt-5" } },
		);

		const claimText = claim.content[0].text;

		const taskGet = await harness.execTool("TaskGet", { taskId: "1" });
		const text = taskGet.content[0].text;

		expect(claimText).toContain("## TaskClaimDone -> Task #1: Proof task");
		expect(claimText).toContain("### Metadata");
		expect(claimText).toContain("- Proof iterations: 1");
		expect(claimText).toContain("- Robot reviews: 1");
		expect(text).toContain("Status: pending");
		expect(text).toContain(
			"Gate status: latest proof review rejected the evidence; strengthen the proof and try again",
		);
		expect(text).toContain("line 1");
		expect(text).toContain("line 8");
		expect(text).toContain("line 35");
		expect(text).not.toContain("line 9");
		expect(text).toContain("[... 19 middle lines omitted ...]");
		expect(text).toContain(
			`[truncated at 16 lines from 35; showing first 8 and last 8; full text: ${artifactPath}]`,
		);
		expect(text).toContain("### Judgement");
		expect(text).toContain("Refused");
		expect(text).toContain("### Missing evidence");
		expect(text).toContain("### Suggestions");
		expect(text).toContain("Add one more artifact");
	});

	it("completes the task fail-open on parse failure and preserves the failure note", async () => {
		const reviewer = writeReviewerScript(`
console.log("ROBOT_REVIEW_JSON_START and nope ROBOT_REVIEW_JSON_END");
`);
		process.env.PI_PROOF_TASKS_PI_BIN = reviewer;

		const harness = makeHarness();
		await harness.execTool("TaskCreate", {
			subject: "Proof task",
			description: "Desc",
			done_criterion: "done",
		});

		const claim = await harness.execTool(
			"TaskClaimDone",
			{
				taskId: "1",
				evidence: "short evidence",
				failure_likely: "missing artifact",
				failure_sneaky: "right shape for wrong reason",
				failure_unknown: "untested provider path",
				falsification_test: "npm test\npass",
				evidence_reasoning:
					"The packet distinguishes the named failures for this test scope.",
				verification_hints: ["look at the proof log"],
				remaining_uncertainty: "Did not inspect live TUI",
			},
			{ model: { provider: "openai", id: "gpt-5" } },
		);

		const claimText = claim.content[0].text;

		const taskGet = await harness.execTool("TaskGet", { taskId: "1" });
		const text = taskGet.content[0].text;

		expect(claimText).toContain("## TaskClaimDone -> Task #1: Proof task");
		expect(claimText).toContain("### Metadata");
		expect(claimText).toContain(
			"- Gate status: completed with reviewer unavailable",
		);
		expect(text).toContain("Status: completed");
		expect(text).toContain("completed with reviewer unavailable");
		expect(text).toContain("Raw output:");
		expect(text).toContain("### Suggestions");
		expect(text).not.toContain("### Missing evidence\n- (none)");
		expect(text).not.toContain("### Observations\n- (none)");
		expect(text).not.toContain("### Concerns\n- (none)");
		expect(text).toContain(
			"ROBOT_REVIEW_JSON_START and nope ROBOT_REVIEW_JSON_END",
		);
		expect(text).toContain("Autonomy continued without blocking completion.");
	});
});