mirror of
https://github.com/wassname/pi-lgtm.git
synced 2026-06-27 16:46:17 +08:00
191 lines
6.4 KiB
TypeScript
191 lines
6.4 KiB
TypeScript
import { chmodSync, mkdtempSync, writeFileSync } from "node:fs";
|
|
import { tmpdir } from "node:os";
|
|
import { join } from "node:path";
|
|
import { afterEach, describe, expect, it, vi } from "vitest";
|
|
import proofTasksExtension from "../src/index.js";
|
|
|
|
type RegisteredTool = {
|
|
name: string;
|
|
execute: (...args: any[]) => Promise<any>;
|
|
};
|
|
|
|
function makeHarness() {
|
|
const tools = new Map<string, RegisteredTool>();
|
|
const pi = {
|
|
on: vi.fn(),
|
|
registerTool: vi.fn((tool: RegisteredTool) => tools.set(tool.name, tool)),
|
|
registerCommand: vi.fn(),
|
|
sendMessage: vi.fn(),
|
|
};
|
|
|
|
proofTasksExtension(pi as any);
|
|
|
|
async function execTool(
|
|
name: string,
|
|
params: Record<string, unknown>,
|
|
ctx: Record<string, unknown> = {},
|
|
) {
|
|
const tool = tools.get(name);
|
|
if (!tool) throw new Error(`Tool ${name} not registered`);
|
|
return tool.execute("tool-call", params, undefined, undefined, ctx);
|
|
}
|
|
|
|
return { execTool };
|
|
}
|
|
|
|
function writeReviewerScript(source: string): string {
|
|
const dir = mkdtempSync(join(tmpdir(), "pi-proof-reviewer-"));
|
|
const path = join(dir, "reviewer.js");
|
|
writeFileSync(path, `#!/usr/bin/env node\n${source}\n`);
|
|
chmodSync(path, 0o755);
|
|
return path;
|
|
}
|
|
|
|
const ORIGINAL_PI_BIN = process.env.PI_PROOF_TASKS_PI_BIN;
|
|
afterEach(() => {
|
|
if (ORIGINAL_PI_BIN === undefined) delete process.env.PI_PROOF_TASKS_PI_BIN;
|
|
else process.env.PI_PROOF_TASKS_PI_BIN = ORIGINAL_PI_BIN;
|
|
});
|
|
|
|
describe("TaskClaimDone end-to-end proof flow", () => {
|
|
it("keeps the task open on rejected review and /lgtm-style TaskGet shows truncated evidence", async () => {
|
|
const reviewer = writeReviewerScript(`
|
|
const review = {
|
|
reviewer: "fake-judge",
|
|
scope: "task evidence",
|
|
rubric: {
|
|
evidence_covers_done_criterion: { reason: "missing one artifact", pass: false },
|
|
falsification_test_runnable: { reason: "ok", pass: true },
|
|
failure_modes_addressed: { reason: "ok", pass: true },
|
|
evidence_distinguishes_success: { reason: "not enough", pass: false },
|
|
verification_hints_actionable: { reason: "ok", pass: true }
|
|
},
|
|
observations: ["Observed truncated proof packet"],
|
|
concerns: ["Need stronger evidence"],
|
|
suggestions: ["Add one more artifact"],
|
|
blind_spots: "Did not inspect live TUI",
|
|
missing_evidence: ["evidence_covers_done_criterion", "evidence_distinguishes_success"],
|
|
evidence_complete: false,
|
|
evidence_convincing: false,
|
|
accepted: false
|
|
};
|
|
console.log("ROBOT_REVIEW_JSON_START");
|
|
console.log(JSON.stringify(review));
|
|
console.log("ROBOT_REVIEW_JSON_END");
|
|
`);
|
|
process.env.PI_PROOF_TASKS_PI_BIN = reviewer;
|
|
|
|
const harness = makeHarness();
|
|
await harness.execTool("TaskCreate", {
|
|
subject: "Proof task",
|
|
description: "Desc",
|
|
done_criterion: "done",
|
|
});
|
|
|
|
const artifactPath = join(tmpdir(), "proof-long-evidence.log");
|
|
const longEvidence = Array.from(
|
|
{ length: 35 },
|
|
(_, i) => `line ${i + 1}`,
|
|
).join("\n");
|
|
writeFileSync(artifactPath, longEvidence);
|
|
|
|
const claim = await harness.execTool(
|
|
"TaskClaimDone",
|
|
{
|
|
taskId: "1",
|
|
evidence: longEvidence,
|
|
failure_likely: "missing artifact",
|
|
failure_sneaky: "right shape for wrong reason",
|
|
failure_unknown: "untested provider path",
|
|
falsification_test: "npm test\npass",
|
|
evidence_reasoning:
|
|
"The packet distinguishes the named failures for this test scope.",
|
|
verification_hints: ["look at the proof log"],
|
|
remaining_uncertainty: "Did not inspect live TUI",
|
|
evidence_paths: [artifactPath],
|
|
},
|
|
{ model: { provider: "openai", id: "gpt-5" } },
|
|
);
|
|
|
|
const claimText = claim.content[0].text;
|
|
|
|
const taskGet = await harness.execTool("TaskGet", { taskId: "1" });
|
|
const text = taskGet.content[0].text;
|
|
|
|
expect(claimText).toContain("## TaskClaimDone -> Task #1: Proof task");
|
|
expect(claimText).toContain("### Metadata");
|
|
expect(claimText).toContain("- Proof iterations: 1");
|
|
expect(claimText).toContain("- Robot reviews: 1");
|
|
expect(text).toContain("Status: pending");
|
|
expect(text).toContain(
|
|
"Gate status: latest proof review rejected the evidence; strengthen the proof and try again",
|
|
);
|
|
expect(text).toContain("line 1");
|
|
expect(text).toContain("line 8");
|
|
expect(text).toContain("line 35");
|
|
expect(text).not.toContain("line 9");
|
|
expect(text).toContain("[... 19 middle lines omitted ...]");
|
|
expect(text).toContain(
|
|
`[truncated at 16 lines from 35; showing first 8 and last 8; full text: ${artifactPath}]`,
|
|
);
|
|
expect(text).toContain("### Judgement");
|
|
expect(text).toContain("Refused");
|
|
expect(text).toContain("### Missing evidence");
|
|
expect(text).toContain("### Suggestions");
|
|
expect(text).toContain("Add one more artifact");
|
|
});
|
|
|
|
it("completes the task fail-open on parse failure and preserves the failure note", async () => {
|
|
const reviewer = writeReviewerScript(`
|
|
console.log("ROBOT_REVIEW_JSON_START and nope ROBOT_REVIEW_JSON_END");
|
|
`);
|
|
process.env.PI_PROOF_TASKS_PI_BIN = reviewer;
|
|
|
|
const harness = makeHarness();
|
|
await harness.execTool("TaskCreate", {
|
|
subject: "Proof task",
|
|
description: "Desc",
|
|
done_criterion: "done",
|
|
});
|
|
|
|
const claim = await harness.execTool(
|
|
"TaskClaimDone",
|
|
{
|
|
taskId: "1",
|
|
evidence: "short evidence",
|
|
failure_likely: "missing artifact",
|
|
failure_sneaky: "right shape for wrong reason",
|
|
failure_unknown: "untested provider path",
|
|
falsification_test: "npm test\npass",
|
|
evidence_reasoning:
|
|
"The packet distinguishes the named failures for this test scope.",
|
|
verification_hints: ["look at the proof log"],
|
|
remaining_uncertainty: "Did not inspect live TUI",
|
|
},
|
|
{ model: { provider: "openai", id: "gpt-5" } },
|
|
);
|
|
|
|
const claimText = claim.content[0].text;
|
|
|
|
const taskGet = await harness.execTool("TaskGet", { taskId: "1" });
|
|
const text = taskGet.content[0].text;
|
|
|
|
expect(claimText).toContain("## TaskClaimDone -> Task #1: Proof task");
|
|
expect(claimText).toContain("### Metadata");
|
|
expect(claimText).toContain(
|
|
"- Gate status: completed with reviewer unavailable",
|
|
);
|
|
expect(text).toContain("Status: completed");
|
|
expect(text).toContain("completed with reviewer unavailable");
|
|
expect(text).toContain("Raw output:");
|
|
expect(text).toContain("### Suggestions");
|
|
expect(text).not.toContain("### Missing evidence\n- (none)");
|
|
expect(text).not.toContain("### Observations\n- (none)");
|
|
expect(text).not.toContain("### Concerns\n- (none)");
|
|
expect(text).toContain(
|
|
"ROBOT_REVIEW_JSON_START and nope ROBOT_REVIEW_JSON_END",
|
|
);
|
|
expect(text).toContain("Autonomy continued without blocking completion.");
|
|
});
|
|
});
|