mirror of
https://github.com/wassname/pi-lgtm.git
synced 2026-06-27 14:16:01 +08:00
short judge output: verdict + reason + blind spots only
This commit is contained in:
+30
-45
@@ -346,6 +346,7 @@ const ROBOT_REVIEW_KEYS = [
|
||||
"robot_review_submitted_at",
|
||||
"robot_review_mode",
|
||||
"robot_review_raw_output",
|
||||
"robot_review_reason",
|
||||
"robot_review_requires_followup",
|
||||
"robot_review_iteration_count",
|
||||
] as const;
|
||||
@@ -855,29 +856,21 @@ function renderTaskUpdateSummary(
|
||||
}
|
||||
|
||||
function renderCompactRobotReview(review: RobotReviewRecord): string {
|
||||
return [
|
||||
`### Judge`,
|
||||
`${review.accepted ? "Accepted" : "Refused"} by ${review.reviewer} on ${review.submitted_at}.`,
|
||||
`Evidence complete: ${review.evidence_complete ? "yes" : "no"}`,
|
||||
`Evidence convincing: ${review.evidence_convincing ? "yes" : "no"}`,
|
||||
review.observations.length > 0
|
||||
? formatBulletList("Observations", summarizeList(review.observations))
|
||||
: "",
|
||||
review.concerns.length > 0
|
||||
? formatBulletList("Concerns", summarizeList(review.concerns))
|
||||
: "",
|
||||
review.missing_evidence.length > 0
|
||||
? formatBulletList(
|
||||
"Missing evidence",
|
||||
summarizeList(review.missing_evidence),
|
||||
)
|
||||
: "",
|
||||
review.suggestions.length > 0
|
||||
? formatBulletList("Suggestions", summarizeList(review.suggestions))
|
||||
: "",
|
||||
]
|
||||
.filter(Boolean)
|
||||
.join("\n\n");
|
||||
const verdict = review.accepted ? "Accepted" : "Refused";
|
||||
const lines = [`${verdict} by ${review.reviewer}.`];
|
||||
if (review.reason) {
|
||||
lines.push(review.reason);
|
||||
} else if (review.observations.length > 0) {
|
||||
lines.push(review.observations[0]);
|
||||
}
|
||||
if (review.blind_spots) lines.push(`Blind spots: ${review.blind_spots}`);
|
||||
if (!review.accepted && review.missing_evidence.length > 0) {
|
||||
lines.push(`Needs: ${review.missing_evidence.join("; ")}`);
|
||||
}
|
||||
if (!review.accepted && review.suggestions.length > 0) {
|
||||
lines.push(`Next: ${review.suggestions.join("; ")}`);
|
||||
}
|
||||
return lines.join(" ");
|
||||
}
|
||||
|
||||
function renderCurrentProofSummary(task: Task): string {
|
||||
@@ -984,22 +977,12 @@ function renderAttempt(
|
||||
`- unknown left: ${presentOrMissing(entry.failure_unknown)}`,
|
||||
`- why this counts: ${presentOrMissing(entry.evidence_reasoning)}`,
|
||||
`- remaining uncertainty: ${presentOrMissing(entry.remaining_uncertainty)}`,
|
||||
`### Judgement\n${judgement.title}\n\n${judgement.body}`,
|
||||
judgement.observations.length > 0
|
||||
? formatBulletList("Observations", summarizeList(judgement.observations))
|
||||
: "",
|
||||
judgement.concerns.length > 0
|
||||
? formatBulletList("Concerns", summarizeList(judgement.concerns))
|
||||
: "",
|
||||
judgement.missingEvidence.length > 0
|
||||
? formatBulletList(
|
||||
"Missing evidence",
|
||||
summarizeList(judgement.missingEvidence),
|
||||
)
|
||||
: "",
|
||||
judgement.suggestions.length > 0
|
||||
? formatBulletList("Suggestions", summarizeList(judgement.suggestions))
|
||||
: "",
|
||||
`### Judgement\n${judgement.title}`,
|
||||
judgement.body,
|
||||
judgement.observations.length > 0 ? judgement.observations[0] : "",
|
||||
judgement.concerns.length > 0 ? `Concerns: ${judgement.concerns.join("; ")}` : "",
|
||||
judgement.missingEvidence.length > 0 ? `Needs: ${judgement.missingEvidence.join("; ")}` : "",
|
||||
judgement.suggestions.length > 0 ? `Next: ${judgement.suggestions.join("; ")}` : "",
|
||||
]
|
||||
.filter(Boolean)
|
||||
.join("\n\n");
|
||||
@@ -1149,22 +1132,24 @@ export function buildRobotReviewPrompt(task: Task): string {
|
||||
"Set evidence_convincing=true if items 1 and 2 pass and you do not see a concrete contradiction in the packet.",
|
||||
"Set accepted=true if items 1 and 2 pass and you do not see a concrete contradiction in the packet. Do not reject solely because items 3, 4, or 5 are weak if the verbatim evidence already proves the done criterion.",
|
||||
"",
|
||||
"observations: what you literally saw in the packet.",
|
||||
"reason: 1-3 sentence summary of why you accepted or rejected. Be concrete: cite specific test counts, file paths, or output lines you checked. Example: 'Pass, I checked the evidence it shows all 142 tests pass and HEAD=origin/main.'",
|
||||
"observations: kept for audit only. One line max, not a repeat of the evidence.",
|
||||
"When rejecting, prefer missing outputs like 'nll_val never decreases in the quoted log' over process complaints like 'too much text'.",
|
||||
"concerns: concise reasons the current evidence may not prove success yet.",
|
||||
"suggestions: what the agent should do next if the evidence is not yet enough. Keep this short, ideally 1-3 bullets.",
|
||||
"missing_evidence: concrete missing artifacts, command outputs, written-file checks, or observations that block acceptance. Prefer phrases like 'literal pytest output' or 'contents of output.json', not abstract rubric labels.",
|
||||
"concerns: kept for audit only. One line max when rejecting, empty when accepting.",
|
||||
"suggestions: what the agent should do next if rejected. 1-3 bullets max.",
|
||||
"missing_evidence: concrete missing artifacts or outputs that block acceptance. Only when rejecting.",
|
||||
"blind_spots: what you could not check. Always include this. Example: 'only reviewed the verbatim packet, did not inspect the actual artifact files.'",
|
||||
"",
|
||||
"Return exactly one JSON object between the markers ROBOT_REVIEW_JSON_START and ROBOT_REVIEW_JSON_END.",
|
||||
"JSON schema:",
|
||||
'{"reviewer":"string","scope":"string","rubric":{"evidence_covers_done_criterion":{"reason":"...","pass":true},"falsification_test_runnable":{"reason":"...","pass":true},"failure_modes_addressed":{"reason":"...","pass":true},"evidence_distinguishes_success":{"reason":"...","pass":true},"verification_hints_actionable":{"reason":"...","pass":true}},"observations":["string"],"concerns":["string"],"suggestions":["string"],"blind_spots":"string","missing_evidence":["string"],"evidence_complete":true,"evidence_convincing":true,"accepted":true}',
|
||||
'{"reviewer":"string","scope":"string","rubric":{"evidence_covers_done_criterion":{"reason":"...","pass":true},"falsification_test_runnable":{"reason":"...","pass":true},"failure_modes_addressed":{"reason":"...","pass":true},"evidence_distinguishes_success":{"reason":"...","pass":true},"verification_hints_actionable":{"reason":"...","pass":true}},"reason":"1-3 sentence summary of why you accepted or rejected","observations":["string"],"concerns":["string"],"suggestions":["string"],"blind_spots":"string","missing_evidence":["string"],"evidence_complete":true,"evidence_convincing":true,"accepted":true}',
|
||||
"",
|
||||
"You are reviewing exactly the same proof packet shown by TaskGet and /lgtm. Do not assume hidden context beyond this packet.",
|
||||
"",
|
||||
renderEvidencePacket(task, { truncateEvidence: false }),
|
||||
"Output format:",
|
||||
"ROBOT_REVIEW_JSON_START",
|
||||
'{"reviewer":"...","scope":"...","rubric":{...},"observations":["..."],"concerns":["..."],"suggestions":["..."],"blind_spots":"...","missing_evidence":["..."],"evidence_complete":true,"evidence_convincing":true,"accepted":true}',
|
||||
'{"reviewer":"...","scope":"...","rubric":{...},"reason":"Pass, I checked the evidence it shows all 142 tests pass and HEAD=origin/main.","observations":["..."],"concerns":["..."],"suggestions":["..."],"blind_spots":"...","missing_evidence":["..."],"evidence_complete":true,"evidence_convincing":true,"accepted":true}',
|
||||
"ROBOT_REVIEW_JSON_END",
|
||||
].join("\n");
|
||||
}
|
||||
|
||||
@@ -6,6 +6,7 @@ export interface RobotReviewRecord {
|
||||
iteration: number;
|
||||
reviewer: string;
|
||||
scope: string;
|
||||
reason?: string;
|
||||
observations: string[];
|
||||
concerns: string[];
|
||||
suggestions: string[];
|
||||
@@ -64,6 +65,8 @@ function normalizeReview(
|
||||
typeof review.iteration === "number" ? review.iteration : index + 1,
|
||||
reviewer,
|
||||
scope,
|
||||
reason:
|
||||
typeof review.reason === "string" ? review.reason : undefined,
|
||||
observations,
|
||||
concerns: toStringArray(review.concerns),
|
||||
suggestions: toStringArray(review.suggestions),
|
||||
@@ -113,6 +116,10 @@ function getLegacyRobotReview(task: Task): RobotReviewRecord | undefined {
|
||||
typeof task.metadata?.robot_review_scope === "string"
|
||||
? task.metadata.robot_review_scope
|
||||
: "unknown",
|
||||
reason:
|
||||
typeof task.metadata?.robot_review_reason === "string"
|
||||
? task.metadata.robot_review_reason
|
||||
: undefined,
|
||||
observations,
|
||||
concerns: toStringArray(task.metadata?.robot_review_concerns),
|
||||
suggestions: toStringArray(task.metadata?.robot_review_suggestions),
|
||||
@@ -289,6 +296,7 @@ export function appendRobotReviewMetadata(
|
||||
robot_review_missing_evidence: latest.missing_evidence,
|
||||
robot_review_submitted_at: latest.submitted_at,
|
||||
robot_review_mode: latest.mode,
|
||||
robot_review_reason: latest.reason ?? null,
|
||||
robot_review_raw_output: latest.raw_output ?? null,
|
||||
robot_review_requires_followup: !(
|
||||
latest.evidence_complete && latest.evidence_convincing
|
||||
|
||||
@@ -249,7 +249,7 @@ describe("robot review helpers", () => {
|
||||
"Do not reject solely because items 3, 4, or 5 are weak",
|
||||
);
|
||||
expect(prompt).toContain(
|
||||
"concrete missing artifacts, command outputs, written-file checks",
|
||||
"concrete missing artifacts or outputs that block acceptance",
|
||||
);
|
||||
});
|
||||
|
||||
@@ -373,10 +373,8 @@ describe("robot review helpers", () => {
|
||||
expect(log).toContain("### Verify");
|
||||
expect(log).toContain("### Judgement");
|
||||
expect(log).toContain("Refused by auto");
|
||||
expect(log).toContain("### Observations");
|
||||
expect(log).toContain("### Concerns");
|
||||
expect(log).toContain("### Missing evidence");
|
||||
expect(log).toContain("### Suggestions");
|
||||
expect(log).toContain("Needs:");
|
||||
expect(log).toContain("Next:");
|
||||
expect(log).toContain("Run one self-hosted TaskClaimDone UAT.");
|
||||
});
|
||||
|
||||
@@ -438,10 +436,7 @@ describe("robot review helpers", () => {
|
||||
expect(log).toContain("completed with reviewer unavailable");
|
||||
expect(log).toContain("### Judgement");
|
||||
expect(log).toContain("judge auth failed");
|
||||
expect(log).toContain("### Suggestions");
|
||||
expect(log).not.toContain("### Missing evidence");
|
||||
expect(log).not.toContain("### Observations");
|
||||
expect(log).not.toContain("### Concerns");
|
||||
expect(log).toContain("Autonomy continued without blocking completion.");
|
||||
expect(log).not.toContain("Needs:");
|
||||
});
|
||||
});
|
||||
|
||||
@@ -130,8 +130,7 @@ console.log("ROBOT_REVIEW_JSON_END");
|
||||
);
|
||||
expect(text).toContain("### Judgement");
|
||||
expect(text).toContain("Refused");
|
||||
expect(text).toContain("### Missing evidence");
|
||||
expect(text).toContain("### Suggestions");
|
||||
expect(text).toContain("Needs:");
|
||||
expect(text).toContain("Add one more artifact");
|
||||
});
|
||||
|
||||
@@ -178,10 +177,8 @@ console.log("ROBOT_REVIEW_JSON_START and nope ROBOT_REVIEW_JSON_END");
|
||||
expect(text).toContain("Status: completed");
|
||||
expect(text).toContain("completed with reviewer unavailable");
|
||||
expect(text).toContain("Raw output:");
|
||||
expect(text).toContain("### Suggestions");
|
||||
expect(text).not.toContain("### Missing evidence\n- (none)");
|
||||
expect(text).not.toContain("### Observations\n- (none)");
|
||||
expect(text).not.toContain("### Concerns\n- (none)");
|
||||
expect(text).toContain("Autonomy continued without blocking completion.");
|
||||
expect(text).not.toContain("Needs:");
|
||||
expect(text).toContain(
|
||||
"ROBOT_REVIEW_JSON_START and nope ROBOT_REVIEW_JSON_END",
|
||||
);
|
||||
|
||||
Reference in New Issue
Block a user