short judge output: verdict + reason + blind spots only

This commit is contained in:
wassname
2026-06-14 21:40:48 +08:00
parent c0ceb95ea4
commit 6442d01ade
4 changed files with 45 additions and 60 deletions
+30 -45
View File
@@ -346,6 +346,7 @@ const ROBOT_REVIEW_KEYS = [
"robot_review_submitted_at",
"robot_review_mode",
"robot_review_raw_output",
"robot_review_reason",
"robot_review_requires_followup",
"robot_review_iteration_count",
] as const;
@@ -855,29 +856,21 @@ function renderTaskUpdateSummary(
}
function renderCompactRobotReview(review: RobotReviewRecord): string {
return [
`### Judge`,
`${review.accepted ? "Accepted" : "Refused"} by ${review.reviewer} on ${review.submitted_at}.`,
`Evidence complete: ${review.evidence_complete ? "yes" : "no"}`,
`Evidence convincing: ${review.evidence_convincing ? "yes" : "no"}`,
review.observations.length > 0
? formatBulletList("Observations", summarizeList(review.observations))
: "",
review.concerns.length > 0
? formatBulletList("Concerns", summarizeList(review.concerns))
: "",
review.missing_evidence.length > 0
? formatBulletList(
"Missing evidence",
summarizeList(review.missing_evidence),
)
: "",
review.suggestions.length > 0
? formatBulletList("Suggestions", summarizeList(review.suggestions))
: "",
]
.filter(Boolean)
.join("\n\n");
const verdict = review.accepted ? "Accepted" : "Refused";
const lines = [`${verdict} by ${review.reviewer}.`];
if (review.reason) {
lines.push(review.reason);
} else if (review.observations.length > 0) {
lines.push(review.observations[0]);
}
if (review.blind_spots) lines.push(`Blind spots: ${review.blind_spots}`);
if (!review.accepted && review.missing_evidence.length > 0) {
lines.push(`Needs: ${review.missing_evidence.join("; ")}`);
}
if (!review.accepted && review.suggestions.length > 0) {
lines.push(`Next: ${review.suggestions.join("; ")}`);
}
return lines.join(" ");
}
function renderCurrentProofSummary(task: Task): string {
@@ -984,22 +977,12 @@ function renderAttempt(
`- unknown left: ${presentOrMissing(entry.failure_unknown)}`,
`- why this counts: ${presentOrMissing(entry.evidence_reasoning)}`,
`- remaining uncertainty: ${presentOrMissing(entry.remaining_uncertainty)}`,
`### Judgement\n${judgement.title}\n\n${judgement.body}`,
judgement.observations.length > 0
? formatBulletList("Observations", summarizeList(judgement.observations))
: "",
judgement.concerns.length > 0
? formatBulletList("Concerns", summarizeList(judgement.concerns))
: "",
judgement.missingEvidence.length > 0
? formatBulletList(
"Missing evidence",
summarizeList(judgement.missingEvidence),
)
: "",
judgement.suggestions.length > 0
? formatBulletList("Suggestions", summarizeList(judgement.suggestions))
: "",
`### Judgement\n${judgement.title}`,
judgement.body,
judgement.observations.length > 0 ? judgement.observations[0] : "",
judgement.concerns.length > 0 ? `Concerns: ${judgement.concerns.join("; ")}` : "",
judgement.missingEvidence.length > 0 ? `Needs: ${judgement.missingEvidence.join("; ")}` : "",
judgement.suggestions.length > 0 ? `Next: ${judgement.suggestions.join("; ")}` : "",
]
.filter(Boolean)
.join("\n\n");
@@ -1149,22 +1132,24 @@ export function buildRobotReviewPrompt(task: Task): string {
"Set evidence_convincing=true if items 1 and 2 pass and you do not see a concrete contradiction in the packet.",
"Set accepted=true if items 1 and 2 pass and you do not see a concrete contradiction in the packet. Do not reject solely because items 3, 4, or 5 are weak if the verbatim evidence already proves the done criterion.",
"",
"observations: what you literally saw in the packet.",
"reason: 1-3 sentence summary of why you accepted or rejected. Be concrete: cite specific test counts, file paths, or output lines you checked. Example: 'Pass, I checked the evidence it shows all 142 tests pass and HEAD=origin/main.'",
"observations: kept for audit only. One line max, not a repeat of the evidence.",
"When rejecting, prefer missing outputs like 'nll_val never decreases in the quoted log' over process complaints like 'too much text'.",
"concerns: concise reasons the current evidence may not prove success yet.",
"suggestions: what the agent should do next if the evidence is not yet enough. Keep this short, ideally 1-3 bullets.",
"missing_evidence: concrete missing artifacts, command outputs, written-file checks, or observations that block acceptance. Prefer phrases like 'literal pytest output' or 'contents of output.json', not abstract rubric labels.",
"concerns: kept for audit only. One line max when rejecting, empty when accepting.",
"suggestions: what the agent should do next if rejected. 1-3 bullets max.",
"missing_evidence: concrete missing artifacts or outputs that block acceptance. Only when rejecting.",
"blind_spots: what you could not check. Always include this. Example: 'only reviewed the verbatim packet, did not inspect the actual artifact files.'",
"",
"Return exactly one JSON object between the markers ROBOT_REVIEW_JSON_START and ROBOT_REVIEW_JSON_END.",
"JSON schema:",
'{"reviewer":"string","scope":"string","rubric":{"evidence_covers_done_criterion":{"reason":"...","pass":true},"falsification_test_runnable":{"reason":"...","pass":true},"failure_modes_addressed":{"reason":"...","pass":true},"evidence_distinguishes_success":{"reason":"...","pass":true},"verification_hints_actionable":{"reason":"...","pass":true}},"observations":["string"],"concerns":["string"],"suggestions":["string"],"blind_spots":"string","missing_evidence":["string"],"evidence_complete":true,"evidence_convincing":true,"accepted":true}',
'{"reviewer":"string","scope":"string","rubric":{"evidence_covers_done_criterion":{"reason":"...","pass":true},"falsification_test_runnable":{"reason":"...","pass":true},"failure_modes_addressed":{"reason":"...","pass":true},"evidence_distinguishes_success":{"reason":"...","pass":true},"verification_hints_actionable":{"reason":"...","pass":true}},"reason":"1-3 sentence summary of why you accepted or rejected","observations":["string"],"concerns":["string"],"suggestions":["string"],"blind_spots":"string","missing_evidence":["string"],"evidence_complete":true,"evidence_convincing":true,"accepted":true}',
"",
"You are reviewing exactly the same proof packet shown by TaskGet and /lgtm. Do not assume hidden context beyond this packet.",
"",
renderEvidencePacket(task, { truncateEvidence: false }),
"Output format:",
"ROBOT_REVIEW_JSON_START",
'{"reviewer":"...","scope":"...","rubric":{...},"observations":["..."],"concerns":["..."],"suggestions":["..."],"blind_spots":"...","missing_evidence":["..."],"evidence_complete":true,"evidence_convincing":true,"accepted":true}',
'{"reviewer":"...","scope":"...","rubric":{...},"reason":"Pass, I checked the evidence it shows all 142 tests pass and HEAD=origin/main.","observations":["..."],"concerns":["..."],"suggestions":["..."],"blind_spots":"...","missing_evidence":["..."],"evidence_complete":true,"evidence_convincing":true,"accepted":true}',
"ROBOT_REVIEW_JSON_END",
].join("\n");
}
+8
View File
@@ -6,6 +6,7 @@ export interface RobotReviewRecord {
iteration: number;
reviewer: string;
scope: string;
reason?: string;
observations: string[];
concerns: string[];
suggestions: string[];
@@ -64,6 +65,8 @@ function normalizeReview(
typeof review.iteration === "number" ? review.iteration : index + 1,
reviewer,
scope,
reason:
typeof review.reason === "string" ? review.reason : undefined,
observations,
concerns: toStringArray(review.concerns),
suggestions: toStringArray(review.suggestions),
@@ -113,6 +116,10 @@ function getLegacyRobotReview(task: Task): RobotReviewRecord | undefined {
typeof task.metadata?.robot_review_scope === "string"
? task.metadata.robot_review_scope
: "unknown",
reason:
typeof task.metadata?.robot_review_reason === "string"
? task.metadata.robot_review_reason
: undefined,
observations,
concerns: toStringArray(task.metadata?.robot_review_concerns),
suggestions: toStringArray(task.metadata?.robot_review_suggestions),
@@ -289,6 +296,7 @@ export function appendRobotReviewMetadata(
robot_review_missing_evidence: latest.missing_evidence,
robot_review_submitted_at: latest.submitted_at,
robot_review_mode: latest.mode,
robot_review_reason: latest.reason ?? null,
robot_review_raw_output: latest.raw_output ?? null,
robot_review_requires_followup: !(
latest.evidence_complete && latest.evidence_convincing
+4 -9
View File
@@ -249,7 +249,7 @@ describe("robot review helpers", () => {
"Do not reject solely because items 3, 4, or 5 are weak",
);
expect(prompt).toContain(
"concrete missing artifacts, command outputs, written-file checks",
"concrete missing artifacts or outputs that block acceptance",
);
});
@@ -373,10 +373,8 @@ describe("robot review helpers", () => {
expect(log).toContain("### Verify");
expect(log).toContain("### Judgement");
expect(log).toContain("Refused by auto");
expect(log).toContain("### Observations");
expect(log).toContain("### Concerns");
expect(log).toContain("### Missing evidence");
expect(log).toContain("### Suggestions");
expect(log).toContain("Needs:");
expect(log).toContain("Next:");
expect(log).toContain("Run one self-hosted TaskClaimDone UAT.");
});
@@ -438,10 +436,7 @@ describe("robot review helpers", () => {
expect(log).toContain("completed with reviewer unavailable");
expect(log).toContain("### Judgement");
expect(log).toContain("judge auth failed");
expect(log).toContain("### Suggestions");
expect(log).not.toContain("### Missing evidence");
expect(log).not.toContain("### Observations");
expect(log).not.toContain("### Concerns");
expect(log).toContain("Autonomy continued without blocking completion.");
expect(log).not.toContain("Needs:");
});
});
+3 -6
View File
@@ -130,8 +130,7 @@ console.log("ROBOT_REVIEW_JSON_END");
);
expect(text).toContain("### Judgement");
expect(text).toContain("Refused");
expect(text).toContain("### Missing evidence");
expect(text).toContain("### Suggestions");
expect(text).toContain("Needs:");
expect(text).toContain("Add one more artifact");
});
@@ -178,10 +177,8 @@ console.log("ROBOT_REVIEW_JSON_START and nope ROBOT_REVIEW_JSON_END");
expect(text).toContain("Status: completed");
expect(text).toContain("completed with reviewer unavailable");
expect(text).toContain("Raw output:");
expect(text).toContain("### Suggestions");
expect(text).not.toContain("### Missing evidence\n- (none)");
expect(text).not.toContain("### Observations\n- (none)");
expect(text).not.toContain("### Concerns\n- (none)");
expect(text).toContain("Autonomy continued without blocking completion.");
expect(text).not.toContain("Needs:");
expect(text).toContain(
"ROBOT_REVIEW_JSON_START and nope ROBOT_REVIEW_JSON_END",
);