short judge output: verdict + reason + blind spots only

This commit is contained in:
wassname
2026-06-14 21:40:48 +08:00
parent c0ceb95ea4
commit 6442d01ade
4 changed files with 45 additions and 60 deletions
+30 -45
View File
@@ -346,6 +346,7 @@ const ROBOT_REVIEW_KEYS = [
"robot_review_submitted_at", "robot_review_submitted_at",
"robot_review_mode", "robot_review_mode",
"robot_review_raw_output", "robot_review_raw_output",
"robot_review_reason",
"robot_review_requires_followup", "robot_review_requires_followup",
"robot_review_iteration_count", "robot_review_iteration_count",
] as const; ] as const;
@@ -855,29 +856,21 @@ function renderTaskUpdateSummary(
} }
function renderCompactRobotReview(review: RobotReviewRecord): string { function renderCompactRobotReview(review: RobotReviewRecord): string {
return [ const verdict = review.accepted ? "Accepted" : "Refused";
`### Judge`, const lines = [`${verdict} by ${review.reviewer}.`];
`${review.accepted ? "Accepted" : "Refused"} by ${review.reviewer} on ${review.submitted_at}.`, if (review.reason) {
`Evidence complete: ${review.evidence_complete ? "yes" : "no"}`, lines.push(review.reason);
`Evidence convincing: ${review.evidence_convincing ? "yes" : "no"}`, } else if (review.observations.length > 0) {
review.observations.length > 0 lines.push(review.observations[0]);
? formatBulletList("Observations", summarizeList(review.observations)) }
: "", if (review.blind_spots) lines.push(`Blind spots: ${review.blind_spots}`);
review.concerns.length > 0 if (!review.accepted && review.missing_evidence.length > 0) {
? formatBulletList("Concerns", summarizeList(review.concerns)) lines.push(`Needs: ${review.missing_evidence.join("; ")}`);
: "", }
review.missing_evidence.length > 0 if (!review.accepted && review.suggestions.length > 0) {
? formatBulletList( lines.push(`Next: ${review.suggestions.join("; ")}`);
"Missing evidence", }
summarizeList(review.missing_evidence), return lines.join(" ");
)
: "",
review.suggestions.length > 0
? formatBulletList("Suggestions", summarizeList(review.suggestions))
: "",
]
.filter(Boolean)
.join("\n\n");
} }
function renderCurrentProofSummary(task: Task): string { function renderCurrentProofSummary(task: Task): string {
@@ -984,22 +977,12 @@ function renderAttempt(
`- unknown left: ${presentOrMissing(entry.failure_unknown)}`, `- unknown left: ${presentOrMissing(entry.failure_unknown)}`,
`- why this counts: ${presentOrMissing(entry.evidence_reasoning)}`, `- why this counts: ${presentOrMissing(entry.evidence_reasoning)}`,
`- remaining uncertainty: ${presentOrMissing(entry.remaining_uncertainty)}`, `- remaining uncertainty: ${presentOrMissing(entry.remaining_uncertainty)}`,
`### Judgement\n${judgement.title}\n\n${judgement.body}`, `### Judgement\n${judgement.title}`,
judgement.observations.length > 0 judgement.body,
? formatBulletList("Observations", summarizeList(judgement.observations)) judgement.observations.length > 0 ? judgement.observations[0] : "",
: "", judgement.concerns.length > 0 ? `Concerns: ${judgement.concerns.join("; ")}` : "",
judgement.concerns.length > 0 judgement.missingEvidence.length > 0 ? `Needs: ${judgement.missingEvidence.join("; ")}` : "",
? formatBulletList("Concerns", summarizeList(judgement.concerns)) judgement.suggestions.length > 0 ? `Next: ${judgement.suggestions.join("; ")}` : "",
: "",
judgement.missingEvidence.length > 0
? formatBulletList(
"Missing evidence",
summarizeList(judgement.missingEvidence),
)
: "",
judgement.suggestions.length > 0
? formatBulletList("Suggestions", summarizeList(judgement.suggestions))
: "",
] ]
.filter(Boolean) .filter(Boolean)
.join("\n\n"); .join("\n\n");
@@ -1149,22 +1132,24 @@ export function buildRobotReviewPrompt(task: Task): string {
"Set evidence_convincing=true if items 1 and 2 pass and you do not see a concrete contradiction in the packet.", "Set evidence_convincing=true if items 1 and 2 pass and you do not see a concrete contradiction in the packet.",
"Set accepted=true if items 1 and 2 pass and you do not see a concrete contradiction in the packet. Do not reject solely because items 3, 4, or 5 are weak if the verbatim evidence already proves the done criterion.", "Set accepted=true if items 1 and 2 pass and you do not see a concrete contradiction in the packet. Do not reject solely because items 3, 4, or 5 are weak if the verbatim evidence already proves the done criterion.",
"", "",
"observations: what you literally saw in the packet.", "reason: 1-3 sentence summary of why you accepted or rejected. Be concrete: cite specific test counts, file paths, or output lines you checked. Example: 'Pass, I checked the evidence it shows all 142 tests pass and HEAD=origin/main.'",
"observations: kept for audit only. One line max, not a repeat of the evidence.",
"When rejecting, prefer missing outputs like 'nll_val never decreases in the quoted log' over process complaints like 'too much text'.", "When rejecting, prefer missing outputs like 'nll_val never decreases in the quoted log' over process complaints like 'too much text'.",
"concerns: concise reasons the current evidence may not prove success yet.", "concerns: kept for audit only. One line max when rejecting, empty when accepting.",
"suggestions: what the agent should do next if the evidence is not yet enough. Keep this short, ideally 1-3 bullets.", "suggestions: what the agent should do next if rejected. 1-3 bullets max.",
"missing_evidence: concrete missing artifacts, command outputs, written-file checks, or observations that block acceptance. Prefer phrases like 'literal pytest output' or 'contents of output.json', not abstract rubric labels.", "missing_evidence: concrete missing artifacts or outputs that block acceptance. Only when rejecting.",
"blind_spots: what you could not check. Always include this. Example: 'only reviewed the verbatim packet, did not inspect the actual artifact files.'",
"", "",
"Return exactly one JSON object between the markers ROBOT_REVIEW_JSON_START and ROBOT_REVIEW_JSON_END.", "Return exactly one JSON object between the markers ROBOT_REVIEW_JSON_START and ROBOT_REVIEW_JSON_END.",
"JSON schema:", "JSON schema:",
'{"reviewer":"string","scope":"string","rubric":{"evidence_covers_done_criterion":{"reason":"...","pass":true},"falsification_test_runnable":{"reason":"...","pass":true},"failure_modes_addressed":{"reason":"...","pass":true},"evidence_distinguishes_success":{"reason":"...","pass":true},"verification_hints_actionable":{"reason":"...","pass":true}},"observations":["string"],"concerns":["string"],"suggestions":["string"],"blind_spots":"string","missing_evidence":["string"],"evidence_complete":true,"evidence_convincing":true,"accepted":true}', '{"reviewer":"string","scope":"string","rubric":{"evidence_covers_done_criterion":{"reason":"...","pass":true},"falsification_test_runnable":{"reason":"...","pass":true},"failure_modes_addressed":{"reason":"...","pass":true},"evidence_distinguishes_success":{"reason":"...","pass":true},"verification_hints_actionable":{"reason":"...","pass":true}},"reason":"1-3 sentence summary of why you accepted or rejected","observations":["string"],"concerns":["string"],"suggestions":["string"],"blind_spots":"string","missing_evidence":["string"],"evidence_complete":true,"evidence_convincing":true,"accepted":true}',
"", "",
"You are reviewing exactly the same proof packet shown by TaskGet and /lgtm. Do not assume hidden context beyond this packet.", "You are reviewing exactly the same proof packet shown by TaskGet and /lgtm. Do not assume hidden context beyond this packet.",
"", "",
renderEvidencePacket(task, { truncateEvidence: false }), renderEvidencePacket(task, { truncateEvidence: false }),
"Output format:", "Output format:",
"ROBOT_REVIEW_JSON_START", "ROBOT_REVIEW_JSON_START",
'{"reviewer":"...","scope":"...","rubric":{...},"observations":["..."],"concerns":["..."],"suggestions":["..."],"blind_spots":"...","missing_evidence":["..."],"evidence_complete":true,"evidence_convincing":true,"accepted":true}', '{"reviewer":"...","scope":"...","rubric":{...},"reason":"Pass, I checked the evidence it shows all 142 tests pass and HEAD=origin/main.","observations":["..."],"concerns":["..."],"suggestions":["..."],"blind_spots":"...","missing_evidence":["..."],"evidence_complete":true,"evidence_convincing":true,"accepted":true}',
"ROBOT_REVIEW_JSON_END", "ROBOT_REVIEW_JSON_END",
].join("\n"); ].join("\n");
} }
+8
View File
@@ -6,6 +6,7 @@ export interface RobotReviewRecord {
iteration: number; iteration: number;
reviewer: string; reviewer: string;
scope: string; scope: string;
reason?: string;
observations: string[]; observations: string[];
concerns: string[]; concerns: string[];
suggestions: string[]; suggestions: string[];
@@ -64,6 +65,8 @@ function normalizeReview(
typeof review.iteration === "number" ? review.iteration : index + 1, typeof review.iteration === "number" ? review.iteration : index + 1,
reviewer, reviewer,
scope, scope,
reason:
typeof review.reason === "string" ? review.reason : undefined,
observations, observations,
concerns: toStringArray(review.concerns), concerns: toStringArray(review.concerns),
suggestions: toStringArray(review.suggestions), suggestions: toStringArray(review.suggestions),
@@ -113,6 +116,10 @@ function getLegacyRobotReview(task: Task): RobotReviewRecord | undefined {
typeof task.metadata?.robot_review_scope === "string" typeof task.metadata?.robot_review_scope === "string"
? task.metadata.robot_review_scope ? task.metadata.robot_review_scope
: "unknown", : "unknown",
reason:
typeof task.metadata?.robot_review_reason === "string"
? task.metadata.robot_review_reason
: undefined,
observations, observations,
concerns: toStringArray(task.metadata?.robot_review_concerns), concerns: toStringArray(task.metadata?.robot_review_concerns),
suggestions: toStringArray(task.metadata?.robot_review_suggestions), suggestions: toStringArray(task.metadata?.robot_review_suggestions),
@@ -289,6 +296,7 @@ export function appendRobotReviewMetadata(
robot_review_missing_evidence: latest.missing_evidence, robot_review_missing_evidence: latest.missing_evidence,
robot_review_submitted_at: latest.submitted_at, robot_review_submitted_at: latest.submitted_at,
robot_review_mode: latest.mode, robot_review_mode: latest.mode,
robot_review_reason: latest.reason ?? null,
robot_review_raw_output: latest.raw_output ?? null, robot_review_raw_output: latest.raw_output ?? null,
robot_review_requires_followup: !( robot_review_requires_followup: !(
latest.evidence_complete && latest.evidence_convincing latest.evidence_complete && latest.evidence_convincing
+4 -9
View File
@@ -249,7 +249,7 @@ describe("robot review helpers", () => {
"Do not reject solely because items 3, 4, or 5 are weak", "Do not reject solely because items 3, 4, or 5 are weak",
); );
expect(prompt).toContain( expect(prompt).toContain(
"concrete missing artifacts, command outputs, written-file checks", "concrete missing artifacts or outputs that block acceptance",
); );
}); });
@@ -373,10 +373,8 @@ describe("robot review helpers", () => {
expect(log).toContain("### Verify"); expect(log).toContain("### Verify");
expect(log).toContain("### Judgement"); expect(log).toContain("### Judgement");
expect(log).toContain("Refused by auto"); expect(log).toContain("Refused by auto");
expect(log).toContain("### Observations"); expect(log).toContain("Needs:");
expect(log).toContain("### Concerns"); expect(log).toContain("Next:");
expect(log).toContain("### Missing evidence");
expect(log).toContain("### Suggestions");
expect(log).toContain("Run one self-hosted TaskClaimDone UAT."); expect(log).toContain("Run one self-hosted TaskClaimDone UAT.");
}); });
@@ -438,10 +436,7 @@ describe("robot review helpers", () => {
expect(log).toContain("completed with reviewer unavailable"); expect(log).toContain("completed with reviewer unavailable");
expect(log).toContain("### Judgement"); expect(log).toContain("### Judgement");
expect(log).toContain("judge auth failed"); expect(log).toContain("judge auth failed");
expect(log).toContain("### Suggestions");
expect(log).not.toContain("### Missing evidence");
expect(log).not.toContain("### Observations");
expect(log).not.toContain("### Concerns");
expect(log).toContain("Autonomy continued without blocking completion."); expect(log).toContain("Autonomy continued without blocking completion.");
expect(log).not.toContain("Needs:");
}); });
}); });
+3 -6
View File
@@ -130,8 +130,7 @@ console.log("ROBOT_REVIEW_JSON_END");
); );
expect(text).toContain("### Judgement"); expect(text).toContain("### Judgement");
expect(text).toContain("Refused"); expect(text).toContain("Refused");
expect(text).toContain("### Missing evidence"); expect(text).toContain("Needs:");
expect(text).toContain("### Suggestions");
expect(text).toContain("Add one more artifact"); expect(text).toContain("Add one more artifact");
}); });
@@ -178,10 +177,8 @@ console.log("ROBOT_REVIEW_JSON_START and nope ROBOT_REVIEW_JSON_END");
expect(text).toContain("Status: completed"); expect(text).toContain("Status: completed");
expect(text).toContain("completed with reviewer unavailable"); expect(text).toContain("completed with reviewer unavailable");
expect(text).toContain("Raw output:"); expect(text).toContain("Raw output:");
expect(text).toContain("### Suggestions"); expect(text).toContain("Autonomy continued without blocking completion.");
expect(text).not.toContain("### Missing evidence\n- (none)"); expect(text).not.toContain("Needs:");
expect(text).not.toContain("### Observations\n- (none)");
expect(text).not.toContain("### Concerns\n- (none)");
expect(text).toContain( expect(text).toContain(
"ROBOT_REVIEW_JSON_START and nope ROBOT_REVIEW_JSON_END", "ROBOT_REVIEW_JSON_START and nope ROBOT_REVIEW_JSON_END",
); );