From 6442d01adea73698ea1ae2491ac1981b592ea3b8 Mon Sep 17 00:00:00 2001 From: wassname <1103714+wassname@users.noreply.github.com> Date: Sun, 14 Jun 2026 21:40:48 +0800 Subject: [PATCH] short judge output: verdict + reason + blind spots only --- src/index.ts | 75 +++++++++++++------------------ src/robot-review.ts | 8 ++++ test/robot-review.test.ts | 13 ++---- test/task-claim-done-flow.test.ts | 9 ++-- 4 files changed, 45 insertions(+), 60 deletions(-) diff --git a/src/index.ts b/src/index.ts index 44693fa..2f0fd29 100644 --- a/src/index.ts +++ b/src/index.ts @@ -346,6 +346,7 @@ const ROBOT_REVIEW_KEYS = [ "robot_review_submitted_at", "robot_review_mode", "robot_review_raw_output", + "robot_review_reason", "robot_review_requires_followup", "robot_review_iteration_count", ] as const; @@ -855,29 +856,21 @@ function renderTaskUpdateSummary( } function renderCompactRobotReview(review: RobotReviewRecord): string { - return [ - `### Judge`, - `${review.accepted ? "Accepted" : "Refused"} by ${review.reviewer} on ${review.submitted_at}.`, - `Evidence complete: ${review.evidence_complete ? "yes" : "no"}`, - `Evidence convincing: ${review.evidence_convincing ? "yes" : "no"}`, - review.observations.length > 0 - ? formatBulletList("Observations", summarizeList(review.observations)) - : "", - review.concerns.length > 0 - ? formatBulletList("Concerns", summarizeList(review.concerns)) - : "", - review.missing_evidence.length > 0 - ? formatBulletList( - "Missing evidence", - summarizeList(review.missing_evidence), - ) - : "", - review.suggestions.length > 0 - ? formatBulletList("Suggestions", summarizeList(review.suggestions)) - : "", - ] - .filter(Boolean) - .join("\n\n"); + const verdict = review.accepted ? "Accepted" : "Refused"; + const lines = [`${verdict} by ${review.reviewer}.`]; + if (review.reason) { + lines.push(review.reason); + } else if (review.observations.length > 0) { + lines.push(review.observations[0]); + } + if (review.blind_spots) lines.push(`Blind spots: ${review.blind_spots}`); + if (!review.accepted && review.missing_evidence.length > 0) { + lines.push(`Needs: ${review.missing_evidence.join("; ")}`); + } + if (!review.accepted && review.suggestions.length > 0) { + lines.push(`Next: ${review.suggestions.join("; ")}`); + } + return lines.join(" "); } function renderCurrentProofSummary(task: Task): string { @@ -984,22 +977,12 @@ function renderAttempt( `- unknown left: ${presentOrMissing(entry.failure_unknown)}`, `- why this counts: ${presentOrMissing(entry.evidence_reasoning)}`, `- remaining uncertainty: ${presentOrMissing(entry.remaining_uncertainty)}`, - `### Judgement\n${judgement.title}\n\n${judgement.body}`, - judgement.observations.length > 0 - ? formatBulletList("Observations", summarizeList(judgement.observations)) - : "", - judgement.concerns.length > 0 - ? formatBulletList("Concerns", summarizeList(judgement.concerns)) - : "", - judgement.missingEvidence.length > 0 - ? formatBulletList( - "Missing evidence", - summarizeList(judgement.missingEvidence), - ) - : "", - judgement.suggestions.length > 0 - ? formatBulletList("Suggestions", summarizeList(judgement.suggestions)) - : "", + `### Judgement\n${judgement.title}`, + judgement.body, + judgement.observations.length > 0 ? judgement.observations[0] : "", + judgement.concerns.length > 0 ? `Concerns: ${judgement.concerns.join("; ")}` : "", + judgement.missingEvidence.length > 0 ? `Needs: ${judgement.missingEvidence.join("; ")}` : "", + judgement.suggestions.length > 0 ? `Next: ${judgement.suggestions.join("; ")}` : "", ] .filter(Boolean) .join("\n\n"); @@ -1149,22 +1132,24 @@ export function buildRobotReviewPrompt(task: Task): string { "Set evidence_convincing=true if items 1 and 2 pass and you do not see a concrete contradiction in the packet.", "Set accepted=true if items 1 and 2 pass and you do not see a concrete contradiction in the packet. Do not reject solely because items 3, 4, or 5 are weak if the verbatim evidence already proves the done criterion.", "", - "observations: what you literally saw in the packet.", + "reason: 1-3 sentence summary of why you accepted or rejected. Be concrete: cite specific test counts, file paths, or output lines you checked. Example: 'Pass, I checked the evidence it shows all 142 tests pass and HEAD=origin/main.'", + "observations: kept for audit only. One line max, not a repeat of the evidence.", "When rejecting, prefer missing outputs like 'nll_val never decreases in the quoted log' over process complaints like 'too much text'.", - "concerns: concise reasons the current evidence may not prove success yet.", - "suggestions: what the agent should do next if the evidence is not yet enough. Keep this short, ideally 1-3 bullets.", - "missing_evidence: concrete missing artifacts, command outputs, written-file checks, or observations that block acceptance. Prefer phrases like 'literal pytest output' or 'contents of output.json', not abstract rubric labels.", + "concerns: kept for audit only. One line max when rejecting, empty when accepting.", + "suggestions: what the agent should do next if rejected. 1-3 bullets max.", + "missing_evidence: concrete missing artifacts or outputs that block acceptance. Only when rejecting.", + "blind_spots: what you could not check. Always include this. Example: 'only reviewed the verbatim packet, did not inspect the actual artifact files.'", "", "Return exactly one JSON object between the markers ROBOT_REVIEW_JSON_START and ROBOT_REVIEW_JSON_END.", "JSON schema:", - '{"reviewer":"string","scope":"string","rubric":{"evidence_covers_done_criterion":{"reason":"...","pass":true},"falsification_test_runnable":{"reason":"...","pass":true},"failure_modes_addressed":{"reason":"...","pass":true},"evidence_distinguishes_success":{"reason":"...","pass":true},"verification_hints_actionable":{"reason":"...","pass":true}},"observations":["string"],"concerns":["string"],"suggestions":["string"],"blind_spots":"string","missing_evidence":["string"],"evidence_complete":true,"evidence_convincing":true,"accepted":true}', + '{"reviewer":"string","scope":"string","rubric":{"evidence_covers_done_criterion":{"reason":"...","pass":true},"falsification_test_runnable":{"reason":"...","pass":true},"failure_modes_addressed":{"reason":"...","pass":true},"evidence_distinguishes_success":{"reason":"...","pass":true},"verification_hints_actionable":{"reason":"...","pass":true}},"reason":"1-3 sentence summary of why you accepted or rejected","observations":["string"],"concerns":["string"],"suggestions":["string"],"blind_spots":"string","missing_evidence":["string"],"evidence_complete":true,"evidence_convincing":true,"accepted":true}', "", "You are reviewing exactly the same proof packet shown by TaskGet and /lgtm. Do not assume hidden context beyond this packet.", "", renderEvidencePacket(task, { truncateEvidence: false }), "Output format:", "ROBOT_REVIEW_JSON_START", - '{"reviewer":"...","scope":"...","rubric":{...},"observations":["..."],"concerns":["..."],"suggestions":["..."],"blind_spots":"...","missing_evidence":["..."],"evidence_complete":true,"evidence_convincing":true,"accepted":true}', + '{"reviewer":"...","scope":"...","rubric":{...},"reason":"Pass, I checked the evidence it shows all 142 tests pass and HEAD=origin/main.","observations":["..."],"concerns":["..."],"suggestions":["..."],"blind_spots":"...","missing_evidence":["..."],"evidence_complete":true,"evidence_convincing":true,"accepted":true}', "ROBOT_REVIEW_JSON_END", ].join("\n"); } diff --git a/src/robot-review.ts b/src/robot-review.ts index 118c492..9c2a43c 100644 --- a/src/robot-review.ts +++ b/src/robot-review.ts @@ -6,6 +6,7 @@ export interface RobotReviewRecord { iteration: number; reviewer: string; scope: string; + reason?: string; observations: string[]; concerns: string[]; suggestions: string[]; @@ -64,6 +65,8 @@ function normalizeReview( typeof review.iteration === "number" ? review.iteration : index + 1, reviewer, scope, + reason: + typeof review.reason === "string" ? review.reason : undefined, observations, concerns: toStringArray(review.concerns), suggestions: toStringArray(review.suggestions), @@ -113,6 +116,10 @@ function getLegacyRobotReview(task: Task): RobotReviewRecord | undefined { typeof task.metadata?.robot_review_scope === "string" ? task.metadata.robot_review_scope : "unknown", + reason: + typeof task.metadata?.robot_review_reason === "string" + ? task.metadata.robot_review_reason + : undefined, observations, concerns: toStringArray(task.metadata?.robot_review_concerns), suggestions: toStringArray(task.metadata?.robot_review_suggestions), @@ -289,6 +296,7 @@ export function appendRobotReviewMetadata( robot_review_missing_evidence: latest.missing_evidence, robot_review_submitted_at: latest.submitted_at, robot_review_mode: latest.mode, + robot_review_reason: latest.reason ?? null, robot_review_raw_output: latest.raw_output ?? null, robot_review_requires_followup: !( latest.evidence_complete && latest.evidence_convincing diff --git a/test/robot-review.test.ts b/test/robot-review.test.ts index 72672d7..362281a 100644 --- a/test/robot-review.test.ts +++ b/test/robot-review.test.ts @@ -249,7 +249,7 @@ describe("robot review helpers", () => { "Do not reject solely because items 3, 4, or 5 are weak", ); expect(prompt).toContain( - "concrete missing artifacts, command outputs, written-file checks", + "concrete missing artifacts or outputs that block acceptance", ); }); @@ -373,10 +373,8 @@ describe("robot review helpers", () => { expect(log).toContain("### Verify"); expect(log).toContain("### Judgement"); expect(log).toContain("Refused by auto"); - expect(log).toContain("### Observations"); - expect(log).toContain("### Concerns"); - expect(log).toContain("### Missing evidence"); - expect(log).toContain("### Suggestions"); + expect(log).toContain("Needs:"); + expect(log).toContain("Next:"); expect(log).toContain("Run one self-hosted TaskClaimDone UAT."); }); @@ -438,10 +436,7 @@ describe("robot review helpers", () => { expect(log).toContain("completed with reviewer unavailable"); expect(log).toContain("### Judgement"); expect(log).toContain("judge auth failed"); - expect(log).toContain("### Suggestions"); - expect(log).not.toContain("### Missing evidence"); - expect(log).not.toContain("### Observations"); - expect(log).not.toContain("### Concerns"); expect(log).toContain("Autonomy continued without blocking completion."); + expect(log).not.toContain("Needs:"); }); }); diff --git a/test/task-claim-done-flow.test.ts b/test/task-claim-done-flow.test.ts index 2665bb8..16b5417 100644 --- a/test/task-claim-done-flow.test.ts +++ b/test/task-claim-done-flow.test.ts @@ -130,8 +130,7 @@ console.log("ROBOT_REVIEW_JSON_END"); ); expect(text).toContain("### Judgement"); expect(text).toContain("Refused"); - expect(text).toContain("### Missing evidence"); - expect(text).toContain("### Suggestions"); + expect(text).toContain("Needs:"); expect(text).toContain("Add one more artifact"); }); @@ -178,10 +177,8 @@ console.log("ROBOT_REVIEW_JSON_START and nope ROBOT_REVIEW_JSON_END"); expect(text).toContain("Status: completed"); expect(text).toContain("completed with reviewer unavailable"); expect(text).toContain("Raw output:"); - expect(text).toContain("### Suggestions"); - expect(text).not.toContain("### Missing evidence\n- (none)"); - expect(text).not.toContain("### Observations\n- (none)"); - expect(text).not.toContain("### Concerns\n- (none)"); + expect(text).toContain("Autonomy continued without blocking completion."); + expect(text).not.toContain("Needs:"); expect(text).toContain( "ROBOT_REVIEW_JSON_START and nope ROBOT_REVIEW_JSON_END", );