Coverage for mcp/mission_judge/shape.py: 97%
42 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-15 15:07 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-15 15:07 +0000
1"""Building blocks shared by the semantic-progress judge tool.
3The judge either produces one finite-float progress score wrapped in a
4canonical result shape, or it fails and produces a structured error
5envelope. This module owns the small, pure pieces both outcomes are built
6from:
8* :class:`ErrorCode` — the frozen set of stable, machine-readable failure
9 codes the judge can surface.
10* :class:`JudgeError` — the exception the judge raises internally; the tool
11 wrapper translates it into an error envelope.
12* :func:`validate_output_name` — guards the caller-supplied metric name so
13 the resulting key is a single, well-formed path segment.
14* :func:`is_finite_float` — the single source of truth for "is this a real,
15 finite number that a threshold comparison can read?".
16* :func:`metrics_result` — assembles the canonical
17 ``{"metrics": {key: value}, ...}`` success shape with provenance placed
18 beside ``metrics`` rather than inside it.
19* :func:`error_envelope` — assembles the ``{"code", "details"}`` failure
20 shape that never carries a top-level ``metrics`` key.
22Everything here is pure: no I/O, no clocks, no environment lookups. That
23keeps the pieces trivial to test in isolation and safe to call from both
24async tool handlers and synchronous code.
25"""
27from __future__ import annotations
29import math
30from typing import Any
32# The longest an output name may be, in characters.
33_MAX_OUTPUT_NAME_LEN = 128
36class ErrorCode:
37 """Stable, machine-readable failure codes the judge can surface.
39 Each value is a short string an operator (or an automated caller) can
40 branch on without parsing a human message. The values are deliberately
41 frozen: callers and tests may depend on the exact strings.
42 """
44 # A caller-supplied output name was empty, too long, or carried a
45 # separator or whitespace character.
46 INVALID_OUTPUT_NAME = "invalid_output_name"
47 # The directive input was absent, empty, or whitespace-only.
48 MISSING_DIRECTIVE = "missing_directive"
49 # No sampling backend was available to produce a score.
50 NO_SAMPLING_BACKEND = "no_sampling_backend"
51 # The backend's sample call raised a transport, throttling, credentials,
52 # or timeout error.
53 SAMPLING_TRANSPORT_ERROR = "sampling_transport_error"
54 # The model output could not be parsed into a finite real numeric score.
55 INVALID_MODEL_SCORE = "invalid_model_score"
58class JudgeError(Exception):
59 """Raised internally when the judge cannot produce a score.
61 Carries a stable short ``code`` (one of :class:`ErrorCode`) and an
62 optional structured ``details`` dict the tool wrapper renders into an
63 error envelope. When ``details`` is omitted it defaults to an empty
64 dict, and the exception's string form falls back to ``code`` so logs
65 always show something meaningful.
66 """
68 def __init__(self, code: str, details: dict[str, Any] | None = None) -> None:
69 self.code: str = code
70 self.details: dict[str, Any] = details or {}
71 super().__init__(code)
74def validate_output_name(name: str) -> str:
75 """Return ``name`` unchanged when it is a single well-formed path segment.
77 A valid name is a non-empty string of at most 128 characters that
78 contains neither a ``.`` separator nor any whitespace character, so the
79 resulting metric path is exactly ``metrics.<name>``. Any other input —
80 empty, too long, containing a ``.``, or containing whitespace — raises
81 :class:`JudgeError` with code :attr:`ErrorCode.INVALID_OUTPUT_NAME` and a
82 ``details`` payload that names the specific reason and echoes the
83 supplied value.
84 """
85 if not isinstance(name, str): 85 ↛ 86line 85 didn't jump to line 86 because the condition on line 85 was never true
86 raise JudgeError(
87 ErrorCode.INVALID_OUTPUT_NAME,
88 {"reason": "not_a_string", "supplied": name},
89 )
90 if not name:
91 raise JudgeError(
92 ErrorCode.INVALID_OUTPUT_NAME,
93 {"reason": "empty", "supplied": name},
94 )
95 if len(name) > _MAX_OUTPUT_NAME_LEN:
96 raise JudgeError(
97 ErrorCode.INVALID_OUTPUT_NAME,
98 {
99 "reason": "too_long",
100 "supplied": name,
101 "max_length": _MAX_OUTPUT_NAME_LEN,
102 "actual_length": len(name),
103 },
104 )
105 if "." in name:
106 raise JudgeError(
107 ErrorCode.INVALID_OUTPUT_NAME,
108 {"reason": "contains_separator", "supplied": name},
109 )
110 if any(ch.isspace() for ch in name):
111 raise JudgeError(
112 ErrorCode.INVALID_OUTPUT_NAME,
113 {"reason": "contains_whitespace", "supplied": name},
114 )
115 return name
118def is_finite_float(x: object) -> bool:
119 """Return True only for a real, finite number.
121 An integer qualifies; a float qualifies when it is finite. A boolean is
122 rejected even though ``bool`` is a subclass of ``int``, and NaN and the
123 infinities are rejected. Everything else — strings, ``None``, containers
124 — is rejected too. This is the single gate the emitted progress score
125 must pass before it can stand in for a metric a threshold comparison
126 will read.
127 """
128 if isinstance(x, bool):
129 return False
130 if isinstance(x, int):
131 return True
132 if isinstance(x, float):
133 # math.isfinite is False for NaN and +/-inf.
134 return math.isfinite(x)
135 return False
138def metrics_result(
139 output_name: str,
140 score: float,
141 *,
142 rationale: str,
143 source: str,
144 backend_name: str,
145 model_id: str,
146 rubric_version: str,
147 raw_score: float,
148) -> dict[str, Any]:
149 """Assemble the canonical success shape: ``{"metrics": {output_name: score}, ...}``.
151 The single progress score lives under the top-level ``metrics`` object;
152 every provenance field (rationale, source identifier, resolved backend
153 name and model id, rubric version, and the pre-clamp raw score) is placed
154 beside ``metrics`` at the top level, never inside it, so the merged view
155 contains only the numeric value. ``score`` must already be a real, finite
156 number — callers parse and clamp before reaching this builder.
157 """
158 assert is_finite_float(score), "metrics_result requires a finite numeric score"
159 # Lay down provenance first, then the metrics object, so the top-level
160 # ``metrics`` key is always the canonical numeric map.
161 result: dict[str, Any] = {
162 "rationale": rationale,
163 "source": source,
164 "backend_name": backend_name,
165 "model_id": model_id,
166 "rubric_version": rubric_version,
167 "raw_score": raw_score,
168 }
169 result["metrics"] = {output_name: score}
170 return result
173def error_envelope(code: str, **details: object) -> dict[str, Any]:
174 """Assemble the structured failure shape: ``{"code": code, "details": {...}}``.
176 The returned object never carries a top-level ``metrics`` key, so a
177 consumer that merges only ``metrics``-shaped results skips it and leaves
178 the corresponding check undecided rather than acting on bad data. Any
179 diagnostic context is nested under ``details``.
180 """
181 return {"code": code, "details": dict(details)}