Coverage for mcp/mission_judge/score.py: 100%
50 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-15 15:07 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-15 15:07 +0000
1"""Parse and bound the model's raw response into a usable progress score.
3The judge asks a model for a single progress score and gets back a string.
4Turning that string into a number a threshold comparison can read is split
5into two deliberately separate steps:
7* :func:`parse_score` owns the only failure path. It decodes the raw text,
8 validates that it carries a real, finite numeric ``score`` field, and
9 returns that score together with the model's rationale. Anything it cannot
10 trust — non-JSON text, a missing or non-numeric field, a boolean, or a
11 NaN/infinite value — becomes a :class:`JudgeError`.
12* :func:`clamp_score` owns no failure path. It is a total function on finite
13 floats that folds an out-of-range value onto the nearest bound of the
14 closed interval ``[0.0, 1.0]`` and returns an in-range value untouched.
16Keeping the two apart means the caller can record the unmodified parsed value
17as provenance and emit the clamped value as the metric, without either step
18second-guessing the other.
19"""
21from __future__ import annotations
23import json
24import math
25from typing import Any
27from .shape import ErrorCode, JudgeError
29# The field the model is instructed to populate with its numeric score.
30SCORE_FIELD = "score"
31# The field the model is instructed to populate with its free-text rationale.
32RATIONALE_FIELD = "rationale"
34# The inclusive bounds of the progress-score interval.
35_LOWER_BOUND = 0.0
36_UPPER_BOUND = 1.0
38# The Markdown code-fence delimiter chat models commonly wrap a JSON answer in.
39_CODE_FENCE = "```"
42def _strip_code_fence(text: str) -> str:
43 """Return the body of a whole-response Markdown code fence, else the text.
45 A fenced answer opens with ``` (optionally tagged, e.g. ```json) on its
46 own line and closes with ``` on a later line. When ``text`` both opens and
47 closes with a fence, the opening line — language tag and all — and the
48 trailing fence are dropped and the body between them is returned; anything
49 not fully fenced is returned unchanged. Pure and deterministic.
50 """
51 if not (text.startswith(_CODE_FENCE) and text.endswith(_CODE_FENCE)):
52 return text
53 # Drop the opening fence line (``` plus any language tag), then everything
54 # from the final fence onward, leaving just the fenced body.
55 _, _, after_open = text.partition("\n")
56 body, _, _ = after_open.rpartition(_CODE_FENCE)
57 return body.strip()
60def _extract_json_payload(raw_text: str) -> str:
61 """Return the most likely JSON substring of a model response.
63 Models often wrap the requested JSON object in a Markdown code fence or
64 surround it with a sentence of prose. This peels a whole-response code
65 fence when present, then, failing a direct decode, falls back to the
66 substring spanning the first ``{`` and the last ``}``. The result is only
67 a *candidate* — the caller still decodes it and validates the shape, so a
68 candidate that is not real JSON is rejected downstream like any other
69 untrustworthy output. The transformation is pure and deterministic.
70 """
71 text = _strip_code_fence(raw_text.strip())
73 # If what remains already decodes, use it as-is.
74 try:
75 json.loads(text)
76 return text
77 except ValueError, TypeError:
78 pass
80 # Last resort: carve out the first balanced-looking object span. This
81 # rescues a JSON object embedded in leading/trailing prose without trying
82 # to be a full parser — the carved span is still decoded by the caller.
83 start = text.find("{")
84 end = text.rfind("}")
85 if start != -1 and end != -1 and end > start:
86 return text[start : end + 1]
88 return text
91def parse_score(raw_text: str) -> tuple[float, str]:
92 """Decode the model's raw response into ``(raw_score, rationale)``.
94 The raw text must carry a JSON object with a real, finite numeric
95 ``score`` field. A whole-response Markdown code fence (```` ```json ... ``` ````)
96 or a sentence of surrounding prose is tolerated — :func:`_extract_json_payload`
97 peels it before decoding — because chat models routinely wrap a JSON
98 answer that way despite being asked for raw JSON. The returned
99 ``raw_score`` is that value coerced to a float and is **not** yet clamped,
100 so the caller can record it verbatim as provenance. The returned
101 ``rationale`` is the model's ``rationale`` field coerced to a string,
102 defaulting to an empty string when absent.
104 Every way the response can fail to yield a trustworthy number raises
105 :class:`JudgeError` with code :attr:`ErrorCode.INVALID_MODEL_SCORE` and a
106 ``reason`` in ``details`` identifying the specific failure:
108 * ``non_json`` — no JSON object could be decoded from the text.
109 * ``missing_score_field`` — the JSON is not an object, or has no
110 ``score`` field.
111 * ``non_numeric`` — the ``score`` is a boolean, string, null, or other
112 non-numeric value.
113 * ``non_finite`` — the ``score`` is NaN or positive/negative infinity.
114 """
115 payload = _extract_json_payload(raw_text)
116 try:
117 parsed = json.loads(payload)
118 except (ValueError, TypeError) as err:
119 raise JudgeError(
120 ErrorCode.INVALID_MODEL_SCORE,
121 {"reason": "non_json"},
122 ) from err
124 if not isinstance(parsed, dict) or SCORE_FIELD not in parsed:
125 raise JudgeError(
126 ErrorCode.INVALID_MODEL_SCORE,
127 {"reason": "missing_score_field"},
128 )
130 raw_value: Any = parsed[SCORE_FIELD]
131 # bool is a subclass of int, so reject it explicitly before the int/float
132 # check below would otherwise accept True/False as 1/0.
133 if isinstance(raw_value, bool) or not isinstance(raw_value, int | float):
134 raise JudgeError(
135 ErrorCode.INVALID_MODEL_SCORE,
136 {"reason": "non_numeric"},
137 )
139 raw_score = float(raw_value)
140 if math.isnan(raw_score) or math.isinf(raw_score):
141 raise JudgeError(
142 ErrorCode.INVALID_MODEL_SCORE,
143 {"reason": "non_finite"},
144 )
146 rationale = str(parsed.get(RATIONALE_FIELD, ""))
147 return raw_score, rationale
150def clamp_score(value: float) -> float:
151 """Fold a finite float onto the closed interval ``[0.0, 1.0]``.
153 A value below ``0.0`` becomes ``0.0`` and a value above ``1.0`` becomes
154 ``1.0``; an in-range value is returned unchanged with no rounding or
155 scaling. The input is assumed finite — :func:`parse_score` has already
156 rejected NaN and the infinities — so this function performs no parsing
157 and never raises.
158 """
159 if value < _LOWER_BOUND:
160 return _LOWER_BOUND
161 if value > _UPPER_BOUND:
162 return _UPPER_BOUND
163 return value