Coverage for mcp/mission_judge/score.py: 100%

1"""Parse and bound the model's raw response into a usable progress score.

3The judge asks a model for a single progress score and gets back a string.

4Turning that string into a number a threshold comparison can read is split

5into two deliberately separate steps:

7* :func:`parse_score` owns the only failure path. It decodes the raw text,

8 validates that it carries a real, finite numeric ``score`` field, and

9 returns that score together with the model's rationale. Anything it cannot

10 trust — non-JSON text, a missing or non-numeric field, a boolean, or a

11 NaN/infinite value — becomes a :class:`JudgeError`.

12* :func:`clamp_score` owns no failure path. It is a total function on finite

13 floats that folds an out-of-range value onto the nearest bound of the

14 closed interval ``[0.0, 1.0]`` and returns an in-range value untouched.

16Keeping the two apart means the caller can record the unmodified parsed value

17as provenance and emit the clamped value as the metric, without either step

18second-guessing the other.

19"""

21from __future__ import annotations

23import json

24import math

25from typing import Any

27from .shape import ErrorCode, JudgeError

29# The field the model is instructed to populate with its numeric score.

30SCORE_FIELD = "score"

31# The field the model is instructed to populate with its free-text rationale.

32RATIONALE_FIELD = "rationale"

34# The inclusive bounds of the progress-score interval.

35_LOWER_BOUND = 0.0

36_UPPER_BOUND = 1.0

38# The Markdown code-fence delimiter chat models commonly wrap a JSON answer in.

39_CODE_FENCE = "```"

42def _strip_code_fence(text: str) -> str:

43 """Return the body of a whole-response Markdown code fence, else the text.

45 A fenced answer opens with ``` (optionally tagged, e.g. ```json) on its

46 own line and closes with ``` on a later line. When ``text`` both opens and

47 closes with a fence, the opening line — language tag and all — and the

48 trailing fence are dropped and the body between them is returned; anything

49 not fully fenced is returned unchanged. Pure and deterministic.

50 """

51 if not (text.startswith(_CODE_FENCE) and text.endswith(_CODE_FENCE)):

52 return text

53 # Drop the opening fence line (``` plus any language tag), then everything

54 # from the final fence onward, leaving just the fenced body.

55 _, _, after_open = text.partition("\n")

56 body, _, _ = after_open.rpartition(_CODE_FENCE)

57 return body.strip()

60def _extract_json_payload(raw_text: str) -> str:

61 """Return the most likely JSON substring of a model response.

63 Models often wrap the requested JSON object in a Markdown code fence or

64 surround it with a sentence of prose. This peels a whole-response code

65 fence when present, then, failing a direct decode, falls back to the

66 substring spanning the first ``{`` and the last ``}``. The result is only

67 a *candidate* — the caller still decodes it and validates the shape, so a

68 candidate that is not real JSON is rejected downstream like any other

69 untrustworthy output. The transformation is pure and deterministic.

70 """

71 text = _strip_code_fence(raw_text.strip())

73 # If what remains already decodes, use it as-is.

74 try:

75 json.loads(text)

76 return text

77 except ValueError, TypeError:

78 pass

80 # Last resort: carve out the first balanced-looking object span. This

81 # rescues a JSON object embedded in leading/trailing prose without trying

82 # to be a full parser — the carved span is still decoded by the caller.

83 start = text.find("{")

84 end = text.rfind("}")

85 if start != -1 and end != -1 and end > start:

86 return text[start : end + 1]

88 return text

91def parse_score(raw_text: str) -> tuple[float, str]:

92 """Decode the model's raw response into ``(raw_score, rationale)``.

94 The raw text must carry a JSON object with a real, finite numeric

95 ``score`` field. A whole-response Markdown code fence (```` ```json ... ``` ````)

96 or a sentence of surrounding prose is tolerated — :func:`_extract_json_payload`

97 peels it before decoding — because chat models routinely wrap a JSON

98 answer that way despite being asked for raw JSON. The returned

99 ``raw_score`` is that value coerced to a float and is **not** yet clamped,

100 so the caller can record it verbatim as provenance. The returned

101 ``rationale`` is the model's ``rationale`` field coerced to a string,

102 defaulting to an empty string when absent.

103

104 Every way the response can fail to yield a trustworthy number raises

105 :class:`JudgeError` with code :attr:`ErrorCode.INVALID_MODEL_SCORE` and a

106 ``reason`` in ``details`` identifying the specific failure:

107

108 * ``non_json`` — no JSON object could be decoded from the text.

109 * ``missing_score_field`` — the JSON is not an object, or has no

110 ``score`` field.

111 * ``non_numeric`` — the ``score`` is a boolean, string, null, or other

112 non-numeric value.

113 * ``non_finite`` — the ``score`` is NaN or positive/negative infinity.

114 """

115 payload = _extract_json_payload(raw_text)

116 try:

117 parsed = json.loads(payload)

118 except (ValueError, TypeError) as err:

119 raise JudgeError(

120 ErrorCode.INVALID_MODEL_SCORE,

121 {"reason": "non_json"},

122 ) from err

123

124 if not isinstance(parsed, dict) or SCORE_FIELD not in parsed:

125 raise JudgeError(

126 ErrorCode.INVALID_MODEL_SCORE,

127 {"reason": "missing_score_field"},

128 )

129

130 raw_value: Any = parsed[SCORE_FIELD]

131 # bool is a subclass of int, so reject it explicitly before the int/float

132 # check below would otherwise accept True/False as 1/0.

133 if isinstance(raw_value, bool) or not isinstance(raw_value, int | float):

134 raise JudgeError(

135 ErrorCode.INVALID_MODEL_SCORE,

136 {"reason": "non_numeric"},

137 )

138

139 raw_score = float(raw_value)

140 if math.isnan(raw_score) or math.isinf(raw_score):

141 raise JudgeError(

142 ErrorCode.INVALID_MODEL_SCORE,

143 {"reason": "non_finite"},

144 )

145

146 rationale = str(parsed.get(RATIONALE_FIELD, ""))

147 return raw_score, rationale

148

149

150def clamp_score(value: float) -> float:

151 """Fold a finite float onto the closed interval ``[0.0, 1.0]``.

152

153 A value below ``0.0`` becomes ``0.0`` and a value above ``1.0`` becomes

154 ``1.0``; an in-range value is returned unchanged with no rounding or

155 scaling. The input is assumed finite — :func:`parse_score` has already

156 rejected NaN and the infinities — so this function performs no parsing

157 and never raises.

158 """

159 if value < _LOWER_BOUND:

160 return _LOWER_BOUND

161 if value > _UPPER_BOUND:

162 return _UPPER_BOUND

163 return value