Coverage for mcp/mission_judge/score.py: 100%

50 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-15 15:07 +0000

1"""Parse and bound the model's raw response into a usable progress score. 

2 

3The judge asks a model for a single progress score and gets back a string. 

4Turning that string into a number a threshold comparison can read is split 

5into two deliberately separate steps: 

6 

7* :func:`parse_score` owns the only failure path. It decodes the raw text, 

8 validates that it carries a real, finite numeric ``score`` field, and 

9 returns that score together with the model's rationale. Anything it cannot 

10 trust — non-JSON text, a missing or non-numeric field, a boolean, or a 

11 NaN/infinite value — becomes a :class:`JudgeError`. 

12* :func:`clamp_score` owns no failure path. It is a total function on finite 

13 floats that folds an out-of-range value onto the nearest bound of the 

14 closed interval ``[0.0, 1.0]`` and returns an in-range value untouched. 

15 

16Keeping the two apart means the caller can record the unmodified parsed value 

17as provenance and emit the clamped value as the metric, without either step 

18second-guessing the other. 

19""" 

20 

21from __future__ import annotations 

22 

23import json 

24import math 

25from typing import Any 

26 

27from .shape import ErrorCode, JudgeError 

28 

29# The field the model is instructed to populate with its numeric score. 

30SCORE_FIELD = "score" 

31# The field the model is instructed to populate with its free-text rationale. 

32RATIONALE_FIELD = "rationale" 

33 

34# The inclusive bounds of the progress-score interval. 

35_LOWER_BOUND = 0.0 

36_UPPER_BOUND = 1.0 

37 

38# The Markdown code-fence delimiter chat models commonly wrap a JSON answer in. 

39_CODE_FENCE = "```" 

40 

41 

42def _strip_code_fence(text: str) -> str: 

43 """Return the body of a whole-response Markdown code fence, else the text. 

44 

45 A fenced answer opens with ``` (optionally tagged, e.g. ```json) on its 

46 own line and closes with ``` on a later line. When ``text`` both opens and 

47 closes with a fence, the opening line — language tag and all — and the 

48 trailing fence are dropped and the body between them is returned; anything 

49 not fully fenced is returned unchanged. Pure and deterministic. 

50 """ 

51 if not (text.startswith(_CODE_FENCE) and text.endswith(_CODE_FENCE)): 

52 return text 

53 # Drop the opening fence line (``` plus any language tag), then everything 

54 # from the final fence onward, leaving just the fenced body. 

55 _, _, after_open = text.partition("\n") 

56 body, _, _ = after_open.rpartition(_CODE_FENCE) 

57 return body.strip() 

58 

59 

60def _extract_json_payload(raw_text: str) -> str: 

61 """Return the most likely JSON substring of a model response. 

62 

63 Models often wrap the requested JSON object in a Markdown code fence or 

64 surround it with a sentence of prose. This peels a whole-response code 

65 fence when present, then, failing a direct decode, falls back to the 

66 substring spanning the first ``{`` and the last ``}``. The result is only 

67 a *candidate* — the caller still decodes it and validates the shape, so a 

68 candidate that is not real JSON is rejected downstream like any other 

69 untrustworthy output. The transformation is pure and deterministic. 

70 """ 

71 text = _strip_code_fence(raw_text.strip()) 

72 

73 # If what remains already decodes, use it as-is. 

74 try: 

75 json.loads(text) 

76 return text 

77 except ValueError, TypeError: 

78 pass 

79 

80 # Last resort: carve out the first balanced-looking object span. This 

81 # rescues a JSON object embedded in leading/trailing prose without trying 

82 # to be a full parser — the carved span is still decoded by the caller. 

83 start = text.find("{") 

84 end = text.rfind("}") 

85 if start != -1 and end != -1 and end > start: 

86 return text[start : end + 1] 

87 

88 return text 

89 

90 

91def parse_score(raw_text: str) -> tuple[float, str]: 

92 """Decode the model's raw response into ``(raw_score, rationale)``. 

93 

94 The raw text must carry a JSON object with a real, finite numeric 

95 ``score`` field. A whole-response Markdown code fence (```` ```json ... ``` ````) 

96 or a sentence of surrounding prose is tolerated — :func:`_extract_json_payload` 

97 peels it before decoding — because chat models routinely wrap a JSON 

98 answer that way despite being asked for raw JSON. The returned 

99 ``raw_score`` is that value coerced to a float and is **not** yet clamped, 

100 so the caller can record it verbatim as provenance. The returned 

101 ``rationale`` is the model's ``rationale`` field coerced to a string, 

102 defaulting to an empty string when absent. 

103 

104 Every way the response can fail to yield a trustworthy number raises 

105 :class:`JudgeError` with code :attr:`ErrorCode.INVALID_MODEL_SCORE` and a 

106 ``reason`` in ``details`` identifying the specific failure: 

107 

108 * ``non_json`` — no JSON object could be decoded from the text. 

109 * ``missing_score_field`` — the JSON is not an object, or has no 

110 ``score`` field. 

111 * ``non_numeric`` — the ``score`` is a boolean, string, null, or other 

112 non-numeric value. 

113 * ``non_finite`` — the ``score`` is NaN or positive/negative infinity. 

114 """ 

115 payload = _extract_json_payload(raw_text) 

116 try: 

117 parsed = json.loads(payload) 

118 except (ValueError, TypeError) as err: 

119 raise JudgeError( 

120 ErrorCode.INVALID_MODEL_SCORE, 

121 {"reason": "non_json"}, 

122 ) from err 

123 

124 if not isinstance(parsed, dict) or SCORE_FIELD not in parsed: 

125 raise JudgeError( 

126 ErrorCode.INVALID_MODEL_SCORE, 

127 {"reason": "missing_score_field"}, 

128 ) 

129 

130 raw_value: Any = parsed[SCORE_FIELD] 

131 # bool is a subclass of int, so reject it explicitly before the int/float 

132 # check below would otherwise accept True/False as 1/0. 

133 if isinstance(raw_value, bool) or not isinstance(raw_value, int | float): 

134 raise JudgeError( 

135 ErrorCode.INVALID_MODEL_SCORE, 

136 {"reason": "non_numeric"}, 

137 ) 

138 

139 raw_score = float(raw_value) 

140 if math.isnan(raw_score) or math.isinf(raw_score): 

141 raise JudgeError( 

142 ErrorCode.INVALID_MODEL_SCORE, 

143 {"reason": "non_finite"}, 

144 ) 

145 

146 rationale = str(parsed.get(RATIONALE_FIELD, "")) 

147 return raw_score, rationale 

148 

149 

150def clamp_score(value: float) -> float: 

151 """Fold a finite float onto the closed interval ``[0.0, 1.0]``. 

152 

153 A value below ``0.0`` becomes ``0.0`` and a value above ``1.0`` becomes 

154 ``1.0``; an in-range value is returned unchanged with no rounding or 

155 scaling. The input is assumed finite — :func:`parse_score` has already 

156 rejected NaN and the infinities — so this function performs no parsing 

157 and never raises. 

158 """ 

159 if value < _LOWER_BOUND: 

160 return _LOWER_BOUND 

161 if value > _UPPER_BOUND: 

162 return _UPPER_BOUND 

163 return value