Coverage for mcp/mission_judge/shape.py: 97%

42 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-15 15:07 +0000

1"""Building blocks shared by the semantic-progress judge tool. 

2 

3The judge either produces one finite-float progress score wrapped in a 

4canonical result shape, or it fails and produces a structured error 

5envelope. This module owns the small, pure pieces both outcomes are built 

6from: 

7 

8* :class:`ErrorCode` — the frozen set of stable, machine-readable failure 

9 codes the judge can surface. 

10* :class:`JudgeError` — the exception the judge raises internally; the tool 

11 wrapper translates it into an error envelope. 

12* :func:`validate_output_name` — guards the caller-supplied metric name so 

13 the resulting key is a single, well-formed path segment. 

14* :func:`is_finite_float` — the single source of truth for "is this a real, 

15 finite number that a threshold comparison can read?". 

16* :func:`metrics_result` — assembles the canonical 

17 ``{"metrics": {key: value}, ...}`` success shape with provenance placed 

18 beside ``metrics`` rather than inside it. 

19* :func:`error_envelope` — assembles the ``{"code", "details"}`` failure 

20 shape that never carries a top-level ``metrics`` key. 

21 

22Everything here is pure: no I/O, no clocks, no environment lookups. That 

23keeps the pieces trivial to test in isolation and safe to call from both 

24async tool handlers and synchronous code. 

25""" 

26 

27from __future__ import annotations 

28 

29import math 

30from typing import Any 

31 

32# The longest an output name may be, in characters. 

33_MAX_OUTPUT_NAME_LEN = 128 

34 

35 

36class ErrorCode: 

37 """Stable, machine-readable failure codes the judge can surface. 

38 

39 Each value is a short string an operator (or an automated caller) can 

40 branch on without parsing a human message. The values are deliberately 

41 frozen: callers and tests may depend on the exact strings. 

42 """ 

43 

44 # A caller-supplied output name was empty, too long, or carried a 

45 # separator or whitespace character. 

46 INVALID_OUTPUT_NAME = "invalid_output_name" 

47 # The directive input was absent, empty, or whitespace-only. 

48 MISSING_DIRECTIVE = "missing_directive" 

49 # No sampling backend was available to produce a score. 

50 NO_SAMPLING_BACKEND = "no_sampling_backend" 

51 # The backend's sample call raised a transport, throttling, credentials, 

52 # or timeout error. 

53 SAMPLING_TRANSPORT_ERROR = "sampling_transport_error" 

54 # The model output could not be parsed into a finite real numeric score. 

55 INVALID_MODEL_SCORE = "invalid_model_score" 

56 

57 

58class JudgeError(Exception): 

59 """Raised internally when the judge cannot produce a score. 

60 

61 Carries a stable short ``code`` (one of :class:`ErrorCode`) and an 

62 optional structured ``details`` dict the tool wrapper renders into an 

63 error envelope. When ``details`` is omitted it defaults to an empty 

64 dict, and the exception's string form falls back to ``code`` so logs 

65 always show something meaningful. 

66 """ 

67 

68 def __init__(self, code: str, details: dict[str, Any] | None = None) -> None: 

69 self.code: str = code 

70 self.details: dict[str, Any] = details or {} 

71 super().__init__(code) 

72 

73 

74def validate_output_name(name: str) -> str: 

75 """Return ``name`` unchanged when it is a single well-formed path segment. 

76 

77 A valid name is a non-empty string of at most 128 characters that 

78 contains neither a ``.`` separator nor any whitespace character, so the 

79 resulting metric path is exactly ``metrics.<name>``. Any other input — 

80 empty, too long, containing a ``.``, or containing whitespace — raises 

81 :class:`JudgeError` with code :attr:`ErrorCode.INVALID_OUTPUT_NAME` and a 

82 ``details`` payload that names the specific reason and echoes the 

83 supplied value. 

84 """ 

85 if not isinstance(name, str): 85 ↛ 86line 85 didn't jump to line 86 because the condition on line 85 was never true

86 raise JudgeError( 

87 ErrorCode.INVALID_OUTPUT_NAME, 

88 {"reason": "not_a_string", "supplied": name}, 

89 ) 

90 if not name: 

91 raise JudgeError( 

92 ErrorCode.INVALID_OUTPUT_NAME, 

93 {"reason": "empty", "supplied": name}, 

94 ) 

95 if len(name) > _MAX_OUTPUT_NAME_LEN: 

96 raise JudgeError( 

97 ErrorCode.INVALID_OUTPUT_NAME, 

98 { 

99 "reason": "too_long", 

100 "supplied": name, 

101 "max_length": _MAX_OUTPUT_NAME_LEN, 

102 "actual_length": len(name), 

103 }, 

104 ) 

105 if "." in name: 

106 raise JudgeError( 

107 ErrorCode.INVALID_OUTPUT_NAME, 

108 {"reason": "contains_separator", "supplied": name}, 

109 ) 

110 if any(ch.isspace() for ch in name): 

111 raise JudgeError( 

112 ErrorCode.INVALID_OUTPUT_NAME, 

113 {"reason": "contains_whitespace", "supplied": name}, 

114 ) 

115 return name 

116 

117 

118def is_finite_float(x: object) -> bool: 

119 """Return True only for a real, finite number. 

120 

121 An integer qualifies; a float qualifies when it is finite. A boolean is 

122 rejected even though ``bool`` is a subclass of ``int``, and NaN and the 

123 infinities are rejected. Everything else — strings, ``None``, containers 

124 — is rejected too. This is the single gate the emitted progress score 

125 must pass before it can stand in for a metric a threshold comparison 

126 will read. 

127 """ 

128 if isinstance(x, bool): 

129 return False 

130 if isinstance(x, int): 

131 return True 

132 if isinstance(x, float): 

133 # math.isfinite is False for NaN and +/-inf. 

134 return math.isfinite(x) 

135 return False 

136 

137 

138def metrics_result( 

139 output_name: str, 

140 score: float, 

141 *, 

142 rationale: str, 

143 source: str, 

144 backend_name: str, 

145 model_id: str, 

146 rubric_version: str, 

147 raw_score: float, 

148) -> dict[str, Any]: 

149 """Assemble the canonical success shape: ``{"metrics": {output_name: score}, ...}``. 

150 

151 The single progress score lives under the top-level ``metrics`` object; 

152 every provenance field (rationale, source identifier, resolved backend 

153 name and model id, rubric version, and the pre-clamp raw score) is placed 

154 beside ``metrics`` at the top level, never inside it, so the merged view 

155 contains only the numeric value. ``score`` must already be a real, finite 

156 number — callers parse and clamp before reaching this builder. 

157 """ 

158 assert is_finite_float(score), "metrics_result requires a finite numeric score" 

159 # Lay down provenance first, then the metrics object, so the top-level 

160 # ``metrics`` key is always the canonical numeric map. 

161 result: dict[str, Any] = { 

162 "rationale": rationale, 

163 "source": source, 

164 "backend_name": backend_name, 

165 "model_id": model_id, 

166 "rubric_version": rubric_version, 

167 "raw_score": raw_score, 

168 } 

169 result["metrics"] = {output_name: score} 

170 return result 

171 

172 

173def error_envelope(code: str, **details: object) -> dict[str, Any]: 

174 """Assemble the structured failure shape: ``{"code": code, "details": {...}}``. 

175 

176 The returned object never carries a top-level ``metrics`` key, so a 

177 consumer that merges only ``metrics``-shaped results skips it and leaves 

178 the corresponding check undecided rather than acting on bad data. Any 

179 diagnostic context is nested under ``details``. 

180 """ 

181 return {"code": code, "details": dict(details)}