Coverage for mcp/mission_judge/shape.py: 97%

1"""Building blocks shared by the semantic-progress judge tool.

3The judge either produces one finite-float progress score wrapped in a

4canonical result shape, or it fails and produces a structured error

5envelope. This module owns the small, pure pieces both outcomes are built

6from:

8* :class:`ErrorCode` — the frozen set of stable, machine-readable failure

9 codes the judge can surface.

10* :class:`JudgeError` — the exception the judge raises internally; the tool

11 wrapper translates it into an error envelope.

12* :func:`validate_output_name` — guards the caller-supplied metric name so

13 the resulting key is a single, well-formed path segment.

14* :func:`is_finite_float` — the single source of truth for "is this a real,

15 finite number that a threshold comparison can read?".

16* :func:`metrics_result` — assembles the canonical

17 ``{"metrics": {key: value}, ...}`` success shape with provenance placed

18 beside ``metrics`` rather than inside it.

19* :func:`error_envelope` — assembles the ``{"code", "details"}`` failure

20 shape that never carries a top-level ``metrics`` key.

22Everything here is pure: no I/O, no clocks, no environment lookups. That

23keeps the pieces trivial to test in isolation and safe to call from both

24async tool handlers and synchronous code.

25"""

27from __future__ import annotations

29import math

30from typing import Any

32# The longest an output name may be, in characters.

33_MAX_OUTPUT_NAME_LEN = 128

36class ErrorCode:

37 """Stable, machine-readable failure codes the judge can surface.

39 Each value is a short string an operator (or an automated caller) can

40 branch on without parsing a human message. The values are deliberately

41 frozen: callers and tests may depend on the exact strings.

42 """

44 # A caller-supplied output name was empty, too long, or carried a

45 # separator or whitespace character.

46 INVALID_OUTPUT_NAME = "invalid_output_name"

47 # The directive input was absent, empty, or whitespace-only.

48 MISSING_DIRECTIVE = "missing_directive"

49 # No sampling backend was available to produce a score.

50 NO_SAMPLING_BACKEND = "no_sampling_backend"

51 # The backend's sample call raised a transport, throttling, credentials,

52 # or timeout error.

53 SAMPLING_TRANSPORT_ERROR = "sampling_transport_error"

54 # The model output could not be parsed into a finite real numeric score.

55 INVALID_MODEL_SCORE = "invalid_model_score"

58class JudgeError(Exception):

59 """Raised internally when the judge cannot produce a score.

61 Carries a stable short ``code`` (one of :class:`ErrorCode`) and an

62 optional structured ``details`` dict the tool wrapper renders into an

63 error envelope. When ``details`` is omitted it defaults to an empty

64 dict, and the exception's string form falls back to ``code`` so logs

65 always show something meaningful.

66 """

68 def __init__(self, code: str, details: dict[str, Any] | None = None) -> None:

69 self.code: str = code

70 self.details: dict[str, Any] = details or {}

71 super().__init__(code)

74def validate_output_name(name: str) -> str:

75 """Return ``name`` unchanged when it is a single well-formed path segment.

77 A valid name is a non-empty string of at most 128 characters that

78 contains neither a ``.`` separator nor any whitespace character, so the

79 resulting metric path is exactly ``metrics.<name>``. Any other input —

80 empty, too long, containing a ``.``, or containing whitespace — raises

81 :class:`JudgeError` with code :attr:`ErrorCode.INVALID_OUTPUT_NAME` and a

82 ``details`` payload that names the specific reason and echoes the

83 supplied value.

84 """

85 if not isinstance(name, str): 85 ↛ 86line 85 didn't jump to line 86 because the condition on line 85 was never true

86 raise JudgeError(

87 ErrorCode.INVALID_OUTPUT_NAME,

88 {"reason": "not_a_string", "supplied": name},

89 )

90 if not name:

91 raise JudgeError(

92 ErrorCode.INVALID_OUTPUT_NAME,

93 {"reason": "empty", "supplied": name},

94 )

95 if len(name) > _MAX_OUTPUT_NAME_LEN:

96 raise JudgeError(

97 ErrorCode.INVALID_OUTPUT_NAME,

98 {

99 "reason": "too_long",

100 "supplied": name,

101 "max_length": _MAX_OUTPUT_NAME_LEN,

102 "actual_length": len(name),

103 },

104 )

105 if "." in name:

106 raise JudgeError(

107 ErrorCode.INVALID_OUTPUT_NAME,

108 {"reason": "contains_separator", "supplied": name},

109 )

110 if any(ch.isspace() for ch in name):

111 raise JudgeError(

112 ErrorCode.INVALID_OUTPUT_NAME,

113 {"reason": "contains_whitespace", "supplied": name},

114 )

115 return name

116

117

118def is_finite_float(x: object) -> bool:

119 """Return True only for a real, finite number.

120

121 An integer qualifies; a float qualifies when it is finite. A boolean is

122 rejected even though ``bool`` is a subclass of ``int``, and NaN and the

123 infinities are rejected. Everything else — strings, ``None``, containers

124 — is rejected too. This is the single gate the emitted progress score

125 must pass before it can stand in for a metric a threshold comparison

126 will read.

127 """

128 if isinstance(x, bool):

129 return False

130 if isinstance(x, int):

131 return True

132 if isinstance(x, float):

133 # math.isfinite is False for NaN and +/-inf.

134 return math.isfinite(x)

135 return False

136

137

138def metrics_result(

139 output_name: str,

140 score: float,

141 *,

142 rationale: str,

143 source: str,

144 backend_name: str,

145 model_id: str,

146 rubric_version: str,

147 raw_score: float,

148) -> dict[str, Any]:

149 """Assemble the canonical success shape: ``{"metrics": {output_name: score}, ...}``.

150

151 The single progress score lives under the top-level ``metrics`` object;

152 every provenance field (rationale, source identifier, resolved backend

153 name and model id, rubric version, and the pre-clamp raw score) is placed

154 beside ``metrics`` at the top level, never inside it, so the merged view

155 contains only the numeric value. ``score`` must already be a real, finite

156 number — callers parse and clamp before reaching this builder.

157 """

158 assert is_finite_float(score), "metrics_result requires a finite numeric score"

159 # Lay down provenance first, then the metrics object, so the top-level

160 # ``metrics`` key is always the canonical numeric map.

161 result: dict[str, Any] = {

162 "rationale": rationale,

163 "source": source,

164 "backend_name": backend_name,

165 "model_id": model_id,

166 "rubric_version": rubric_version,

167 "raw_score": raw_score,

168 }

169 result["metrics"] = {output_name: score}

170 return result

171

172

173def error_envelope(code: str, **details: object) -> dict[str, Any]:

174 """Assemble the structured failure shape: ``{"code": code, "details": {...}}``.

175

176 The returned object never carries a top-level ``metrics`` key, so a

177 consumer that merges only ``metrics``-shaped results skips it and leaves

178 the corresponding check undecided rather than acting on bad data. Any

179 diagnostic context is nested under ``details``.

180 """

181 return {"code": code, "details": dict(details)}