Coverage for mcp/tools/semantic_progress.py: 96%

43 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-15 15:07 +0000

1"""Read-only LLM-as-judge tool that scores Mission progress. 

2 

3The single ``metrics_semantic_progress`` tool scores how close a Mission is 

4to satisfying its directive and returns that score in the canonical 

5``{"metrics": {"progress_score": <number>}}`` shape the Observe_Phase merges, 

6so a plain ``metric_threshold`` or ``metric_trend`` criterion can read it by 

7dot-path with no special handling. 

8 

9The whole tool registration is gated by ``GCO_ENABLE_SEMANTIC_PROGRESS`` so the 

10``@mcp.tool`` decorator only fires when the flag (or the umbrella 

11``GCO_ENABLE_ALL_TOOLS``) is enabled. With the flag unset this module imports 

12cleanly and FastMCP never sees the tool. Each invocation incurs one LLM call 

13via the existing sampling seam, which is why the tool is default-off. 

14 

15[gated by GCO_ENABLE_SEMANTIC_PROGRESS] 

16""" 

17 

18from __future__ import annotations 

19 

20import sys 

21from pathlib import Path 

22from typing import Any 

23 

24from audit import audit_logged 

25from feature_flags import is_enabled 

26from server import mcp 

27 

28# The pure judge package and the sampling seam live under ``mcp/``; the 

29# path-injection pattern matches the rest of the MCP module surface so 

30# ``import mission_judge.*`` and ``import mission.*`` resolve without making 

31# the ``mcp`` directory a package. 

32sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) 

33 

34# The sampling seam — reused, not reconstructed. 

35from mission.sampling import ( # noqa: E402 

36 SamplingTransportError, 

37 select_sampling_backend, 

38) 

39from mission_judge import prompt as judge_prompt # noqa: E402 

40from mission_judge import rubric as judge_rubric # noqa: E402 

41from mission_judge import score as judge_score # noqa: E402 

42from mission_judge.shape import ( # noqa: E402 

43 ErrorCode, 

44 JudgeError, 

45 error_envelope, 

46 metrics_result, 

47 validate_output_name, 

48) 

49 

50 

51def _try_get_context() -> Any | None: 

52 """Return the active FastMCP Context if inside a request, else ``None``. 

53 

54 Mirrors :func:`mcp.tools.mission._try_get_context`: wraps the optional 

55 ``fastmcp.server.dependencies.get_context`` import so the helper works on 

56 the CLI path and in unit tests that don't go through an MCP request — 

57 those raise ``RuntimeError`` from ``get_context()``, which we swallow so 

58 ``select_sampling_backend`` falls back to the Bedrock path. 

59 """ 

60 try: 

61 from fastmcp.server.dependencies import get_context 

62 

63 return get_context() 

64 except Exception: 

65 return None 

66 

67 

68# Registration is entirely gated by the feature flag. When the flag is unset, 

69# the decorator below never fires and FastMCP never sees the tool, so it does 

70# not appear in ``mcp.list_tools()``. The gate is evaluated only through 

71# ``feature_flags.is_enabled`` — never by reading ``os.environ`` here. 

72if is_enabled("GCO_ENABLE_SEMANTIC_PROGRESS"): 

73 

74 @mcp.tool(tags={"safe", "metrics"}) 

75 @audit_logged 

76 async def metrics_semantic_progress( 

77 directive: str, 

78 recent_context: str | None = None, 

79 output_name: str | None = None, 

80 model_id: str | None = None, 

81 ) -> dict[str, Any]: 

82 """[gated by GCO_ENABLE_SEMANTIC_PROGRESS] [read-only] Score Mission progress. 

83 

84 Scores how close a Mission is to satisfying ``directive`` against a 

85 fixed, versioned rubric via the existing sampling backend, and returns 

86 the canonical ``{"metrics": {"progress_score": <float 0.0-1.0>}}`` shape 

87 consumable by a ``metric_threshold`` (e.g. ``progress_score >= 0.8``) or 

88 ``metric_trend`` (e.g. ``progress_score`` increasing) criterion. Incurs 

89 one LLM call per invocation. Mutates nothing — it only reads its inputs 

90 and asks the model for a score. Provenance (rationale, source, 

91 backend_name, model_id, rubric_version, raw_score) is returned outside 

92 the ``metrics`` object. 

93 

94 Args: 

95 directive: The natural-language objective the Mission is pursuing. 

96 Must be non-empty and not whitespace-only. 

97 recent_context: Optional recent progress context (recent 

98 observations and/or metric-history series the caller selects). 

99 Truncated keep-newest to a fixed character budget; omit it to 

100 score from the directive alone. 

101 output_name: Optional metric key under ``metrics`` (default 

102 ``"progress_score"``). Must be a single path segment of 1..128 

103 characters with no ``.`` separator and no whitespace. 

104 model_id: Optional concrete model identifier forwarded to the 

105 sampling seam; ``None`` uses the seam's resolved default. 

106 

107 Returns the canonical metrics shape on success, or a structured 

108 ``{"code", "details"}`` error envelope (never carrying a top-level 

109 ``metrics`` key) on any failure, so the Mission loop keeps running. 

110 """ 

111 try: 

112 key = validate_output_name(output_name) if output_name else "progress_score" 

113 if not directive or not directive.strip(): 

114 raise JudgeError(ErrorCode.MISSING_DIRECTIVE) 

115 

116 prompt = judge_prompt.build_prompt( 

117 directive, recent_context, judge_rubric.RUBRIC_VERSION 

118 ) 

119 

120 ctx = _try_get_context() # active FastMCP Context or None (CLI path) 

121 backend = select_sampling_backend(ctx, model_id, None) 

122 if backend is None: 

123 raise JudgeError(ErrorCode.NO_SAMPLING_BACKEND) 

124 

125 try: 

126 # The ONLY non-determinism; no retry. Both shipped backends 

127 # call only ``prompt.assemble()``, so the duck-typed JudgePrompt 

128 # drives either of them — same shim pattern as the sampling 

129 # module's own ``_PreRendered`` look-alike. 

130 raw_text = await backend.sample(prompt) # type: ignore[arg-type] 

131 except SamplingTransportError as err: 

132 raise JudgeError( 

133 ErrorCode.SAMPLING_TRANSPORT_ERROR, 

134 { 

135 "transport_code": err.code, 

136 "backend_name": backend.backend_name, 

137 "model_id": backend.model_id, 

138 }, 

139 ) from err 

140 

141 raw_score, rationale = judge_score.parse_score(raw_text) # raises INVALID_MODEL_SCORE 

142 value = judge_score.clamp_score(raw_score) 

143 

144 return metrics_result( 

145 key, 

146 value, 

147 rationale=rationale[: judge_prompt.MAX_RATIONALE_CHARS], 

148 source=f"{backend.backend_name}:{backend.model_id}", 

149 backend_name=backend.backend_name, 

150 model_id=backend.model_id, 

151 rubric_version=judge_rubric.RUBRIC_VERSION, 

152 raw_score=raw_score, 

153 ) 

154 except JudgeError as err: 

155 return error_envelope(err.code, **err.details) 

156 except Exception as err: # noqa: BLE001 - defensive: nothing escapes the tool 

157 return error_envelope( 

158 ErrorCode.SAMPLING_TRANSPORT_ERROR, reason="unexpected", detail=str(err) 

159 )