Coverage for mcp/tools/semantic

1"""Read-only LLM-as-judge tool that scores Mission progress.

3The single ``metrics_semantic_progress`` tool scores how close a Mission is

4to satisfying its directive and returns that score in the canonical

5``{"metrics": {"progress_score": <number>}}`` shape the Observe_Phase merges,

6so a plain ``metric_threshold`` or ``metric_trend`` criterion can read it by

7dot-path with no special handling.

9The whole tool registration is gated by ``GCO_ENABLE_SEMANTIC_PROGRESS`` so the

10``@mcp.tool`` decorator only fires when the flag (or the umbrella

11``GCO_ENABLE_ALL_TOOLS``) is enabled. With the flag unset this module imports

12cleanly and FastMCP never sees the tool. Each invocation incurs one LLM call

13via the existing sampling seam, which is why the tool is default-off.

15[gated by GCO_ENABLE_SEMANTIC_PROGRESS]

16"""

18from __future__ import annotations

20import sys

21from pathlib import Path

22from typing import Any

24from audit import audit_logged

25from feature_flags import is_enabled

26from server import mcp

28# The pure judge package and the sampling seam live under ``mcp/``; the

29# path-injection pattern matches the rest of the MCP module surface so

30# ``import mission_judge.*`` and ``import mission.*`` resolve without making

31# the ``mcp`` directory a package.

32sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

34# The sampling seam — reused, not reconstructed.

35from mission.sampling import ( # noqa: E402

36 SamplingTransportError,

37 select_sampling_backend,

38)

39from mission_judge import prompt as judge_prompt # noqa: E402

40from mission_judge import rubric as judge_rubric # noqa: E402

41from mission_judge import score as judge_score # noqa: E402

42from mission_judge.shape import ( # noqa: E402

43 ErrorCode,

44 JudgeError,

45 error_envelope,

46 metrics_result,

47 validate_output_name,

48)

51def _try_get_context() -> Any | None:

52 """Return the active FastMCP Context if inside a request, else ``None``.

54 Mirrors :func:`mcp.tools.mission._try_get_context`: wraps the optional

55 ``fastmcp.server.dependencies.get_context`` import so the helper works on

56 the CLI path and in unit tests that don't go through an MCP request —

57 those raise ``RuntimeError`` from ``get_context()``, which we swallow so

58 ``select_sampling_backend`` falls back to the Bedrock path.

59 """

60 try:

61 from fastmcp.server.dependencies import get_context

63 return get_context()

64 except Exception:

65 return None

68# Registration is entirely gated by the feature flag. When the flag is unset,

69# the decorator below never fires and FastMCP never sees the tool, so it does

70# not appear in ``mcp.list_tools()``. The gate is evaluated only through

71# ``feature_flags.is_enabled`` — never by reading ``os.environ`` here.

72if is_enabled("GCO_ENABLE_SEMANTIC_PROGRESS"):

74 @mcp.tool(tags={"safe", "metrics"})

75 @audit_logged

76 async def metrics_semantic_progress(

77 directive: str,

78 recent_context: str | None = None,

79 output_name: str | None = None,

80 model_id: str | None = None,

81 ) -> dict[str, Any]:

82 """[gated by GCO_ENABLE_SEMANTIC_PROGRESS] [read-only] Score Mission progress.

84 Scores how close a Mission is to satisfying ``directive`` against a

85 fixed, versioned rubric via the existing sampling backend, and returns

86 the canonical ``{"metrics": {"progress_score": <float 0.0-1.0>}}`` shape

87 consumable by a ``metric_threshold`` (e.g. ``progress_score >= 0.8``) or

88 ``metric_trend`` (e.g. ``progress_score`` increasing) criterion. Incurs

89 one LLM call per invocation. Mutates nothing — it only reads its inputs

90 and asks the model for a score. Provenance (rationale, source,

91 backend_name, model_id, rubric_version, raw_score) is returned outside

92 the ``metrics`` object.

94 Args:

95 directive: The natural-language objective the Mission is pursuing.

96 Must be non-empty and not whitespace-only.

97 recent_context: Optional recent progress context (recent

98 observations and/or metric-history series the caller selects).

99 Truncated keep-newest to a fixed character budget; omit it to

100 score from the directive alone.

101 output_name: Optional metric key under ``metrics`` (default

102 ``"progress_score"``). Must be a single path segment of 1..128

103 characters with no ``.`` separator and no whitespace.

104 model_id: Optional concrete model identifier forwarded to the

105 sampling seam; ``None`` uses the seam's resolved default.

106

107 Returns the canonical metrics shape on success, or a structured

108 ``{"code", "details"}`` error envelope (never carrying a top-level

109 ``metrics`` key) on any failure, so the Mission loop keeps running.

110 """

111 try:

112 key = validate_output_name(output_name) if output_name else "progress_score"

113 if not directive or not directive.strip():

114 raise JudgeError(ErrorCode.MISSING_DIRECTIVE)

115

116 prompt = judge_prompt.build_prompt(

117 directive, recent_context, judge_rubric.RUBRIC_VERSION

118 )

119

120 ctx = _try_get_context() # active FastMCP Context or None (CLI path)

121 backend = select_sampling_backend(ctx, model_id, None)

122 if backend is None:

123 raise JudgeError(ErrorCode.NO_SAMPLING_BACKEND)

124

125 try:

126 # The ONLY non-determinism; no retry. Both shipped backends

127 # call only ``prompt.assemble()``, so the duck-typed JudgePrompt

128 # drives either of them — same shim pattern as the sampling

129 # module's own ``_PreRendered`` look-alike.

130 raw_text = await backend.sample(prompt) # type: ignore[arg-type]

131 except SamplingTransportError as err:

132 raise JudgeError(

133 ErrorCode.SAMPLING_TRANSPORT_ERROR,

134 {

135 "transport_code": err.code,

136 "backend_name": backend.backend_name,

137 "model_id": backend.model_id,

138 },

139 ) from err

140

141 raw_score, rationale = judge_score.parse_score(raw_text) # raises INVALID_MODEL_SCORE

142 value = judge_score.clamp_score(raw_score)

143

144 return metrics_result(

145 key,

146 value,

147 rationale=rationale[: judge_prompt.MAX_RATIONALE_CHARS],

148 source=f"{backend.backend_name}:{backend.model_id}",

149 backend_name=backend.backend_name,

150 model_id=backend.model_id,

151 rubric_version=judge_rubric.RUBRIC_VERSION,

152 raw_score=raw_score,

153 )

154 except JudgeError as err:

155 return error_envelope(err.code, **err.details)

156 except Exception as err: # noqa: BLE001 - defensive: nothing escapes the tool

157 return error_envelope(

158 ErrorCode.SAMPLING_TRANSPORT_ERROR, reason="unexpected", detail=str(err)

159 )

Coverage for mcp/tools/semantic_progress.py: 96%

43 statements