Coverage for mcp/tools/semantic_progress.py: 96%
43 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-15 15:07 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-15 15:07 +0000
1"""Read-only LLM-as-judge tool that scores Mission progress.
3The single ``metrics_semantic_progress`` tool scores how close a Mission is
4to satisfying its directive and returns that score in the canonical
5``{"metrics": {"progress_score": <number>}}`` shape the Observe_Phase merges,
6so a plain ``metric_threshold`` or ``metric_trend`` criterion can read it by
7dot-path with no special handling.
9The whole tool registration is gated by ``GCO_ENABLE_SEMANTIC_PROGRESS`` so the
10``@mcp.tool`` decorator only fires when the flag (or the umbrella
11``GCO_ENABLE_ALL_TOOLS``) is enabled. With the flag unset this module imports
12cleanly and FastMCP never sees the tool. Each invocation incurs one LLM call
13via the existing sampling seam, which is why the tool is default-off.
15[gated by GCO_ENABLE_SEMANTIC_PROGRESS]
16"""
18from __future__ import annotations
20import sys
21from pathlib import Path
22from typing import Any
24from audit import audit_logged
25from feature_flags import is_enabled
26from server import mcp
28# The pure judge package and the sampling seam live under ``mcp/``; the
29# path-injection pattern matches the rest of the MCP module surface so
30# ``import mission_judge.*`` and ``import mission.*`` resolve without making
31# the ``mcp`` directory a package.
32sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
34# The sampling seam — reused, not reconstructed.
35from mission.sampling import ( # noqa: E402
36 SamplingTransportError,
37 select_sampling_backend,
38)
39from mission_judge import prompt as judge_prompt # noqa: E402
40from mission_judge import rubric as judge_rubric # noqa: E402
41from mission_judge import score as judge_score # noqa: E402
42from mission_judge.shape import ( # noqa: E402
43 ErrorCode,
44 JudgeError,
45 error_envelope,
46 metrics_result,
47 validate_output_name,
48)
51def _try_get_context() -> Any | None:
52 """Return the active FastMCP Context if inside a request, else ``None``.
54 Mirrors :func:`mcp.tools.mission._try_get_context`: wraps the optional
55 ``fastmcp.server.dependencies.get_context`` import so the helper works on
56 the CLI path and in unit tests that don't go through an MCP request —
57 those raise ``RuntimeError`` from ``get_context()``, which we swallow so
58 ``select_sampling_backend`` falls back to the Bedrock path.
59 """
60 try:
61 from fastmcp.server.dependencies import get_context
63 return get_context()
64 except Exception:
65 return None
68# Registration is entirely gated by the feature flag. When the flag is unset,
69# the decorator below never fires and FastMCP never sees the tool, so it does
70# not appear in ``mcp.list_tools()``. The gate is evaluated only through
71# ``feature_flags.is_enabled`` — never by reading ``os.environ`` here.
72if is_enabled("GCO_ENABLE_SEMANTIC_PROGRESS"):
74 @mcp.tool(tags={"safe", "metrics"})
75 @audit_logged
76 async def metrics_semantic_progress(
77 directive: str,
78 recent_context: str | None = None,
79 output_name: str | None = None,
80 model_id: str | None = None,
81 ) -> dict[str, Any]:
82 """[gated by GCO_ENABLE_SEMANTIC_PROGRESS] [read-only] Score Mission progress.
84 Scores how close a Mission is to satisfying ``directive`` against a
85 fixed, versioned rubric via the existing sampling backend, and returns
86 the canonical ``{"metrics": {"progress_score": <float 0.0-1.0>}}`` shape
87 consumable by a ``metric_threshold`` (e.g. ``progress_score >= 0.8``) or
88 ``metric_trend`` (e.g. ``progress_score`` increasing) criterion. Incurs
89 one LLM call per invocation. Mutates nothing — it only reads its inputs
90 and asks the model for a score. Provenance (rationale, source,
91 backend_name, model_id, rubric_version, raw_score) is returned outside
92 the ``metrics`` object.
94 Args:
95 directive: The natural-language objective the Mission is pursuing.
96 Must be non-empty and not whitespace-only.
97 recent_context: Optional recent progress context (recent
98 observations and/or metric-history series the caller selects).
99 Truncated keep-newest to a fixed character budget; omit it to
100 score from the directive alone.
101 output_name: Optional metric key under ``metrics`` (default
102 ``"progress_score"``). Must be a single path segment of 1..128
103 characters with no ``.`` separator and no whitespace.
104 model_id: Optional concrete model identifier forwarded to the
105 sampling seam; ``None`` uses the seam's resolved default.
107 Returns the canonical metrics shape on success, or a structured
108 ``{"code", "details"}`` error envelope (never carrying a top-level
109 ``metrics`` key) on any failure, so the Mission loop keeps running.
110 """
111 try:
112 key = validate_output_name(output_name) if output_name else "progress_score"
113 if not directive or not directive.strip():
114 raise JudgeError(ErrorCode.MISSING_DIRECTIVE)
116 prompt = judge_prompt.build_prompt(
117 directive, recent_context, judge_rubric.RUBRIC_VERSION
118 )
120 ctx = _try_get_context() # active FastMCP Context or None (CLI path)
121 backend = select_sampling_backend(ctx, model_id, None)
122 if backend is None:
123 raise JudgeError(ErrorCode.NO_SAMPLING_BACKEND)
125 try:
126 # The ONLY non-determinism; no retry. Both shipped backends
127 # call only ``prompt.assemble()``, so the duck-typed JudgePrompt
128 # drives either of them — same shim pattern as the sampling
129 # module's own ``_PreRendered`` look-alike.
130 raw_text = await backend.sample(prompt) # type: ignore[arg-type]
131 except SamplingTransportError as err:
132 raise JudgeError(
133 ErrorCode.SAMPLING_TRANSPORT_ERROR,
134 {
135 "transport_code": err.code,
136 "backend_name": backend.backend_name,
137 "model_id": backend.model_id,
138 },
139 ) from err
141 raw_score, rationale = judge_score.parse_score(raw_text) # raises INVALID_MODEL_SCORE
142 value = judge_score.clamp_score(raw_score)
144 return metrics_result(
145 key,
146 value,
147 rationale=rationale[: judge_prompt.MAX_RATIONALE_CHARS],
148 source=f"{backend.backend_name}:{backend.model_id}",
149 backend_name=backend.backend_name,
150 model_id=backend.model_id,
151 rubric_version=judge_rubric.RUBRIC_VERSION,
152 raw_score=raw_score,
153 )
154 except JudgeError as err:
155 return error_envelope(err.code, **err.details)
156 except Exception as err: # noqa: BLE001 - defensive: nothing escapes the tool
157 return error_envelope(
158 ErrorCode.SAMPLING_TRANSPORT_ERROR, reason="unexpected", detail=str(err)
159 )