Coverage for mcp/mission_judge/prompt.py: 95%
39 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-15 15:07 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-15 15:07 +0000
1"""Assemble the deterministic prompt the judge sends to the model.
3The judge scores progress from a directive plus optional recent progress
4context, embedding the fixed rubric. The single non-deterministic step in
5the whole flow is the model's answer; everything that builds the request is
6pure and reproducible. This module owns that pure half:
8* :func:`truncate_context` — bounds arbitrarily large progress context to a
9 fixed character budget by keeping the most recent content and discarding
10 the oldest, so a long metric history can never produce an unbounded prompt.
11* :class:`JudgePrompt` — a frozen bundle of the directive, the (already
12 truncated) context, and the rubric, with an :meth:`~JudgePrompt.assemble`
13 method that renders them into a fixed-layout string.
14* :func:`build_prompt` — the entry point the tool wrapper calls: truncate the
15 context, bind the rubric, and return a ready-to-assemble prompt.
17The rendered prompt is a pure function of its inputs — no clock, no random
18value, no ambient state — so two identical inputs always produce a
19byte-identical string. The model is instructed to answer with a single JSON
20object carrying exactly one numeric ``score`` field and one ``rationale``
21field, which keeps the downstream parse step deterministic.
23The duck-typed :class:`JudgePrompt` exposes only ``assemble() -> str``, which
24is the entire surface a sampling backend touches, so it drives either backend
25without importing the sampling module.
26"""
28from __future__ import annotations
30from dataclasses import dataclass
32from .rubric import RUBRIC
34# Maximum characters of recent progress context folded into the prompt. An
35# arbitrarily large metric history or observation set cannot grow the prompt
36# without bound because anything past this budget is discarded oldest-first.
37MAX_CONTEXT_CHARS: int = 8000
39# Maximum characters of model rationale retained in provenance. Kept here
40# beside the context budget so both prompt-size knobs live in one place.
41MAX_RATIONALE_CHARS: int = 2000
43# Prepended to the retained tail whenever context is truncated, so a reader
44# can see at a glance that older content was dropped.
45TRUNCATION_MARKER: str = "...[older context truncated]"
48def truncate_context(context: str, limit: int = MAX_CONTEXT_CHARS) -> str:
49 """Bound ``context`` to ``limit`` characters, keeping the most recent tail.
51 When ``context`` is already at or under ``limit`` it is returned
52 unchanged. When it is longer, the oldest characters (the head) are
53 discarded and the most recent characters (the tail) are retained, with
54 :data:`TRUNCATION_MARKER` prepended so the result reads as a clipped view.
56 The returned string — marker included — is always at most ``limit``
57 characters: the retained tail is sized to ``limit - len(TRUNCATION_MARKER)``
58 so prepending the marker lands exactly on ``limit``. In the degenerate
59 case where ``limit`` is too small to hold even the marker
60 (``limit <= len(TRUNCATION_MARKER)``), the marker is omitted and the most
61 recent ``limit`` characters are returned, so ``len(result) <= limit`` holds
62 for every input and every non-negative ``limit``.
64 The operation is deterministic: two identical oversized inputs yield a
65 byte-identical result.
66 """
67 if len(context) <= limit:
68 return context
70 tail_length = limit - len(TRUNCATION_MARKER)
71 if tail_length <= 0: 71 ↛ 74line 71 didn't jump to line 74 because the condition on line 71 was never true
72 # Not enough room for the marker; keep the most recent ``limit``
73 # characters so the length bound still holds.
74 return context[len(context) - limit :]
76 tail = context[len(context) - tail_length :]
77 return TRUNCATION_MARKER + tail
80@dataclass(frozen=True)
81class JudgePrompt:
82 """A frozen, deterministic prompt for the progress judge.
84 The dataclass is ``frozen`` so the inputs cannot mutate between
85 construction and :meth:`assemble`, which guarantees the rendered string
86 is a stable function of the bound fields. ``context`` is expected to be
87 already truncated (an empty string when no context was supplied).
88 """
90 directive: str
91 context: str
92 rubric: str
93 rubric_version: str
95 def assemble(self) -> str:
96 """Render the fixed-layout prompt as a byte-deterministic string.
98 Sections are delimited by ``=== <name> ===`` headers emitted in a
99 fixed order: the scoring rubric (tagged with its version), the
100 directive, the recent progress context, and the output-format
101 instruction. The body contains no clock-derived, random, or ambient
102 content, so identical ``(directive, context, rubric, rubric_version)``
103 inputs always produce an identical byte sequence.
105 The output-format section instructs the model to answer with a single
106 JSON object carrying exactly one numeric ``score`` field (in the
107 closed range ``0.0`` to ``1.0``) and one free-text ``rationale``
108 field, so the score-extraction step downstream is deterministic.
109 """
110 sections: list[str] = []
111 sections.append(
112 "You are scoring how close a goal-directed automation run is to "
113 "satisfying its stated objective. Read the scoring rubric, the "
114 "objective, and the recent progress context, then return your "
115 "judgement in the required output format."
116 )
117 sections.append("")
118 sections.append(f"=== Scoring rubric (version {self.rubric_version}) ===")
119 sections.append(self.rubric)
120 sections.append("")
121 sections.append("=== Mission directive ===")
122 sections.append(self.directive)
123 sections.append("")
124 sections.append("=== Recent progress context ===")
125 sections.append(self.context)
126 sections.append("")
127 sections.append("=== Output format ===")
128 sections.append(
129 "Respond with a single JSON object and no prose outside it. The "
130 "object must contain exactly two fields: a numeric `score` field "
131 "holding your progress score as a number in the closed range 0.0 "
132 "to 1.0, and a `rationale` field holding a brief free-text "
133 "explanation of that score."
134 )
135 return "\n".join(sections)
138def build_prompt(directive: str, recent_context: str | None, rubric_version: str) -> JudgePrompt:
139 """Build a :class:`JudgePrompt` from the directive and optional context.
141 ``recent_context`` is bounded with :func:`truncate_context` (keeping the
142 most recent tail); when it is ``None`` or empty the prompt's context
143 becomes an empty string and the judgement proceeds from the directive
144 alone. The module rubric is bound verbatim and paired with
145 ``rubric_version``. The result is a pure function of the three inputs.
146 """
147 context = truncate_context(recent_context) if recent_context else ""
148 return JudgePrompt(
149 directive=directive,
150 context=context,
151 rubric=RUBRIC,
152 rubric_version=rubric_version,
153 )