Coverage for mcp/mission_judge/prompt.py: 95%

1"""Assemble the deterministic prompt the judge sends to the model.

3The judge scores progress from a directive plus optional recent progress

4context, embedding the fixed rubric. The single non-deterministic step in

5the whole flow is the model's answer; everything that builds the request is

6pure and reproducible. This module owns that pure half:

8* :func:`truncate_context` — bounds arbitrarily large progress context to a

9 fixed character budget by keeping the most recent content and discarding

10 the oldest, so a long metric history can never produce an unbounded prompt.

11* :class:`JudgePrompt` — a frozen bundle of the directive, the (already

12 truncated) context, and the rubric, with an :meth:`~JudgePrompt.assemble`

13 method that renders them into a fixed-layout string.

14* :func:`build_prompt` — the entry point the tool wrapper calls: truncate the

15 context, bind the rubric, and return a ready-to-assemble prompt.

17The rendered prompt is a pure function of its inputs — no clock, no random

18value, no ambient state — so two identical inputs always produce a

19byte-identical string. The model is instructed to answer with a single JSON

20object carrying exactly one numeric ``score`` field and one ``rationale``

21field, which keeps the downstream parse step deterministic.

23The duck-typed :class:`JudgePrompt` exposes only ``assemble() -> str``, which

24is the entire surface a sampling backend touches, so it drives either backend

25without importing the sampling module.

26"""

28from __future__ import annotations

30from dataclasses import dataclass

32from .rubric import RUBRIC

34# Maximum characters of recent progress context folded into the prompt. An

35# arbitrarily large metric history or observation set cannot grow the prompt

36# without bound because anything past this budget is discarded oldest-first.

37MAX_CONTEXT_CHARS: int = 8000

39# Maximum characters of model rationale retained in provenance. Kept here

40# beside the context budget so both prompt-size knobs live in one place.

41MAX_RATIONALE_CHARS: int = 2000

43# Prepended to the retained tail whenever context is truncated, so a reader

44# can see at a glance that older content was dropped.

45TRUNCATION_MARKER: str = "...[older context truncated]"

48def truncate_context(context: str, limit: int = MAX_CONTEXT_CHARS) -> str:

49 """Bound ``context`` to ``limit`` characters, keeping the most recent tail.

51 When ``context`` is already at or under ``limit`` it is returned

52 unchanged. When it is longer, the oldest characters (the head) are

53 discarded and the most recent characters (the tail) are retained, with

54 :data:`TRUNCATION_MARKER` prepended so the result reads as a clipped view.

56 The returned string — marker included — is always at most ``limit``

57 characters: the retained tail is sized to ``limit - len(TRUNCATION_MARKER)``

58 so prepending the marker lands exactly on ``limit``. In the degenerate

59 case where ``limit`` is too small to hold even the marker

60 (``limit <= len(TRUNCATION_MARKER)``), the marker is omitted and the most

61 recent ``limit`` characters are returned, so ``len(result) <= limit`` holds

62 for every input and every non-negative ``limit``.

64 The operation is deterministic: two identical oversized inputs yield a

65 byte-identical result.

66 """

67 if len(context) <= limit:

68 return context

70 tail_length = limit - len(TRUNCATION_MARKER)

71 if tail_length <= 0: 71 ↛ 74line 71 didn't jump to line 74 because the condition on line 71 was never true

72 # Not enough room for the marker; keep the most recent ``limit``

73 # characters so the length bound still holds.

74 return context[len(context) - limit :]

76 tail = context[len(context) - tail_length :]

77 return TRUNCATION_MARKER + tail

80@dataclass(frozen=True)

81class JudgePrompt:

82 """A frozen, deterministic prompt for the progress judge.

84 The dataclass is ``frozen`` so the inputs cannot mutate between

85 construction and :meth:`assemble`, which guarantees the rendered string

86 is a stable function of the bound fields. ``context`` is expected to be

87 already truncated (an empty string when no context was supplied).

88 """

90 directive: str

91 context: str

92 rubric: str

93 rubric_version: str

95 def assemble(self) -> str:

96 """Render the fixed-layout prompt as a byte-deterministic string.

98 Sections are delimited by ``=== <name> ===`` headers emitted in a

99 fixed order: the scoring rubric (tagged with its version), the

100 directive, the recent progress context, and the output-format

101 instruction. The body contains no clock-derived, random, or ambient

102 content, so identical ``(directive, context, rubric, rubric_version)``

103 inputs always produce an identical byte sequence.

104

105 The output-format section instructs the model to answer with a single

106 JSON object carrying exactly one numeric ``score`` field (in the

107 closed range ``0.0`` to ``1.0``) and one free-text ``rationale``

108 field, so the score-extraction step downstream is deterministic.

109 """

110 sections: list[str] = []

111 sections.append(

112 "You are scoring how close a goal-directed automation run is to "

113 "satisfying its stated objective. Read the scoring rubric, the "

114 "objective, and the recent progress context, then return your "

115 "judgement in the required output format."

116 )

117 sections.append("")

118 sections.append(f"=== Scoring rubric (version {self.rubric_version}) ===")

119 sections.append(self.rubric)

120 sections.append("")

121 sections.append("=== Mission directive ===")

122 sections.append(self.directive)

123 sections.append("")

124 sections.append("=== Recent progress context ===")

125 sections.append(self.context)

126 sections.append("")

127 sections.append("=== Output format ===")

128 sections.append(

129 "Respond with a single JSON object and no prose outside it. The "

130 "object must contain exactly two fields: a numeric `score` field "

131 "holding your progress score as a number in the closed range 0.0 "

132 "to 1.0, and a `rationale` field holding a brief free-text "

133 "explanation of that score."

134 )

135 return "\n".join(sections)

136

137

138def build_prompt(directive: str, recent_context: str | None, rubric_version: str) -> JudgePrompt:

139 """Build a :class:`JudgePrompt` from the directive and optional context.

140

141 ``recent_context`` is bounded with :func:`truncate_context` (keeping the

142 most recent tail); when it is ``None`` or empty the prompt's context

143 becomes an empty string and the judgement proceeds from the directive

144 alone. The module rubric is bound verbatim and paired with

145 ``rubric_version``. The result is a pure function of the three inputs.

146 """

147 context = truncate_context(recent_context) if recent_context else ""

148 return JudgePrompt(

149 directive=directive,

150 context=context,

151 rubric=RUBRIC,

152 rubric_version=rubric_version,

153 )