Coverage for mcp/mission_judge/prompt.py: 95%

39 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-15 15:07 +0000

1"""Assemble the deterministic prompt the judge sends to the model. 

2 

3The judge scores progress from a directive plus optional recent progress 

4context, embedding the fixed rubric. The single non-deterministic step in 

5the whole flow is the model's answer; everything that builds the request is 

6pure and reproducible. This module owns that pure half: 

7 

8* :func:`truncate_context` — bounds arbitrarily large progress context to a 

9 fixed character budget by keeping the most recent content and discarding 

10 the oldest, so a long metric history can never produce an unbounded prompt. 

11* :class:`JudgePrompt` — a frozen bundle of the directive, the (already 

12 truncated) context, and the rubric, with an :meth:`~JudgePrompt.assemble` 

13 method that renders them into a fixed-layout string. 

14* :func:`build_prompt` — the entry point the tool wrapper calls: truncate the 

15 context, bind the rubric, and return a ready-to-assemble prompt. 

16 

17The rendered prompt is a pure function of its inputs — no clock, no random 

18value, no ambient state — so two identical inputs always produce a 

19byte-identical string. The model is instructed to answer with a single JSON 

20object carrying exactly one numeric ``score`` field and one ``rationale`` 

21field, which keeps the downstream parse step deterministic. 

22 

23The duck-typed :class:`JudgePrompt` exposes only ``assemble() -> str``, which 

24is the entire surface a sampling backend touches, so it drives either backend 

25without importing the sampling module. 

26""" 

27 

28from __future__ import annotations 

29 

30from dataclasses import dataclass 

31 

32from .rubric import RUBRIC 

33 

34# Maximum characters of recent progress context folded into the prompt. An 

35# arbitrarily large metric history or observation set cannot grow the prompt 

36# without bound because anything past this budget is discarded oldest-first. 

37MAX_CONTEXT_CHARS: int = 8000 

38 

39# Maximum characters of model rationale retained in provenance. Kept here 

40# beside the context budget so both prompt-size knobs live in one place. 

41MAX_RATIONALE_CHARS: int = 2000 

42 

43# Prepended to the retained tail whenever context is truncated, so a reader 

44# can see at a glance that older content was dropped. 

45TRUNCATION_MARKER: str = "...[older context truncated]" 

46 

47 

48def truncate_context(context: str, limit: int = MAX_CONTEXT_CHARS) -> str: 

49 """Bound ``context`` to ``limit`` characters, keeping the most recent tail. 

50 

51 When ``context`` is already at or under ``limit`` it is returned 

52 unchanged. When it is longer, the oldest characters (the head) are 

53 discarded and the most recent characters (the tail) are retained, with 

54 :data:`TRUNCATION_MARKER` prepended so the result reads as a clipped view. 

55 

56 The returned string — marker included — is always at most ``limit`` 

57 characters: the retained tail is sized to ``limit - len(TRUNCATION_MARKER)`` 

58 so prepending the marker lands exactly on ``limit``. In the degenerate 

59 case where ``limit`` is too small to hold even the marker 

60 (``limit <= len(TRUNCATION_MARKER)``), the marker is omitted and the most 

61 recent ``limit`` characters are returned, so ``len(result) <= limit`` holds 

62 for every input and every non-negative ``limit``. 

63 

64 The operation is deterministic: two identical oversized inputs yield a 

65 byte-identical result. 

66 """ 

67 if len(context) <= limit: 

68 return context 

69 

70 tail_length = limit - len(TRUNCATION_MARKER) 

71 if tail_length <= 0: 71 ↛ 74line 71 didn't jump to line 74 because the condition on line 71 was never true

72 # Not enough room for the marker; keep the most recent ``limit`` 

73 # characters so the length bound still holds. 

74 return context[len(context) - limit :] 

75 

76 tail = context[len(context) - tail_length :] 

77 return TRUNCATION_MARKER + tail 

78 

79 

80@dataclass(frozen=True) 

81class JudgePrompt: 

82 """A frozen, deterministic prompt for the progress judge. 

83 

84 The dataclass is ``frozen`` so the inputs cannot mutate between 

85 construction and :meth:`assemble`, which guarantees the rendered string 

86 is a stable function of the bound fields. ``context`` is expected to be 

87 already truncated (an empty string when no context was supplied). 

88 """ 

89 

90 directive: str 

91 context: str 

92 rubric: str 

93 rubric_version: str 

94 

95 def assemble(self) -> str: 

96 """Render the fixed-layout prompt as a byte-deterministic string. 

97 

98 Sections are delimited by ``=== <name> ===`` headers emitted in a 

99 fixed order: the scoring rubric (tagged with its version), the 

100 directive, the recent progress context, and the output-format 

101 instruction. The body contains no clock-derived, random, or ambient 

102 content, so identical ``(directive, context, rubric, rubric_version)`` 

103 inputs always produce an identical byte sequence. 

104 

105 The output-format section instructs the model to answer with a single 

106 JSON object carrying exactly one numeric ``score`` field (in the 

107 closed range ``0.0`` to ``1.0``) and one free-text ``rationale`` 

108 field, so the score-extraction step downstream is deterministic. 

109 """ 

110 sections: list[str] = [] 

111 sections.append( 

112 "You are scoring how close a goal-directed automation run is to " 

113 "satisfying its stated objective. Read the scoring rubric, the " 

114 "objective, and the recent progress context, then return your " 

115 "judgement in the required output format." 

116 ) 

117 sections.append("") 

118 sections.append(f"=== Scoring rubric (version {self.rubric_version}) ===") 

119 sections.append(self.rubric) 

120 sections.append("") 

121 sections.append("=== Mission directive ===") 

122 sections.append(self.directive) 

123 sections.append("") 

124 sections.append("=== Recent progress context ===") 

125 sections.append(self.context) 

126 sections.append("") 

127 sections.append("=== Output format ===") 

128 sections.append( 

129 "Respond with a single JSON object and no prose outside it. The " 

130 "object must contain exactly two fields: a numeric `score` field " 

131 "holding your progress score as a number in the closed range 0.0 " 

132 "to 1.0, and a `rationale` field holding a brief free-text " 

133 "explanation of that score." 

134 ) 

135 return "\n".join(sections) 

136 

137 

138def build_prompt(directive: str, recent_context: str | None, rubric_version: str) -> JudgePrompt: 

139 """Build a :class:`JudgePrompt` from the directive and optional context. 

140 

141 ``recent_context`` is bounded with :func:`truncate_context` (keeping the 

142 most recent tail); when it is ``None`` or empty the prompt's context 

143 becomes an empty string and the judgement proceeds from the directive 

144 alone. The module rubric is bound verbatim and paired with 

145 ``rubric_version``. The result is a pure function of the three inputs. 

146 """ 

147 context = truncate_context(recent_context) if recent_context else "" 

148 return JudgePrompt( 

149 directive=directive, 

150 context=context, 

151 rubric=RUBRIC, 

152 rubric_version=rubric_version, 

153 )