Coverage for mcp/mission/types.py: 100%

115 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-15 15:07 +0000

1"""Domain types for the Mission goal-directed iteration loop. 

2 

3All structured types live in one module so the engine, validators, sampler, 

4and tool wrappers share the same shape. ``TypedDict`` (not ``dataclass``) so 

5``json.dumps`` / ``json.loads`` round-trip without any custom serialization. 

6The ``version`` field on :class:`SessionState` is checked on every load 

7against :data:`mcp.mission.SCHEMA_VERSION` (re-exported here as 

8:data:`SCHEMA_VERSION` for callers that import only this module). 

9 

10Optional keys use :data:`typing.NotRequired` so that ``mypy --strict`` accepts 

11absence on dict literals while still rejecting an unknown key. 

12""" 

13 

14from __future__ import annotations 

15 

16from typing import Any, Literal, NotRequired, TypedDict 

17 

18# Re-exported here so callers that import only ``mcp.mission.types`` can read 

19# the schema version without an extra import. The canonical value lives on 

20# the package ``__init__``. 

21from . import SCHEMA_VERSION as SCHEMA_VERSION 

22 

23# --------------------------------------------------------------------------- 

24# Literal type aliases 

25# --------------------------------------------------------------------------- 

26 

27VerdictLabel = Literal["continue", "adjust", "complete", "terminate"] 

28"""The four possible Decide_Phase outputs. 

29 

30``continue`` keeps the loop running with the current strategy. ``adjust`` 

31runs another iteration with a Strategy_Revision. ``complete`` ends the 

32session as success. ``terminate`` ends the session as give-up. 

33""" 

34 

35VerdictReason = Literal[ 

36 "in_progress", 

37 "cadence_skip", 

38 "criteria_met", 

39 "forced_complete", 

40 "heuristic_unproductive", 

41 "max_iterations", 

42 "max_wall_clock", 

43 "no_progress", 

44 "user_abort", 

45] 

46"""The exhaustive set of reasons that pair with a :data:`VerdictLabel`.""" 

47 

48StatusLabel = Literal["pending", "running", "paused", "completed", "terminated", "failed"] 

49"""The lifecycle states of a :class:`SessionState`.""" 

50 

51CriterionKind = Literal[ 

52 "metric_threshold", "event", "predicate", "tool_call_succeeded", "metric_trend" 

53] 

54"""The five Criterion evaluator kinds. 

55 

56``metric_trend`` is the history-aware kind: rather than comparing a single 

57point-in-time value to a fixed target (``metric_threshold``), it evaluates the 

58direction of a metric across iterations using the cumulative metric history the 

59engine accumulates in :meth:`MissionEngine._build_cumulative_observation`. 

60""" 

61 

62MetricTrendDirection = Literal["decreasing", "increasing", "non_increasing", "non_decreasing"] 

63"""The four trend directions a ``metric_trend`` criterion can require. 

64 

65``decreasing`` / ``increasing`` require a strict net change across the window 

66(last < first / last > first); ``non_increasing`` / ``non_decreasing`` allow a 

67flat series (last <= first / last >= first). 

68""" 

69 

70SamplingStatus = Literal["used", "rejected", "fallback", "unavailable", "disabled"] 

71"""The terminal status of a single sampling attempt on an iteration.""" 

72 

73CadenceKind = Literal["every_iteration", "every_n_iterations", "every_t_seconds", "on_event"] 

74"""The four supported Checkpoint_Cadence kinds.""" 

75 

76 

77# --------------------------------------------------------------------------- 

78# Terminal-state sets 

79# --------------------------------------------------------------------------- 

80 

81TERMINAL_STATES: frozenset[StatusLabel] = frozenset({"completed", "terminated", "failed"}) 

82"""The :data:`StatusLabel` values from which a session cannot transition. 

83 

84A session in any of these states refuses further ``mission_iterate`` calls 

85with ``session_terminal``. The engine consults this set on every iteration 

86entry to short-circuit before performing any work. 

87""" 

88 

89TERMINAL_VERDICTS: frozenset[VerdictLabel] = frozenset({"complete", "terminate"}) 

90"""The :data:`VerdictLabel` values that end a session. 

91 

92When the Decide_Phase emits a verdict in this set, the engine writes a 

93Final_Report and transitions the session to ``completed`` or ``terminated`` 

94(matching the verdict). 

95""" 

96 

97 

98# --------------------------------------------------------------------------- 

99# Criterion and CriterionResult 

100# --------------------------------------------------------------------------- 

101 

102 

103class Criterion(TypedDict): 

104 """A single machine-checkable success condition. 

105 

106 The kind-specific keys (``metric``/``op``/``target`` for 

107 ``metric_threshold``, ``event_name`` for ``event``, ``expression`` for 

108 ``predicate``, ``tool_name``/``min_count`` for ``tool_call_succeeded``, 

109 ``metric``/``direction``/``window``/``min_points`` for ``metric_trend``) 

110 are not declared on the base ``TypedDict`` because they are mutually 

111 exclusive per ``kind``. Validators in ``mcp.mission.validation`` verify 

112 the right keys are present for each ``kind`` and may attach a private 

113 cached AST under ``_parsed_ast`` for ``predicate`` entries. 

114 """ 

115 

116 criterion_id: str 

117 kind: CriterionKind 

118 required: bool 

119 # Kind-specific keys (validator-enforced): 

120 metric: NotRequired[str] 

121 op: NotRequired[Literal["<", "<=", ">", ">=", "==", "!="]] 

122 target: NotRequired[float] 

123 event_name: NotRequired[str] 

124 expression: NotRequired[str] 

125 tool_name: NotRequired[str] 

126 min_count: NotRequired[int] 

127 # metric_trend keys: ``direction`` is required for the kind; ``window`` 

128 # bounds how many of the most-recent points are considered (default: all 

129 # available); ``min_points`` is the minimum number of numeric points 

130 # required before the criterion decides met/unmet rather than inconclusive. 

131 direction: NotRequired[MetricTrendDirection] 

132 window: NotRequired[int] 

133 min_points: NotRequired[int] 

134 # Cached parsed AST attached by ``validate_criteria`` for predicate entries. 

135 _parsed_ast: NotRequired[Any] 

136 

137 

138class CriterionResult(TypedDict): 

139 """The outcome of evaluating one :class:`Criterion` at a checkpoint.""" 

140 

141 criterion_id: str 

142 status: Literal["met", "unmet", "inconclusive"] 

143 evidence: Any 

144 evaluated_at: str # ISO 8601 UTC 

145 

146 

147# --------------------------------------------------------------------------- 

148# Budget controls and cadence 

149# --------------------------------------------------------------------------- 

150 

151 

152class BudgetControls(TypedDict): 

153 """Loop-control caps every Mission_Session declares at start time. 

154 

155 These are **loop-control** caps — not financial budgets. Mission 

156 enforces only the caps the loop has direct visibility into: 

157 iteration count and wall-clock seconds. Cost guardrails live 

158 out-of-band; configure AWS Budgets and Cost Anomaly Detection at 

159 the account level for those. 

160 

161 Both ``max_iterations`` and ``max_wall_clock_seconds`` accept 

162 either a strictly-positive integer cap or the explicit sentinel 

163 ``-1`` to opt out of that axis. The validator rejects every other 

164 shape (zero, other negatives, non-integer types, missing keys), 

165 and additionally rejects both caps being ``-1`` simultaneously 

166 (with ``reason="at_least_one_cap_required"``) since that would 

167 leave the loop with no axis-driven termination — a runaway-loop 

168 config error. 

169 """ 

170 

171 max_iterations: int 

172 max_wall_clock_seconds: int 

173 

174 

175class Cadence(TypedDict): 

176 """The Checkpoint_Cadence configuration on a session. 

177 

178 ``n`` is required for ``every_n_iterations``. ``t`` is required for 

179 ``every_t_seconds``. ``event_name`` is required for ``on_event``. The 

180 base ``every_iteration`` requires no extra keys. 

181 """ 

182 

183 kind: CadenceKind 

184 n: NotRequired[int] 

185 t: NotRequired[int] 

186 event_name: NotRequired[str] 

187 

188 

189# --------------------------------------------------------------------------- 

190# Tool calls and strategy 

191# --------------------------------------------------------------------------- 

192 

193 

194class ToolCallRecord(TypedDict): 

195 """A single tool invocation recorded during an Iteration's Execute_Phase. 

196 

197 Used both for direct ``tool_calls`` strategies and for in-script calls 

198 captured by the Mission_Sandbox under ``IterationRecord.script_call_log``. 

199 """ 

200 

201 tool_name: str 

202 args: dict[str, Any] 

203 status: Literal["ok", "failed", "skipped_not_allowed"] 

204 result_summary: Any 

205 duration_ms: int 

206 error_message: NotRequired[str] 

207 

208 

209class Strategy(TypedDict, total=False): 

210 """The Propose_Phase output. Carries one of ``tool_calls`` or ``script``. 

211 

212 ``total=False`` because every key is optional in isolation; the 

213 ``validate_strategy`` validator enforces the mutual-exclusivity rule 

214 (exactly one of ``tool_calls`` or ``script`` must be present and 

215 non-empty). 

216 """ 

217 

218 tool_calls: list[dict[str, Any]] 

219 script: str 

220 expected_observation_keys: list[str] 

221 rationale: str 

222 

223 

224# --------------------------------------------------------------------------- 

225# Observation, Phase, Iteration, Session 

226# --------------------------------------------------------------------------- 

227 

228 

229class Observation(TypedDict): 

230 """The Observe_Phase output — a normalized view of Execute_Phase results.""" 

231 

232 tool_results: list[Any] 

233 metrics: dict[str, Any] 

234 events: list[dict[str, Any]] 

235 errors: NotRequired[list[dict[str, Any]]] 

236 # Cumulative, history-aware view of every numeric metric seen across the 

237 # session, keyed by metric name and ordered oldest→newest. Present only on 

238 # the *cumulative* observation the Evaluate_Phase builds (see 

239 # :meth:`MissionEngine._build_cumulative_observation`); the per-iteration 

240 # Observation written to ``record["observation"]`` keeps ``metrics`` 

241 # strictly point-in-time and does not carry this key. Consumed by the 

242 # ``metric_trend`` criterion and available to predicates. 

243 metric_history: NotRequired[dict[str, list[float]]] 

244 phase_started_at: str 

245 phase_ended_at: str 

246 

247 

248class PhaseRecord(TypedDict): 

249 """One row in :attr:`IterationRecord.phases`. One per phase regardless of outcome.""" 

250 

251 phase: Literal["propose", "execute", "observe", "evaluate", "decide"] 

252 status: Literal["succeeded", "failed"] 

253 started_at: str 

254 ended_at: str 

255 error_message: NotRequired[str] 

256 

257 

258class IterationRecord(TypedDict): 

259 """The complete record of one pass through the five-phase cycle. 

260 

261 Sampling-related fields (``sampling_status``, ``sampling_output``, 

262 ``sampling_rejection_reason``) are present only when the iteration 

263 triggered an advisory-path sampling call. ``script_call_log`` is 

264 present only when the strategy carried a ``script``. 

265 """ 

266 

267 iteration_index: int 

268 started_at: str 

269 ended_at: str 

270 phases: list[PhaseRecord] 

271 strategy: Strategy 

272 observation: Observation 

273 criteria_evaluation: list[CriterionResult] 

274 verdict: VerdictLabel 

275 verdict_reason: VerdictReason 

276 revision_rationale: NotRequired[str] 

277 checkpoint_evaluated: bool 

278 sampling_status: NotRequired[SamplingStatus] 

279 sampling_output: NotRequired[str] 

280 sampling_rejection_reason: NotRequired[str] 

281 script_call_log: NotRequired[list[ToolCallRecord]] 

282 # Set by ``_execute_script`` when the sandbox runner raises 

283 # :class:`mcp.mission.sandbox.SandboxTerminated`. The Decide_Phase's 

284 # cascade reads this sentinel before any other branch and emits 

285 # ``("terminate", <reason>)`` so a sandbox cap propagates up to the 

286 # budget-cap path rather than failing the iteration as a phase 

287 # exception. Carries the wall-clock :data:`VerdictReason` 

288 # ``max_wall_clock`` for duration / memory / runtime caps. 

289 sandbox_terminated_reason: NotRequired[VerdictReason] 

290 

291 

292class SessionState(TypedDict): 

293 """The durable Mission_Session payload persisted by Mission_State_Backend. 

294 

295 The ``version`` field carries :data:`SCHEMA_VERSION`; loaders compare it 

296 against the current value and reject mismatches. Optional fields are 

297 populated as the session progresses (``started_at`` on first iteration, 

298 ``ended_at`` and ``final_report_path`` on terminal verdict, etc.). 

299 """ 

300 

301 version: int 

302 session_id: str 

303 directive_text: str 

304 criteria: list[Criterion] 

305 budget: BudgetControls 

306 tool_allowlist: list[str] 

307 checkpoint_cadence: Cadence 

308 stagnation_threshold: int 

309 use_sampling: bool 

310 sampling_backend_resolved: NotRequired[Literal["mcp", "bedrock", "none"]] 

311 bedrock_model_id: NotRequired[str] 

312 allow_scripted_strategies: bool 

313 sampling_model_preferences: NotRequired[dict[str, Any]] 

314 status: StatusLabel 

315 created_at: str 

316 started_at: NotRequired[str] 

317 ended_at: NotRequired[str] 

318 iterations: list[IterationRecord] 

319 no_progress_counter: int 

320 last_checkpoint_at: NotRequired[str] 

321 final_verdict: NotRequired[VerdictLabel] 

322 final_report_path: NotRequired[str]