Coverage for mcp/mission/types.py: 100%

1"""Domain types for the Mission goal-directed iteration loop.

3All structured types live in one module so the engine, validators, sampler,

4and tool wrappers share the same shape. ``TypedDict`` (not ``dataclass``) so

5``json.dumps`` / ``json.loads`` round-trip without any custom serialization.

6The ``version`` field on :class:`SessionState` is checked on every load

7against :data:`mcp.mission.SCHEMA_VERSION` (re-exported here as

8:data:`SCHEMA_VERSION` for callers that import only this module).

10Optional keys use :data:`typing.NotRequired` so that ``mypy --strict`` accepts

11absence on dict literals while still rejecting an unknown key.

12"""

14from __future__ import annotations

16from typing import Any, Literal, NotRequired, TypedDict

18# Re-exported here so callers that import only ``mcp.mission.types`` can read

19# the schema version without an extra import. The canonical value lives on

20# the package ``__init__``.

21from . import SCHEMA_VERSION as SCHEMA_VERSION

23# ---------------------------------------------------------------------------

24# Literal type aliases

25# ---------------------------------------------------------------------------

27VerdictLabel = Literal["continue", "adjust", "complete", "terminate"]

28"""The four possible Decide_Phase outputs.

30``continue`` keeps the loop running with the current strategy. ``adjust``

31runs another iteration with a Strategy_Revision. ``complete`` ends the

32session as success. ``terminate`` ends the session as give-up.

33"""

35VerdictReason = Literal[

36 "in_progress",

37 "cadence_skip",

38 "criteria_met",

39 "forced_complete",

40 "heuristic_unproductive",

41 "max_iterations",

42 "max_wall_clock",

43 "no_progress",

44 "user_abort",

45]

46"""The exhaustive set of reasons that pair with a :data:`VerdictLabel`."""

48StatusLabel = Literal["pending", "running", "paused", "completed", "terminated", "failed"]

49"""The lifecycle states of a :class:`SessionState`."""

51CriterionKind = Literal[

52 "metric_threshold", "event", "predicate", "tool_call_succeeded", "metric_trend"

53]

54"""The five Criterion evaluator kinds.

56``metric_trend`` is the history-aware kind: rather than comparing a single

57point-in-time value to a fixed target (``metric_threshold``), it evaluates the

58direction of a metric across iterations using the cumulative metric history the

59engine accumulates in :meth:`MissionEngine._build_cumulative_observation`.

60"""

62MetricTrendDirection = Literal["decreasing", "increasing", "non_increasing", "non_decreasing"]

63"""The four trend directions a ``metric_trend`` criterion can require.

65``decreasing`` / ``increasing`` require a strict net change across the window

66(last < first / last > first); ``non_increasing`` / ``non_decreasing`` allow a

67flat series (last <= first / last >= first).

68"""

70SamplingStatus = Literal["used", "rejected", "fallback", "unavailable", "disabled"]

71"""The terminal status of a single sampling attempt on an iteration."""

73CadenceKind = Literal["every_iteration", "every_n_iterations", "every_t_seconds", "on_event"]

74"""The four supported Checkpoint_Cadence kinds."""

77# ---------------------------------------------------------------------------

78# Terminal-state sets

79# ---------------------------------------------------------------------------

81TERMINAL_STATES: frozenset[StatusLabel] = frozenset({"completed", "terminated", "failed"})

82"""The :data:`StatusLabel` values from which a session cannot transition.

84A session in any of these states refuses further ``mission_iterate`` calls

85with ``session_terminal``. The engine consults this set on every iteration

86entry to short-circuit before performing any work.

87"""

89TERMINAL_VERDICTS: frozenset[VerdictLabel] = frozenset({"complete", "terminate"})

90"""The :data:`VerdictLabel` values that end a session.

92When the Decide_Phase emits a verdict in this set, the engine writes a

93Final_Report and transitions the session to ``completed`` or ``terminated``

94(matching the verdict).

95"""

98# ---------------------------------------------------------------------------

99# Criterion and CriterionResult

100# ---------------------------------------------------------------------------

101

102

103class Criterion(TypedDict):

104 """A single machine-checkable success condition.

105

106 The kind-specific keys (``metric``/``op``/``target`` for

107 ``metric_threshold``, ``event_name`` for ``event``, ``expression`` for

108 ``predicate``, ``tool_name``/``min_count`` for ``tool_call_succeeded``,

109 ``metric``/``direction``/``window``/``min_points`` for ``metric_trend``)

110 are not declared on the base ``TypedDict`` because they are mutually

111 exclusive per ``kind``. Validators in ``mcp.mission.validation`` verify

112 the right keys are present for each ``kind`` and may attach a private

113 cached AST under ``_parsed_ast`` for ``predicate`` entries.

114 """

115

116 criterion_id: str

117 kind: CriterionKind

118 required: bool

119 # Kind-specific keys (validator-enforced):

120 metric: NotRequired[str]

121 op: NotRequired[Literal["<", "<=", ">", ">=", "==", "!="]]

122 target: NotRequired[float]

123 event_name: NotRequired[str]

124 expression: NotRequired[str]

125 tool_name: NotRequired[str]

126 min_count: NotRequired[int]

127 # metric_trend keys: ``direction`` is required for the kind; ``window``

128 # bounds how many of the most-recent points are considered (default: all

129 # available); ``min_points`` is the minimum number of numeric points

130 # required before the criterion decides met/unmet rather than inconclusive.

131 direction: NotRequired[MetricTrendDirection]

132 window: NotRequired[int]

133 min_points: NotRequired[int]

134 # Cached parsed AST attached by ``validate_criteria`` for predicate entries.

135 _parsed_ast: NotRequired[Any]

136

137

138class CriterionResult(TypedDict):

139 """The outcome of evaluating one :class:`Criterion` at a checkpoint."""

140

141 criterion_id: str

142 status: Literal["met", "unmet", "inconclusive"]

143 evidence: Any

144 evaluated_at: str # ISO 8601 UTC

145

146

147# ---------------------------------------------------------------------------

148# Budget controls and cadence

149# ---------------------------------------------------------------------------

150

151

152class BudgetControls(TypedDict):

153 """Loop-control caps every Mission_Session declares at start time.

154

155 These are **loop-control** caps — not financial budgets. Mission

156 enforces only the caps the loop has direct visibility into:

157 iteration count and wall-clock seconds. Cost guardrails live

158 out-of-band; configure AWS Budgets and Cost Anomaly Detection at

159 the account level for those.

160

161 Both ``max_iterations`` and ``max_wall_clock_seconds`` accept

162 either a strictly-positive integer cap or the explicit sentinel

163 ``-1`` to opt out of that axis. The validator rejects every other

164 shape (zero, other negatives, non-integer types, missing keys),

165 and additionally rejects both caps being ``-1`` simultaneously

166 (with ``reason="at_least_one_cap_required"``) since that would

167 leave the loop with no axis-driven termination — a runaway-loop

168 config error.

169 """

170

171 max_iterations: int

172 max_wall_clock_seconds: int

173

174

175class Cadence(TypedDict):

176 """The Checkpoint_Cadence configuration on a session.

177

178 ``n`` is required for ``every_n_iterations``. ``t`` is required for

179 ``every_t_seconds``. ``event_name`` is required for ``on_event``. The

180 base ``every_iteration`` requires no extra keys.

181 """

182

183 kind: CadenceKind

184 n: NotRequired[int]

185 t: NotRequired[int]

186 event_name: NotRequired[str]

187

188

189# ---------------------------------------------------------------------------

190# Tool calls and strategy

191# ---------------------------------------------------------------------------

192

193

194class ToolCallRecord(TypedDict):

195 """A single tool invocation recorded during an Iteration's Execute_Phase.

196

197 Used both for direct ``tool_calls`` strategies and for in-script calls

198 captured by the Mission_Sandbox under ``IterationRecord.script_call_log``.

199 """

200

201 tool_name: str

202 args: dict[str, Any]

203 status: Literal["ok", "failed", "skipped_not_allowed"]

204 result_summary: Any

205 duration_ms: int

206 error_message: NotRequired[str]

207

208

209class Strategy(TypedDict, total=False):

210 """The Propose_Phase output. Carries one of ``tool_calls`` or ``script``.

211

212 ``total=False`` because every key is optional in isolation; the

213 ``validate_strategy`` validator enforces the mutual-exclusivity rule

214 (exactly one of ``tool_calls`` or ``script`` must be present and

215 non-empty).

216 """

217

218 tool_calls: list[dict[str, Any]]

219 script: str

220 expected_observation_keys: list[str]

221 rationale: str

222

223

224# ---------------------------------------------------------------------------

225# Observation, Phase, Iteration, Session

226# ---------------------------------------------------------------------------

227

228

229class Observation(TypedDict):

230 """The Observe_Phase output — a normalized view of Execute_Phase results."""

231

232 tool_results: list[Any]

233 metrics: dict[str, Any]

234 events: list[dict[str, Any]]

235 errors: NotRequired[list[dict[str, Any]]]

236 # Cumulative, history-aware view of every numeric metric seen across the

237 # session, keyed by metric name and ordered oldest→newest. Present only on

238 # the *cumulative* observation the Evaluate_Phase builds (see

239 # :meth:`MissionEngine._build_cumulative_observation`); the per-iteration

240 # Observation written to ``record["observation"]`` keeps ``metrics``

241 # strictly point-in-time and does not carry this key. Consumed by the

242 # ``metric_trend`` criterion and available to predicates.

243 metric_history: NotRequired[dict[str, list[float]]]

244 phase_started_at: str

245 phase_ended_at: str

246

247

248class PhaseRecord(TypedDict):

249 """One row in :attr:`IterationRecord.phases`. One per phase regardless of outcome."""

250

251 phase: Literal["propose", "execute", "observe", "evaluate", "decide"]

252 status: Literal["succeeded", "failed"]

253 started_at: str

254 ended_at: str

255 error_message: NotRequired[str]

256

257

258class IterationRecord(TypedDict):

259 """The complete record of one pass through the five-phase cycle.

260

261 Sampling-related fields (``sampling_status``, ``sampling_output``,

262 ``sampling_rejection_reason``) are present only when the iteration

263 triggered an advisory-path sampling call. ``script_call_log`` is

264 present only when the strategy carried a ``script``.

265 """

266

267 iteration_index: int

268 started_at: str

269 ended_at: str

270 phases: list[PhaseRecord]

271 strategy: Strategy

272 observation: Observation

273 criteria_evaluation: list[CriterionResult]

274 verdict: VerdictLabel

275 verdict_reason: VerdictReason

276 revision_rationale: NotRequired[str]

277 checkpoint_evaluated: bool

278 sampling_status: NotRequired[SamplingStatus]

279 sampling_output: NotRequired[str]

280 sampling_rejection_reason: NotRequired[str]

281 script_call_log: NotRequired[list[ToolCallRecord]]

282 # Set by ``_execute_script`` when the sandbox runner raises

283 # :class:`mcp.mission.sandbox.SandboxTerminated`. The Decide_Phase's

284 # cascade reads this sentinel before any other branch and emits

285 # ``("terminate", <reason>)`` so a sandbox cap propagates up to the

286 # budget-cap path rather than failing the iteration as a phase

287 # exception. Carries the wall-clock :data:`VerdictReason`

288 # ``max_wall_clock`` for duration / memory / runtime caps.

289 sandbox_terminated_reason: NotRequired[VerdictReason]

290

291

292class SessionState(TypedDict):

293 """The durable Mission_Session payload persisted by Mission_State_Backend.

294

295 The ``version`` field carries :data:`SCHEMA_VERSION`; loaders compare it

296 against the current value and reject mismatches. Optional fields are

297 populated as the session progresses (``started_at`` on first iteration,

298 ``ended_at`` and ``final_report_path`` on terminal verdict, etc.).

299 """

300

301 version: int

302 session_id: str

303 directive_text: str

304 criteria: list[Criterion]

305 budget: BudgetControls

306 tool_allowlist: list[str]

307 checkpoint_cadence: Cadence

308 stagnation_threshold: int

309 use_sampling: bool

310 sampling_backend_resolved: NotRequired[Literal["mcp", "bedrock", "none"]]

311 bedrock_model_id: NotRequired[str]

312 allow_scripted_strategies: bool

313 sampling_model_preferences: NotRequired[dict[str, Any]]

314 status: StatusLabel

315 created_at: str

316 started_at: NotRequired[str]

317 ended_at: NotRequired[str]

318 iterations: list[IterationRecord]

319 no_progress_counter: int

320 last_checkpoint_at: NotRequired[str]

321 final_verdict: NotRequired[VerdictLabel]

322 final_report_path: NotRequired[str]