Coverage for mcp/resources/docs.py: 93%

1"""Documentation resources (docs:// scheme) for the GCO MCP server."""

3from pathlib import Path

5from server import mcp

7PROJECT_ROOT = Path(__file__).parent.parent.parent

8DOCS_DIR = PROJECT_ROOT / "docs"

9EXAMPLES_DIR = PROJECT_ROOT / "examples"

11# ---------------------------------------------------------------------------

12# Example metadata — used by both the index and the per-example resource to

13# give the LLM rich context about what each manifest does and how to adapt it.

14# ---------------------------------------------------------------------------

16EXAMPLE_METADATA: dict[str, dict[str, str | list[str]]] = {

17 "simple-job": {

18 "category": "Jobs & Training",

19 "summary": "Basic Kubernetes Job that runs a command and completes. Start here to verify your cluster.",

20 "gpu": "no",

21 "opt_in": "",

22 "submission": "gco jobs submit-sqs examples/simple-job.yaml --region us-east-1",

23 "keywords": ["simple", "hello", "starter", "basic", "smoke test"],

24 "instance_types": [],

25 "use_cases": [

26 "verify cluster setup",

27 "smoke test a new region",

28 "minimal job example",

29 ],

30 "related": ["gpu-job", "sqs-job-submission"],

31 },

32 "gpu-job": {

33 "category": "Jobs & Training",

34 "summary": "Requests GPU resources and runs on GPU-enabled nodes.",

35 "gpu": "NVIDIA",

36 "opt_in": "",

37 "submission": "gco jobs submit-sqs examples/gpu-job.yaml --region us-east-1",

38 "keywords": [

39 "gpu",

40 "nvidia",

41 "cuda",

42 "single gpu",

43 "nvidia.com/gpu",

44 "g5",

45 "g6",

46 "g4dn",

47 "tolerations",

48 ],

49 "instance_types": ["g5.xlarge", "g6.xlarge", "g4dn.xlarge"],

50 "use_cases": [

51 "run a single GPU workload",

52 "test GPU node provisioning",

53 "smoke test CUDA",

54 ],

55 "related": ["multi-gpu-training", "gpu-timeslicing-job", "simple-job"],

56 },

57 "gpu-timeslicing-job": {

58 "category": "Jobs & Training",

59 "summary": "Fractional GPU via NVIDIA time-slicing — multiple pods share one physical GPU.",

60 "gpu": "NVIDIA (time-sliced)",

61 "opt_in": "NVIDIA device plugin time-slicing ConfigMap",

62 "submission": "kubectl apply -f examples/gpu-timeslicing-job.yaml",

63 "keywords": ["gpu", "timeslicing", "fractional", "shared gpu", "nvidia"],

64 "instance_types": ["g5.xlarge", "g6.xlarge"],

65 "use_cases": [

66 "share one GPU between multiple pods",

67 "lower cost for small inference workloads",

68 ],

69 "related": ["gpu-job", "multi-gpu-training"],

70 },

71 "multi-gpu-training": {

72 "category": "Jobs & Training",

73 "summary": "PyTorch DistributedDataParallel (DDP) across multiple GPUs with indexed pods and headless service.",

74 "gpu": "NVIDIA",

75 "opt_in": "",

76 "submission": "kubectl apply -f examples/multi-gpu-training.yaml",

77 "keywords": [

78 "ddp",

79 "distributed",

80 "pytorch",

81 "multi gpu",

82 "training",

83 "torchrun",

84 "nccl",

85 "indexed pods",

86 "headless service",

87 ],

88 "instance_types": ["g5.12xlarge", "g6.12xlarge", "p4d.24xlarge"],

89 "use_cases": [

90 "distributed PyTorch DDP training",

91 "scale a training job across multiple GPUs",

92 ],

93 "related": ["gpu-job", "efa-distributed-training", "megatrain-sft-job"],

94 },

95 "efa-distributed-training": {

96 "category": "Jobs & Training",

97 "summary": "Elastic Fabric Adapter (EFA) for high-bandwidth inter-node communication (up to 3.2 Tbps on P5, 28.8 Tbps on P6e). For p4d/p5/p5e/p5en/p6-b200/p6-b300/p6e-gb200/trn instances.",

98 "gpu": "NVIDIA + EFA",

99 "opt_in": "",

100 "submission": "gco jobs submit-direct examples/efa-distributed-training.yaml -r us-east-1",

101 "keywords": ["efa", "elastic fabric adapter", "distributed", "nccl", "high bandwidth"],

102 "instance_types": [

103 "p4d.24xlarge",

104 "p5.48xlarge",

105 "trn1.32xlarge",

106 "trn2.48xlarge",

107 ],

108 "use_cases": [

109 "multi-node distributed training over EFA",

110 "high-bandwidth NCCL all-reduce",

111 "large-scale model pretraining",

112 ],

113 "related": ["multi-gpu-training", "trainium-job", "megatrain-sft-job"],

114 },

115 "megatrain-sft-job": {

116 "category": "Jobs & Training",

117 "summary": "SFT fine-tuning of Qwen2.5-1.5B on a single GPU using MegaTrain. Downloads weights to EFS.",

118 "gpu": "NVIDIA",

119 "opt_in": "",

120 "submission": "gco jobs submit-direct examples/megatrain-sft-job.yaml -r us-east-1",

121 "keywords": ["sft", "fine-tuning", "qwen", "megatrain", "llm training"],

122 "instance_types": ["g5.xlarge", "g5.12xlarge", "g6.xlarge"],

123 "use_cases": [

124 "supervised fine-tuning of an LLM",

125 "single-GPU SFT on Qwen",

126 "fine-tune a small open-source model",

127 ],

128 "related": ["multi-gpu-training", "model-download-job", "efa-distributed-training"],

129 },

130 "model-download-job": {

131 "category": "Jobs & Training",

132 "summary": "Pre-downloads HuggingFace model weights to shared EFS for inference endpoints.",

133 "gpu": "no",

134 "opt_in": "",

135 "submission": "kubectl apply -f examples/model-download-job.yaml",

136 "keywords": ["huggingface", "download", "weights", "model cache", "efs"],

137 "instance_types": [],

138 "use_cases": [

139 "stage HuggingFace weights on EFS",

140 "warm a model cache before serving",

141 ],

142 "related": ["megatrain-sft-job", "inference-vllm", "efs-output-job"],

143 },

144 "sqs-job-submission": {

145 "category": "Jobs & Training",

146 "summary": "Demonstrates SQS-based submission (recommended). Contains CPU and GPU job examples.",

147 "gpu": "optional",

148 "opt_in": "",

149 "submission": "gco jobs submit-sqs examples/sqs-job-submission.yaml --region us-east-1",

150 "keywords": ["sqs", "submission", "queue", "broker"],

151 "instance_types": [],

152 "use_cases": [

153 "submit jobs through the SQS queue",

154 "queue-based job submission pattern",

155 ],

156 "related": ["simple-job", "gpu-job", "keda-scaled-job"],

157 },

158 "trainium-job": {

159 "category": "Accelerator Jobs",

160 "summary": "AWS Trainium instance with Neuron SDK. Lower cost than GPU for training.",

161 "gpu": "Trainium",

162 "opt_in": "",

163 "submission": "gco jobs submit examples/trainium-job.yaml --region us-east-1",

164 "keywords": ["trainium", "neuron", "trn1", "trn2", "training accelerator"],

165 "instance_types": ["trn1.2xlarge", "trn1.32xlarge", "trn2.48xlarge"],

166 "use_cases": [

167 "lower-cost training on AWS silicon",

168 "train with the Neuron SDK",

169 ],

170 "related": ["inferentia-job", "efa-distributed-training"],

171 },

172 "inferentia-job": {

173 "category": "Accelerator Jobs",

174 "summary": "AWS Inferentia2 with Neuron SDK. Optimized for low-cost, high-throughput inference.",

175 "gpu": "Inferentia",

176 "opt_in": "",

177 "submission": "gco jobs submit examples/inferentia-job.yaml --region us-east-1",

178 "keywords": ["inferentia", "neuron", "inf2", "inference accelerator"],

179 "instance_types": ["inf2.xlarge", "inf2.8xlarge", "inf2.24xlarge", "inf2.48xlarge"],

180 "use_cases": [

181 "low-cost inference on AWS silicon",

182 "high-throughput batch inference",

183 ],

184 "related": ["trainium-job", "inference-vllm"],

185 },

186 "inference-vllm": {

187 "category": "Inference Serving",

188 "summary": "vLLM OpenAI-compatible LLM serving with PagedAttention.",

189 "gpu": "NVIDIA",

190 "opt_in": "",

191 "submission": "gco inference deploy my-llm -i vllm/vllm-openai:v0.22.0 --gpu-count 1",

192 "keywords": [

193 "vllm",

194 "openai",

195 "openai-compatible",

196 "llm serving",

197 "pagedattention",

198 "inference",

199 "completions",

200 "chat completions",

201 "v1/chat/completions",

202 "model server",

203 "llama",

204 "qwen",

205 "mistral",

206 ],

207 "instance_types": ["g5.xlarge", "g5.12xlarge", "g6.xlarge"],

208 "use_cases": [

209 "serve an LLM with an OpenAI-compatible API",

210 "high-throughput LLM inference",

211 "deploy a chat completions endpoint",

212 ],

213 "related": ["inference-tgi", "inference-sglang", "inference-triton", "model-download-job"],

214 },

215 "inference-tgi": {

216 "category": "Inference Serving",

217 "summary": "HuggingFace Text Generation Inference — optimized transformer serving.",

218 "gpu": "NVIDIA",

219 "opt_in": "",

220 "submission": "gco jobs submit-direct examples/inference-tgi.yaml -r us-east-1",

221 "keywords": ["tgi", "huggingface", "text generation", "llm serving"],

222 "instance_types": ["g5.xlarge", "g5.12xlarge", "g6.xlarge"],

223 "use_cases": [

224 "serve HuggingFace LLMs with TGI",

225 "transformer text-generation endpoint",

226 ],

227 "related": ["inference-vllm", "inference-sglang", "inference-torchserve"],

228 },

229 "inference-triton": {

230 "category": "Inference Serving",

231 "summary": "NVIDIA Triton Inference Server — multi-framework (PyTorch, TensorFlow, ONNX).",

232 "gpu": "NVIDIA",

233 "opt_in": "",

234 "submission": "gco jobs submit-direct examples/inference-triton.yaml -r us-east-1",

235 "keywords": ["triton", "nvidia", "multi-framework", "onnx", "tensorflow", "inference"],

236 "instance_types": ["g5.xlarge", "g6.xlarge"],

237 "use_cases": [

238 "multi-framework inference serving",

239 "serve ONNX or TensorFlow models",

240 ],

241 "related": ["inference-vllm", "inference-torchserve"],

242 },

243 "inference-torchserve": {

244 "category": "Inference Serving",

245 "summary": "PyTorch TorchServe model serving.",

246 "gpu": "NVIDIA",

247 "opt_in": "",

248 "submission": "gco jobs submit-direct examples/inference-torchserve.yaml -r us-east-1",

249 "keywords": ["torchserve", "pytorch", "model serving"],

250 "instance_types": ["g5.xlarge", "g6.xlarge"],

251 "use_cases": [

252 "serve a PyTorch model with TorchServe",

253 ],

254 "related": ["inference-triton", "inference-vllm"],

255 },

256 "inference-sglang": {

257 "category": "Inference Serving",

258 "summary": "SGLang high-throughput serving with RadixAttention for prefix caching.",

259 "gpu": "NVIDIA",

260 "opt_in": "",

261 "submission": "gco jobs submit-direct examples/inference-sglang.yaml -r us-east-1",

262 "keywords": ["sglang", "radixattention", "prefix caching", "llm serving"],

263 "instance_types": ["g5.xlarge", "g5.12xlarge"],

264 "use_cases": [

265 "high-throughput LLM serving with prefix caching",

266 "serve LLMs with structured output",

267 ],

268 "related": ["inference-vllm", "inference-tgi"],

269 },

270 "efs-output-job": {

271 "category": "Storage & Persistence",

272 "summary": "Writes output to shared EFS storage. Results persist after pod termination.",

273 "gpu": "no",

274 "opt_in": "",

275 "submission": "gco jobs submit-direct examples/efs-output-job.yaml --region us-east-1 -n gco-jobs",

276 "keywords": ["efs", "shared storage", "persistent", "output"],

277 "instance_types": [],

278 "use_cases": [

279 "persist job output to EFS",

280 "share data between pods via EFS",

281 ],

282 "related": ["fsx-lustre-job", "model-download-job", "cluster-shared-bucket-upload-job"],

283 },

284 "fsx-lustre-job": {

285 "category": "Storage & Persistence",

286 "summary": "FSx for Lustre high-performance parallel storage (1000+ GB/s throughput).",

287 "gpu": "no",

288 "opt_in": "FSx (gco stacks fsx enable -y)",

289 "submission": "gco jobs submit-direct examples/fsx-lustre-job.yaml --region us-east-1 -n gco-jobs",

290 "keywords": ["fsx", "lustre", "parallel storage", "high throughput", "hpc"],

291 "instance_types": [],

292 "use_cases": [

293 "high-throughput parallel storage for training",

294 "stream large datasets to GPU nodes",

295 ],

296 "related": ["efs-output-job", "multi-gpu-training", "efa-distributed-training"],

297 },

298 "valkey-cache-job": {

299 "category": "Caching & Databases",

300 "summary": "Valkey Serverless cache for K/V caching, prompt caching, session state, feature stores.",

301 "gpu": "no",

302 "opt_in": 'Valkey ("valkey": {"enabled": true} in cdk.json)',

303 "submission": "gco jobs submit-direct examples/valkey-cache-job.yaml -r us-east-1",

304 "keywords": ["valkey", "redis", "cache", "kv store", "session state"],

305 "instance_types": [],

306 "use_cases": [

307 "cache prompts or session state",

308 "use Valkey from a job",

309 "feature store backed by Valkey",

310 ],

311 "related": ["aurora-pgvector-job"],

312 },

313 "aurora-pgvector-job": {

314 "category": "Caching & Databases",

315 "summary": "Aurora Serverless v2 PostgreSQL with pgvector for RAG and semantic search.",

316 "gpu": "no",

317 "opt_in": 'Aurora ("aurora_pgvector": {"enabled": true} in cdk.json)',

318 "submission": "gco jobs submit-direct examples/aurora-pgvector-job.yaml -r us-east-1",

319 "keywords": ["aurora", "pgvector", "postgres", "rag", "vector database", "embeddings"],

320 "instance_types": [],

321 "use_cases": [

322 "RAG with pgvector",

323 "semantic search backed by Postgres",

324 "store embeddings in Aurora",

325 ],

326 "related": ["valkey-cache-job", "analytics-database-export-job"],

327 },

328 "cluster-shared-bucket-upload-job": {

329 "category": "Storage & Persistence",

330 "summary": "Uploads a file to the always-on Cluster_Shared_Bucket using the gco-cluster-shared-bucket ConfigMap via envFrom. Works with analytics disabled.",

331 "gpu": "no",

332 "opt_in": "",

333 "submission": "gco jobs submit-direct examples/cluster-shared-bucket-upload-job.yaml -r us-east-1",

334 "keywords": ["s3", "shared bucket", "upload", "configmap"],

335 "instance_types": [],

336 "use_cases": [

337 "upload artifacts to the shared S3 bucket",

338 "share files across regions via S3",

339 ],

340 "related": ["efs-output-job", "analytics-s3-upload-job"],

341 },

342 "analytics-s3-upload-job": {

343 "category": "Analytics",

344 "summary": "Publishes a dataset snapshot plus schema manifest to Cluster_Shared_Bucket under analytics-data/ so a SageMaker Studio notebook can read it.",

345 "gpu": "no",

346 "opt_in": 'Analytics ("analytics_environment": {"enabled": true} in cdk.json)',

347 "submission": "gco jobs submit-direct examples/analytics-s3-upload-job.yaml -r us-east-1",

348 "keywords": ["analytics", "s3", "sagemaker", "dataset", "schema"],

349 "instance_types": [],

350 "use_cases": [

351 "publish a dataset for a SageMaker Studio notebook",

352 "share an analytics snapshot via S3",

353 ],

354 "related": ["analytics-database-export-job", "cluster-shared-bucket-upload-job"],

355 },

356 "analytics-database-export-job": {

357 "category": "Analytics",

358 "summary": "Exports rows from the regional Aurora pgvector cluster to Cluster_Shared_Bucket as CSV for a SageMaker Studio notebook to analyse.",

359 "gpu": "no",

360 "opt_in": 'Aurora + Analytics ("aurora_pgvector.enabled" and "analytics_environment.enabled" in cdk.json)',

361 "submission": "gco jobs submit-direct examples/analytics-database-export-job.yaml -r us-east-1",

362 "keywords": ["analytics", "aurora", "csv", "export", "sagemaker"],

363 "instance_types": [],

364 "use_cases": [

365 "export Aurora rows to S3 as CSV",

366 "feed a SageMaker Studio notebook from Postgres",

367 ],

368 "related": ["aurora-pgvector-job", "analytics-s3-upload-job"],

369 },

370 "volcano-gang-job": {

371 "category": "Schedulers",

372 "summary": "Volcano gang scheduling — all pods scheduled together or none. Master + workers topology.",

373 "gpu": "no",

374 "opt_in": "",

375 "submission": "kubectl apply -f examples/volcano-gang-job.yaml",

376 "keywords": ["volcano", "gang scheduling", "batch", "scheduler"],

377 "instance_types": [],

378 "use_cases": [

379 "schedule all pods at once or none",

380 "MPI-style master + workers topology",

381 ],

382 "related": ["kueue-job", "yunikorn-job", "slurm-cluster-job"],

383 },

384 "kueue-job": {

385 "category": "Schedulers",

386 "summary": "Kueue job queueing with ClusterQueue, LocalQueue, ResourceFlavors, and fair-sharing.",

387 "gpu": "optional",

388 "opt_in": "",

389 "submission": "kubectl apply -f examples/kueue-job.yaml",

390 "keywords": ["kueue", "queueing", "fair sharing", "scheduler", "clusterqueue"],

391 "instance_types": [],

392 "use_cases": [

393 "queue jobs with quotas and fair sharing",

394 "multi-tenant batch scheduling",

395 ],

396 "related": ["volcano-gang-job", "yunikorn-job"],

397 },

398 "yunikorn-job": {

399 "category": "Schedulers",

400 "summary": "Apache YuniKorn app-aware scheduling with hierarchical queues and gang scheduling.",

401 "gpu": "no",

402 "opt_in": 'YuniKorn ("helm": {"yunikorn": {"enabled": true}} in cdk.json)',

403 "submission": "kubectl apply -f examples/yunikorn-job.yaml",

404 "keywords": ["yunikorn", "scheduler", "hierarchical queues", "gang scheduling"],

405 "instance_types": [],

406 "use_cases": [

407 "app-aware scheduling with hierarchical queues",

408 "YuniKorn-style gang scheduling",

409 ],

410 "related": ["kueue-job", "volcano-gang-job"],

411 },

412 "keda-scaled-job": {

413 "category": "Schedulers",

414 "summary": "KEDA ScaledJob — custom SQS-triggered autoscaling. Template for custom consumers.",

415 "gpu": "no",

416 "opt_in": "",

417 "submission": "kubectl apply -f examples/keda-scaled-job.yaml",

418 "keywords": ["keda", "scaledjob", "autoscaling", "sqs", "event driven"],

419 "instance_types": [],

420 "use_cases": [

421 "scale jobs from SQS queue depth",

422 "event-driven job autoscaling",

423 ],

424 "related": ["sqs-job-submission"],

425 },

426 "slurm-cluster-job": {

427 "category": "Schedulers",

428 "summary": "Slinky Slurm Operator — sbatch submission on Kubernetes for HPC workloads.",

429 "gpu": "no",

430 "opt_in": 'Slurm ("helm": {"slurm": {"enabled": true}} in cdk.json)',

431 "submission": "kubectl apply -f examples/slurm-cluster-job.yaml",

432 "keywords": ["slurm", "hpc", "sbatch", "slinky"],

433 "instance_types": [],

434 "use_cases": [

435 "submit sbatch jobs on Kubernetes",

436 "run HPC workloads with Slurm",

437 ],

438 "related": ["volcano-gang-job"],

439 },

440 "ray-cluster": {

441 "category": "Distributed Computing",

442 "summary": "KubeRay RayCluster for distributed training, tuning, and serving. Auto-scaling workers.",

443 "gpu": "no",

444 "opt_in": "",

445 "submission": "kubectl apply -f examples/ray-cluster.yaml",

446 "keywords": ["ray", "kuberay", "distributed", "tune", "serve"],

447 "instance_types": [],

448 "use_cases": [

449 "stand up a Ray cluster on EKS",

450 "distributed training and tuning with Ray",

451 ],

452 "related": ["multi-gpu-training", "pipeline-dag"],

453 },

454 "pipeline-dag": {

455 "category": "DAG Pipelines",

456 "summary": "Multi-step pipeline with dependency ordering. Preprocess → Train via shared EFS.",

457 "gpu": "no",

458 "opt_in": "",

459 "submission": "gco dag run examples/pipeline-dag.yaml -r us-east-1",

460 "keywords": ["dag", "pipeline", "workflow", "dependencies"],

461 "instance_types": [],

462 "use_cases": [

463 "run a multi-step ML pipeline",

464 "chain preprocess and train jobs",

465 ],

466 "related": ["dag-step-preprocess", "dag-step-train", "ray-cluster"],

467 },

468 "dag-step-preprocess": {

469 "category": "DAG Pipelines",

470 "summary": "DAG step 1: generates training data on shared EFS.",

471 "gpu": "no",

472 "opt_in": "",

473 "submission": "(used by pipeline-dag.yaml)",

474 "keywords": ["dag", "preprocess", "step", "pipeline"],

475 "instance_types": [],

476 "use_cases": [

477 "preprocessing step of a pipeline",

478 "generate training data for a downstream step",

479 ],

480 "related": ["pipeline-dag", "dag-step-train"],

481 },

482 "dag-step-train": {

483 "category": "DAG Pipelines",

484 "summary": "DAG step 2: reads preprocess output, trains model, writes artifacts to EFS.",

485 "gpu": "no",

486 "opt_in": "",

487 "submission": "(used by pipeline-dag.yaml)",

488 "keywords": ["dag", "train", "step", "pipeline"],

489 "instance_types": [],

490 "use_cases": [

491 "training step of a pipeline",

492 "consume preprocess output and train",

493 ],

494 "related": ["pipeline-dag", "dag-step-preprocess"],

495 },

496}

497

498

499# ---------------------------------------------------------------------------

500# Doc metadata — used by ``find_docs`` and the docs:// discovery resources to

501# describe every markdown file under ``docs/``. Indexed by basename without

502# extension (e.g. ``ARCHITECTURE``). The vocabulary in ``topics`` is kept

503# small and consistent so topic-based search across docs stays predictable.

504# ---------------------------------------------------------------------------

505

506DOC_METADATA: dict[str, dict[str, str | list[str]]] = {

507 "ANALYTICS": {

508 "summary": "Optional SageMaker Studio + EMR Serverless analytics environment, enabled via a single cdk.json toggle.",

509 "topics": ["analytics", "storage", "customization", "gpu"],

510 "keywords": [

511 "sagemaker studio",

512 "emr serverless",

513 "cognito",

514 "data science",

515 "notebook",

516 "presigned url",

517 "studio domain",

518 "analytics environment",

519 "user pool",

520 ],

521 "related": ["CLUSTER_SHARED_BUCKET", "CUSTOMIZATION"],

522 },

523 "API": {

524 "summary": "REST API reference for the GCO Manifest Processor service — endpoints, auth, and CLI quick reference.",

525 "topics": ["api", "cli", "jobs", "inference", "webhooks", "templates"],

526 "keywords": [

527 "rest",

528 "manifest processor",

529 "endpoints",

530 "auth",

531 "x-gco-auth-token",

532 "api gateway",

533 "sigv4",

534 "openapi",

535 "submit job",

536 ],

537 "related": ["CLI", "ARCHITECTURE"],

538 },

539 "ARCHITECTURE": {

540 "summary": "Deep dive into the multi-region infrastructure, security layers, data flow, and scale characteristics.",

541 "topics": [

542 "architecture",

543 "concepts",

544 "security",

545 "multi-region",

546 "eks",

547 "capacity",

548 "inference",

549 "gpu",

550 "monitoring",

551 "deployment",

552 "nodepools",

553 "storage",

554 "images",

555 "cost",

556 "networking",

557 ],

558 "keywords": [

559 "multi-region",

560 "eks",

561 "vpc",

562 "global accelerator",

563 "data flow",

564 "control plane",

565 "data plane",

566 "regional stack",

567 "global stack",

568 "iam",

569 "kms",

570 "high level design",

571 "blast radius",

572 ],

573 "related": ["CONCEPTS", "CUSTOMIZATION", "API"],

574 },

575 "CLI": {

576 "summary": "Complete command-line interface reference for the gco CLI across jobs, queues, stacks, capacity, inference, and more.",

577 "topics": [

578 "cli",

579 "api",

580 "jobs",

581 "capacity",

582 "inference",

583 "cost",

584 "gpu",

585 "multi-region",

586 "images",

587 "nodepools",

588 "deployment",

589 ],

590 "keywords": [

591 "gco",

592 "command-line",

593 "subcommand",

594 "submit job",

595 "stacks deploy",

596 "stacks destroy",

597 "capacity status",

598 "ai_recommend",

599 "reserve_capacity",

600 "images build",

601 "models upload",

602 ],

603 "related": ["API", "RUNBOOKS"],

604 },

605 "CLUSTER_SHARED_BUCKET": {

606 "summary": "Reference for the always-on Cluster_Shared_Bucket — the S3 bucket every regional cluster can read and write by default.",

607 "topics": ["storage", "concepts", "multi-region", "security"],

608 "keywords": [

609 "s3",

610 "shared bucket",

611 "cross-region",

612 "configmap",

613 "envFrom",

614 "kms",

615 "iam grant",

616 "bucket policy",

617 "always-on",

618 ],

619 "related": ["ANALYTICS", "ARCHITECTURE"],

620 },

621 "CONCEPTS": {

622 "summary": "Fundamental concepts behind GCO — what it is, the problems it solves, and how the key components fit together.",

623 "topics": [

624 "concepts",

625 "architecture",

626 "multi-region",

627 "capacity",

628 "gpu",

629 "eks",

630 "jobs",

631 "inference",

632 ],

633 "keywords": [

634 "what is gco",

635 "fundamentals",

636 "components",

637 "global queue",

638 "capacity orchestration",

639 "ai/ml workloads",

640 "gpu allocation",

641 "regional clusters",

642 ],

643 "related": ["ARCHITECTURE", "README"],

644 },

645 "CUSTOMIZATION": {

646 "summary": "How to customize GCO — deployment regions, EKS configuration, GPU nodepools, and more.",

647 "topics": [

648 "customization",

649 "architecture",

650 "gpu",

651 "eks",

652 "nodepools",

653 "storage",

654 "multi-region",

655 "deployment",

656 ],

657 "keywords": [

658 "cdk.json",

659 "regions",

660 "addons",

661 "instance types",

662 "fsx",

663 "valkey",

664 "aurora",

665 "feature toggles",

666 "queue processor",

667 "helm charts",

668 "image registry config",

669 ],

670 "related": ["ARCHITECTURE", "ANALYTICS"],

671 },

672 "INFERENCE": {

673 "summary": "Deploy and manage multi-region GPU inference endpoints, including model weight management and supported frameworks.",

674 "topics": [

675 "inference",

676 "architecture",

677 "gpu",

678 "multi-region",

679 "cost",

680 "images",

681 "monitoring",

682 ],

683 "keywords": [

684 "vllm",

685 "tgi",

686 "triton",

687 "torchserve",

688 "sglang",

689 "endpoints",

690 "canary",

691 "rolling update",

692 "model weights",

693 "global accelerator",

694 "openai-compatible",

695 "inference monitor",

696 ],

697 "related": ["ARCHITECTURE", "RUNBOOKS"],

698 },

699 "KEDA": {

700 "summary": "KEDA event-driven autoscaling integration — scales workloads from external sources like SQS, Kafka, and Prometheus.",

701 "topics": ["schedulers", "jobs", "autoscaling"],

702 "keywords": [

703 "keda",

704 "scaledjob",

705 "scaledobject",

706 "sqs trigger",

707 "event-driven",

708 "kafka",

709 "prometheus",

710 "queue depth",

711 ],

712 "related": ["SCHEDULERS", "VOLCANO"],

713 },

714 "KUBERAY": {

715 "summary": "KubeRay operator integration — runs Ray distributed computing workloads on Kubernetes for training, tuning, and serving.",

716 "topics": ["schedulers", "jobs", "gpu", "training", "distributed"],

717 "keywords": [

718 "kuberay",

719 "ray",

720 "raycluster",

721 "rayjob",

722 "rayservice",

723 "ray tune",

724 "ray train",

725 "ray serve",

726 "distributed",

727 ],

728 "related": ["SCHEDULERS", "VOLCANO"],

729 },

730 "KUEUE": {

731 "summary": "Kueue integration for Kubernetes-native job queueing with resource quotas, fair sharing, and priority scheduling.",

732 "topics": ["schedulers", "jobs"],

733 "keywords": [

734 "kueue",

735 "clusterqueue",

736 "localqueue",

737 "resourceflavor",

738 "quota",

739 "fair sharing",

740 "priority",

741 "preemption",

742 ],

743 "related": ["SCHEDULERS", "VOLCANO", "YUNIKORN"],

744 },

745 "MISSION": {

746 "summary": "Goal-directed iteration loop — declare a directive, criteria, tool allowlist, and budget; runs deterministic five-phase iterations until a verdict.",

747 "topics": [

748 "concepts",

749 "cli",

750 "api",

751 "automation",

752 "feature-flags",

753 ],

754 "keywords": [

755 "mission",

756 "directive",

757 "criteria",

758 "verdict",

759 "iteration",

760 "goal directed",

761 "autonomous loop",

762 "sampling",

763 "sandbox",

764 "predicate",

765 "checkpoint",

766 "budget",

767 "final report",

768 ],

769 "related": ["CLI", "ARCHITECTURE", "RUNBOOKS"],

770 },

771 "README": {

772 "summary": "Documentation index — the top-level guide map for the rest of the docs/ tree.",

773 "topics": ["concepts", "multi-region", "gpu", "capacity", "inference", "quickstart"],

774 "keywords": [

775 "index",

776 "overview",

777 "guide map",

778 "documentation",

779 "table of contents",

780 "getting started",

781 ],

782 "related": ["CONCEPTS", "ARCHITECTURE"],

783 },

784 "RUNBOOKS": {

785 "summary": "Operational runbooks — step-by-step procedures for common operational scenarios with symptoms, diagnosis, and resolution.",

786 "topics": [

787 "runbooks",

788 "troubleshooting",

789 "jobs",

790 "inference",

791 "capacity",

792 "monitoring",

793 "deployment",

794 ],

795 "keywords": [

796 "incident response",

797 "operational procedures",

798 "stuck job",

799 "endpoint down",

800 "capacity exhausted",

801 "stack rollback",

802 "playbook",

803 "diagnose",

804 "remediation",

805 ],

806 "related": ["TROUBLESHOOTING", "CLI"],

807 },

808 "SCHEDULERS": {

809 "summary": "Comparison and overview of the six supported scheduling and orchestration tools — Volcano, Kueue, KubeRay, KEDA, Slurm, YuniKorn.",

810 "topics": ["schedulers", "concepts", "jobs", "gpu"],

811 "keywords": [

812 "volcano",

813 "kueue",

814 "kuberay",

815 "keda",

816 "slurm",

817 "yunikorn",

818 "gang scheduling",

819 "batch scheduler",

820 "scheduler comparison",

821 "queueing",

822 ],

823 "related": ["VOLCANO", "KUEUE", "KUBERAY"],

824 },

825 "SLURM_OPERATOR": {

826 "summary": "Slinky Slurm Operator integration — runs sbatch, srun, and salloc inside an EKS cluster for HPC workflows.",

827 "topics": ["schedulers", "jobs", "hpc"],

828 "keywords": [

829 "slurm",

830 "slinky",

831 "sbatch",

832 "srun",

833 "salloc",

834 "hpc",

835 "scientific computing",

836 "mpi",

837 ],

838 "related": ["SCHEDULERS", "VOLCANO"],

839 },

840 "TROUBLESHOOTING": {

841 "summary": "Troubleshooting guide — common installation, deployment, kubectl, and pod issues with their resolutions.",

842 "topics": [

843 "troubleshooting",

844 "runbooks",

845 "deployment",

846 "eks",

847 "jobs",

848 "inference",

849 "capacity",

850 ],

851 "keywords": [

852 "kubectl",

853 "pod crashloop",

854 "imagepullbackoff",

855 "stack rollback",

856 "deployment failed",

857 "credentials",

858 "vpc",

859 "nodepool not scaling",

860 "common errors",

861 "fix",

862 ],

863 "related": ["RUNBOOKS", "CLI"],

864 },

865 "VOLCANO": {

866 "summary": "Volcano batch scheduler integration — gang scheduling, fair-share queuing, and job lifecycle management for AI/ML and HPC.",

867 "topics": ["schedulers", "jobs", "gpu", "hpc"],

868 "keywords": [

869 "volcano",

870 "gang scheduling",

871 "vcjob",

872 "podgroup",

873 "queue",

874 "fair share",

875 "job lifecycle",

876 "batch",

877 ],

878 "related": ["SCHEDULERS", "KUEUE", "YUNIKORN"],

879 },

880 "YUNIKORN": {

881 "summary": "Apache YuniKorn integration — multi-tenant scheduler with hierarchical queues and gang scheduling.",

882 "topics": ["schedulers", "jobs"],

883 "keywords": [

884 "yunikorn",

885 "hierarchical queues",

886 "multi-tenant",

887 "gang scheduling",

888 "app-aware scheduling",

889 "fair share",

890 ],

891 "related": ["SCHEDULERS", "KUEUE", "VOLCANO"],

892 },

893}

894

895

896# ---------------------------------------------------------------------------

897# Package-doc metadata — used by ``find_docs`` and the

898# ``docs://gco/packages/...`` resources to describe the package-level READMEs

899# that live next to the code (under ``mcp/``) rather than in ``docs/``. These

900# are developer-facing internals guides (how a package is structured and how to

901# customize it), kept in a catalog separate from ``DOC_METADATA`` so the strict

902# 1:1 ``docs/*.md`` invariant stays intact. Each entry is keyed by a stable

903# slug and carries a ``path`` relative to the project root. A ``related`` entry

904# may reference either another package slug or a ``DOC_METADATA`` key.

905# ---------------------------------------------------------------------------

906

907PACKAGE_DOC_METADATA: dict[str, dict[str, str | list[str]]] = {

908 "mcp-server": {

909 "path": "mcp/README.md",

910 "summary": "GCO MCP server guide — setup across MCP clients, feature-flag gating, and the full tool and resource catalog.",

911 "topics": ["mcp", "concepts", "feature-flags", "customization"],

912 "keywords": [

913 "mcp server",

914 "fastmcp",

915 "stdio",

916 "feature flags",

917 "gco_enable",

918 "kiro",

919 "claude desktop",

920 "cursor",

921 "tool search",

922 "available tools",

923 "resources",

924 ],

925 "related": ["mcp-tools", "mcp-resources", "CLI", "MISSION"],

926 },

927 "mcp-tools": {

928 "path": "mcp/tools/README.md",

929 "summary": "How MCP tools are defined — one module per domain, the @mcp.tool + audit_logged pattern, and how to add a new tool.",

930 "topics": ["mcp", "customization"],

931 "keywords": [

932 "tool",

933 "mcp.tool",

934 "audit_logged",

935 "cli_runner",

936 "adding a tool",

937 "tool module",

938 "domain",

939 ],

940 "related": ["mcp-server", "mcp-resources"],

941 },

942 "mcp-resources": {

943 "path": "mcp/resources/README.md",

944 "summary": "MCP resource modules by URI scheme (docs://, source://, k8s://, …) and how to add a new resource group.",

945 "topics": ["mcp", "customization"],

946 "keywords": [

947 "resource",

948 "mcp.resource",

949 "uri scheme",

950 "docs scheme",

951 "source scheme",

952 "resource index",

953 "adding a resource",

954 ],

955 "related": ["mcp-server", "mcp-tools"],

956 },

957 "mcp-mission": {

958 "path": "mcp/mission/README.md",

959 "summary": "Mission package internals — the five-phase goal-directed loop, deterministic verdict cascade, sandboxes, and how to extend each piece.",

960 "topics": ["mcp", "automation", "customization", "concepts"],

961 "keywords": [

962 "mission",

963 "engine",

964 "verdict",

965 "five-phase loop",

966 "sandbox",

967 "predicate",

968 "sampling",

969 "criteria",

970 "customize mission",

971 "module map",

972 ],

973 "related": ["MISSION", "mcp-metric-readers", "mcp-mission-judge", "mcp-server"],

974 },

975 "mcp-metric-readers": {

976 "path": "mcp/metric_readers/README.md",

977 "summary": "Pure helpers behind the read-only metric-reader tools — and how to add an aggregation mode, file format, error code, or whole new reader source.",

978 "topics": ["mcp", "metrics", "customization", "automation"],

979 "keywords": [

980 "metric reader",

981 "cloudwatch",

982 "aggregation mode",

983 "file format",

984 "parquet",

985 "jsonl",

986 "error code",

987 "metrics_result",

988 "customize metrics",

989 "local root",

990 ],

991 "related": ["mcp-mission-judge", "mcp-mission", "MISSION", "mcp-server"],

992 },

993 "mcp-mission-judge": {

994 "path": "mcp/mission_judge/README.md",

995 "summary": "LLM-as-judge progress scoring internals — the versioned rubric, deterministic prompt, score parsing, and how to customize the scoring for your use case.",

996 "topics": ["mcp", "metrics", "customization", "automation"],

997 "keywords": [

998 "semantic progress",

999 "judge",

1000 "rubric",

1001 "prompt",

1002 "score parsing",

1003 "progress_score",

1004 "gco_enable_semantic_progress",

1005 "llm as judge",

1006 "customize rubric",

1007 ],

1008 "related": ["mcp-metric-readers", "mcp-mission", "MISSION", "mcp-server"],

1009 },

1010}

1011

1012

1013@mcp.resource("docs://gco/index")

1014def docs_index() -> str:

1015 """List all available GCO documentation, examples, and configuration resources."""

1016 sections = ["# GCO Resource Index\n"]

1017 sections.append("## Project Overview")

1018 sections.append("- `docs://gco/README` — Project README and overview")

1019 sections.append("- `docs://gco/QUICKSTART` — Quick start guide (deploy in under 60 minutes)")

1020 sections.append("- `docs://gco/CONTRIBUTING` — Contributing guide\n")

1021

1022 sections.append("## Documentation")

1023 sections.append(

1024 "- `find_docs(query=..., topic=..., limit=...)` tool — search the docs catalog by topic and free-text query"

1025 )

1026 sections.append(

1027 "- `docs://gco/docs/by-topic/{topic}` — list every doc tagged with a given topic phrase"

1028 )

1029 sections.append(

1030 "- `docs://gco/docs/by-related/{doc_name}` — list every doc related to the given doc"

1031 )

1032 for f in sorted(DOCS_DIR.glob("*.md")):

1033 sections.append(f"- `docs://gco/docs/{f.stem}` — {f.stem}")

1034

1035 sections.append("\n## Package Internals")

1036 sections.append(

1037 "Developer-facing guides to the code packages under `mcp/` — structure and "

1038 "how to customize each. Also searchable via `find_docs`."

1039 )

1040 for name, meta in PACKAGE_DOC_METADATA.items():

1041 sections.append(f"- `docs://gco/packages/{name}` — {meta.get('summary', '')}")

1042

1043 sections.append("\n## Example Manifests")

1044 sections.append("- `docs://gco/examples/README` — Examples overview and usage guide")

1045 sections.append(

1046 "- `docs://gco/examples/guide` — How to create new job manifests (patterns & metadata)"

1047 )

1048

1049 sections.append("\n### Discovery")

1050 sections.append(

1051 "- `find_examples(query=..., category=..., gpu=..., opt_in=..., limit=...)` tool — "

1052 "search the catalog by keyword and filters"

1053 )

1054 sections.append(

1055 "- `docs://gco/examples/by-category/{category}` — list every example in a given category"

1056 )

1057 sections.append(

1058 "- `docs://gco/examples/by-use-case/{use_case}` — list every example matching a use-case phrase"

1059 )

1060 sections.append(

1061 "- `docs://gco/examples/{name}` — full manifest plus metadata header for a single example\n"

1062 )

1063

1064 # Categorize examples

1065 categories: dict[str, list[str]] = {}

1066 for f in sorted(EXAMPLES_DIR.glob("*.yaml")):

1067 name = f.stem

1068 meta = EXAMPLE_METADATA.get(name, {})

1069 cat_value = meta.get("category", "Other")

1070 cat = cat_value if isinstance(cat_value, str) else "Other"

1071 summary_value = meta.get("summary", name)

1072 summary = summary_value if isinstance(summary_value, str) else name

1073 entry = f"- `docs://gco/examples/{name}` — {summary}"

1074 categories.setdefault(cat, []).append(entry)

1075

1076 for cat, entries in categories.items():

1077 sections.append(f"### {cat}")

1078 sections.extend(entries)

1079 sections.append("")

1080

1081 sections.append("## Live State")

1082 sections.append(

1083 "- `gco://jobs/{job_name}` — live YAML for a Kubernetes Job in the `gco-jobs` namespace"

1084 )

1085 sections.append(

1086 "- `gco://inference/{endpoint_name}` — desired-state record for an inference endpoint "

1087 "from the DynamoDB store"

1088 )

1089 sections.append(

1090 "- `gco://k8s/{namespace}/{kind}/{name}` — live YAML for any Kubernetes resource in any namespace"

1091 )

1092 sections.append(

1093 "- `gco://cluster/{region}/topology` — Karpenter NodePools plus Pending pods snapshot for one region"

1094 )

1095 sections.append(

1096 "- `costs://gco/summary/{days_window}` — cost summary for the given day window (positive integer)"

1097 )

1098 sections.append("- `tasks://gco/{task_id}` — current status of a FastMCP background task by ID")

1099 sections.append("")

1100

1101 sections.append("## Other Resource Groups")

1102 sections.append("- `k8s://gco/manifests/index` — Kubernetes manifests deployed to EKS")

1103 sections.append("- `iam://gco/policies/index` — IAM policy templates")

1104 sections.append("- `infra://gco/index` — Dockerfiles, Helm charts, CI/CD config")

1105 sections.append("- `ci://gco/index` — GitHub Actions workflows, composite actions, templates")

1106 sections.append("- `source://gco/index` — Source code browser")

1107 sections.append("- `demos://gco/index` — Demo walkthroughs and presentation materials")

1108 sections.append("- `clients://gco/index` — API client examples (Python, curl, AWS CLI)")

1109 sections.append("- `scripts://gco/index` — Utility scripts")

1110 sections.append("- `tests://gco/index` — Test suite documentation and patterns")

1111 sections.append(

1112 "- `config://gco/index` — CDK configuration, feature toggles, environment variables"

1113 )

1114 return "\n".join(sections)

1115

1116

1117@mcp.resource("docs://gco/README")

1118def readme_resource() -> str:

1119 """The main project README with overview and quickstart information."""

1120 return (PROJECT_ROOT / "README.md").read_text()

1121

1122

1123@mcp.resource("docs://gco/QUICKSTART")

1124def quickstart_resource() -> str:

1125 """Quick start guide — get running in under 60 minutes."""

1126 path = PROJECT_ROOT / "QUICKSTART.md"

1127 if not path.is_file(): 1127 ↛ 1128line 1127 didn't jump to line 1128 because the condition on line 1127 was never true

1128 return "QUICKSTART.md not found."

1129 return path.read_text()

1130

1131

1132@mcp.resource("docs://gco/CONTRIBUTING")

1133def contributing_resource() -> str:

1134 """Contributing guide — how to contribute to the project."""

1135 path = PROJECT_ROOT / "CONTRIBUTING.md"

1136 if not path.is_file(): 1136 ↛ 1137line 1136 didn't jump to line 1137 because the condition on line 1136 was never true

1137 return "CONTRIBUTING.md not found."

1138 return path.read_text()

1139

1140

1141@mcp.resource("docs://gco/docs/{doc_name}")

1142def doc_resource(doc_name: str) -> str:

1143 """Read a documentation file by name (e.g. ARCHITECTURE, CLI, INFERENCE).

1144

1145 Prepends an HTML-comment header with ``Topics:`` and ``Related:`` lines

1146 pulled from ``DOC_METADATA`` so an LLM consuming the rendered markdown

1147 sees the doc's classification without it bleeding into the rendered

1148 output. HTML comments are used rather than ``#`` because docs are

1149 markdown — Python-style comments would render as text.

1150 """

1151 path = DOCS_DIR / f"{doc_name}.md"

1152 if not path.is_file():

1153 available = [f.stem for f in DOCS_DIR.glob("*.md")]

1154 return f"Document '{doc_name}' not found. Available: {', '.join(available)}"

1155 content = path.read_text()

1156 meta = DOC_METADATA.get(doc_name, {})

1157 header_lines = []

1158 topics = meta.get("topics", [])

1159 if isinstance(topics, list) and topics: 1159 ↛ 1161line 1159 didn't jump to line 1161 because the condition on line 1159 was always true

1160 header_lines.append(f"")

1161 related = meta.get("related", [])

1162 if isinstance(related, list) and related: 1162 ↛ 1164line 1162 didn't jump to line 1164 because the condition on line 1162 was always true

1163 header_lines.append(f"")

1164 if header_lines: 1164 ↛ 1166line 1164 didn't jump to line 1166 because the condition on line 1164 was always true

1165 return "\n".join(header_lines) + "\n\n" + content

1166 return content

1167

1168

1169@mcp.resource("docs://gco/packages/{package_name}")

1170def package_doc_resource(package_name: str) -> str:

1171 """Read a package-level README by slug (e.g. mcp-mission, mcp-metric-readers).

1172

1173 Serves the developer-facing internals guides catalogued in

1174 ``PACKAGE_DOC_METADATA`` — the README files that live next to the code

1175 under ``mcp/`` rather than in ``docs/``. Prepends an HTML-comment header

1176 with ``Topics:`` and ``Related:`` lines (mirroring :func:`doc_resource`)

1177 so a consuming LLM sees the classification without it bleeding into the

1178 rendered markdown. An unknown slug returns the literal ``Package doc 'X'

1179 not found. Available: ...`` string so callers can recover.

1180 """

1181 meta = PACKAGE_DOC_METADATA.get(package_name)

1182 if meta is None:

1183 available = ", ".join(sorted(PACKAGE_DOC_METADATA.keys()))

1184 return f"Package doc '{package_name}' not found. Available: {available}"

1185 rel_path = str(meta.get("path", ""))

1186 path = PROJECT_ROOT / rel_path

1187 if not path.is_file(): 1187 ↛ 1188line 1187 didn't jump to line 1188 because the condition on line 1187 was never true

1188 return f"Package doc '{package_name}' file not found at '{rel_path}'."

1189 content = path.read_text()

1190 header_lines = []

1191 topics = meta.get("topics", [])

1192 if isinstance(topics, list) and topics: 1192 ↛ 1194line 1192 didn't jump to line 1194 because the condition on line 1192 was always true

1193 header_lines.append(f"")

1194 related = meta.get("related", [])

1195 if isinstance(related, list) and related: 1195 ↛ 1197line 1195 didn't jump to line 1197 because the condition on line 1195 was always true

1196 header_lines.append(f"")

1197 if header_lines: 1197 ↛ 1199line 1197 didn't jump to line 1199 because the condition on line 1197 was always true

1198 return "\n".join(header_lines) + "\n\n" + content

1199 return content

1200

1201

1202@mcp.resource("docs://gco/examples/README")

1203def examples_readme_resource() -> str:

1204 """Examples README — overview of all example manifests with usage instructions."""

1205 path = EXAMPLES_DIR / "README.md"

1206 if not path.is_file(): 1206 ↛ 1207line 1206 didn't jump to line 1207 because the condition on line 1206 was never true

1207 return "Examples README.md not found."

1208 return path.read_text()

1209

1210

1211@mcp.resource("docs://gco/examples/guide")

1212def examples_guide_resource() -> str:

1213 """How to create new job manifests — patterns, metadata, and best practices.

1214

1215 Use this resource when you need to write a new Kubernetes manifest for GCO.

1216 It provides the metadata for every existing example so you can pick the

1217 closest one as a starting point and adapt it.

1218 """

1219 lines = ["# GCO Example Manifest Guide\n"]

1220 lines.append("Use this guide to create new Kubernetes manifests for GCO. Pick the closest")

1221 lines.append("existing example as a starting point, then adapt it.\n")

1222 lines.append("## All Examples with Metadata\n")

1224 lines.append("|---------|----------|----------|-----|--------|---------------|")

1225 for name, meta in EXAMPLE_METADATA.items():

1226 gpu = meta.get("gpu", "no")

1227 opt_in = meta.get("opt_in", "—") or "—"

1228 submission = meta.get("submission", "")

1229 keywords = meta.get("keywords", [])

1230 keywords_cell = ", ".join(keywords) if isinstance(keywords, list) and keywords else "—"

1231 lines.append(

1232 f"| `{name}` | {meta['category']} | {keywords_cell} | {gpu} | {opt_in} | "

1233 f"`{submission}` |"

1234 )

1235

1236 lines.append("\n## Common Patterns\n")

1237 lines.append("### Namespace")

1238 lines.append(

1239 "All GCO jobs use `namespace: gco-jobs`. Inference uses `namespace: gco-inference`.\n"

1240 )

1241 lines.append("### Security Context (required)")

1242 lines.append("```yaml")

1243 lines.append("securityContext:")

1244 lines.append(" runAsNonRoot: true")

1245 lines.append(" runAsUser: 1000")

1246 lines.append(" runAsGroup: 1000")

1247 lines.append("containers:")

1248 lines.append("- securityContext:")

1249 lines.append(" allowPrivilegeEscalation: false")

1250 lines.append(" capabilities:")

1251 lines.append(' drop: ["ALL"]')

1252 lines.append("```\n")

1253 lines.append("### GPU Resources")

1254 lines.append("```yaml")

1255 lines.append("resources:")

1256 lines.append(" requests:")

1257 lines.append(' nvidia.com/gpu: "1"')

1258 lines.append(" limits:")

1259 lines.append(' nvidia.com/gpu: "1"')

1260 lines.append("tolerations:")

1261 lines.append("- key: nvidia.com/gpu")

1262 lines.append(" operator: Equal")

1263 lines.append(' value: "true"')

1264 lines.append(" effect: NoSchedule")

1265 lines.append("```\n")

1266 lines.append("### EFS Shared Storage")

1267 lines.append("```yaml")

1268 lines.append("volumeMounts:")

1269 lines.append("- name: shared-storage")

1270 lines.append(" mountPath: /mnt/gco")

1271 lines.append("volumes:")

1272 lines.append("- name: shared-storage")

1273 lines.append(" persistentVolumeClaim:")

1274 lines.append(" claimName: gco-shared-storage")

1275 lines.append("```\n")

1276 lines.append("### Prevent Node Consolidation (long-running jobs)")

1277 lines.append("```yaml")

1278 lines.append("metadata:")

1279 lines.append(" annotations:")

1280 lines.append(' karpenter.sh/do-not-disrupt: "true"')

1281 lines.append("```\n")

1282 lines.append("### Submission Methods")

1283 lines.append("1. **SQS (recommended):** `gco jobs submit-sqs <manifest> --region <region>`")

1284 lines.append("2. **API Gateway:** `gco jobs submit <manifest>`")

1285 lines.append("3. **Direct kubectl:** `gco jobs submit-direct <manifest> -r <region>`")

1286 lines.append("4. **kubectl apply:** `kubectl apply -f <manifest>`")

1287 return "\n".join(lines)

1288

1289

1290@mcp.resource("docs://gco/examples/{example_name}")

1291def example_resource(example_name: str) -> str:

1292 """Read an example manifest by name, with metadata context for creating similar jobs.

1293

1294 Returns the raw YAML manifest preceded by a metadata header that describes

1295 what the example does, its requirements, and how to submit it.

1296 """

1297 path = EXAMPLES_DIR / f"{example_name}.yaml"

1298 if not path.is_file():

1299 available = [f.stem for f in EXAMPLES_DIR.glob("*.yaml")]

1300 return f"Example '{example_name}' not found. Available: {', '.join(available)}"

1301

1302 meta = EXAMPLE_METADATA.get(example_name, {})

1303 header_lines = []

1304 if meta: 1304 ↛ 1330line 1304 didn't jump to line 1330 because the condition on line 1304 was always true

1305 header_lines.append(f"# Example: {example_name}")

1306 header_lines.append(f"# Category: {meta.get('category', 'Unknown')}")

1307 header_lines.append(f"# Summary: {meta.get('summary', '')}")

1308 if meta.get("gpu", "no") != "no":

1309 header_lines.append(f"# GPU/Accelerator: {meta['gpu']}")

1310 if meta.get("opt_in"):

1311 header_lines.append(f"# Opt-in required: {meta['opt_in']}")

1312 header_lines.append(

1313 f"# Submit with: {meta.get('submission', 'kubectl apply -f examples/' + example_name + '.yaml')}"

1314 )

1315 keywords = meta.get("keywords", [])

1316 if isinstance(keywords, list) and keywords: 1316 ↛ 1318line 1316 didn't jump to line 1318 because the condition on line 1316 was always true

1317 header_lines.append(f"# Keywords: {', '.join(keywords)}")

1318 instance_types = meta.get("instance_types", [])

1319 if isinstance(instance_types, list) and instance_types:

1320 header_lines.append(f"# Instance Types: {', '.join(instance_types)}")

1321 use_cases = meta.get("use_cases", [])

1322 if isinstance(use_cases, list) and use_cases: 1322 ↛ 1324line 1322 didn't jump to line 1324 because the condition on line 1322 was always true

1323 header_lines.append(f"# Use Cases: {', '.join(use_cases)}")

1324 related = meta.get("related", [])

1325 if isinstance(related, list) and related: 1325 ↛ 1327line 1325 didn't jump to line 1327 because the condition on line 1325 was always true

1326 header_lines.append(f"# Related: {', '.join(related)}")

1327 header_lines.append("#")

1328 header_lines.append("# --- Manifest begins below ---\n")

1329

1330 manifest = path.read_text()

1331 if header_lines: 1331 ↛ 1333line 1331 didn't jump to line 1333 because the condition on line 1331 was always true

1332 return "\n".join(header_lines) + manifest

1333 return manifest

1334

1335

1336@mcp.resource("docs://gco/examples/by-category/{category}")

1337def examples_by_category_resource(category: str) -> str:

1338 """List examples grouped by category.

1339

1340 Returns a markdown listing of every example in the given category. Match

1341 is case-insensitive against the entry's ``category`` field. When the

1342 category is not recognised, returns the literal "Category 'X' not found.

1343 Available: ..." string so callers can recover.

1344 """

1345 matches = [

1346 (name, meta)

1347 for name, meta in EXAMPLE_METADATA.items()

1348 if str(meta.get("category", "")).lower() == category.lower()

1349 ]

1350 if not matches:

1351 available = sorted({str(m.get("category", "")) for m in EXAMPLE_METADATA.values()})

1352 return f"Category '{category}' not found. Available: {', '.join(available)}"

1353 lines = [f"# Examples in category: {category}\n"]

1354 for name, meta in sorted(matches):

1355 lines.append(f"- `docs://gco/examples/{name}` — {meta.get('summary', '')}")

1356 return "\n".join(lines)

1357

1358

1359@mcp.resource("docs://gco/examples/by-use-case/{use_case}")

1360def examples_by_use_case_resource(use_case: str) -> str:

1361 """List examples whose use_cases include the given phrase (case-insensitive).

1362

1363 Substring match against every entry in each example's ``use_cases`` list.

1364 When nothing matches, returns the literal "No examples match use case

1365 'X'." string with a pointer to ``find_examples`` for broader search.

1366 """

1367 needle = use_case.lower()

1368 matches: list[tuple[str, dict[str, str | list[str]]]] = []

1369 for name, meta in EXAMPLE_METADATA.items():

1370 ucs = meta.get("use_cases", [])

1371 if isinstance(ucs, list) and any(needle in str(uc).lower() for uc in ucs):

1372 matches.append((name, meta))

1373 if not matches:

1374 return (

1375 f"No examples match use case '{use_case}'. "

1376 "Try `find_examples(query=...)` for broader search."

1377 )

1378 lines = [f"# Examples matching use case: {use_case}\n"]

1379 for name, meta in sorted(matches):

1380 lines.append(f"- `docs://gco/examples/{name}` — {meta.get('summary', '')}")

1381 return "\n".join(lines)

1382

1383

1384@mcp.resource("docs://gco/docs/by-topic/{topic}")

1385def docs_by_topic_resource(topic: str) -> str:

1386 """List docs whose topics include the given phrase (case-insensitive).

1387

1388 Substring match against every entry in each doc's ``topics`` list. When

1389 nothing matches, returns the literal ``Topic 'X' not found. Available:

1390 ...`` string with the union of every known topic so callers can recover.

1391 """

1392 needle = topic.lower()

1393 matches: list[tuple[str, dict[str, str | list[str]]]] = []

1394 for name, meta in DOC_METADATA.items():

1395 topics = meta.get("topics", [])

1396 if isinstance(topics, list) and any(needle in str(t).lower() for t in topics):

1397 matches.append((name, meta))

1398 if not matches:

1399 available = sorted(

1400 {

1401 str(t)

1402 for meta in DOC_METADATA.values()

1403 for t in (meta.get("topics", []) if isinstance(meta.get("topics"), list) else [])

1404 }

1405 )

1406 return f"Topic '{topic}' not found. Available: {', '.join(available)}"

1407 lines = [f"# Docs matching topic: {topic}\n"]

1408 for name, meta in sorted(matches):

1409 lines.append(f"- `docs://gco/docs/{name}` — {meta.get('summary', '')}")

1410 return "\n".join(lines)

1411

1412

1413@mcp.resource("docs://gco/docs/by-related/{doc_name}")

1414def docs_by_related_resource(doc_name: str) -> str:

1415 """List docs related to ``doc_name``.

1416

1417 Combines two views of the bidirectional relation: every doc that lists

1418 ``doc_name`` in its own ``related`` field (referenced by) and every doc

1419 ``doc_name`` itself lists (references). Unknown names return the literal

1420 ``Doc 'X' not found. Available: ...`` string.

1421 """

1422 if doc_name not in DOC_METADATA:

1423 available = sorted(DOC_METADATA.keys())

1424 return f"Doc '{doc_name}' not found. Available: {', '.join(available)}"

1425

1426 referenced_by: list[str] = []

1427 for name, meta in DOC_METADATA.items():

1428 related = meta.get("related", [])

1429 if isinstance(related, list) and doc_name in related:

1430 referenced_by.append(name)

1431

1432 references: list[str] = []

1433 referenced_self = DOC_METADATA[doc_name].get("related", [])

1434 if isinstance(referenced_self, list): 1434 ↛ 1437line 1434 didn't jump to line 1437 because the condition on line 1434 was always true

1435 references = [str(r) for r in referenced_self]

1436

1437 lines = [f"# Docs related to {doc_name}\n"]

1438 if references: 1438 ↛ 1444line 1438 didn't jump to line 1444 because the condition on line 1438 was always true

1439 lines.append("## Referenced by this doc")

1440 for ref in sorted(set(references)):

1441 meta = DOC_METADATA.get(ref, {})

1442 lines.append(f"- `docs://gco/docs/{ref}` — {meta.get('summary', '')}")

1443 lines.append("")

1444 if referenced_by: 1444 ↛ 1449line 1444 didn't jump to line 1449 because the condition on line 1444 was always true

1445 lines.append("## Docs that reference this one")

1446 for ref in sorted(set(referenced_by)):

1447 meta = DOC_METADATA.get(ref, {})

1448 lines.append(f"- `docs://gco/docs/{ref}` — {meta.get('summary', '')}")

1449 return "\n".join(lines)