Coverage for cli/stacks.py: 93%
933 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-15 15:07 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-15 15:07 +0000
1"""
2Stack management for GCO CLI.
4Provides commands for deploying, updating, and managing CDK stacks.
5This is the largest CLI module (~1600 lines) because it orchestrates the
6full deployment lifecycle including container runtime detection, CDK
7bootstrapping, Lambda source synchronization, and parallel regional deploys.
9This module handles:
10 - Container runtime detection (Docker, Finch, Podman) with automatic fallback
11 - CDK bootstrap across all target regions (idempotent)
12 - Lambda source synchronization (copies handler code + dependencies before synth)
13 - CDK stack deployment with proper dependency ordering:
14 1. Global stack (Global Accelerator, DynamoDB)
15 2. API Gateway stack (auth secret, Lambda proxy)
16 3. Regional stacks in parallel (EKS, VPC, ALB per region)
17 4. Monitoring stack (CloudWatch dashboards, alarms)
18 - Parallel deployment of regional stacks via ThreadPoolExecutor
19 - Stack destruction in reverse dependency order
20 - FSx for Lustre enable/disable toggle
21 - kubectl access configuration (EKS access entries + kubeconfig)
23Key Design Decisions:
24 - Regional stacks deploy in parallel for speed; global/API/monitoring are sequential
25 - Lambda build directories are synced before every deploy to avoid stale code
26 - Container runtime is auto-detected; CDK_DOCKER env var overrides
27 - All destructive operations require -y/--yes confirmation
28 - Stack status is read from CloudFormation, not cached locally
30Environment Variables:
31 CDK_DOCKER: Override container runtime (default: auto-detect Docker/Finch/Podman)
32 AWS_REGION: Default region for single-region operations
33"""
35from __future__ import annotations
37import logging
38import os
39import shutil
40import site
41import subprocess
42import sys
43from collections.abc import Callable
44from concurrent.futures import ThreadPoolExecutor, as_completed
45from dataclasses import dataclass, field
46from datetime import datetime
47from pathlib import Path
48from threading import Event, Lock, Thread
49from typing import TYPE_CHECKING, Any
51from botocore.exceptions import ClientError
53# <pyflowchart-code-diagram> BEGIN - auto-inserted, do not edit
54# Flowchart(s) generated from this file:
55# * ``StackManager.deploy_orchestrated`` -> ``diagrams/code_diagrams/cli/stacks.StackManager_deploy_orchestrated.html``
56# (PNG: ``diagrams/code_diagrams/cli/stacks.StackManager_deploy_orchestrated.png``)
57# * ``StackManager.destroy_orchestrated`` -> ``diagrams/code_diagrams/cli/stacks.StackManager_destroy_orchestrated.html``
58# (PNG: ``diagrams/code_diagrams/cli/stacks.StackManager_destroy_orchestrated.png``)
59# Regenerate with ``python diagrams/code_diagrams/generate.py``.
60# <pyflowchart-code-diagram> END
63if TYPE_CHECKING:
64 from .config import GCOConfig
66logger = logging.getLogger(__name__)
69@dataclass
70class StackInfo:
71 """Information about a CDK stack."""
73 name: str
74 status: str
75 region: str
76 created_time: datetime | None = None
77 updated_time: datetime | None = None
78 outputs: dict[str, str] = field(default_factory=dict)
79 tags: dict[str, str] = field(default_factory=dict)
81 def to_dict(self) -> dict[str, Any]:
82 return {
83 "name": self.name,
84 "status": self.status,
85 "region": self.region,
86 "created_time": self.created_time.isoformat() if self.created_time else None,
87 "updated_time": self.updated_time.isoformat() if self.updated_time else None,
88 "outputs": self.outputs,
89 "tags": self.tags,
90 }
93def _safe_rmtree(path: Path) -> None:
94 """Remove a directory tree, handling broken symlinks on macOS.
96 shutil.rmtree can fail with ``OSError: [Errno 66] Directory not empty``
97 on macOS when pip-installed packages (e.g. botocore) contain broken
98 symlinks or extended-attribute resource forks.
100 Falls back to ``rm -rf`` via subprocess, but only after validating the
101 path is a real directory under the project tree to avoid accidents.
102 """
103 resolved = path.resolve()
105 # Safety: refuse to remove anything that isn't clearly a build artifact
106 # inside the project. The path must contain "lambda" and end with "-build".
107 if "lambda" not in resolved.parts or not resolved.name.endswith("-build"):
108 raise ValueError(f"Refusing to remove unexpected path: {resolved}")
110 try:
111 shutil.rmtree(str(resolved))
112 except OSError:
113 subprocess.run(["rm", "-rf", "--", str(resolved)], check=True)
116# Container runtime detection lives in cli/_container_runtime.py so it can
117# be shared between StackManager (CDK asset bundling) and ImageManager
118# (gco images build/push). The uncached probe is imported from there;
119# this module keeps its own small cache so existing tests that reset
120# ``cli.stacks._container_runtime_cache`` continue to work without
121# touching the new module's cache.
122from cli._container_runtime import ( # noqa: E402
123 _detect_container_runtime_uncached,
124)
126# Cached result for container runtime detection (None = not yet checked)
127_container_runtime_cache: str | None = None
128_container_runtime_checked: bool = False
131def _detect_container_runtime() -> str | None:
132 """
133 Detect available container runtime for CDK asset bundling.
135 Thin caching wrapper around the shared
136 ``cli._container_runtime._detect_container_runtime_uncached`` probe.
137 The cache state is held on this module so tests that patch or reset
138 ``cli.stacks._container_runtime_cache`` keep working unchanged.
139 """
140 global _container_runtime_cache, _container_runtime_checked
141 if _container_runtime_checked: 141 ↛ 142line 141 didn't jump to line 142 because the condition on line 141 was never true
142 return _container_runtime_cache
144 _container_runtime_cache = _detect_container_runtime_uncached()
145 _container_runtime_checked = True
146 return _container_runtime_cache
149class StackManager:
150 """Manages CDK stack operations."""
152 def __init__(self, config: GCOConfig, project_root: Path | None = None):
153 self.config = config
154 self.project_root = project_root or self._find_project_root()
155 self._cdk_path = self._find_cdk()
157 # Ensure Lambda build directory exists before any CDK synth
158 self._ensure_lambda_build()
160 def _find_project_root(self) -> Path:
161 """Find the project root by looking for cdk.json."""
162 current = Path.cwd()
163 for parent in [current] + list(current.parents):
164 if (parent / "cdk.json").exists():
165 return parent
166 return current
168 def _find_cdk(self) -> str:
169 """Find CDK executable."""
170 # Check if cdk is in PATH
171 try:
172 result = subprocess.run(["which", "cdk"], capture_output=True, text=True, check=True)
173 return result.stdout.strip()
174 except subprocess.CalledProcessError:
175 pass
177 # Check common locations
178 for path in ["/usr/local/bin/cdk", "~/.npm-global/bin/cdk"]:
179 expanded = os.path.expanduser(path)
180 if os.path.exists(expanded):
181 return expanded
183 # Fall back to npx
184 return "npx cdk"
186 def _ensure_lambda_build(self) -> None:
187 """Ensure the Lambda build directories exist for CDK synthesis.
189 Called from __init__ so the directory is ready before any CDK synth
190 (even ``cdk list`` needs it because ``app.py`` references the asset).
192 This does a lightweight check — only builds if the directory is
193 missing or incomplete. It does NOT do a full rebuild (no rmtree).
194 The full rebuild happens in ``_rebuild_lambda_packages()``, which
195 is called by ``deploy()`` before the actual CDK deploy.
196 """
197 kubectl_source = self.project_root / "lambda" / "kubectl-applier-simple"
198 kubectl_build = self.project_root / "lambda" / "kubectl-applier-simple-build"
199 helm_source = self.project_root / "lambda" / "helm-installer"
200 helm_build = self.project_root / "lambda" / "helm-installer-build"
202 # Only build if the directory is missing or deps aren't installed
203 if kubectl_source.exists() and (
204 not kubectl_build.exists() or not (kubectl_build / "yaml").exists()
205 ):
206 self._build_kubectl_lambda()
208 if helm_source.exists() and not helm_build.exists(): 208 ↛ 209line 208 didn't jump to line 209 because the condition on line 208 was never true
209 self._build_helm_installer_lambda()
211 def _check_and_fix_stuck_stack(self, stack_name: str) -> None:
212 """Check if a stack is in a stuck state and auto-recover.
214 Stacks can get stuck in REVIEW_IN_PROGRESS, ROLLBACK_COMPLETE, or
215 other non-deployable states. This detects those and cleans up so
216 the next deploy can succeed.
217 """
218 import boto3
220 region = self._get_deploy_region(stack_name)
221 if not region:
222 return
224 try:
225 cfn = boto3.client("cloudformation", region_name=region)
226 response = cfn.describe_stacks(StackName=stack_name)
227 status = response["Stacks"][0]["StackStatus"]
229 stuck_states = {
230 "REVIEW_IN_PROGRESS",
231 "ROLLBACK_COMPLETE",
232 "ROLLBACK_FAILED",
233 "CREATE_FAILED",
234 "DELETE_FAILED",
235 }
237 if status in stuck_states:
238 print(f" Stack {stack_name} is in {status} state, cleaning up...")
239 cfn.delete_stack(StackName=stack_name)
240 waiter = cfn.get_waiter("stack_delete_complete")
241 waiter.wait(StackName=stack_name, WaiterConfig={"Delay": 10, "MaxAttempts": 60})
242 print(f" Stack {stack_name} cleaned up, will recreate on deploy")
244 except Exception as e:
245 logger.debug("Stack pre-check for %s: %s", stack_name, e)
246 # Stack doesn't exist or can't be described — fine, deploy will create it
248 def _diagnose_deploy_failure(self, stack_name: str) -> None:
249 """Fetch CloudFormation events after a failed deploy and print diagnostics.
251 Gives users actionable information instead of just the CDK error message.
252 """
253 import boto3
255 region = self._get_deploy_region(stack_name)
256 if not region:
257 return
259 try:
260 cfn = boto3.client("cloudformation", region_name=region)
262 # Get recent events
263 response = cfn.describe_stack_events(StackName=stack_name)
264 events = response.get("StackEvents", [])
266 # Filter to failed events
267 failed = [
268 e
269 for e in events[:20]
270 if "FAILED" in e.get("ResourceStatus", "")
271 or "ROLLBACK" in e.get("ResourceStatus", "")
272 ]
274 if failed:
275 print(f"\n CloudFormation failure details for {stack_name}:")
276 for event in failed[:5]:
277 resource = event.get("LogicalResourceId", "unknown")
278 status = event.get("ResourceStatus", "unknown")
279 reason = event.get("ResourceStatusReason", "no reason given")
280 print(f" {resource}: {status}")
281 print(f" {reason}")
283 # Check stack status for actionable advice
284 try:
285 stack_resp = cfn.describe_stacks(StackName=stack_name)
286 status = stack_resp["Stacks"][0]["StackStatus"]
288 advice = {
289 "REVIEW_IN_PROGRESS": (
290 "Stack is stuck in REVIEW_IN_PROGRESS. "
291 "Run: aws cloudformation delete-stack "
292 f"--stack-name {stack_name} --region {region}"
293 ),
294 "ROLLBACK_COMPLETE": (
295 "Stack rolled back. Delete it and retry: "
296 f"aws cloudformation delete-stack "
297 f"--stack-name {stack_name} --region {region}"
298 ),
299 "ROLLBACK_FAILED": (
300 "Stack rollback failed. Delete with --retain: "
301 f"aws cloudformation delete-stack "
302 f"--stack-name {stack_name} --region {region}"
303 ),
304 "UPDATE_ROLLBACK_COMPLETE": (
305 "Update rolled back but stack is stable. "
306 "Check the events above and retry the deploy."
307 ),
308 }
310 if status in advice: 310 ↛ exitline 310 didn't return from function '_diagnose_deploy_failure' because the condition on line 310 was always true
311 print(f"\n Suggested fix: {advice[status]}")
313 except Exception as e:
314 logger.debug("Failed to parse stack events: %s", e)
316 except Exception as e:
317 logger.debug("Failed to diagnose deploy failure for %s: %s", stack_name, e)
318 # Best effort — don't fail the deploy further
320 def _sync_lambda_sources(self) -> None:
321 """
322 Sync latest handler code and manifests into the build directory.
324 Called at the start of ``deploy()`` to ensure CDK picks up the
325 latest source changes. Only copies files — does NOT rebuild pip
326 deps or rmtree. Runs once per StackManager instance.
327 """
328 if getattr(self, "_lambda_sources_synced", False):
329 return
330 self._lambda_sources_synced = True
332 source_dir = self.project_root / "lambda" / "kubectl-applier-simple"
333 build_dir = self.project_root / "lambda" / "kubectl-applier-simple-build"
335 if not source_dir.exists() or not build_dir.exists():
336 return
338 # Sync handler.py
339 source_handler = source_dir / "handler.py"
340 build_handler = build_dir / "handler.py"
341 if source_handler.exists(): 341 ↛ 345line 341 didn't jump to line 345 because the condition on line 341 was always true
342 shutil.copy2(source_handler, build_handler)
344 # Sync manifests directory
345 source_manifests = source_dir / "manifests"
346 build_manifests = build_dir / "manifests"
347 if source_manifests.exists(): 347 ↛ exitline 347 didn't return from function '_sync_lambda_sources' because the condition on line 347 was always true
348 build_manifests.mkdir(parents=True, exist_ok=True)
349 for manifest_file in source_manifests.glob("*.yaml"):
350 shutil.copy2(manifest_file, build_manifests / manifest_file.name)
352 def _rebuild_lambda_packages(self) -> None:
353 """Full rebuild of Lambda packages for deploy.
355 Nukes the build directories and recreates them from scratch with
356 fresh pip deps, handler code, and manifests. Called once at the
357 start of a deploy to ensure CDK picks up all changes.
359 Runs once per StackManager instance.
360 """
361 if getattr(self, "_lambda_packages_rebuilt", False): 361 ↛ 362line 361 didn't jump to line 362 because the condition on line 361 was never true
362 return
363 self._lambda_packages_rebuilt = True
364 self._build_lambda_packages()
366 def _build_lambda_packages(self) -> None:
367 """Build Lambda packages for kubectl-applier and helm-installer.
369 Creates build directories with fresh copies of handler code, manifests,
370 charts config, and pip dependencies. This ensures CDK always picks up
371 the latest content regardless of Docker/asset caching.
372 """
373 self._build_kubectl_lambda()
374 self._build_helm_installer_lambda()
376 def _build_kubectl_lambda(self) -> None:
377 """Build the kubectl-applier-simple Lambda package."""
378 source_dir = self.project_root / "lambda" / "kubectl-applier-simple"
379 build_dir = self.project_root / "lambda" / "kubectl-applier-simple-build"
380 requirements = source_dir / "requirements.txt"
382 if not source_dir.exists() or not requirements.exists():
383 return
385 print(" Building kubectl-applier-simple Lambda package...")
387 # Clean stale build directory to avoid broken symlinks from previous
388 # pip installs (botocore/data/ is a common source of dangling symlinks
389 # that cause CDK's asset fingerprinting to fail with ENOENT).
390 if build_dir.exists():
391 _safe_rmtree(build_dir)
393 # Create build directory
394 build_dir.mkdir(parents=True, exist_ok=True)
396 # Copy handler and manifests (always overwrite to prevent stale content)
397 shutil.copy2(source_dir / "handler.py", build_dir / "handler.py")
398 build_manifests = build_dir / "manifests"
399 if (source_dir / "manifests").exists(): 399 ↛ 405line 399 didn't jump to line 405 because the condition on line 399 was always true
400 if build_manifests.exists(): 400 ↛ 401line 400 didn't jump to line 401 because the condition on line 400 was never true
401 shutil.rmtree(build_manifests)
402 shutil.copytree(source_dir / "manifests", build_manifests, dirs_exist_ok=True)
404 # Install pip dependencies for Lambda runtime
405 import sys
407 result = subprocess.run( # nosemgrep: dangerous-subprocess-use-audit - static list: sys.executable + pip args + Path objects, no user input
408 [
409 sys.executable,
410 "-m",
411 "pip",
412 "install",
413 "-r",
414 str(requirements),
415 "-t",
416 str(build_dir),
417 "--upgrade",
418 "--platform",
419 "manylinux2014_x86_64",
420 "--only-binary=:all:",
421 "--quiet",
422 ],
423 capture_output=True,
424 text=True,
425 )
426 if result.returncode != 0: 426 ↛ 427line 426 didn't jump to line 427 because the condition on line 426 was never true
427 print(f" Warning: pip install failed: {result.stderr[:200]}")
428 else:
429 print(" Lambda package built successfully")
431 def _build_helm_installer_lambda(self) -> None:
432 """Build the helm-installer Lambda Docker context.
434 Copies all source files into a clean build directory so CDK always
435 detects changes to charts.yaml, handler.py, Dockerfile, etc.
436 """
437 source_dir = self.project_root / "lambda" / "helm-installer"
438 build_dir = self.project_root / "lambda" / "helm-installer-build"
440 if not source_dir.exists():
441 return
443 print(" Building helm-installer Lambda package...")
445 # Clean and recreate build directory
446 if build_dir.exists():
447 _safe_rmtree(build_dir)
448 build_dir.mkdir(parents=True, exist_ok=True)
450 # Copy all source files into the build directory
451 for item in source_dir.iterdir():
452 if item.name == "__pycache__":
453 continue
454 if item.is_file(): 454 ↛ 456line 454 didn't jump to line 456 because the condition on line 454 was always true
455 shutil.copy2(item, build_dir / item.name)
456 elif item.is_dir():
457 shutil.copytree(item, build_dir / item.name, dirs_exist_ok=True)
459 print(" Helm installer package built successfully")
461 def _get_python_path(self) -> str:
462 """
463 Get PYTHONPATH that includes the current Python's site-packages.
465 This is critical for pipx installations where CDK runs `python3 app.py`
466 using the system Python, which doesn't have aws_cdk installed.
467 By setting PYTHONPATH, we ensure CDK's subprocess can find our modules.
468 """
469 # Get all site-packages directories from the current Python
470 site_packages = site.getsitepackages()
472 # Also include user site-packages if available
473 user_site = site.getusersitepackages()
474 if user_site and os.path.isdir(user_site): 474 ↛ 475line 474 didn't jump to line 475 because the condition on line 474 was never true
475 site_packages.append(user_site)
477 # Include the directory containing the current module (for editable installs)
478 current_module_dir = Path(__file__).parent.parent
479 if current_module_dir.exists(): 479 ↛ 483line 479 didn't jump to line 483 because the condition on line 479 was always true
480 site_packages.append(str(current_module_dir))
482 # Combine with existing PYTHONPATH if any
483 existing_path = os.environ.get("PYTHONPATH", "")
484 all_paths = site_packages + ([existing_path] if existing_path else [])
486 return os.pathsep.join(all_paths)
488 def _run_cdk(
489 self,
490 command: list[str],
491 capture_output: bool = False,
492 env: dict[str, str] | None = None,
493 timeout: float | None = None,
494 ) -> subprocess.CompletedProcess[str]:
495 """Run a CDK command.
497 Args:
498 command: CDK subcommand argv (e.g. ``["destroy", "gco-us-east-1", "--force"]``).
499 capture_output: Capture stdout / stderr instead of streaming.
500 env: Extra env vars merged onto the parent process environment.
501 timeout: Wall-clock timeout in seconds. ``None`` (default) waits
502 forever — preserving the old behaviour for ``synth`` / ``list``.
503 When set, on timeout we send SIGTERM, give the CDK process up
504 to 30 seconds to exit cleanly, then SIGKILL, and finally
505 re-raise ``subprocess.TimeoutExpired`` so callers can decide
506 how to handle a hung subprocess. ``deploy()`` and ``destroy()``
507 pass a per-stack budget so a wedged ``cdk destroy`` (e.g. its
508 post-delete polling loop hanging after CloudFormation has
509 already finished) can't block the orchestrator forever.
510 """
511 full_env = os.environ.copy()
513 # Inject PYTHONPATH so CDK's python3 subprocess can find aws_cdk
514 # This is essential for pipx installations
515 full_env["PYTHONPATH"] = self._get_python_path()
517 if env:
518 full_env.update(env)
520 cdk_cmd = self._cdk_path.split() + command
522 try:
523 if capture_output:
524 return subprocess.run( # nosemgrep: dangerous-subprocess-use-audit - cdk_cmd is a list of static CDK subcommands, no user-controlled shell injection
525 cdk_cmd,
526 cwd=self.project_root,
527 capture_output=True,
528 text=True,
529 env=full_env,
530 timeout=timeout,
531 )
532 return subprocess.run( # nosemgrep: dangerous-subprocess-use-audit - cdk_cmd is a list of static CDK subcommands, no user-controlled shell injection
533 cdk_cmd,
534 cwd=self.project_root,
535 env=full_env,
536 text=True,
537 timeout=timeout,
538 )
539 except subprocess.TimeoutExpired:
540 # subprocess.run already sent SIGKILL and reaped the child by
541 # the time TimeoutExpired propagates. Surface the timeout so
542 # callers (deploy / destroy) can verify post-state via CFN.
543 logger.warning(
544 "cdk command timed out after %ss: %s",
545 timeout,
546 " ".join(cdk_cmd),
547 )
548 raise
550 def list_stacks(self) -> list[str]:
551 """List all available CDK stacks."""
552 result = self._run_cdk(["list"], capture_output=True)
553 if result.returncode != 0:
554 raise RuntimeError(f"Failed to list stacks: {result.stderr}")
555 return [s.strip() for s in result.stdout.strip().split("\n") if s.strip()]
557 def synth(self, stack_name: str | None = None, quiet: bool = True) -> str:
558 """Synthesize CloudFormation templates."""
559 cmd = ["synth"]
560 if stack_name: 560 ↛ 562line 560 didn't jump to line 562 because the condition on line 560 was always true
561 cmd.append(stack_name)
562 if quiet: 562 ↛ 565line 562 didn't jump to line 565 because the condition on line 562 was always true
563 cmd.append("--quiet")
565 result = self._run_cdk(cmd, capture_output=True)
566 if result.returncode != 0:
567 raise RuntimeError(f"CDK synth failed: {result.stderr}")
568 return str(result.stdout)
570 def diff(self, stack_name: str | None = None) -> str:
571 """Show diff between deployed and local stacks."""
572 cmd = ["diff", "--no-color"]
573 if stack_name: 573 ↛ 576line 573 didn't jump to line 576 because the condition on line 573 was always true
574 cmd.append(stack_name)
576 result = self._run_cdk(cmd, capture_output=True)
577 # diff returns non-zero if there are differences, which is expected
578 return str(result.stdout or result.stderr)
580 def deploy(
581 self,
582 stack_name: str | None = None,
583 require_approval: bool = True,
584 all_stacks: bool = False,
585 outputs_file: str | None = None,
586 parameters: dict[str, str] | None = None,
587 tags: dict[str, str] | None = None,
588 progress: str = "events",
589 output_dir: str | None = None,
590 exclusively: bool = False,
591 ) -> bool:
592 """Deploy CDK stacks.
594 Args:
595 stack_name: Name of the stack to deploy
596 require_approval: Whether to require approval for changes
597 all_stacks: Deploy all stacks
598 outputs_file: File to write outputs to
599 parameters: CDK parameters
600 tags: Tags to apply to stacks
601 progress: Progress display type
602 output_dir: Custom CDK output directory (for parallel deployments)
603 exclusively: Pass ``--exclusively`` to CDK so only the named
604 stack is evaluated, not its transitive dependencies. Used by
605 ``deploy_orchestrated`` once earlier phases have already
606 deployed the globals — re-synthesizing them every phase
607 forces custom resources (notably KubectlApplyManifests)
608 to re-run each time, adding minutes per phase for no
609 actual change.
610 """
611 # Full rebuild of Lambda packages on first deploy call (once per session),
612 # then sync latest source on subsequent calls
613 self._rebuild_lambda_packages()
614 self._sync_lambda_sources()
616 # Check for stuck stacks and auto-recover
617 if stack_name:
618 self._check_and_fix_stuck_stack(stack_name)
620 # Ensure container runtime is available for building images
621 runtime = _detect_container_runtime()
622 if not runtime:
623 raise RuntimeError(
624 "No container runtime found. Please install Docker, Finch, or Podman.\n"
625 " - Docker: https://docs.docker.com/get-docker/\n"
626 " - Finch: brew install finch && finch vm init\n"
627 " - Podman: https://podman.io/getting-started/installation"
628 )
630 # Auto-bootstrap the target region if needed
631 if stack_name:
632 region = self._get_deploy_region(stack_name)
633 if region and not self.ensure_bootstrapped(region):
634 raise RuntimeError(
635 f"Region {region} could not be bootstrapped. "
636 "Run 'gco stacks bootstrap --region "
637 f"{region}' manually to diagnose."
638 )
640 cmd = ["deploy"]
642 if all_stacks:
643 cmd.append("--all")
644 elif stack_name: 644 ↛ 652line 644 didn't jump to line 652 because the condition on line 644 was always true
645 cmd.append(stack_name)
647 # --exclusively tells CDK to deploy *only* the named stack, not its
648 # transitive dependencies. deploy_orchestrated sets this once the
649 # earlier phases (global, api-gateway) are already in place so that
650 # the regional and monitoring phases don't re-synthesize and
651 # re-evaluate globals on every pass.
652 if exclusively and stack_name and not all_stacks:
653 cmd.append("--exclusively")
655 if not require_approval:
656 cmd.extend(["--require-approval", "never"])
658 if outputs_file:
659 cmd.extend(["--outputs-file", outputs_file])
661 if parameters:
662 for key, value in parameters.items():
663 cmd.extend(["--parameters", f"{key}={value}"])
665 if tags:
666 for key, value in tags.items():
667 cmd.extend(["--tags", f"{key}={value}"])
669 cmd.extend(["--progress", progress])
671 # Use custom output directory for parallel deployments
672 if output_dir:
673 cmd.extend(["--output", output_dir])
675 # Set CDK_DOCKER env var if not already set
676 env = {"CDK_DOCKER": runtime} if not os.environ.get("CDK_DOCKER") else None
678 # Per-stack wall-clock cap so a wedged ``cdk deploy`` (e.g. an
679 # IAM eventual-consistency wait that never completes) can't block
680 # the orchestrator forever. Default 60 minutes — long enough for
681 # a fresh EKS cluster cold start. Override via
682 # GCO_CDK_DEPLOY_TIMEOUT_SECONDS.
683 timeout_s = float(os.environ.get("GCO_CDK_DEPLOY_TIMEOUT_SECONDS", "3600"))
685 try:
686 result = self._run_cdk(cmd, env=env, timeout=timeout_s)
687 success = result.returncode == 0
688 except subprocess.TimeoutExpired:
689 print(
690 f" cdk deploy timed out after {timeout_s}s for "
691 f"{stack_name or 'all stacks'}; verifying CloudFormation state..."
692 )
693 success = False
695 # Reconcile against CloudFormation. If the stack is in a terminal
696 # CREATE/UPDATE_COMPLETE state, the deploy actually succeeded
697 # despite cdk's exit code or timeout.
698 if stack_name and not all_stacks and not success:
699 cfn_status = self._get_stack_status(stack_name)
700 if cfn_status in ("CREATE_COMPLETE", "UPDATE_COMPLETE"):
701 print(
702 f" cdk reported a non-zero exit but {stack_name} is in "
703 f"{cfn_status} in CloudFormation — treating as success."
704 )
705 success = True
707 if not success and stack_name:
708 self._diagnose_deploy_failure(stack_name)
710 # After deploying gco-analytics, automatically redeploy
711 # gco-api-gateway to wire in the /studio/* routes (the API gateway
712 # imports the Cognito pool ARN and presigned-URL Lambda ARN from
713 # the analytics stack).
714 if success and stack_name and "analytics" in stack_name and not all_stacks:
715 print(" Updating gco-api-gateway with analytics routes...")
716 self.deploy(
717 stack_name="gco-api-gateway",
718 require_approval=require_approval,
719 exclusively=True,
720 )
722 return success
724 def destroy(
725 self,
726 stack_name: str | None = None,
727 all_stacks: bool = False,
728 force: bool = False,
729 output_dir: str | None = None,
730 ) -> bool:
731 """Destroy CDK stacks.
733 If the target stack exists in CloudFormation but isn't in the CDK
734 app (e.g. because a toggle was disabled), temporarily enables the
735 toggle so CDK can synthesize and destroy the stack properly. This
736 ensures custom resource cleanup handlers (like the analytics
737 cleanup Lambda) fire during deletion.
739 Args:
740 stack_name: Name of the stack to destroy
741 all_stacks: Destroy all stacks
742 force: Skip confirmation prompts
743 output_dir: Custom CDK output directory (for parallel deployments)
744 """
745 # Image-registry pre-destroy guards. Only fires for the global
746 # stack (where the registry lives) and only when the operator
747 # has explicitly chosen ``removal_policy: "destroy"``. The
748 # default ``retain`` posture is a no-op here. See
749 # ``_image_registry_destroy_preflight`` for the exact rules.
750 if (
751 stack_name == "gco-global"
752 and not all_stacks
753 and not self._image_registry_destroy_preflight(force=force)
754 ):
755 return False
757 # If destroying a specific stack that exists in CloudFormation but
758 # might not be in the CDK app, temporarily enable its toggle.
759 toggle_restored = False
760 if (
761 stack_name
762 and not all_stacks
763 and "analytics" in stack_name
764 and self._stack_exists_in_cloudformation(stack_name)
765 ):
766 toggle_restored = self._ensure_analytics_enabled_for_destroy()
768 # The analytics stack exports values (e.g. Cognito pool ARN) that
769 # gco-api-gateway imports. CloudFormation blocks deletion of stacks
770 # with consumed exports. To break the dependency, redeploy the API
771 # gateway with analytics disabled first, then destroy analytics.
772 if stack_name and not all_stacks and "analytics" in stack_name:
773 safe_to_destroy = self._remove_api_gateway_analytics_dependency()
774 if not safe_to_destroy:
775 # Restore analytics toggle before bailing out.
776 if toggle_restored: 776 ↛ 777line 776 didn't jump to line 777 because the condition on line 776 was never true
777 self._restore_analytics_disabled()
778 print(
779 " Aborting gco-analytics destroy: gco-api-gateway still "
780 "imports analytics exports. Fix the API gateway and retry."
781 )
782 return False
784 cmd = ["destroy"]
786 if all_stacks:
787 cmd.append("--all")
788 elif stack_name: 788 ↛ 796line 788 didn't jump to line 796 because the condition on line 788 was always true
789 cmd.append(stack_name)
790 # --exclusively prevents CDK from cascading the destroy to
791 # dependent stacks (e.g. destroying gco-analytics should not
792 # also destroy gco-api-gateway just because it references the
793 # presigned-URL Lambda ARN).
794 cmd.append("--exclusively")
796 if force:
797 cmd.append("--force")
799 if output_dir:
800 cmd.extend(["--output", output_dir])
802 # Per-stack wall-clock cap so a wedged ``cdk destroy`` (its
803 # post-delete polling loop hanging after CloudFormation has
804 # already finished) can't block the orchestrator forever. Default
805 # 45 minutes — enough for an EKS regional teardown which can take
806 # 25-30 minutes when SG / ENI cleanup serialises. Override via
807 # GCO_CDK_DESTROY_TIMEOUT_SECONDS.
808 timeout_s = float(os.environ.get("GCO_CDK_DESTROY_TIMEOUT_SECONDS", "2700"))
810 try:
811 result = self._run_cdk(cmd, timeout=timeout_s)
812 cdk_succeeded = result.returncode == 0
813 except subprocess.TimeoutExpired:
814 # CDK hung. Verify the AWS-side state below — if the stack
815 # is gone in CloudFormation, the destroy actually succeeded
816 # and the timeout was just CDK's polling loop wedged.
817 print(
818 f" cdk destroy timed out after {timeout_s}s; verifying "
819 f"CloudFormation state for {stack_name}..."
820 )
821 cdk_succeeded = False
823 # Restore the toggle if we changed it
824 if toggle_restored:
825 self._restore_analytics_disabled()
827 # Reconcile against CloudFormation. Three outcomes:
828 # 1. cdk succeeded AND stack is gone → success
829 # 2. stack is gone (regardless of cdk return code or timeout) → success
830 # 3. stack still exists → fall back to direct CFN delete or surface failure
831 if stack_name and not all_stacks:
832 still_present = self._stack_exists_in_cloudformation(stack_name)
833 if not still_present:
834 # Whatever cdk did or didn't do, AWS confirms the stack
835 # is gone. Treat as success.
836 if not cdk_succeeded:
837 print(
838 f" cdk reported a non-zero exit but {stack_name} is "
839 f"already deleted in CloudFormation — treating as success."
840 )
841 return True
842 # Stack still there. If cdk succeeded but CFN still has it,
843 # fall back to a direct CFN delete (existing behaviour).
844 if cdk_succeeded:
845 return self._cloudformation_delete_stack(stack_name)
846 # cdk failed AND stack is still present → real failure.
847 return False
849 return cdk_succeeded
851 # ------------------------------------------------------------------
852 # Image registry pre-destroy guards
853 # ------------------------------------------------------------------
854 def _read_images_config(self) -> dict[str, Any]:
855 """Read the ``images`` block from cdk.json with defaults applied.
857 Mirrors the parser in ``gco/stacks/global_stack.py`` so the CLI
858 can reason about the same fields without importing the CDK
859 module (which pulls aws_cdk and the full constructs surface).
860 Defaults stay aligned with the global-stack parser; any value
861 that fails validation (e.g. an unexpected ``removal_policy``)
862 is silently coerced to ``"retain"`` here so the CLI never blocks
863 on a typo — the actual deploy-time validation is the global
864 stack's responsibility.
865 """
866 import json
868 cdk_json_path = _find_cdk_json()
869 if not cdk_json_path:
870 return {
871 "removal_policy": "retain",
872 "empty_on_delete": False,
873 }
874 try:
875 with open(cdk_json_path, encoding="utf-8") as f:
876 ctx = json.load(f).get("context", {}) or {}
877 except (OSError, json.JSONDecodeError) as exc:
878 logger.debug("Failed to read cdk.json for images config: %s", exc)
879 return {"removal_policy": "retain", "empty_on_delete": False}
881 raw = ctx.get("images") or {}
882 removal_policy = str(raw.get("removal_policy", "retain")).strip().lower()
883 if removal_policy not in ("retain", "destroy"):
884 removal_policy = "retain"
885 return {
886 "removal_policy": removal_policy,
887 "empty_on_delete": bool(raw.get("empty_on_delete", False)),
888 }
890 def _build_image_registry_inventory(self) -> dict[str, Any]:
891 """Aggregate repo / tag / size / reference counts for the registry.
893 Returns a dict shape suitable for printing to the operator. Best
894 effort: a missing ImageManager dependency or an AWS error
895 produces a partially-populated dict rather than raising.
896 """
897 inventory: dict[str, Any] = {
898 "repo_count": 0,
899 "tag_count": 0,
900 "total_bytes": 0,
901 "endpoint_refs": 0,
902 "job_refs": 0,
903 }
904 try:
905 from cli.images import ImageManager
906 except Exception as exc: # noqa: BLE001
907 logger.debug("ImageManager import failed during preflight: %s", exc)
908 return inventory
910 try:
911 manager = ImageManager(config=self.config)
912 repos = manager.list_repos()
913 inventory["repo_count"] = len(repos)
914 for repo in repos:
915 repo_name = repo.get("name", "")
916 if not repo_name.startswith("gco/"):
917 continue
918 short = repo_name.removeprefix("gco/")
919 try:
920 tags = manager.list_tags(short)
921 except Exception as exc: # noqa: BLE001
922 logger.debug("list_tags failed for %s: %s", repo_name, exc)
923 continue
924 inventory["tag_count"] += len(tags)
925 for row in tags:
926 size = row.get("size_bytes")
927 if isinstance(size, int): 927 ↛ 925line 927 didn't jump to line 925 because the condition on line 927 was always true
928 inventory["total_bytes"] += size
929 try:
930 inventory["endpoint_refs"] = len(manager._collect_inference_image_refs())
931 except Exception as exc: # noqa: BLE001
932 logger.debug("inference ref collection failed: %s", exc)
933 try:
934 inventory["job_refs"] = len(manager._collect_recent_job_image_refs())
935 except Exception as exc: # noqa: BLE001
936 logger.debug("job ref collection failed: %s", exc)
937 except Exception as exc: # noqa: BLE001
938 logger.debug("Image registry inventory failed: %s", exc)
939 return inventory
941 def _image_registry_destroy_preflight(self, *, force: bool) -> bool:
942 """Validate the image-registry destroy posture before invoking CFN.
944 Two rules:
946 1. ``removal_policy: "destroy"`` AND ``empty_on_delete: false``
947 → refuse with the literal helpful-error message pointing
948 the operator at ``gco images cleanup --all`` or at flipping
949 ``empty_on_delete: true``.
951 2. ``removal_policy: "destroy"`` AND ``empty_on_delete: true``
952 → print the inventory summary first. On a TTY the operator
953 is also prompted for confirmation; non-TTY runs proceed
954 (the operator presumably passed ``-y`` or is automating).
956 Returns True when the destroy may proceed, False when it has
957 been refused or declined.
958 """
959 cfg = self._read_images_config()
960 if cfg["removal_policy"] != "destroy":
961 return True
963 if not cfg["empty_on_delete"]:
964 print(
965 "Repos under gco/* are not empty and empty_on_delete is "
966 "false. Run 'gco images cleanup --all' first, or set "
967 "images.empty_on_delete: true in cdk.json."
968 )
969 return False
971 inventory = self._build_image_registry_inventory()
972 gib = inventory["total_bytes"] / (1024**3) if inventory["total_bytes"] else 0.0
973 print("Image registry inventory before destroy:")
974 print(f" repos: {inventory['repo_count']}")
975 print(f" tags: {inventory['tag_count']}")
976 print(f" total size: {gib:.2f} GiB")
977 print(f" referencing endpoints: {inventory['endpoint_refs']}")
978 print(f" recent job refs: {inventory['job_refs']}")
980 # Already confirmed via -y, or non-interactive — proceed.
981 if force or not sys.stdin.isatty():
982 return True
984 try:
985 response = input("Destroy gco-global and delete every gco/* repo? [y/N]: ")
986 except EOFError, KeyboardInterrupt:
987 print("Aborted.")
988 return False
989 if response.strip().lower() not in ("y", "yes"):
990 print("Aborted.")
991 return False
992 return True
994 def _stack_exists_in_cloudformation(self, stack_name: str) -> bool:
995 """Check if a stack exists and is not in a deleted state."""
996 import boto3
998 region = self._get_destroy_region(stack_name)
999 cfn = boto3.client("cloudformation", region_name=region)
1000 try:
1001 resp = cfn.describe_stacks(StackName=stack_name)
1002 status = resp["Stacks"][0]["StackStatus"]
1003 return "DELETE" not in status
1004 except Exception:
1005 return False
1007 def _get_stack_status(self, stack_name: str) -> str | None:
1008 """Return the live CloudFormation status of ``stack_name`` or None.
1010 Used by ``deploy()`` to reconcile against AWS-side state when ``cdk
1011 deploy`` returns a non-zero exit code or times out — if the stack
1012 actually finished CREATE_COMPLETE or UPDATE_COMPLETE on the AWS
1013 side, the deploy succeeded regardless of what cdk reported.
1014 Returns None when the stack does not exist or the lookup itself
1015 fails (network blip, perms, etc.) so callers can treat the
1016 unknown case as 'cdk's verdict stands'.
1017 """
1018 import boto3
1020 region = self._get_destroy_region(stack_name)
1021 cfn = boto3.client("cloudformation", region_name=region)
1022 try:
1023 resp = cfn.describe_stacks(StackName=stack_name)
1024 return str(resp["Stacks"][0]["StackStatus"])
1025 except Exception:
1026 return None
1028 def _cloudformation_delete_stack(self, stack_name: str) -> bool:
1029 """Delete a stack directly via CloudFormation API."""
1030 import boto3
1032 region = self._get_destroy_region(stack_name)
1033 cfn = boto3.client("cloudformation", region_name=region)
1034 try:
1035 cfn.delete_stack(StackName=stack_name)
1036 waiter = cfn.get_waiter("stack_delete_complete")
1037 waiter.wait(
1038 StackName=stack_name,
1039 WaiterConfig={"Delay": 15, "MaxAttempts": 120},
1040 )
1041 return True
1042 except Exception:
1043 return False
1045 def _get_destroy_region(self, stack_name: str) -> str:
1046 """Determine the region for a stack based on its name."""
1047 try:
1048 region = self._get_deploy_region(stack_name)
1049 return region or self.config.api_gateway_region
1050 except Exception:
1051 return self.config.api_gateway_region
1053 def _ensure_analytics_enabled_for_destroy(self) -> bool:
1054 """Temporarily enable analytics so CDK includes the stack for destroy."""
1055 try:
1056 current = get_analytics_config()
1057 if not current.get("enabled"):
1058 update_analytics_config({"enabled": True})
1059 return True
1060 except Exception as exc:
1061 logger.debug(
1062 "Failed to enable analytics toggle for destroy: %s",
1063 exc,
1064 exc_info=True,
1065 )
1066 return False
1068 def _restore_analytics_disabled(self) -> None:
1069 """Restore analytics toggle to disabled after destroy."""
1070 try:
1071 update_analytics_config({"enabled": False})
1072 except Exception as exc:
1073 logger.warning(
1074 "Failed to restore analytics toggle to disabled after destroy: %s",
1075 exc,
1076 exc_info=True,
1077 )
1079 def _remove_api_gateway_analytics_dependency(self) -> bool:
1080 """Redeploy gco-api-gateway with analytics disabled to drop cross-stack imports.
1082 The analytics stack exports values (Cognito pool ARN, presigned-URL
1083 Lambda ARN) that gco-api-gateway imports for the /studio/* routes.
1084 CloudFormation blocks deletion of stacks with consumed exports. By
1085 disabling analytics and redeploying the API gateway, the /studio/*
1086 routes are removed and the imports are dropped, unblocking the
1087 analytics stack deletion.
1089 Returns:
1090 True if the analytics stack is safe to destroy (either because
1091 no consumer remains or because the redeploy successfully
1092 dropped the imports). False if a consumer of the analytics
1093 exports still exists and the analytics destroy will fail.
1094 """
1095 # Fast path: if gco-api-gateway doesn't exist (or has already been
1096 # deleted/rolled-back into a non-consuming state), there's nothing
1097 # importing the analytics exports. Skip the redeploy entirely.
1098 if not self._stack_exists_in_cloudformation("gco-api-gateway"):
1099 logger.info(
1100 "gco-api-gateway does not exist in CloudFormation; "
1101 "skipping redeploy before analytics destroy."
1102 )
1103 return True
1105 # Second fast path: if the deployed api-gateway isn't actually
1106 # importing anything from the analytics stack, we don't need to
1107 # touch it. This happens when analytics was never fully wired up.
1108 if not self._api_gateway_imports_from_analytics():
1109 logger.info(
1110 "gco-api-gateway does not import any gco-analytics exports; "
1111 "skipping redeploy before analytics destroy."
1112 )
1113 return True
1115 try:
1116 # Temporarily disable analytics so CDK drops the /studio/* routes.
1117 current = get_analytics_config()
1118 was_enabled = current.get("enabled", False)
1119 if was_enabled:
1120 update_analytics_config({"enabled": False})
1122 print(" Updating gco-api-gateway to remove analytics routes...")
1123 import tempfile
1125 with tempfile.TemporaryDirectory() as tmp_out:
1126 success = self.deploy(
1127 stack_name="gco-api-gateway",
1128 require_approval=False,
1129 exclusively=True,
1130 output_dir=tmp_out,
1131 )
1133 # Re-enable analytics so CDK can synthesize the analytics stack
1134 # for the destroy operation (custom resources need to fire).
1135 if was_enabled:
1136 update_analytics_config({"enabled": True})
1138 if not success:
1139 # The redeploy failed. That's only a real problem if the
1140 # api-gateway still imports analytics exports. Recheck:
1141 # the auto-cleanup of ROLLBACK_COMPLETE stacks may have
1142 # deleted the consumer entirely, in which case the destroy
1143 # can still proceed.
1144 if not self._api_gateway_imports_from_analytics():
1145 logger.info(
1146 "gco-api-gateway redeploy failed, but the stack no "
1147 "longer imports analytics exports (likely deleted "
1148 "during cleanup). Analytics destroy can proceed."
1149 )
1150 return True
1151 logger.error(
1152 "Failed to redeploy gco-api-gateway to drop analytics "
1153 "imports, and the stack still consumes analytics exports. "
1154 "Destroying gco-analytics will fail with "
1155 "'Export ... cannot be deleted as it is in use'. Fix "
1156 "gco-api-gateway first (see events above) and retry."
1157 )
1158 return False
1160 return True
1161 except Exception as exc:
1162 logger.warning(
1163 "Failed to remove API gateway analytics dependency: %s",
1164 exc,
1165 exc_info=True,
1166 )
1167 # On unexpected exceptions, recheck whether imports remain.
1168 # Be permissive only if we can confirm the destroy is safe.
1169 try:
1170 return not self._api_gateway_imports_from_analytics()
1171 except Exception:
1172 return False
1174 def _api_gateway_imports_from_analytics(self) -> bool:
1175 """Return True if gco-api-gateway imports any exports from gco-analytics.
1177 Uses CloudFormation's ``list_exports`` + ``list_imports`` to detect
1178 cross-stack references at runtime. This is more reliable than
1179 inspecting the CDK app because it reflects what's actually
1180 deployed.
1181 """
1182 import boto3
1184 region = self._get_deploy_region("gco-analytics")
1185 if not region:
1186 return False
1188 try:
1189 cfn = boto3.client("cloudformation", region_name=region)
1190 # Collect every export whose owning stack is gco-analytics.
1191 analytics_exports: list[str] = []
1192 paginator = cfn.get_paginator("list_exports")
1193 for page in paginator.paginate():
1194 for export in page.get("Exports", []):
1195 owner = export.get("ExportingStackId", "")
1196 # ExportingStackId is a full ARN; match by stack name.
1197 if ":stack/gco-analytics/" in owner:
1198 analytics_exports.append(export["Name"])
1200 if not analytics_exports:
1201 return False
1203 # For each export, check whether gco-api-gateway is listed
1204 # as an importer. ``list_imports`` returns the stack names
1205 # that currently import the given export.
1206 import_paginator = cfn.get_paginator("list_imports")
1207 for export_name in analytics_exports:
1208 try:
1209 for page in import_paginator.paginate(ExportName=export_name):
1210 for importer in page.get("Imports", []):
1211 if importer == "gco-api-gateway": 1211 ↛ 1210line 1211 didn't jump to line 1210 because the condition on line 1211 was always true
1212 return True
1213 except Exception as exc:
1214 # ``list_imports`` raises when an export has zero
1215 # consumers — treat that as "not imported" and move on.
1216 logger.debug(
1217 "list_imports(%s) failed (likely no consumers): %s",
1218 export_name,
1219 exc,
1220 )
1221 return False
1222 except Exception as exc:
1223 logger.debug(
1224 "Failed to check analytics imports for gco-api-gateway: %s",
1225 exc,
1226 exc_info=True,
1227 )
1228 # On failure to check, err on the side of attempting the
1229 # redeploy so we don't skip necessary cleanup.
1230 return True
1232 def bootstrap(
1233 self,
1234 account: str | None = None,
1235 region: str | None = None,
1236 ) -> bool:
1237 """Bootstrap CDK in an AWS account/region."""
1238 cmd = ["bootstrap"]
1240 if account and region:
1241 cmd.append(f"aws://{account}/{region}")
1242 elif region: 1242 ↛ 1245line 1242 didn't jump to line 1245 because the condition on line 1242 was always true
1243 cmd.append(f"aws://unknown-account/{region}")
1245 result = self._run_cdk(cmd)
1246 return result.returncode == 0
1248 def is_bootstrapped(self, region: str) -> bool:
1249 """Check if CDK has been bootstrapped in a region.
1251 Looks for the CDKToolkit CloudFormation stack which is created
1252 by ``cdk bootstrap``. Result is cached per region for the lifetime
1253 of this StackManager instance.
1254 """
1255 if not hasattr(self, "_bootstrap_cache"): 1255 ↛ 1258line 1255 didn't jump to line 1258 because the condition on line 1255 was always true
1256 self._bootstrap_cache: dict[str, bool] = {}
1258 if region in self._bootstrap_cache: 1258 ↛ 1259line 1258 didn't jump to line 1259 because the condition on line 1258 was never true
1259 return self._bootstrap_cache[region]
1261 import boto3
1263 cf = boto3.client("cloudformation", region_name=region)
1264 try:
1265 response = cf.describe_stacks(StackName="CDKToolkit")
1266 stacks = response.get("Stacks", [])
1267 if stacks:
1268 status = stacks[0].get("StackStatus", "")
1269 # Any non-deleted state counts as bootstrapped
1270 result = "DELETE" not in status
1271 self._bootstrap_cache[region] = result
1272 return result
1273 except ClientError:
1274 pass # Stack doesn't exist — not bootstrapped
1275 except Exception as e:
1276 logger.debug("Failed to check CDK bootstrap in %s: %s", region, e)
1278 self._bootstrap_cache[region] = False
1279 return False
1281 def ensure_bootstrapped(self, region: str) -> bool:
1282 """Ensure a region is CDK-bootstrapped, auto-bootstrapping if needed.
1284 Returns True if the region is (or was successfully) bootstrapped.
1285 """
1286 if self.is_bootstrapped(region):
1287 return True
1289 print(f"ℹ Region {region} is not CDK-bootstrapped. Bootstrapping now...")
1290 success = self.bootstrap(region=region)
1291 if success:
1292 # Update cache so we don't re-check this region
1293 if not hasattr(self, "_bootstrap_cache"):
1294 self._bootstrap_cache = {}
1295 self._bootstrap_cache[region] = True
1296 print(f"✓ CDK bootstrapped in {region}")
1297 else:
1298 print(f"✗ Failed to bootstrap CDK in {region}")
1299 return success
1301 def _get_deploy_region(self, stack_name: str) -> str | None:
1302 """Determine the target AWS region for a given stack name."""
1303 from .config import _load_cdk_json
1305 cdk_regions = _load_cdk_json()
1307 region: str | None
1308 if stack_name == "gco-global":
1309 region = cdk_regions.get("global") or self.config.global_region
1310 return region
1311 if stack_name == "gco-api-gateway":
1312 region = cdk_regions.get("api_gateway") or self.config.api_gateway_region
1313 return region
1314 if stack_name == "gco-monitoring":
1315 region = cdk_regions.get("monitoring") or self.config.monitoring_region
1316 return region
1317 if stack_name == "gco-analytics":
1318 # The analytics stack shares the API gateway region so the
1319 # presigned-URL Lambda can hook into the existing /studio/*
1320 # routes on the same API Gateway.
1321 region = cdk_regions.get("api_gateway") or self.config.api_gateway_region
1322 return region
1324 # Regional stacks: gco-{region}
1325 prefix = "gco-"
1326 if stack_name.startswith(prefix):
1327 return stack_name[len(prefix) :]
1329 return None
1331 def get_outputs(self, stack_name: str, region: str) -> dict[str, str]:
1332 """Get stack outputs from CloudFormation."""
1333 import boto3
1335 cf = boto3.client("cloudformation", region_name=region)
1336 try:
1337 response = cf.describe_stacks(StackName=stack_name)
1338 if response["Stacks"]:
1339 stack = response["Stacks"][0]
1340 outputs: dict[str, str] = {}
1341 for output in stack.get("Outputs", []):
1342 outputs[str(output["OutputKey"])] = str(output["OutputValue"])
1343 return outputs
1344 except Exception as e:
1345 logger.debug("Failed to get outputs for %s in %s: %s", stack_name, region, e)
1346 return {}
1348 def get_stack_status(self, stack_name: str, region: str) -> StackInfo | None:
1349 """Get detailed stack status from CloudFormation."""
1350 import boto3
1352 cf = boto3.client("cloudformation", region_name=region)
1353 try:
1354 response = cf.describe_stacks(StackName=stack_name)
1355 if response["Stacks"]:
1356 stack = response["Stacks"][0]
1357 return StackInfo(
1358 name=stack["StackName"],
1359 status=stack["StackStatus"],
1360 region=region,
1361 created_time=stack.get("CreationTime"),
1362 updated_time=stack.get("LastUpdatedTime"),
1363 outputs={o["OutputKey"]: o["OutputValue"] for o in stack.get("Outputs", [])},
1364 tags={t["Key"]: t["Value"] for t in stack.get("Tags", [])},
1365 )
1366 except Exception as e:
1367 logger.debug("Failed to get stack status for %s in %s: %s", stack_name, region, e)
1368 return None
1370 def deploy_orchestrated(
1371 self,
1372 require_approval: bool = True,
1373 outputs_file: str | None = None,
1374 parameters: dict[str, str] | None = None,
1375 tags: dict[str, str] | None = None,
1376 progress: str = "events",
1377 on_stack_start: Callable[[str], None] | None = None,
1378 on_stack_complete: Callable[[str, bool], None] | None = None,
1379 parallel: bool = False,
1380 max_workers: int = 4,
1381 ) -> tuple[bool, list[str], list[str]]:
1382 """
1383 Deploy all stacks in the correct order.
1385 Deploys global stacks first, then regional stacks (optionally in parallel),
1386 then the monitoring stack (which depends on regional stacks).
1388 Args:
1389 require_approval: Whether to require approval for changes
1390 outputs_file: File to write outputs to
1391 parameters: CDK parameters
1392 tags: Tags to apply to stacks
1393 progress: Progress display type
1394 on_stack_start: Callback(stack_name) called when starting a stack
1395 on_stack_complete: Callback(stack_name, success) called when stack completes
1396 parallel: Deploy regional stacks in parallel
1397 max_workers: Maximum number of parallel deployments (default: 4)
1399 Returns:
1400 Tuple of (overall_success, successful_stacks, failed_stacks)
1401 """
1402 stacks = self.list_stacks()
1403 ordered_stacks = get_stack_deployment_order(stacks)
1405 # Separate stacks into three groups:
1406 # 1. Pre-regional global stacks (gco-global, gco-api-gateway)
1407 # 2. Regional stacks (can be parallelized)
1408 # 3. Post-regional stacks (gco-monitoring - depends on regional stacks)
1409 pre_regional = {"gco-global", "gco-api-gateway"}
1410 post_regional = {"gco-monitoring"}
1412 pre_regional_stacks = [s for s in ordered_stacks if s in pre_regional]
1413 regional_stacks = [
1414 s for s in ordered_stacks if s not in pre_regional and s not in post_regional
1415 ]
1416 post_regional_stacks = [s for s in ordered_stacks if s in post_regional]
1418 successful: list[str] = []
1419 failed: list[str] = []
1421 # Phase 1: Deploy pre-regional global stacks sequentially
1422 for stack_name in pre_regional_stacks:
1423 if on_stack_start:
1424 on_stack_start(stack_name)
1426 success = self.deploy(
1427 stack_name=stack_name,
1428 require_approval=require_approval,
1429 outputs_file=outputs_file,
1430 parameters=parameters,
1431 tags=tags,
1432 progress=progress,
1433 )
1435 if success:
1436 successful.append(stack_name)
1437 else:
1438 failed.append(stack_name)
1440 if on_stack_complete:
1441 on_stack_complete(stack_name, success)
1443 # Stop on failure to prevent cascading issues
1444 if not success:
1445 return False, successful, failed
1447 # Phase 2: Deploy regional stacks (parallel or sequential)
1448 # All regional stacks pass --exclusively: globals are already deployed
1449 # in Phase 1, so CDK doesn't need to re-evaluate them. Skipping that
1450 # re-evaluation avoids re-running custom resources (notably
1451 # KubectlApplyManifests) on the global stacks every time a regional
1452 # stack is deployed — that would otherwise re-apply manifests and
1453 # rollout-restart controllers for no actual change.
1454 if regional_stacks: 1454 ↛ 1506line 1454 didn't jump to line 1506 because the condition on line 1454 was always true
1455 if parallel and len(regional_stacks) > 1:
1456 # Parallel deployment of regional stacks
1457 successful_regional, failed_regional = self._deploy_stacks_parallel(
1458 stacks=regional_stacks,
1459 require_approval=require_approval,
1460 outputs_file=outputs_file,
1461 parameters=parameters,
1462 tags=tags,
1463 progress=progress,
1464 on_stack_start=on_stack_start,
1465 on_stack_complete=on_stack_complete,
1466 max_workers=max_workers,
1467 )
1468 successful.extend(successful_regional)
1469 failed.extend(failed_regional)
1471 # Stop if any regional stack failed
1472 if failed_regional:
1473 return False, successful, failed
1474 else:
1475 # Sequential deployment
1476 for stack_name in regional_stacks:
1477 if on_stack_start:
1478 on_stack_start(stack_name)
1480 success = self.deploy(
1481 stack_name=stack_name,
1482 require_approval=require_approval,
1483 outputs_file=outputs_file,
1484 parameters=parameters,
1485 tags=tags,
1486 progress=progress,
1487 exclusively=True,
1488 )
1490 if success: 1490 ↛ 1493line 1490 didn't jump to line 1493 because the condition on line 1490 was always true
1491 successful.append(stack_name)
1492 else:
1493 failed.append(stack_name)
1495 if on_stack_complete:
1496 on_stack_complete(stack_name, success)
1498 # Stop on failure
1499 if not success: 1499 ↛ 1500line 1499 didn't jump to line 1500 because the condition on line 1499 was never true
1500 return False, successful, failed
1502 # Phase 3: Deploy post-regional stacks (monitoring) sequentially.
1503 # Same rationale as Phase 2: every upstream stack is already
1504 # deployed, so --exclusively prevents a redundant pass over
1505 # global/api-gateway/regional.
1506 for stack_name in post_regional_stacks:
1507 if on_stack_start: 1507 ↛ 1508line 1507 didn't jump to line 1508 because the condition on line 1507 was never true
1508 on_stack_start(stack_name)
1510 success = self.deploy(
1511 stack_name=stack_name,
1512 require_approval=require_approval,
1513 outputs_file=outputs_file,
1514 parameters=parameters,
1515 tags=tags,
1516 progress=progress,
1517 exclusively=True,
1518 )
1520 if success: 1520 ↛ 1523line 1520 didn't jump to line 1523 because the condition on line 1520 was always true
1521 successful.append(stack_name)
1522 else:
1523 failed.append(stack_name)
1525 if on_stack_complete: 1525 ↛ 1526line 1525 didn't jump to line 1526 because the condition on line 1525 was never true
1526 on_stack_complete(stack_name, success)
1528 if not success: 1528 ↛ 1529line 1528 didn't jump to line 1529 because the condition on line 1528 was never true
1529 return False, successful, failed
1531 return len(failed) == 0, successful, failed
1533 def _deploy_stacks_parallel(
1534 self,
1535 stacks: list[str],
1536 require_approval: bool,
1537 outputs_file: str | None,
1538 parameters: dict[str, str] | None,
1539 tags: dict[str, str] | None,
1540 progress: str,
1541 on_stack_start: Callable[[str], None] | None,
1542 on_stack_complete: Callable[[str, bool], None] | None,
1543 max_workers: int,
1544 ) -> tuple[list[str], list[str]]:
1545 """Deploy multiple stacks in parallel using separate CDK output directories."""
1546 import tempfile
1548 successful: list[str] = []
1549 failed: list[str] = []
1550 lock = Lock()
1552 def deploy_single(stack_name: str) -> tuple[str, bool]:
1553 # Use a unique output directory in /tmp for each parallel deployment
1554 # This avoids CDK copying cdk.out.* directories into assets
1555 output_dir = tempfile.mkdtemp(prefix=f"cdk-{stack_name}-")
1557 if on_stack_start: 1557 ↛ 1558line 1557 didn't jump to line 1558 because the condition on line 1557 was never true
1558 with lock:
1559 on_stack_start(stack_name)
1561 success = self.deploy(
1562 stack_name=stack_name,
1563 require_approval=require_approval,
1564 outputs_file=outputs_file,
1565 parameters=parameters,
1566 tags=tags,
1567 progress=progress,
1568 output_dir=output_dir,
1569 exclusively=True,
1570 )
1572 # Clean up the temporary output directory
1573 try:
1574 import shutil
1576 if os.path.exists(output_dir): 1576 ↛ 1581line 1576 didn't jump to line 1581 because the condition on line 1576 was always true
1577 shutil.rmtree(output_dir)
1578 except Exception as e:
1579 logger.debug("Cleanup of %s failed: %s", output_dir, e)
1581 return stack_name, success
1583 with ThreadPoolExecutor(max_workers=max_workers) as executor:
1584 futures = {executor.submit(deploy_single, stack): stack for stack in stacks}
1586 for future in as_completed(futures):
1587 stack_name, success = future.result()
1589 with lock:
1590 if success:
1591 successful.append(stack_name)
1592 else:
1593 failed.append(stack_name)
1595 if on_stack_complete: 1595 ↛ 1596line 1595 didn't jump to line 1596 because the condition on line 1595 was never true
1596 on_stack_complete(stack_name, success)
1598 return successful, failed
1600 def destroy_orchestrated(
1601 self,
1602 force: bool = False,
1603 on_stack_start: Callable[[str], None] | None = None,
1604 on_stack_complete: Callable[[str, bool], None] | None = None,
1605 parallel: bool = False,
1606 max_workers: int = 4,
1607 ) -> tuple[bool, list[str], list[str]]:
1608 """
1609 Destroy all stacks in the correct order.
1611 Destroys monitoring stack first, then regional stacks (optionally in parallel),
1612 then global stacks.
1614 Args:
1615 force: Skip confirmation prompts
1616 on_stack_start: Callback(stack_name) called when starting a stack
1617 on_stack_complete: Callback(stack_name, success) called when stack completes
1618 parallel: Destroy regional stacks in parallel
1619 max_workers: Maximum number of parallel destructions (default: 4)
1621 Returns:
1622 Tuple of (overall_success, successful_stacks, failed_stacks)
1623 """
1624 stacks = self.list_stacks()
1626 # Image-registry pre-destroy guard — fires before ANY teardown so
1627 # operators don't end up with monitoring + regional + api-gateway
1628 # already destroyed when the global-stack ECR delete fails on a
1629 # non-empty repo. Skipped entirely under the default
1630 # ``removal_policy: "retain"`` posture. See
1631 # ``_image_registry_destroy_preflight`` for the exact rules.
1632 if not self._image_registry_destroy_preflight(force=force):
1633 return False, [], list(stacks)
1635 # Phase 0: Clean up backup vault recovery points so the global stack
1636 # can be deleted cleanly by CloudFormation.
1637 self._cleanup_backup_vault()
1639 # Separate stacks into three groups (reverse of deploy order):
1640 pre_regional = {"gco-global", "gco-api-gateway"}
1641 post_regional = {"gco-monitoring"}
1643 post_regional_stacks = [s for s in stacks if s in post_regional]
1644 regional_stacks = [s for s in stacks if s not in pre_regional and s not in post_regional]
1645 pre_regional_stacks = [s for s in stacks if s in pre_regional]
1647 # Sort regional stacks alphabetically (reversed for destroy)
1648 regional_stacks.sort(reverse=True)
1649 # Sort pre-regional stacks in reverse priority order
1650 pre_regional_order = {"gco-api-gateway": 1, "gco-global": 2}
1651 pre_regional_stacks.sort(key=lambda x: pre_regional_order.get(x, 0))
1653 successful: list[str] = []
1654 failed: list[str] = []
1656 # Phase 1: Destroy post-regional stacks (monitoring) first
1657 for stack_name in post_regional_stacks:
1658 if on_stack_start: 1658 ↛ 1659line 1658 didn't jump to line 1659 because the condition on line 1658 was never true
1659 on_stack_start(stack_name)
1661 success = self.destroy(
1662 stack_name=stack_name,
1663 force=force,
1664 )
1666 if success: 1666 ↛ 1669line 1666 didn't jump to line 1669 because the condition on line 1666 was always true
1667 successful.append(stack_name)
1668 else:
1669 failed.append(stack_name)
1671 if on_stack_complete: 1671 ↛ 1672line 1671 didn't jump to line 1672 because the condition on line 1671 was never true
1672 on_stack_complete(stack_name, success)
1674 # Phase 2: Destroy regional stacks (parallel or sequential)
1675 # Each regional destroy has a background watchdog that proactively
1676 # deletes EKS-managed security groups + orphaned ENIs as soon as
1677 # they appear. Without this, CloudFormation's VPC delete step sits
1678 # in DELETE_IN_PROGRESS for ~10 min waiting for EKS to GC the
1679 # ``eks-cluster-sg-<cluster>-*`` security group and its cluster
1680 # ENIs. The SG is owned by the EKS service (not CloudFormation),
1681 # so CFN can't delete it directly — it just polls the VPC's
1682 # dependencies until they drain. See ``_start_eks_sg_watchdog``.
1683 watchdog_stops: dict[str, Event] = {}
1684 watchdog_threads: dict[str, Thread] = {}
1685 for stack_name in regional_stacks:
1686 stop_event = Event()
1687 thread = self._start_eks_sg_watchdog(stack_name, stop_event)
1688 watchdog_stops[stack_name] = stop_event
1689 watchdog_threads[stack_name] = thread
1691 if regional_stacks: 1691 ↛ 1724line 1691 didn't jump to line 1724 because the condition on line 1691 was always true
1692 if parallel and len(regional_stacks) > 1:
1693 # Parallel destruction of regional stacks
1694 successful_regional, failed_regional = self._destroy_stacks_parallel(
1695 stacks=regional_stacks,
1696 force=force,
1697 on_stack_start=on_stack_start,
1698 on_stack_complete=on_stack_complete,
1699 max_workers=max_workers,
1700 )
1701 successful.extend(successful_regional)
1702 failed.extend(failed_regional)
1703 else:
1704 # Sequential destruction
1705 for stack_name in regional_stacks:
1706 if on_stack_start:
1707 on_stack_start(stack_name)
1709 success = self.destroy(
1710 stack_name=stack_name,
1711 force=force,
1712 )
1714 if success:
1715 successful.append(stack_name)
1716 else:
1717 failed.append(stack_name)
1719 if on_stack_complete:
1720 on_stack_complete(stack_name, success)
1722 # Stop all watchdogs and do one final cleanup pass per stack to
1723 # catch anything EKS re-created during the destroy flow.
1724 for stack_name in regional_stacks:
1725 watchdog_stops[stack_name].set()
1726 watchdog_threads[stack_name].join(timeout=5)
1727 self._cleanup_eks_security_groups(stack_name)
1729 # Phase 3: Destroy pre-regional global stacks last
1730 for stack_name in pre_regional_stacks:
1731 if on_stack_start:
1732 on_stack_start(stack_name)
1734 success = self.destroy(
1735 stack_name=stack_name,
1736 force=force,
1737 )
1739 if success: 1739 ↛ 1742line 1739 didn't jump to line 1742 because the condition on line 1739 was always true
1740 successful.append(stack_name)
1741 else:
1742 failed.append(stack_name)
1744 if on_stack_complete:
1745 on_stack_complete(stack_name, success)
1747 return len(failed) == 0, successful, failed
1749 def _destroy_stacks_parallel(
1750 self,
1751 stacks: list[str],
1752 force: bool,
1753 on_stack_start: Callable[[str], None] | None,
1754 on_stack_complete: Callable[[str, bool], None] | None,
1755 max_workers: int,
1756 ) -> tuple[list[str], list[str]]:
1757 """Destroy multiple stacks in parallel using separate CDK output directories."""
1758 import tempfile
1760 successful: list[str] = []
1761 failed: list[str] = []
1762 lock = Lock()
1764 def destroy_single(stack_name: str) -> tuple[str, bool]:
1765 # Use a unique output directory in /tmp for each parallel destruction
1766 output_dir = tempfile.mkdtemp(prefix=f"cdk-{stack_name}-")
1768 if on_stack_start: 1768 ↛ 1769line 1768 didn't jump to line 1769 because the condition on line 1768 was never true
1769 with lock:
1770 on_stack_start(stack_name)
1772 success = self.destroy(
1773 stack_name=stack_name,
1774 force=force,
1775 output_dir=output_dir,
1776 )
1778 # Clean up the temporary output directory
1779 try:
1780 import shutil
1782 if os.path.exists(output_dir): 1782 ↛ 1787line 1782 didn't jump to line 1787 because the condition on line 1782 was always true
1783 shutil.rmtree(output_dir)
1784 except Exception as e:
1785 logger.debug("Cleanup of %s failed: %s", output_dir, e)
1787 return stack_name, success
1789 with ThreadPoolExecutor(max_workers=max_workers) as executor:
1790 futures = {executor.submit(destroy_single, stack): stack for stack in stacks}
1792 for future in as_completed(futures):
1793 stack_name, success = future.result()
1795 with lock:
1796 if success:
1797 successful.append(stack_name)
1798 else:
1799 failed.append(stack_name)
1801 if on_stack_complete: 1801 ↛ 1802line 1801 didn't jump to line 1802 because the condition on line 1801 was never true
1802 on_stack_complete(stack_name, success)
1804 return successful, failed
1806 def _cleanup_backup_vault(self) -> None:
1807 """Delete all recovery points from the GCO backup vault.
1809 This is called before destroy-all so CloudFormation can delete the
1810 backup vault cleanly. Without this, the vault deletion fails because
1811 it contains recovery points.
1812 """
1813 import boto3
1815 global_region = self.config.global_region
1816 project_name = self.config.project_name
1818 try:
1819 backup_client = boto3.client("backup", region_name=global_region)
1821 # Find the backup vault by listing vaults with the project prefix
1822 paginator = backup_client.get_paginator("list_backup_vaults")
1823 vault_name = None
1824 for page in paginator.paginate():
1825 for vault in page.get("BackupVaultList", []):
1826 if project_name in vault["BackupVaultName"].lower(): 1826 ↛ 1825line 1826 didn't jump to line 1825 because the condition on line 1826 was always true
1827 vault_name = vault["BackupVaultName"]
1828 break
1829 if vault_name:
1830 break
1832 if not vault_name:
1833 return
1835 # Delete all recovery points in the vault
1836 rp_paginator = backup_client.get_paginator("list_recovery_points_by_backup_vault")
1837 deleted = 0
1838 for page in rp_paginator.paginate(BackupVaultName=vault_name):
1839 for rp in page.get("RecoveryPoints", []):
1840 try:
1841 backup_client.delete_recovery_point(
1842 BackupVaultName=vault_name,
1843 RecoveryPointArn=rp["RecoveryPointArn"],
1844 )
1845 deleted += 1
1846 except Exception as e:
1847 logger.debug("Failed to delete recovery point: %s", e)
1849 if deleted > 0:
1850 print(f" Cleaned up {deleted} backup recovery points from {vault_name}")
1852 except Exception as e:
1853 print(f" Warning: Backup vault cleanup failed (non-fatal): {e}")
1855 def cleanup_eks_security_groups(self) -> None:
1856 """Clean up EKS-managed security groups across all regional stacks.
1858 Called between destroy retries to remove orphaned security groups
1859 that block VPC deletion.
1860 """
1861 stacks = self.list_stacks()
1862 pre_regional = {"gco-global", "gco-api-gateway", "gco-monitoring"}
1863 regional_stacks = [s for s in stacks if s not in pre_regional]
1864 for stack_name in regional_stacks:
1865 self._cleanup_eks_security_groups(stack_name)
1867 def _cleanup_eks_security_groups(self, stack_name: str) -> None:
1868 """Clean up EKS-managed security groups that block VPC deletion.
1870 EKS creates a cluster security group (eks-cluster-sg-<cluster-name>-*)
1871 that is owned by EKS, not CloudFormation. When the stack is destroyed,
1872 this SG and its attached ENIs can linger, causing VPC deletion to fail.
1874 This method finds and deletes these orphaned security groups and their
1875 ENIs before the stack destroy runs.
1876 """
1877 import time as _time
1879 import boto3
1881 # Extract region from stack name (e.g., gco-us-east-1 -> us-east-1)
1882 project_name = self.config.project_name
1883 region = stack_name.replace(f"{project_name}-", "", 1)
1884 cluster_name = stack_name # cluster name matches stack name
1886 try:
1887 ec2 = boto3.client("ec2", region_name=region)
1889 # Find EKS-managed security groups by name pattern
1890 response = ec2.describe_security_groups(
1891 Filters=[
1892 {
1893 "Name": "group-name",
1894 "Values": [f"eks-cluster-sg-{cluster_name}-*"],
1895 }
1896 ]
1897 )
1899 sgs = response.get("SecurityGroups", [])
1900 if not sgs:
1901 return
1903 for sg in sgs:
1904 sg_id = sg["GroupId"]
1905 sg_name = sg.get("GroupName", "")
1907 # First, detach and delete any ENIs using this SG
1908 eni_response = ec2.describe_network_interfaces(
1909 Filters=[{"Name": "group-id", "Values": [sg_id]}]
1910 )
1912 for eni in eni_response.get("NetworkInterfaces", []):
1913 eni_id = eni["NetworkInterfaceId"]
1914 try:
1915 if eni.get("Attachment"):
1916 ec2.detach_network_interface(
1917 AttachmentId=eni["Attachment"]["AttachmentId"],
1918 Force=True,
1919 )
1920 _time.sleep(5)
1921 ec2.delete_network_interface(NetworkInterfaceId=eni_id)
1922 logger.debug("Deleted ENI %s from SG %s", eni_id, sg_name)
1923 except Exception as e:
1924 logger.debug("Failed to delete ENI %s: %s", eni_id, e)
1926 # Now delete the security group
1927 try:
1928 ec2.delete_security_group(GroupId=sg_id)
1929 print(f" Cleaned up EKS security group: {sg_name} ({sg_id})")
1930 except Exception as e:
1931 logger.debug(
1932 "Failed to delete SG %s: %s (will retry on next attempt)", sg_id, e
1933 )
1935 except Exception as e:
1936 # Non-fatal — the retry loop will handle it
1937 logger.debug("EKS security group cleanup for %s failed: %s", stack_name, e)
1939 def _start_eks_sg_watchdog(self, stack_name: str, stop_event: Event) -> Thread:
1940 """Start a background thread that polls for orphaned EKS security groups.
1942 EKS creates an ``eks-cluster-sg-<cluster-name>-*`` security group that
1943 is owned by the EKS service (not CloudFormation). When the stack's
1944 EKS cluster resource deletes, the SG is supposed to GC along with its
1945 cluster ENIs — but on EKS Auto Mode there's a window where the SG
1946 lingers after the cluster is gone, blocking the subsequent VPC
1947 delete with ``DependencyViolation``. CloudFormation then sits in
1948 ``DELETE_IN_PROGRESS`` on the VPC for ~10 minutes retrying.
1950 This watchdog runs for the full duration of the regional-stack
1951 destroy, polling every 30 seconds. As soon as an orphaned SG appears
1952 (which only happens after the cluster delete has progressed past
1953 the cluster resource itself), it deletes the SG and any ENIs still
1954 attached. That unblocks the VPC delete immediately instead of
1955 waiting for EKS's own GC timer.
1957 The thread exits when ``stop_event`` is set by the orchestrator at
1958 the end of the regional phase.
1959 """
1961 def _watchdog() -> None:
1962 while not stop_event.is_set():
1963 try:
1964 self._cleanup_eks_security_groups(stack_name)
1965 except Exception as e:
1966 logger.debug(
1967 "EKS SG watchdog tick for %s failed (non-fatal): %s",
1968 stack_name,
1969 e,
1970 )
1971 # ``wait`` returns immediately when the event is set, so this
1972 # doubles as the sleep-and-shutdown-check in one call.
1973 stop_event.wait(timeout=30)
1975 thread = Thread(
1976 target=_watchdog,
1977 name=f"eks-sg-watchdog-{stack_name}",
1978 daemon=True,
1979 )
1980 thread.start()
1981 return thread
1984def get_stack_manager(config: GCOConfig) -> StackManager:
1985 """Factory function to get a StackManager instance."""
1986 return StackManager(config)
1989def get_stack_deployment_order(stacks: list[str]) -> list[str]:
1990 """
1991 Get the correct deployment order for stacks.
1993 Order: global stacks first, then regional stacks.
1994 Global stacks: gco-global, gco-api-gateway, gco-monitoring
1995 Regional stacks: gco-{region} (e.g., gco-us-east-1)
1996 """
1997 global_stacks = []
1998 regional_stacks = []
2000 # Define global stack priority (lower = deploy first)
2001 global_priority = {
2002 "gco-global": 1,
2003 "gco-api-gateway": 2,
2004 "gco-analytics": 2.5,
2005 "gco-monitoring": 3,
2006 }
2008 for stack in stacks:
2009 if stack in global_priority:
2010 global_stacks.append((global_priority[stack], stack))
2011 else:
2012 regional_stacks.append(stack)
2014 # Sort global stacks by priority, regional stacks alphabetically
2015 global_stacks.sort(key=lambda x: x[0])
2016 regional_stacks.sort()
2018 return [s[1] for s in global_stacks] + regional_stacks
2021def get_stack_destroy_order(stacks: list[str]) -> list[str]:
2022 """
2023 Get the correct destroy order for stacks.
2025 Order: regional stacks first, then global stacks (reverse of deploy).
2026 """
2027 deployment_order = get_stack_deployment_order(stacks)
2028 return list(reversed(deployment_order))
2031# =============================================================================
2032# Feature toggle helpers
2033# =============================================================================
2035_FSX_DEFAULTS: dict[str, Any] = {
2036 "enabled": False,
2037 "storage_capacity_gib": 1200,
2038 "deployment_type": "SCRATCH_2",
2039 "per_unit_storage_throughput": 200,
2040 "data_compression_type": "LZ4",
2041 "import_path": None,
2042 "export_path": None,
2043 "auto_import_policy": "NEW_CHANGED_DELETED",
2044}
2047def _find_cdk_json() -> Path | None:
2048 """Find cdk.json in current or parent directories."""
2049 current = Path.cwd()
2050 for parent in [current] + list(current.parents):
2051 cdk_path = parent / "cdk.json"
2052 if cdk_path.exists():
2053 return cdk_path
2054 return None
2057def get_fsx_config(region: str | None = None) -> dict[str, Any]:
2058 """Get current FSx for Lustre configuration from cdk.json.
2060 Args:
2061 region: Optional region to get config for. If provided, checks for
2062 region-specific overrides first.
2064 Returns:
2065 FSx configuration dictionary
2066 """
2067 return _get_feature_config("fsx_lustre", _FSX_DEFAULTS, region)
2070def update_fsx_config(settings: dict[str, Any], region: str | None = None) -> None:
2071 """Update FSx for Lustre configuration in cdk.json.
2073 Args:
2074 settings: FSx settings to update
2075 region: Optional region for region-specific config. If None, updates global config.
2076 """
2077 _update_feature_config("fsx_lustre", settings, _FSX_DEFAULTS, region)
2080# =============================================================================
2081# Generic feature toggle helpers (used by FSx, Valkey, Aurora, and future features)
2082# =============================================================================
2085def _get_feature_config(
2086 feature_key: str,
2087 default_config: dict[str, Any],
2088 region: str | None = None,
2089) -> dict[str, Any]:
2090 """Get configuration for a toggleable feature from cdk.json.
2092 Args:
2093 feature_key: The cdk.json context key (e.g. "valkey", "aurora_pgvector").
2094 default_config: Default configuration values when the key is missing.
2095 region: Optional region for region-specific overrides.
2097 Returns:
2098 Merged configuration dictionary.
2099 """
2100 cdk_json_path = _find_cdk_json()
2101 if not cdk_json_path:
2102 raise RuntimeError("cdk.json not found")
2104 import json
2106 with open(cdk_json_path, encoding="utf-8") as f:
2107 cdk_config = json.load(f)
2109 global_config = cdk_config.get("context", {}).get(feature_key, default_config)
2111 if region:
2112 region_key = f"{feature_key}_regions"
2113 region_overrides = cdk_config.get("context", {}).get(region_key, {})
2114 if region in region_overrides:
2115 merged = {**global_config, **region_overrides[region]}
2116 merged["region"] = region
2117 merged["is_region_specific"] = True
2118 return merged
2120 result = {**default_config, **global_config}
2121 result["is_region_specific"] = False
2122 return result
2125def _update_feature_config(
2126 feature_key: str,
2127 settings: dict[str, Any],
2128 default_config: dict[str, Any],
2129 region: str | None = None,
2130) -> None:
2131 """Update configuration for a toggleable feature in cdk.json.
2133 Args:
2134 feature_key: The cdk.json context key (e.g. "valkey", "aurora_pgvector").
2135 settings: Settings to update.
2136 default_config: Default configuration values when the key is missing.
2137 region: Optional region for region-specific config.
2138 """
2139 cdk_json_path = _find_cdk_json()
2140 if not cdk_json_path:
2141 raise RuntimeError("cdk.json not found")
2143 import json
2145 with open(cdk_json_path, encoding="utf-8") as f:
2146 cdk_config = json.load(f)
2148 if "context" not in cdk_config:
2149 cdk_config["context"] = {}
2151 if region:
2152 region_key = f"{feature_key}_regions"
2153 if region_key not in cdk_config["context"]: 2153 ↛ 2155line 2153 didn't jump to line 2155 because the condition on line 2153 was always true
2154 cdk_config["context"][region_key] = {}
2155 if region not in cdk_config["context"][region_key]: 2155 ↛ 2157line 2155 didn't jump to line 2157 because the condition on line 2155 was always true
2156 cdk_config["context"][region_key][region] = {}
2157 for key, value in settings.items():
2158 if value is not None or key == "enabled": 2158 ↛ 2157line 2158 didn't jump to line 2157 because the condition on line 2158 was always true
2159 cdk_config["context"][region_key][region][key] = value
2160 else:
2161 if feature_key not in cdk_config["context"]:
2162 cdk_config["context"][feature_key] = {**default_config}
2163 for key, value in settings.items():
2164 if value is not None or key == "enabled":
2165 cdk_config["context"][feature_key][key] = value
2167 with open(cdk_json_path, "w", encoding="utf-8") as f:
2168 json.dump(cdk_config, f, indent=2)
2171# =============================================================================
2172# Valkey configuration
2173# =============================================================================
2175_VALKEY_DEFAULTS: dict[str, Any] = {
2176 "enabled": False,
2177 "max_data_storage_gb": 5,
2178 "max_ecpu_per_second": 5000,
2179 "snapshot_retention_limit": 1,
2180}
2183def get_valkey_config(region: str | None = None) -> dict[str, Any]:
2184 """Get current Valkey Serverless configuration from cdk.json."""
2185 return _get_feature_config("valkey", _VALKEY_DEFAULTS, region)
2188def update_valkey_config(settings: dict[str, Any], region: str | None = None) -> None:
2189 """Update Valkey Serverless configuration in cdk.json."""
2190 _update_feature_config("valkey", settings, _VALKEY_DEFAULTS, region)
2193# =============================================================================
2194# Aurora pgvector configuration
2195# =============================================================================
2197_AURORA_DEFAULTS: dict[str, Any] = {
2198 "enabled": False,
2199 "min_acu": 0,
2200 "max_acu": 16,
2201 "backup_retention_days": 7,
2202 "deletion_protection": False,
2203}
2206def get_aurora_config(region: str | None = None) -> dict[str, Any]:
2207 """Get current Aurora pgvector configuration from cdk.json."""
2208 return _get_feature_config("aurora_pgvector", _AURORA_DEFAULTS, region)
2211def update_aurora_config(settings: dict[str, Any], region: str | None = None) -> None:
2212 """Update Aurora pgvector configuration in cdk.json."""
2213 _update_feature_config("aurora_pgvector", settings, _AURORA_DEFAULTS, region)
2216# =============================================================================
2217# Analytics environment configuration
2218# =============================================================================
2220_ANALYTICS_DEFAULTS: dict[str, Any] = {
2221 "enabled": False,
2222 "hyperpod": {"enabled": False},
2223 "canvas": {"enabled": False},
2224 "cognito": {"domain_prefix": None, "removal_policy": "destroy"},
2225 "efs": {"removal_policy": "destroy"},
2226 "studio": {"user_profile_name_prefix": None},
2227}
2230def get_analytics_config() -> dict[str, Any]:
2231 """Get the analytics environment configuration from cdk.json.
2233 The analytics stack is single-region by construction (lives in the
2234 api-gateway region), so this helper does not accept a region argument.
2235 Returned dict is the defaults merged with any operator overrides from
2236 the ``context.analytics_environment`` block.
2237 """
2238 return _get_feature_config("analytics_environment", _ANALYTICS_DEFAULTS)
2241def update_analytics_config(settings: dict[str, Any]) -> None:
2242 """Update the analytics environment configuration in cdk.json.
2244 Mirrors ``update_valkey_config`` / ``update_aurora_config``. Nested
2245 keys under ``analytics_environment`` (``hyperpod``, ``canvas``,
2246 ``cognito``, ``efs``, ``studio``) are merged one level deep rather
2247 than replaced wholesale — ``enable --hyperpod`` must not clobber
2248 ``cognito.removal_policy``.
2249 """
2250 _update_feature_config("analytics_environment", settings, _ANALYTICS_DEFAULTS)