Coverage for cli / stacks.py: 88%
710 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-30 21:47 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-30 21:47 +0000
1"""
2Stack management for GCO CLI.
4Provides commands for deploying, updating, and managing CDK stacks.
5This is the largest CLI module (~1600 lines) because it orchestrates the
6full deployment lifecycle including container runtime detection, CDK
7bootstrapping, Lambda source synchronization, and parallel regional deploys.
9This module handles:
10 - Container runtime detection (Docker, Finch, Podman) with automatic fallback
11 - CDK bootstrap across all target regions (idempotent)
12 - Lambda source synchronization (copies handler code + dependencies before synth)
13 - CDK stack deployment with proper dependency ordering:
14 1. Global stack (Global Accelerator, DynamoDB)
15 2. API Gateway stack (auth secret, Lambda proxy)
16 3. Regional stacks in parallel (EKS, VPC, ALB per region)
17 4. Monitoring stack (CloudWatch dashboards, alarms)
18 - Parallel deployment of regional stacks via ThreadPoolExecutor
19 - Stack destruction in reverse dependency order
20 - FSx for Lustre enable/disable toggle
21 - kubectl access configuration (EKS access entries + kubeconfig)
23Key Design Decisions:
24 - Regional stacks deploy in parallel for speed; global/API/monitoring are sequential
25 - Lambda build directories are synced before every deploy to avoid stale code
26 - Container runtime is auto-detected; CDK_DOCKER env var overrides
27 - All destructive operations require -y/--yes confirmation
28 - Stack status is read from CloudFormation, not cached locally
30Environment Variables:
31 CDK_DOCKER: Override container runtime (default: auto-detect Docker/Finch/Podman)
32 AWS_REGION: Default region for single-region operations
33"""
35from __future__ import annotations
37import logging
38import os
39import shutil
40import site
41import subprocess
42from collections.abc import Callable
43from concurrent.futures import ThreadPoolExecutor, as_completed
44from dataclasses import dataclass, field
45from datetime import datetime
46from pathlib import Path
47from threading import Event, Lock, Thread
48from typing import TYPE_CHECKING, Any
50from botocore.exceptions import ClientError
52if TYPE_CHECKING:
53 from .config import GCOConfig
55logger = logging.getLogger(__name__)
58@dataclass
59class StackInfo:
60 """Information about a CDK stack."""
62 name: str
63 status: str
64 region: str
65 created_time: datetime | None = None
66 updated_time: datetime | None = None
67 outputs: dict[str, str] = field(default_factory=dict)
68 tags: dict[str, str] = field(default_factory=dict)
70 def to_dict(self) -> dict[str, Any]:
71 return {
72 "name": self.name,
73 "status": self.status,
74 "region": self.region,
75 "created_time": self.created_time.isoformat() if self.created_time else None,
76 "updated_time": self.updated_time.isoformat() if self.updated_time else None,
77 "outputs": self.outputs,
78 "tags": self.tags,
79 }
82def _safe_rmtree(path: Path) -> None:
83 """Remove a directory tree, handling broken symlinks on macOS.
85 shutil.rmtree can fail with ``OSError: [Errno 66] Directory not empty``
86 on macOS when pip-installed packages (e.g. botocore) contain broken
87 symlinks or extended-attribute resource forks.
89 Falls back to ``rm -rf`` via subprocess, but only after validating the
90 path is a real directory under the project tree to avoid accidents.
91 """
92 resolved = path.resolve()
94 # Safety: refuse to remove anything that isn't clearly a build artifact
95 # inside the project. The path must contain "lambda" and end with "-build".
96 if "lambda" not in resolved.parts or not resolved.name.endswith("-build"):
97 raise ValueError(f"Refusing to remove unexpected path: {resolved}")
99 try:
100 shutil.rmtree(str(resolved))
101 except OSError:
102 subprocess.run(["rm", "-rf", "--", str(resolved)], check=True)
105# Cached result for container runtime detection (None = not yet checked)
106_container_runtime_cache: str | None = None
107_container_runtime_checked: bool = False
110def _detect_container_runtime() -> str | None:
111 """
112 Detect available container runtime for CDK asset bundling.
114 CDK requires a container runtime to build Lambda function assets.
115 This function checks for available runtimes in order of preference
116 and verifies they are actually running (not just installed).
118 Priority order: docker > finch > podman
120 Returns:
121 Runtime name ('docker', 'finch', or 'podman') if found and running,
122 None if no runtime is available.
124 Note:
125 If CDK_DOCKER environment variable is already set, that value
126 is returned without checking if the runtime is available.
127 """
128 global _container_runtime_cache, _container_runtime_checked
129 if _container_runtime_checked: 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true
130 return _container_runtime_cache
132 _container_runtime_cache = _detect_container_runtime_uncached()
133 _container_runtime_checked = True
134 return _container_runtime_cache
137def _detect_container_runtime_uncached() -> str | None:
138 """Uncached implementation of container runtime detection."""
139 # Check if CDK_DOCKER is already set
140 if os.environ.get("CDK_DOCKER"):
141 return os.environ["CDK_DOCKER"]
143 # Try docker first
144 if shutil.which("docker"):
145 # Verify docker is actually running
146 try:
147 result = subprocess.run(
148 ["docker", "info"],
149 capture_output=True,
150 timeout=5,
151 )
152 if result.returncode == 0:
153 return "docker"
154 except subprocess.TimeoutExpired, Exception:
155 pass
157 # Try finch as fallback
158 if shutil.which("finch"):
159 try:
160 result = subprocess.run(
161 ["finch", "info"],
162 capture_output=True,
163 timeout=5,
164 )
165 if result.returncode == 0: 165 ↛ 171line 165 didn't jump to line 171 because the condition on line 165 was always true
166 return "finch"
167 except subprocess.TimeoutExpired, Exception:
168 pass
170 # Try podman as last resort
171 if shutil.which("podman"):
172 try:
173 result = subprocess.run(
174 ["podman", "info"],
175 capture_output=True,
176 timeout=5,
177 )
178 if result.returncode == 0: 178 ↛ 183line 178 didn't jump to line 183 because the condition on line 178 was always true
179 return "podman"
180 except subprocess.TimeoutExpired, Exception:
181 pass
183 return None
186class StackManager:
187 """Manages CDK stack operations."""
189 def __init__(self, config: GCOConfig, project_root: Path | None = None):
190 self.config = config
191 self.project_root = project_root or self._find_project_root()
192 self._cdk_path = self._find_cdk()
194 # Ensure Lambda build directory exists before any CDK synth
195 self._ensure_lambda_build()
197 def _find_project_root(self) -> Path:
198 """Find the project root by looking for cdk.json."""
199 current = Path.cwd()
200 for parent in [current] + list(current.parents):
201 if (parent / "cdk.json").exists():
202 return parent
203 return current
205 def _find_cdk(self) -> str:
206 """Find CDK executable."""
207 # Check if cdk is in PATH
208 try:
209 result = subprocess.run(["which", "cdk"], capture_output=True, text=True, check=True)
210 return result.stdout.strip()
211 except subprocess.CalledProcessError:
212 pass
214 # Check common locations
215 for path in ["/usr/local/bin/cdk", "~/.npm-global/bin/cdk"]:
216 expanded = os.path.expanduser(path)
217 if os.path.exists(expanded):
218 return expanded
220 # Fall back to npx
221 return "npx cdk"
223 def _ensure_lambda_build(self) -> None:
224 """Ensure the Lambda build directories exist for CDK synthesis.
226 Called from __init__ so the directory is ready before any CDK synth
227 (even ``cdk list`` needs it because ``app.py`` references the asset).
229 This does a lightweight check — only builds if the directory is
230 missing or incomplete. It does NOT do a full rebuild (no rmtree).
231 The full rebuild happens in ``_rebuild_lambda_packages()``, which
232 is called by ``deploy()`` before the actual CDK deploy.
233 """
234 kubectl_source = self.project_root / "lambda" / "kubectl-applier-simple"
235 kubectl_build = self.project_root / "lambda" / "kubectl-applier-simple-build"
236 helm_source = self.project_root / "lambda" / "helm-installer"
237 helm_build = self.project_root / "lambda" / "helm-installer-build"
239 # Only build if the directory is missing or deps aren't installed
240 if kubectl_source.exists() and (
241 not kubectl_build.exists() or not (kubectl_build / "yaml").exists()
242 ):
243 self._build_kubectl_lambda()
245 if helm_source.exists() and not helm_build.exists(): 245 ↛ 246line 245 didn't jump to line 246 because the condition on line 245 was never true
246 self._build_helm_installer_lambda()
248 def _check_and_fix_stuck_stack(self, stack_name: str) -> None:
249 """Check if a stack is in a stuck state and auto-recover.
251 Stacks can get stuck in REVIEW_IN_PROGRESS, ROLLBACK_COMPLETE, or
252 other non-deployable states. This detects those and cleans up so
253 the next deploy can succeed.
254 """
255 import boto3
257 region = self._get_deploy_region(stack_name)
258 if not region:
259 return
261 try:
262 cfn = boto3.client("cloudformation", region_name=region)
263 response = cfn.describe_stacks(StackName=stack_name)
264 status = response["Stacks"][0]["StackStatus"]
266 stuck_states = {
267 "REVIEW_IN_PROGRESS",
268 "ROLLBACK_COMPLETE",
269 "ROLLBACK_FAILED",
270 "CREATE_FAILED",
271 "DELETE_FAILED",
272 }
274 if status in stuck_states:
275 print(f" Stack {stack_name} is in {status} state, cleaning up...")
276 cfn.delete_stack(StackName=stack_name)
277 waiter = cfn.get_waiter("stack_delete_complete")
278 waiter.wait(StackName=stack_name, WaiterConfig={"Delay": 10, "MaxAttempts": 60})
279 print(f" Stack {stack_name} cleaned up, will recreate on deploy")
281 except Exception as e:
282 logger.debug("Stack pre-check for %s: %s", stack_name, e)
283 # Stack doesn't exist or can't be described — fine, deploy will create it
285 def _diagnose_deploy_failure(self, stack_name: str) -> None:
286 """Fetch CloudFormation events after a failed deploy and print diagnostics.
288 Gives users actionable information instead of just the CDK error message.
289 """
290 import boto3
292 region = self._get_deploy_region(stack_name)
293 if not region:
294 return
296 try:
297 cfn = boto3.client("cloudformation", region_name=region)
299 # Get recent events
300 response = cfn.describe_stack_events(StackName=stack_name)
301 events = response.get("StackEvents", [])
303 # Filter to failed events
304 failed = [
305 e
306 for e in events[:20]
307 if "FAILED" in e.get("ResourceStatus", "")
308 or "ROLLBACK" in e.get("ResourceStatus", "")
309 ]
311 if failed:
312 print(f"\n CloudFormation failure details for {stack_name}:")
313 for event in failed[:5]:
314 resource = event.get("LogicalResourceId", "unknown")
315 status = event.get("ResourceStatus", "unknown")
316 reason = event.get("ResourceStatusReason", "no reason given")
317 print(f" {resource}: {status}")
318 print(f" {reason}")
320 # Check stack status for actionable advice
321 try:
322 stack_resp = cfn.describe_stacks(StackName=stack_name)
323 status = stack_resp["Stacks"][0]["StackStatus"]
325 advice = {
326 "REVIEW_IN_PROGRESS": (
327 "Stack is stuck in REVIEW_IN_PROGRESS. "
328 "Run: aws cloudformation delete-stack "
329 f"--stack-name {stack_name} --region {region}"
330 ),
331 "ROLLBACK_COMPLETE": (
332 "Stack rolled back. Delete it and retry: "
333 f"aws cloudformation delete-stack "
334 f"--stack-name {stack_name} --region {region}"
335 ),
336 "ROLLBACK_FAILED": (
337 "Stack rollback failed. Delete with --retain: "
338 f"aws cloudformation delete-stack "
339 f"--stack-name {stack_name} --region {region}"
340 ),
341 "UPDATE_ROLLBACK_COMPLETE": (
342 "Update rolled back but stack is stable. "
343 "Check the events above and retry the deploy."
344 ),
345 }
347 if status in advice: 347 ↛ exitline 347 didn't return from function '_diagnose_deploy_failure' because the condition on line 347 was always true
348 print(f"\n Suggested fix: {advice[status]}")
350 except Exception as e:
351 logger.debug("Failed to parse stack events: %s", e)
353 except Exception as e:
354 logger.debug("Failed to diagnose deploy failure for %s: %s", stack_name, e)
355 # Best effort — don't fail the deploy further
357 def _sync_lambda_sources(self) -> None:
358 """
359 Sync latest handler code and manifests into the build directory.
361 Called at the start of ``deploy()`` to ensure CDK picks up the
362 latest source changes. Only copies files — does NOT rebuild pip
363 deps or rmtree. Runs once per StackManager instance.
364 """
365 if getattr(self, "_lambda_sources_synced", False): 365 ↛ 366line 365 didn't jump to line 366 because the condition on line 365 was never true
366 return
367 self._lambda_sources_synced = True
369 source_dir = self.project_root / "lambda" / "kubectl-applier-simple"
370 build_dir = self.project_root / "lambda" / "kubectl-applier-simple-build"
372 if not source_dir.exists() or not build_dir.exists():
373 return
375 # Sync handler.py
376 source_handler = source_dir / "handler.py"
377 build_handler = build_dir / "handler.py"
378 if source_handler.exists(): 378 ↛ 382line 378 didn't jump to line 382 because the condition on line 378 was always true
379 shutil.copy2(source_handler, build_handler)
381 # Sync manifests directory
382 source_manifests = source_dir / "manifests"
383 build_manifests = build_dir / "manifests"
384 if source_manifests.exists(): 384 ↛ exitline 384 didn't return from function '_sync_lambda_sources' because the condition on line 384 was always true
385 build_manifests.mkdir(parents=True, exist_ok=True)
386 for manifest_file in source_manifests.glob("*.yaml"):
387 shutil.copy2(manifest_file, build_manifests / manifest_file.name)
389 def _rebuild_lambda_packages(self) -> None:
390 """Full rebuild of Lambda packages for deploy.
392 Nukes the build directories and recreates them from scratch with
393 fresh pip deps, handler code, and manifests. Called once at the
394 start of a deploy to ensure CDK picks up all changes.
396 Runs once per StackManager instance.
397 """
398 if getattr(self, "_lambda_packages_rebuilt", False): 398 ↛ 399line 398 didn't jump to line 399 because the condition on line 398 was never true
399 return
400 self._lambda_packages_rebuilt = True
401 self._build_lambda_packages()
403 def _build_lambda_packages(self) -> None:
404 """Build Lambda packages for kubectl-applier and helm-installer.
406 Creates build directories with fresh copies of handler code, manifests,
407 charts config, and pip dependencies. This ensures CDK always picks up
408 the latest content regardless of Docker/asset caching.
409 """
410 self._build_kubectl_lambda()
411 self._build_helm_installer_lambda()
413 def _build_kubectl_lambda(self) -> None:
414 """Build the kubectl-applier-simple Lambda package."""
415 source_dir = self.project_root / "lambda" / "kubectl-applier-simple"
416 build_dir = self.project_root / "lambda" / "kubectl-applier-simple-build"
417 requirements = source_dir / "requirements.txt"
419 if not source_dir.exists() or not requirements.exists():
420 return
422 print(" Building kubectl-applier-simple Lambda package...")
424 # Clean stale build directory to avoid broken symlinks from previous
425 # pip installs (botocore/data/ is a common source of dangling symlinks
426 # that cause CDK's asset fingerprinting to fail with ENOENT).
427 if build_dir.exists():
428 _safe_rmtree(build_dir)
430 # Create build directory
431 build_dir.mkdir(parents=True, exist_ok=True)
433 # Copy handler and manifests (always overwrite to prevent stale content)
434 shutil.copy2(source_dir / "handler.py", build_dir / "handler.py")
435 build_manifests = build_dir / "manifests"
436 if (source_dir / "manifests").exists(): 436 ↛ 442line 436 didn't jump to line 442 because the condition on line 436 was always true
437 if build_manifests.exists(): 437 ↛ 438line 437 didn't jump to line 438 because the condition on line 437 was never true
438 shutil.rmtree(build_manifests)
439 shutil.copytree(source_dir / "manifests", build_manifests, dirs_exist_ok=True)
441 # Install pip dependencies for Lambda runtime
442 import sys
444 result = subprocess.run( # nosemgrep: dangerous-subprocess-use-audit - static list: sys.executable + pip args + Path objects, no user input
445 [
446 sys.executable,
447 "-m",
448 "pip",
449 "install",
450 "-r",
451 str(requirements),
452 "-t",
453 str(build_dir),
454 "--upgrade",
455 "--platform",
456 "manylinux2014_x86_64",
457 "--only-binary=:all:",
458 "--quiet",
459 ],
460 capture_output=True,
461 text=True,
462 )
463 if result.returncode != 0: 463 ↛ 464line 463 didn't jump to line 464 because the condition on line 463 was never true
464 print(f" Warning: pip install failed: {result.stderr[:200]}")
465 else:
466 print(" Lambda package built successfully")
468 def _build_helm_installer_lambda(self) -> None:
469 """Build the helm-installer Lambda Docker context.
471 Copies all source files into a clean build directory so CDK always
472 detects changes to charts.yaml, handler.py, Dockerfile, etc.
473 """
474 source_dir = self.project_root / "lambda" / "helm-installer"
475 build_dir = self.project_root / "lambda" / "helm-installer-build"
477 if not source_dir.exists():
478 return
480 print(" Building helm-installer Lambda package...")
482 # Clean and recreate build directory
483 if build_dir.exists():
484 _safe_rmtree(build_dir)
485 build_dir.mkdir(parents=True, exist_ok=True)
487 # Copy all source files into the build directory
488 for item in source_dir.iterdir():
489 if item.name == "__pycache__":
490 continue
491 if item.is_file(): 491 ↛ 493line 491 didn't jump to line 493 because the condition on line 491 was always true
492 shutil.copy2(item, build_dir / item.name)
493 elif item.is_dir():
494 shutil.copytree(item, build_dir / item.name, dirs_exist_ok=True)
496 print(" Helm installer package built successfully")
498 def _get_python_path(self) -> str:
499 """
500 Get PYTHONPATH that includes the current Python's site-packages.
502 This is critical for pipx installations where CDK runs `python3 app.py`
503 using the system Python, which doesn't have aws_cdk installed.
504 By setting PYTHONPATH, we ensure CDK's subprocess can find our modules.
505 """
506 # Get all site-packages directories from the current Python
507 site_packages = site.getsitepackages()
509 # Also include user site-packages if available
510 user_site = site.getusersitepackages()
511 if user_site and os.path.isdir(user_site): 511 ↛ 512line 511 didn't jump to line 512 because the condition on line 511 was never true
512 site_packages.append(user_site)
514 # Include the directory containing the current module (for editable installs)
515 current_module_dir = Path(__file__).parent.parent
516 if current_module_dir.exists(): 516 ↛ 520line 516 didn't jump to line 520 because the condition on line 516 was always true
517 site_packages.append(str(current_module_dir))
519 # Combine with existing PYTHONPATH if any
520 existing_path = os.environ.get("PYTHONPATH", "")
521 all_paths = site_packages + ([existing_path] if existing_path else [])
523 return os.pathsep.join(all_paths)
525 def _run_cdk(
526 self,
527 command: list[str],
528 capture_output: bool = False,
529 env: dict[str, str] | None = None,
530 ) -> subprocess.CompletedProcess[str]:
531 """Run a CDK command."""
532 full_env = os.environ.copy()
534 # Inject PYTHONPATH so CDK's python3 subprocess can find aws_cdk
535 # This is essential for pipx installations
536 full_env["PYTHONPATH"] = self._get_python_path()
538 if env:
539 full_env.update(env)
541 cdk_cmd = self._cdk_path.split() + command
543 if capture_output:
544 return subprocess.run( # nosemgrep: dangerous-subprocess-use-audit - cdk_cmd is a list of static CDK subcommands, no user-controlled shell injection
545 cdk_cmd,
546 cwd=self.project_root,
547 capture_output=True,
548 text=True,
549 env=full_env,
550 )
551 return subprocess.run( # nosemgrep: dangerous-subprocess-use-audit - cdk_cmd is a list of static CDK subcommands, no user-controlled shell injection
552 cdk_cmd,
553 cwd=self.project_root,
554 env=full_env,
555 text=True,
556 )
558 def list_stacks(self) -> list[str]:
559 """List all available CDK stacks."""
560 result = self._run_cdk(["list"], capture_output=True)
561 if result.returncode != 0:
562 raise RuntimeError(f"Failed to list stacks: {result.stderr}")
563 return [s.strip() for s in result.stdout.strip().split("\n") if s.strip()]
565 def synth(self, stack_name: str | None = None, quiet: bool = True) -> str:
566 """Synthesize CloudFormation templates."""
567 cmd = ["synth"]
568 if stack_name: 568 ↛ 570line 568 didn't jump to line 570 because the condition on line 568 was always true
569 cmd.append(stack_name)
570 if quiet: 570 ↛ 573line 570 didn't jump to line 573 because the condition on line 570 was always true
571 cmd.append("--quiet")
573 result = self._run_cdk(cmd, capture_output=True)
574 if result.returncode != 0:
575 raise RuntimeError(f"CDK synth failed: {result.stderr}")
576 return str(result.stdout)
578 def diff(self, stack_name: str | None = None) -> str:
579 """Show diff between deployed and local stacks."""
580 cmd = ["diff", "--no-color"]
581 if stack_name: 581 ↛ 584line 581 didn't jump to line 584 because the condition on line 581 was always true
582 cmd.append(stack_name)
584 result = self._run_cdk(cmd, capture_output=True)
585 # diff returns non-zero if there are differences, which is expected
586 return str(result.stdout or result.stderr)
588 def deploy(
589 self,
590 stack_name: str | None = None,
591 require_approval: bool = True,
592 all_stacks: bool = False,
593 outputs_file: str | None = None,
594 parameters: dict[str, str] | None = None,
595 tags: dict[str, str] | None = None,
596 progress: str = "events",
597 output_dir: str | None = None,
598 exclusively: bool = False,
599 ) -> bool:
600 """Deploy CDK stacks.
602 Args:
603 stack_name: Name of the stack to deploy
604 require_approval: Whether to require approval for changes
605 all_stacks: Deploy all stacks
606 outputs_file: File to write outputs to
607 parameters: CDK parameters
608 tags: Tags to apply to stacks
609 progress: Progress display type
610 output_dir: Custom CDK output directory (for parallel deployments)
611 exclusively: Pass ``--exclusively`` to CDK so only the named
612 stack is evaluated, not its transitive dependencies. Used by
613 ``deploy_orchestrated`` once earlier phases have already
614 deployed the globals — re-synthesizing them every phase
615 forces custom resources (notably KubectlApplyManifests)
616 to re-run each time, adding minutes per phase for no
617 actual change.
618 """
619 # Full rebuild of Lambda packages on first deploy call (once per session),
620 # then sync latest source on subsequent calls
621 self._rebuild_lambda_packages()
622 self._sync_lambda_sources()
624 # Check for stuck stacks and auto-recover
625 if stack_name:
626 self._check_and_fix_stuck_stack(stack_name)
628 # Ensure container runtime is available for building images
629 runtime = _detect_container_runtime()
630 if not runtime:
631 raise RuntimeError(
632 "No container runtime found. Please install Docker, Finch, or Podman.\n"
633 " - Docker: https://docs.docker.com/get-docker/\n"
634 " - Finch: brew install finch && finch vm init\n"
635 " - Podman: https://podman.io/getting-started/installation"
636 )
638 # Auto-bootstrap the target region if needed
639 if stack_name:
640 region = self._get_deploy_region(stack_name)
641 if region and not self.ensure_bootstrapped(region):
642 raise RuntimeError(
643 f"Region {region} could not be bootstrapped. "
644 "Run 'gco stacks bootstrap --region "
645 f"{region}' manually to diagnose."
646 )
648 cmd = ["deploy"]
650 if all_stacks:
651 cmd.append("--all")
652 elif stack_name: 652 ↛ 660line 652 didn't jump to line 660 because the condition on line 652 was always true
653 cmd.append(stack_name)
655 # --exclusively tells CDK to deploy *only* the named stack, not its
656 # transitive dependencies. deploy_orchestrated sets this once the
657 # earlier phases (global, api-gateway) are already in place so that
658 # the regional and monitoring phases don't re-synthesize and
659 # re-evaluate globals on every pass.
660 if exclusively and stack_name and not all_stacks:
661 cmd.append("--exclusively")
663 if not require_approval:
664 cmd.extend(["--require-approval", "never"])
666 if outputs_file:
667 cmd.extend(["--outputs-file", outputs_file])
669 if parameters:
670 for key, value in parameters.items():
671 cmd.extend(["--parameters", f"{key}={value}"])
673 if tags:
674 for key, value in tags.items():
675 cmd.extend(["--tags", f"{key}={value}"])
677 cmd.extend(["--progress", progress])
679 # Use custom output directory for parallel deployments
680 if output_dir:
681 cmd.extend(["--output", output_dir])
683 # Set CDK_DOCKER env var if not already set
684 env = {"CDK_DOCKER": runtime} if not os.environ.get("CDK_DOCKER") else None
686 result = self._run_cdk(cmd, env=env)
687 success = result.returncode == 0
689 if not success and stack_name:
690 self._diagnose_deploy_failure(stack_name)
692 return success
694 def destroy(
695 self,
696 stack_name: str | None = None,
697 all_stacks: bool = False,
698 force: bool = False,
699 output_dir: str | None = None,
700 ) -> bool:
701 """Destroy CDK stacks.
703 Args:
704 stack_name: Name of the stack to destroy
705 all_stacks: Destroy all stacks
706 force: Skip confirmation prompts
707 output_dir: Custom CDK output directory (for parallel deployments)
708 """
709 cmd = ["destroy"]
711 if all_stacks:
712 cmd.append("--all")
713 elif stack_name: 713 ↛ 716line 713 didn't jump to line 716 because the condition on line 713 was always true
714 cmd.append(stack_name)
716 if force:
717 cmd.append("--force")
719 # Use custom output directory for parallel deployments
720 if output_dir:
721 cmd.extend(["--output", output_dir])
723 result = self._run_cdk(cmd)
724 return result.returncode == 0
726 def bootstrap(
727 self,
728 account: str | None = None,
729 region: str | None = None,
730 ) -> bool:
731 """Bootstrap CDK in an AWS account/region."""
732 cmd = ["bootstrap"]
734 if account and region:
735 cmd.append(f"aws://{account}/{region}")
736 elif region: 736 ↛ 739line 736 didn't jump to line 739 because the condition on line 736 was always true
737 cmd.append(f"aws://unknown-account/{region}")
739 result = self._run_cdk(cmd)
740 return result.returncode == 0
742 def is_bootstrapped(self, region: str) -> bool:
743 """Check if CDK has been bootstrapped in a region.
745 Looks for the CDKToolkit CloudFormation stack which is created
746 by ``cdk bootstrap``. Result is cached per region for the lifetime
747 of this StackManager instance.
748 """
749 if not hasattr(self, "_bootstrap_cache"): 749 ↛ 752line 749 didn't jump to line 752 because the condition on line 749 was always true
750 self._bootstrap_cache: dict[str, bool] = {}
752 if region in self._bootstrap_cache: 752 ↛ 753line 752 didn't jump to line 753 because the condition on line 752 was never true
753 return self._bootstrap_cache[region]
755 import boto3
757 cf = boto3.client("cloudformation", region_name=region)
758 try:
759 response = cf.describe_stacks(StackName="CDKToolkit")
760 stacks = response.get("Stacks", [])
761 if stacks:
762 status = stacks[0].get("StackStatus", "")
763 # Any non-deleted state counts as bootstrapped
764 result = "DELETE" not in status
765 self._bootstrap_cache[region] = result
766 return result
767 except ClientError:
768 pass # Stack doesn't exist — not bootstrapped
769 except Exception as e:
770 logger.debug("Failed to check CDK bootstrap in %s: %s", region, e)
772 self._bootstrap_cache[region] = False
773 return False
775 def ensure_bootstrapped(self, region: str) -> bool:
776 """Ensure a region is CDK-bootstrapped, auto-bootstrapping if needed.
778 Returns True if the region is (or was successfully) bootstrapped.
779 """
780 if self.is_bootstrapped(region):
781 return True
783 print(f"ℹ Region {region} is not CDK-bootstrapped. Bootstrapping now...")
784 success = self.bootstrap(region=region)
785 if success:
786 # Update cache so we don't re-check this region
787 if not hasattr(self, "_bootstrap_cache"): 787 ↛ 789line 787 didn't jump to line 789 because the condition on line 787 was always true
788 self._bootstrap_cache = {}
789 self._bootstrap_cache[region] = True
790 print(f"✓ CDK bootstrapped in {region}")
791 else:
792 print(f"✗ Failed to bootstrap CDK in {region}")
793 return success
795 def _get_deploy_region(self, stack_name: str) -> str | None:
796 """Determine the target AWS region for a given stack name."""
797 from .config import _load_cdk_json
799 cdk_regions = _load_cdk_json()
801 region: str | None
802 if stack_name == "gco-global":
803 region = cdk_regions.get("global") or self.config.global_region
804 return region
805 if stack_name == "gco-api-gateway":
806 region = cdk_regions.get("api_gateway") or self.config.api_gateway_region
807 return region
808 if stack_name == "gco-monitoring":
809 region = cdk_regions.get("monitoring") or self.config.monitoring_region
810 return region
812 # Regional stacks: gco-{region}
813 prefix = "gco-"
814 if stack_name.startswith(prefix):
815 return stack_name[len(prefix) :]
817 return None
819 def get_outputs(self, stack_name: str, region: str) -> dict[str, str]:
820 """Get stack outputs from CloudFormation."""
821 import boto3
823 cf = boto3.client("cloudformation", region_name=region)
824 try:
825 response = cf.describe_stacks(StackName=stack_name)
826 if response["Stacks"]:
827 stack = response["Stacks"][0]
828 outputs: dict[str, str] = {}
829 for output in stack.get("Outputs", []):
830 outputs[str(output["OutputKey"])] = str(output["OutputValue"])
831 return outputs
832 except Exception as e:
833 logger.debug("Failed to get outputs for %s in %s: %s", stack_name, region, e)
834 return {}
836 def get_stack_status(self, stack_name: str, region: str) -> StackInfo | None:
837 """Get detailed stack status from CloudFormation."""
838 import boto3
840 cf = boto3.client("cloudformation", region_name=region)
841 try:
842 response = cf.describe_stacks(StackName=stack_name)
843 if response["Stacks"]:
844 stack = response["Stacks"][0]
845 return StackInfo(
846 name=stack["StackName"],
847 status=stack["StackStatus"],
848 region=region,
849 created_time=stack.get("CreationTime"),
850 updated_time=stack.get("LastUpdatedTime"),
851 outputs={o["OutputKey"]: o["OutputValue"] for o in stack.get("Outputs", [])},
852 tags={t["Key"]: t["Value"] for t in stack.get("Tags", [])},
853 )
854 except Exception as e:
855 logger.debug("Failed to get stack status for %s in %s: %s", stack_name, region, e)
856 return None
858 def deploy_orchestrated(
859 self,
860 require_approval: bool = True,
861 outputs_file: str | None = None,
862 parameters: dict[str, str] | None = None,
863 tags: dict[str, str] | None = None,
864 progress: str = "events",
865 on_stack_start: Callable[[str], None] | None = None,
866 on_stack_complete: Callable[[str, bool], None] | None = None,
867 parallel: bool = False,
868 max_workers: int = 4,
869 ) -> tuple[bool, list[str], list[str]]:
870 """
871 Deploy all stacks in the correct order.
873 Deploys global stacks first, then regional stacks (optionally in parallel),
874 then the monitoring stack (which depends on regional stacks).
876 Args:
877 require_approval: Whether to require approval for changes
878 outputs_file: File to write outputs to
879 parameters: CDK parameters
880 tags: Tags to apply to stacks
881 progress: Progress display type
882 on_stack_start: Callback(stack_name) called when starting a stack
883 on_stack_complete: Callback(stack_name, success) called when stack completes
884 parallel: Deploy regional stacks in parallel
885 max_workers: Maximum number of parallel deployments (default: 4)
887 Returns:
888 Tuple of (overall_success, successful_stacks, failed_stacks)
889 """
890 stacks = self.list_stacks()
891 ordered_stacks = get_stack_deployment_order(stacks)
893 # Separate stacks into three groups:
894 # 1. Pre-regional global stacks (gco-global, gco-api-gateway)
895 # 2. Regional stacks (can be parallelized)
896 # 3. Post-regional stacks (gco-monitoring - depends on regional stacks)
897 pre_regional = {"gco-global", "gco-api-gateway"}
898 post_regional = {"gco-monitoring"}
900 pre_regional_stacks = [s for s in ordered_stacks if s in pre_regional]
901 regional_stacks = [
902 s for s in ordered_stacks if s not in pre_regional and s not in post_regional
903 ]
904 post_regional_stacks = [s for s in ordered_stacks if s in post_regional]
906 successful: list[str] = []
907 failed: list[str] = []
909 # Phase 1: Deploy pre-regional global stacks sequentially
910 for stack_name in pre_regional_stacks:
911 if on_stack_start:
912 on_stack_start(stack_name)
914 success = self.deploy(
915 stack_name=stack_name,
916 require_approval=require_approval,
917 outputs_file=outputs_file,
918 parameters=parameters,
919 tags=tags,
920 progress=progress,
921 )
923 if success:
924 successful.append(stack_name)
925 else:
926 failed.append(stack_name)
928 if on_stack_complete:
929 on_stack_complete(stack_name, success)
931 # Stop on failure to prevent cascading issues
932 if not success:
933 return False, successful, failed
935 # Phase 2: Deploy regional stacks (parallel or sequential)
936 # All regional stacks pass --exclusively: globals are already deployed
937 # in Phase 1, so CDK doesn't need to re-evaluate them. Skipping that
938 # re-evaluation avoids re-running custom resources (notably
939 # KubectlApplyManifests) on the global stacks every time a regional
940 # stack is deployed — that would otherwise re-apply manifests and
941 # rollout-restart controllers for no actual change.
942 if regional_stacks: 942 ↛ 994line 942 didn't jump to line 994 because the condition on line 942 was always true
943 if parallel and len(regional_stacks) > 1:
944 # Parallel deployment of regional stacks
945 successful_regional, failed_regional = self._deploy_stacks_parallel(
946 stacks=regional_stacks,
947 require_approval=require_approval,
948 outputs_file=outputs_file,
949 parameters=parameters,
950 tags=tags,
951 progress=progress,
952 on_stack_start=on_stack_start,
953 on_stack_complete=on_stack_complete,
954 max_workers=max_workers,
955 )
956 successful.extend(successful_regional)
957 failed.extend(failed_regional)
959 # Stop if any regional stack failed
960 if failed_regional:
961 return False, successful, failed
962 else:
963 # Sequential deployment
964 for stack_name in regional_stacks:
965 if on_stack_start:
966 on_stack_start(stack_name)
968 success = self.deploy(
969 stack_name=stack_name,
970 require_approval=require_approval,
971 outputs_file=outputs_file,
972 parameters=parameters,
973 tags=tags,
974 progress=progress,
975 exclusively=True,
976 )
978 if success: 978 ↛ 981line 978 didn't jump to line 981 because the condition on line 978 was always true
979 successful.append(stack_name)
980 else:
981 failed.append(stack_name)
983 if on_stack_complete:
984 on_stack_complete(stack_name, success)
986 # Stop on failure
987 if not success: 987 ↛ 988line 987 didn't jump to line 988 because the condition on line 987 was never true
988 return False, successful, failed
990 # Phase 3: Deploy post-regional stacks (monitoring) sequentially.
991 # Same rationale as Phase 2: every upstream stack is already
992 # deployed, so --exclusively prevents a redundant pass over
993 # global/api-gateway/regional.
994 for stack_name in post_regional_stacks:
995 if on_stack_start: 995 ↛ 996line 995 didn't jump to line 996 because the condition on line 995 was never true
996 on_stack_start(stack_name)
998 success = self.deploy(
999 stack_name=stack_name,
1000 require_approval=require_approval,
1001 outputs_file=outputs_file,
1002 parameters=parameters,
1003 tags=tags,
1004 progress=progress,
1005 exclusively=True,
1006 )
1008 if success: 1008 ↛ 1011line 1008 didn't jump to line 1011 because the condition on line 1008 was always true
1009 successful.append(stack_name)
1010 else:
1011 failed.append(stack_name)
1013 if on_stack_complete: 1013 ↛ 1014line 1013 didn't jump to line 1014 because the condition on line 1013 was never true
1014 on_stack_complete(stack_name, success)
1016 if not success: 1016 ↛ 1017line 1016 didn't jump to line 1017 because the condition on line 1016 was never true
1017 return False, successful, failed
1019 return len(failed) == 0, successful, failed
1021 def _deploy_stacks_parallel(
1022 self,
1023 stacks: list[str],
1024 require_approval: bool,
1025 outputs_file: str | None,
1026 parameters: dict[str, str] | None,
1027 tags: dict[str, str] | None,
1028 progress: str,
1029 on_stack_start: Callable[[str], None] | None,
1030 on_stack_complete: Callable[[str, bool], None] | None,
1031 max_workers: int,
1032 ) -> tuple[list[str], list[str]]:
1033 """Deploy multiple stacks in parallel using separate CDK output directories."""
1034 import tempfile
1036 successful: list[str] = []
1037 failed: list[str] = []
1038 lock = Lock()
1040 def deploy_single(stack_name: str) -> tuple[str, bool]:
1041 # Use a unique output directory in /tmp for each parallel deployment
1042 # This avoids CDK copying cdk.out.* directories into assets
1043 output_dir = tempfile.mkdtemp(prefix=f"cdk-{stack_name}-")
1045 if on_stack_start: 1045 ↛ 1046line 1045 didn't jump to line 1046 because the condition on line 1045 was never true
1046 with lock:
1047 on_stack_start(stack_name)
1049 success = self.deploy(
1050 stack_name=stack_name,
1051 require_approval=require_approval,
1052 outputs_file=outputs_file,
1053 parameters=parameters,
1054 tags=tags,
1055 progress=progress,
1056 output_dir=output_dir,
1057 exclusively=True,
1058 )
1060 # Clean up the temporary output directory
1061 try:
1062 import shutil
1064 if os.path.exists(output_dir): 1064 ↛ 1069line 1064 didn't jump to line 1069 because the condition on line 1064 was always true
1065 shutil.rmtree(output_dir)
1066 except Exception as e:
1067 logger.debug("Cleanup of %s failed: %s", output_dir, e)
1069 return stack_name, success
1071 with ThreadPoolExecutor(max_workers=max_workers) as executor:
1072 futures = {executor.submit(deploy_single, stack): stack for stack in stacks}
1074 for future in as_completed(futures):
1075 stack_name, success = future.result()
1077 with lock:
1078 if success:
1079 successful.append(stack_name)
1080 else:
1081 failed.append(stack_name)
1083 if on_stack_complete: 1083 ↛ 1084line 1083 didn't jump to line 1084 because the condition on line 1083 was never true
1084 on_stack_complete(stack_name, success)
1086 return successful, failed
1088 def destroy_orchestrated(
1089 self,
1090 force: bool = False,
1091 on_stack_start: Callable[[str], None] | None = None,
1092 on_stack_complete: Callable[[str, bool], None] | None = None,
1093 parallel: bool = False,
1094 max_workers: int = 4,
1095 ) -> tuple[bool, list[str], list[str]]:
1096 """
1097 Destroy all stacks in the correct order.
1099 Destroys monitoring stack first, then regional stacks (optionally in parallel),
1100 then global stacks.
1102 Args:
1103 force: Skip confirmation prompts
1104 on_stack_start: Callback(stack_name) called when starting a stack
1105 on_stack_complete: Callback(stack_name, success) called when stack completes
1106 parallel: Destroy regional stacks in parallel
1107 max_workers: Maximum number of parallel destructions (default: 4)
1109 Returns:
1110 Tuple of (overall_success, successful_stacks, failed_stacks)
1111 """
1112 stacks = self.list_stacks()
1114 # Phase 0: Clean up backup vault recovery points so the global stack
1115 # can be deleted cleanly by CloudFormation.
1116 self._cleanup_backup_vault()
1118 # Separate stacks into three groups (reverse of deploy order):
1119 pre_regional = {"gco-global", "gco-api-gateway"}
1120 post_regional = {"gco-monitoring"}
1122 post_regional_stacks = [s for s in stacks if s in post_regional]
1123 regional_stacks = [s for s in stacks if s not in pre_regional and s not in post_regional]
1124 pre_regional_stacks = [s for s in stacks if s in pre_regional]
1126 # Sort regional stacks alphabetically (reversed for destroy)
1127 regional_stacks.sort(reverse=True)
1128 # Sort pre-regional stacks in reverse priority order
1129 pre_regional_order = {"gco-api-gateway": 1, "gco-global": 2}
1130 pre_regional_stacks.sort(key=lambda x: pre_regional_order.get(x, 0))
1132 successful: list[str] = []
1133 failed: list[str] = []
1135 # Phase 1: Destroy post-regional stacks (monitoring) first
1136 for stack_name in post_regional_stacks:
1137 if on_stack_start: 1137 ↛ 1138line 1137 didn't jump to line 1138 because the condition on line 1137 was never true
1138 on_stack_start(stack_name)
1140 success = self.destroy(
1141 stack_name=stack_name,
1142 force=force,
1143 )
1145 if success: 1145 ↛ 1148line 1145 didn't jump to line 1148 because the condition on line 1145 was always true
1146 successful.append(stack_name)
1147 else:
1148 failed.append(stack_name)
1150 if on_stack_complete: 1150 ↛ 1151line 1150 didn't jump to line 1151 because the condition on line 1150 was never true
1151 on_stack_complete(stack_name, success)
1153 # Phase 2: Destroy regional stacks (parallel or sequential)
1154 # Each regional destroy has a background watchdog that proactively
1155 # deletes EKS-managed security groups + orphaned ENIs as soon as
1156 # they appear. Without this, CloudFormation's VPC delete step sits
1157 # in DELETE_IN_PROGRESS for ~10 min waiting for EKS to GC the
1158 # ``eks-cluster-sg-<cluster>-*`` security group and its cluster
1159 # ENIs. The SG is owned by the EKS service (not CloudFormation),
1160 # so CFN can't delete it directly — it just polls the VPC's
1161 # dependencies until they drain. See ``_start_eks_sg_watchdog``.
1162 watchdog_stops: dict[str, Event] = {}
1163 watchdog_threads: dict[str, Thread] = {}
1164 for stack_name in regional_stacks:
1165 stop_event = Event()
1166 thread = self._start_eks_sg_watchdog(stack_name, stop_event)
1167 watchdog_stops[stack_name] = stop_event
1168 watchdog_threads[stack_name] = thread
1170 if regional_stacks: 1170 ↛ 1203line 1170 didn't jump to line 1203 because the condition on line 1170 was always true
1171 if parallel and len(regional_stacks) > 1:
1172 # Parallel destruction of regional stacks
1173 successful_regional, failed_regional = self._destroy_stacks_parallel(
1174 stacks=regional_stacks,
1175 force=force,
1176 on_stack_start=on_stack_start,
1177 on_stack_complete=on_stack_complete,
1178 max_workers=max_workers,
1179 )
1180 successful.extend(successful_regional)
1181 failed.extend(failed_regional)
1182 else:
1183 # Sequential destruction
1184 for stack_name in regional_stacks:
1185 if on_stack_start:
1186 on_stack_start(stack_name)
1188 success = self.destroy(
1189 stack_name=stack_name,
1190 force=force,
1191 )
1193 if success:
1194 successful.append(stack_name)
1195 else:
1196 failed.append(stack_name)
1198 if on_stack_complete:
1199 on_stack_complete(stack_name, success)
1201 # Stop all watchdogs and do one final cleanup pass per stack to
1202 # catch anything EKS re-created during the destroy flow.
1203 for stack_name in regional_stacks:
1204 watchdog_stops[stack_name].set()
1205 watchdog_threads[stack_name].join(timeout=5)
1206 self._cleanup_eks_security_groups(stack_name)
1208 # Phase 3: Destroy pre-regional global stacks last
1209 for stack_name in pre_regional_stacks:
1210 if on_stack_start:
1211 on_stack_start(stack_name)
1213 success = self.destroy(
1214 stack_name=stack_name,
1215 force=force,
1216 )
1218 if success: 1218 ↛ 1221line 1218 didn't jump to line 1221 because the condition on line 1218 was always true
1219 successful.append(stack_name)
1220 else:
1221 failed.append(stack_name)
1223 if on_stack_complete:
1224 on_stack_complete(stack_name, success)
1226 return len(failed) == 0, successful, failed
1228 def _destroy_stacks_parallel(
1229 self,
1230 stacks: list[str],
1231 force: bool,
1232 on_stack_start: Callable[[str], None] | None,
1233 on_stack_complete: Callable[[str, bool], None] | None,
1234 max_workers: int,
1235 ) -> tuple[list[str], list[str]]:
1236 """Destroy multiple stacks in parallel using separate CDK output directories."""
1237 import tempfile
1239 successful: list[str] = []
1240 failed: list[str] = []
1241 lock = Lock()
1243 def destroy_single(stack_name: str) -> tuple[str, bool]:
1244 # Use a unique output directory in /tmp for each parallel destruction
1245 output_dir = tempfile.mkdtemp(prefix=f"cdk-{stack_name}-")
1247 if on_stack_start: 1247 ↛ 1248line 1247 didn't jump to line 1248 because the condition on line 1247 was never true
1248 with lock:
1249 on_stack_start(stack_name)
1251 success = self.destroy(
1252 stack_name=stack_name,
1253 force=force,
1254 output_dir=output_dir,
1255 )
1257 # Clean up the temporary output directory
1258 try:
1259 import shutil
1261 if os.path.exists(output_dir): 1261 ↛ 1266line 1261 didn't jump to line 1266 because the condition on line 1261 was always true
1262 shutil.rmtree(output_dir)
1263 except Exception as e:
1264 logger.debug("Cleanup of %s failed: %s", output_dir, e)
1266 return stack_name, success
1268 with ThreadPoolExecutor(max_workers=max_workers) as executor:
1269 futures = {executor.submit(destroy_single, stack): stack for stack in stacks}
1271 for future in as_completed(futures):
1272 stack_name, success = future.result()
1274 with lock:
1275 if success:
1276 successful.append(stack_name)
1277 else:
1278 failed.append(stack_name)
1280 if on_stack_complete: 1280 ↛ 1281line 1280 didn't jump to line 1281 because the condition on line 1280 was never true
1281 on_stack_complete(stack_name, success)
1283 return successful, failed
1285 def _cleanup_backup_vault(self) -> None:
1286 """Delete all recovery points from the GCO backup vault.
1288 This is called before destroy-all so CloudFormation can delete the
1289 backup vault cleanly. Without this, the vault deletion fails because
1290 it contains recovery points.
1291 """
1292 import boto3
1294 global_region = self.config.global_region
1295 project_name = self.config.project_name
1297 try:
1298 backup_client = boto3.client("backup", region_name=global_region)
1300 # Find the backup vault by listing vaults with the project prefix
1301 paginator = backup_client.get_paginator("list_backup_vaults")
1302 vault_name = None
1303 for page in paginator.paginate():
1304 for vault in page.get("BackupVaultList", []):
1305 if project_name in vault["BackupVaultName"].lower():
1306 vault_name = vault["BackupVaultName"]
1307 break
1308 if vault_name:
1309 break
1311 if not vault_name:
1312 return
1314 # Delete all recovery points in the vault
1315 rp_paginator = backup_client.get_paginator("list_recovery_points_by_backup_vault")
1316 deleted = 0
1317 for page in rp_paginator.paginate(BackupVaultName=vault_name):
1318 for rp in page.get("RecoveryPoints", []):
1319 try:
1320 backup_client.delete_recovery_point(
1321 BackupVaultName=vault_name,
1322 RecoveryPointArn=rp["RecoveryPointArn"],
1323 )
1324 deleted += 1
1325 except Exception as e:
1326 logger.debug("Failed to delete recovery point: %s", e)
1328 if deleted > 0:
1329 print(f" Cleaned up {deleted} backup recovery points from {vault_name}")
1331 except Exception as e:
1332 print(f" Warning: Backup vault cleanup failed (non-fatal): {e}")
1334 def cleanup_eks_security_groups(self) -> None:
1335 """Clean up EKS-managed security groups across all regional stacks.
1337 Called between destroy retries to remove orphaned security groups
1338 that block VPC deletion.
1339 """
1340 stacks = self.list_stacks()
1341 pre_regional = {"gco-global", "gco-api-gateway", "gco-monitoring"}
1342 regional_stacks = [s for s in stacks if s not in pre_regional]
1343 for stack_name in regional_stacks:
1344 self._cleanup_eks_security_groups(stack_name)
1346 def _cleanup_eks_security_groups(self, stack_name: str) -> None:
1347 """Clean up EKS-managed security groups that block VPC deletion.
1349 EKS creates a cluster security group (eks-cluster-sg-<cluster-name>-*)
1350 that is owned by EKS, not CloudFormation. When the stack is destroyed,
1351 this SG and its attached ENIs can linger, causing VPC deletion to fail.
1353 This method finds and deletes these orphaned security groups and their
1354 ENIs before the stack destroy runs.
1355 """
1356 import time as _time
1358 import boto3
1360 # Extract region from stack name (e.g., gco-us-east-1 -> us-east-1)
1361 project_name = self.config.project_name
1362 region = stack_name.replace(f"{project_name}-", "", 1)
1363 cluster_name = stack_name # cluster name matches stack name
1365 try:
1366 ec2 = boto3.client("ec2", region_name=region)
1368 # Find EKS-managed security groups by name pattern
1369 response = ec2.describe_security_groups(
1370 Filters=[
1371 {
1372 "Name": "group-name",
1373 "Values": [f"eks-cluster-sg-{cluster_name}-*"],
1374 }
1375 ]
1376 )
1378 sgs = response.get("SecurityGroups", [])
1379 if not sgs:
1380 return
1382 for sg in sgs:
1383 sg_id = sg["GroupId"]
1384 sg_name = sg.get("GroupName", "")
1386 # First, detach and delete any ENIs using this SG
1387 eni_response = ec2.describe_network_interfaces(
1388 Filters=[{"Name": "group-id", "Values": [sg_id]}]
1389 )
1391 for eni in eni_response.get("NetworkInterfaces", []):
1392 eni_id = eni["NetworkInterfaceId"]
1393 try:
1394 if eni.get("Attachment"): 1394 ↛ 1400line 1394 didn't jump to line 1400 because the condition on line 1394 was always true
1395 ec2.detach_network_interface(
1396 AttachmentId=eni["Attachment"]["AttachmentId"],
1397 Force=True,
1398 )
1399 _time.sleep(5)
1400 ec2.delete_network_interface(NetworkInterfaceId=eni_id)
1401 logger.debug("Deleted ENI %s from SG %s", eni_id, sg_name)
1402 except Exception as e:
1403 logger.debug("Failed to delete ENI %s: %s", eni_id, e)
1405 # Now delete the security group
1406 try:
1407 ec2.delete_security_group(GroupId=sg_id)
1408 print(f" Cleaned up EKS security group: {sg_name} ({sg_id})")
1409 except Exception as e:
1410 logger.debug(
1411 "Failed to delete SG %s: %s (will retry on next attempt)", sg_id, e
1412 )
1414 except Exception as e:
1415 # Non-fatal — the retry loop will handle it
1416 logger.debug("EKS security group cleanup for %s failed: %s", stack_name, e)
1418 def _start_eks_sg_watchdog(self, stack_name: str, stop_event: Event) -> Thread:
1419 """Start a background thread that polls for orphaned EKS security groups.
1421 EKS creates an ``eks-cluster-sg-<cluster-name>-*`` security group that
1422 is owned by the EKS service (not CloudFormation). When the stack's
1423 EKS cluster resource deletes, the SG is supposed to GC along with its
1424 cluster ENIs — but on EKS Auto Mode there's a window where the SG
1425 lingers after the cluster is gone, blocking the subsequent VPC
1426 delete with ``DependencyViolation``. CloudFormation then sits in
1427 ``DELETE_IN_PROGRESS`` on the VPC for ~10 minutes retrying.
1429 This watchdog runs for the full duration of the regional-stack
1430 destroy, polling every 30 seconds. As soon as an orphaned SG appears
1431 (which only happens after the cluster delete has progressed past
1432 the cluster resource itself), it deletes the SG and any ENIs still
1433 attached. That unblocks the VPC delete immediately instead of
1434 waiting for EKS's own GC timer.
1436 The thread exits when ``stop_event`` is set by the orchestrator at
1437 the end of the regional phase.
1438 """
1440 def _watchdog() -> None:
1441 while not stop_event.is_set():
1442 try:
1443 self._cleanup_eks_security_groups(stack_name)
1444 except Exception as e:
1445 logger.debug(
1446 "EKS SG watchdog tick for %s failed (non-fatal): %s",
1447 stack_name,
1448 e,
1449 )
1450 # ``wait`` returns immediately when the event is set, so this
1451 # doubles as the sleep-and-shutdown-check in one call.
1452 stop_event.wait(timeout=30)
1454 thread = Thread(
1455 target=_watchdog,
1456 name=f"eks-sg-watchdog-{stack_name}",
1457 daemon=True,
1458 )
1459 thread.start()
1460 return thread
1463def get_stack_manager(config: GCOConfig) -> StackManager:
1464 """Factory function to get a StackManager instance."""
1465 return StackManager(config)
1468def get_stack_deployment_order(stacks: list[str]) -> list[str]:
1469 """
1470 Get the correct deployment order for stacks.
1472 Order: global stacks first, then regional stacks.
1473 Global stacks: gco-global, gco-api-gateway, gco-monitoring
1474 Regional stacks: gco-{region} (e.g., gco-us-east-1)
1475 """
1476 global_stacks = []
1477 regional_stacks = []
1479 # Define global stack priority (lower = deploy first)
1480 global_priority = {
1481 "gco-global": 1,
1482 "gco-api-gateway": 2,
1483 "gco-monitoring": 3,
1484 }
1486 for stack in stacks:
1487 if stack in global_priority:
1488 global_stacks.append((global_priority[stack], stack))
1489 else:
1490 regional_stacks.append(stack)
1492 # Sort global stacks by priority, regional stacks alphabetically
1493 global_stacks.sort(key=lambda x: x[0])
1494 regional_stacks.sort()
1496 return [s[1] for s in global_stacks] + regional_stacks
1499def get_stack_destroy_order(stacks: list[str]) -> list[str]:
1500 """
1501 Get the correct destroy order for stacks.
1503 Order: regional stacks first, then global stacks (reverse of deploy).
1504 """
1505 deployment_order = get_stack_deployment_order(stacks)
1506 return list(reversed(deployment_order))
1509# =============================================================================
1510# Feature toggle helpers
1511# =============================================================================
1513_FSX_DEFAULTS: dict[str, Any] = {
1514 "enabled": False,
1515 "storage_capacity_gib": 1200,
1516 "deployment_type": "SCRATCH_2",
1517 "per_unit_storage_throughput": 200,
1518 "data_compression_type": "LZ4",
1519 "import_path": None,
1520 "export_path": None,
1521 "auto_import_policy": "NEW_CHANGED_DELETED",
1522}
1525def _find_cdk_json() -> Path | None:
1526 """Find cdk.json in current or parent directories."""
1527 current = Path.cwd()
1528 for parent in [current] + list(current.parents):
1529 cdk_path = parent / "cdk.json"
1530 if cdk_path.exists():
1531 return cdk_path
1532 return None
1535def get_fsx_config(region: str | None = None) -> dict[str, Any]:
1536 """Get current FSx for Lustre configuration from cdk.json.
1538 Args:
1539 region: Optional region to get config for. If provided, checks for
1540 region-specific overrides first.
1542 Returns:
1543 FSx configuration dictionary
1544 """
1545 return _get_feature_config("fsx_lustre", _FSX_DEFAULTS, region)
1548def update_fsx_config(settings: dict[str, Any], region: str | None = None) -> None:
1549 """Update FSx for Lustre configuration in cdk.json.
1551 Args:
1552 settings: FSx settings to update
1553 region: Optional region for region-specific config. If None, updates global config.
1554 """
1555 _update_feature_config("fsx_lustre", settings, _FSX_DEFAULTS, region)
1558# =============================================================================
1559# Generic feature toggle helpers (used by FSx, Valkey, Aurora, and future features)
1560# =============================================================================
1563def _get_feature_config(
1564 feature_key: str,
1565 default_config: dict[str, Any],
1566 region: str | None = None,
1567) -> dict[str, Any]:
1568 """Get configuration for a toggleable feature from cdk.json.
1570 Args:
1571 feature_key: The cdk.json context key (e.g. "valkey", "aurora_pgvector").
1572 default_config: Default configuration values when the key is missing.
1573 region: Optional region for region-specific overrides.
1575 Returns:
1576 Merged configuration dictionary.
1577 """
1578 cdk_json_path = _find_cdk_json()
1579 if not cdk_json_path:
1580 raise RuntimeError("cdk.json not found")
1582 import json
1584 with open(cdk_json_path, encoding="utf-8") as f:
1585 cdk_config = json.load(f)
1587 global_config = cdk_config.get("context", {}).get(feature_key, default_config)
1589 if region:
1590 region_key = f"{feature_key}_regions"
1591 region_overrides = cdk_config.get("context", {}).get(region_key, {})
1592 if region in region_overrides:
1593 merged = {**global_config, **region_overrides[region]}
1594 merged["region"] = region
1595 merged["is_region_specific"] = True
1596 return merged
1598 result = {**default_config, **global_config}
1599 result["is_region_specific"] = False
1600 return result
1603def _update_feature_config(
1604 feature_key: str,
1605 settings: dict[str, Any],
1606 default_config: dict[str, Any],
1607 region: str | None = None,
1608) -> None:
1609 """Update configuration for a toggleable feature in cdk.json.
1611 Args:
1612 feature_key: The cdk.json context key (e.g. "valkey", "aurora_pgvector").
1613 settings: Settings to update.
1614 default_config: Default configuration values when the key is missing.
1615 region: Optional region for region-specific config.
1616 """
1617 cdk_json_path = _find_cdk_json()
1618 if not cdk_json_path:
1619 raise RuntimeError("cdk.json not found")
1621 import json
1623 with open(cdk_json_path, encoding="utf-8") as f:
1624 cdk_config = json.load(f)
1626 if "context" not in cdk_config:
1627 cdk_config["context"] = {}
1629 if region:
1630 region_key = f"{feature_key}_regions"
1631 if region_key not in cdk_config["context"]: 1631 ↛ 1633line 1631 didn't jump to line 1633 because the condition on line 1631 was always true
1632 cdk_config["context"][region_key] = {}
1633 if region not in cdk_config["context"][region_key]: 1633 ↛ 1635line 1633 didn't jump to line 1635 because the condition on line 1633 was always true
1634 cdk_config["context"][region_key][region] = {}
1635 for key, value in settings.items():
1636 if value is not None or key == "enabled": 1636 ↛ 1635line 1636 didn't jump to line 1635 because the condition on line 1636 was always true
1637 cdk_config["context"][region_key][region][key] = value
1638 else:
1639 if feature_key not in cdk_config["context"]:
1640 cdk_config["context"][feature_key] = {**default_config}
1641 for key, value in settings.items():
1642 if value is not None or key == "enabled":
1643 cdk_config["context"][feature_key][key] = value
1645 with open(cdk_json_path, "w", encoding="utf-8") as f:
1646 json.dump(cdk_config, f, indent=2)
1649# =============================================================================
1650# Valkey configuration
1651# =============================================================================
1653_VALKEY_DEFAULTS: dict[str, Any] = {
1654 "enabled": False,
1655 "max_data_storage_gb": 5,
1656 "max_ecpu_per_second": 5000,
1657 "snapshot_retention_limit": 1,
1658}
1661def get_valkey_config(region: str | None = None) -> dict[str, Any]:
1662 """Get current Valkey Serverless configuration from cdk.json."""
1663 return _get_feature_config("valkey", _VALKEY_DEFAULTS, region)
1666def update_valkey_config(settings: dict[str, Any], region: str | None = None) -> None:
1667 """Update Valkey Serverless configuration in cdk.json."""
1668 _update_feature_config("valkey", settings, _VALKEY_DEFAULTS, region)
1671# =============================================================================
1672# Aurora pgvector configuration
1673# =============================================================================
1675_AURORA_DEFAULTS: dict[str, Any] = {
1676 "enabled": False,
1677 "min_acu": 0,
1678 "max_acu": 16,
1679 "backup_retention_days": 7,
1680 "deletion_protection": False,
1681}
1684def get_aurora_config(region: str | None = None) -> dict[str, Any]:
1685 """Get current Aurora pgvector configuration from cdk.json."""
1686 return _get_feature_config("aurora_pgvector", _AURORA_DEFAULTS, region)
1689def update_aurora_config(settings: dict[str, Any], region: str | None = None) -> None:
1690 """Update Aurora pgvector configuration in cdk.json."""
1691 _update_feature_config("aurora_pgvector", settings, _AURORA_DEFAULTS, region)