Coverage for cli / stacks.py: 88%

710 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-30 21:47 +0000

1""" 

2Stack management for GCO CLI. 

3 

4Provides commands for deploying, updating, and managing CDK stacks. 

5This is the largest CLI module (~1600 lines) because it orchestrates the 

6full deployment lifecycle including container runtime detection, CDK 

7bootstrapping, Lambda source synchronization, and parallel regional deploys. 

8 

9This module handles: 

10 - Container runtime detection (Docker, Finch, Podman) with automatic fallback 

11 - CDK bootstrap across all target regions (idempotent) 

12 - Lambda source synchronization (copies handler code + dependencies before synth) 

13 - CDK stack deployment with proper dependency ordering: 

14 1. Global stack (Global Accelerator, DynamoDB) 

15 2. API Gateway stack (auth secret, Lambda proxy) 

16 3. Regional stacks in parallel (EKS, VPC, ALB per region) 

17 4. Monitoring stack (CloudWatch dashboards, alarms) 

18 - Parallel deployment of regional stacks via ThreadPoolExecutor 

19 - Stack destruction in reverse dependency order 

20 - FSx for Lustre enable/disable toggle 

21 - kubectl access configuration (EKS access entries + kubeconfig) 

22 

23Key Design Decisions: 

24 - Regional stacks deploy in parallel for speed; global/API/monitoring are sequential 

25 - Lambda build directories are synced before every deploy to avoid stale code 

26 - Container runtime is auto-detected; CDK_DOCKER env var overrides 

27 - All destructive operations require -y/--yes confirmation 

28 - Stack status is read from CloudFormation, not cached locally 

29 

30Environment Variables: 

31 CDK_DOCKER: Override container runtime (default: auto-detect Docker/Finch/Podman) 

32 AWS_REGION: Default region for single-region operations 

33""" 

34 

35from __future__ import annotations 

36 

37import logging 

38import os 

39import shutil 

40import site 

41import subprocess 

42from collections.abc import Callable 

43from concurrent.futures import ThreadPoolExecutor, as_completed 

44from dataclasses import dataclass, field 

45from datetime import datetime 

46from pathlib import Path 

47from threading import Event, Lock, Thread 

48from typing import TYPE_CHECKING, Any 

49 

50from botocore.exceptions import ClientError 

51 

52if TYPE_CHECKING: 

53 from .config import GCOConfig 

54 

55logger = logging.getLogger(__name__) 

56 

57 

58@dataclass 

59class StackInfo: 

60 """Information about a CDK stack.""" 

61 

62 name: str 

63 status: str 

64 region: str 

65 created_time: datetime | None = None 

66 updated_time: datetime | None = None 

67 outputs: dict[str, str] = field(default_factory=dict) 

68 tags: dict[str, str] = field(default_factory=dict) 

69 

70 def to_dict(self) -> dict[str, Any]: 

71 return { 

72 "name": self.name, 

73 "status": self.status, 

74 "region": self.region, 

75 "created_time": self.created_time.isoformat() if self.created_time else None, 

76 "updated_time": self.updated_time.isoformat() if self.updated_time else None, 

77 "outputs": self.outputs, 

78 "tags": self.tags, 

79 } 

80 

81 

82def _safe_rmtree(path: Path) -> None: 

83 """Remove a directory tree, handling broken symlinks on macOS. 

84 

85 shutil.rmtree can fail with ``OSError: [Errno 66] Directory not empty`` 

86 on macOS when pip-installed packages (e.g. botocore) contain broken 

87 symlinks or extended-attribute resource forks. 

88 

89 Falls back to ``rm -rf`` via subprocess, but only after validating the 

90 path is a real directory under the project tree to avoid accidents. 

91 """ 

92 resolved = path.resolve() 

93 

94 # Safety: refuse to remove anything that isn't clearly a build artifact 

95 # inside the project. The path must contain "lambda" and end with "-build". 

96 if "lambda" not in resolved.parts or not resolved.name.endswith("-build"): 

97 raise ValueError(f"Refusing to remove unexpected path: {resolved}") 

98 

99 try: 

100 shutil.rmtree(str(resolved)) 

101 except OSError: 

102 subprocess.run(["rm", "-rf", "--", str(resolved)], check=True) 

103 

104 

105# Cached result for container runtime detection (None = not yet checked) 

106_container_runtime_cache: str | None = None 

107_container_runtime_checked: bool = False 

108 

109 

110def _detect_container_runtime() -> str | None: 

111 """ 

112 Detect available container runtime for CDK asset bundling. 

113 

114 CDK requires a container runtime to build Lambda function assets. 

115 This function checks for available runtimes in order of preference 

116 and verifies they are actually running (not just installed). 

117 

118 Priority order: docker > finch > podman 

119 

120 Returns: 

121 Runtime name ('docker', 'finch', or 'podman') if found and running, 

122 None if no runtime is available. 

123 

124 Note: 

125 If CDK_DOCKER environment variable is already set, that value 

126 is returned without checking if the runtime is available. 

127 """ 

128 global _container_runtime_cache, _container_runtime_checked 

129 if _container_runtime_checked: 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true

130 return _container_runtime_cache 

131 

132 _container_runtime_cache = _detect_container_runtime_uncached() 

133 _container_runtime_checked = True 

134 return _container_runtime_cache 

135 

136 

137def _detect_container_runtime_uncached() -> str | None: 

138 """Uncached implementation of container runtime detection.""" 

139 # Check if CDK_DOCKER is already set 

140 if os.environ.get("CDK_DOCKER"): 

141 return os.environ["CDK_DOCKER"] 

142 

143 # Try docker first 

144 if shutil.which("docker"): 

145 # Verify docker is actually running 

146 try: 

147 result = subprocess.run( 

148 ["docker", "info"], 

149 capture_output=True, 

150 timeout=5, 

151 ) 

152 if result.returncode == 0: 

153 return "docker" 

154 except subprocess.TimeoutExpired, Exception: 

155 pass 

156 

157 # Try finch as fallback 

158 if shutil.which("finch"): 

159 try: 

160 result = subprocess.run( 

161 ["finch", "info"], 

162 capture_output=True, 

163 timeout=5, 

164 ) 

165 if result.returncode == 0: 165 ↛ 171line 165 didn't jump to line 171 because the condition on line 165 was always true

166 return "finch" 

167 except subprocess.TimeoutExpired, Exception: 

168 pass 

169 

170 # Try podman as last resort 

171 if shutil.which("podman"): 

172 try: 

173 result = subprocess.run( 

174 ["podman", "info"], 

175 capture_output=True, 

176 timeout=5, 

177 ) 

178 if result.returncode == 0: 178 ↛ 183line 178 didn't jump to line 183 because the condition on line 178 was always true

179 return "podman" 

180 except subprocess.TimeoutExpired, Exception: 

181 pass 

182 

183 return None 

184 

185 

186class StackManager: 

187 """Manages CDK stack operations.""" 

188 

189 def __init__(self, config: GCOConfig, project_root: Path | None = None): 

190 self.config = config 

191 self.project_root = project_root or self._find_project_root() 

192 self._cdk_path = self._find_cdk() 

193 

194 # Ensure Lambda build directory exists before any CDK synth 

195 self._ensure_lambda_build() 

196 

197 def _find_project_root(self) -> Path: 

198 """Find the project root by looking for cdk.json.""" 

199 current = Path.cwd() 

200 for parent in [current] + list(current.parents): 

201 if (parent / "cdk.json").exists(): 

202 return parent 

203 return current 

204 

205 def _find_cdk(self) -> str: 

206 """Find CDK executable.""" 

207 # Check if cdk is in PATH 

208 try: 

209 result = subprocess.run(["which", "cdk"], capture_output=True, text=True, check=True) 

210 return result.stdout.strip() 

211 except subprocess.CalledProcessError: 

212 pass 

213 

214 # Check common locations 

215 for path in ["/usr/local/bin/cdk", "~/.npm-global/bin/cdk"]: 

216 expanded = os.path.expanduser(path) 

217 if os.path.exists(expanded): 

218 return expanded 

219 

220 # Fall back to npx 

221 return "npx cdk" 

222 

223 def _ensure_lambda_build(self) -> None: 

224 """Ensure the Lambda build directories exist for CDK synthesis. 

225 

226 Called from __init__ so the directory is ready before any CDK synth 

227 (even ``cdk list`` needs it because ``app.py`` references the asset). 

228 

229 This does a lightweight check — only builds if the directory is 

230 missing or incomplete. It does NOT do a full rebuild (no rmtree). 

231 The full rebuild happens in ``_rebuild_lambda_packages()``, which 

232 is called by ``deploy()`` before the actual CDK deploy. 

233 """ 

234 kubectl_source = self.project_root / "lambda" / "kubectl-applier-simple" 

235 kubectl_build = self.project_root / "lambda" / "kubectl-applier-simple-build" 

236 helm_source = self.project_root / "lambda" / "helm-installer" 

237 helm_build = self.project_root / "lambda" / "helm-installer-build" 

238 

239 # Only build if the directory is missing or deps aren't installed 

240 if kubectl_source.exists() and ( 

241 not kubectl_build.exists() or not (kubectl_build / "yaml").exists() 

242 ): 

243 self._build_kubectl_lambda() 

244 

245 if helm_source.exists() and not helm_build.exists(): 245 ↛ 246line 245 didn't jump to line 246 because the condition on line 245 was never true

246 self._build_helm_installer_lambda() 

247 

248 def _check_and_fix_stuck_stack(self, stack_name: str) -> None: 

249 """Check if a stack is in a stuck state and auto-recover. 

250 

251 Stacks can get stuck in REVIEW_IN_PROGRESS, ROLLBACK_COMPLETE, or 

252 other non-deployable states. This detects those and cleans up so 

253 the next deploy can succeed. 

254 """ 

255 import boto3 

256 

257 region = self._get_deploy_region(stack_name) 

258 if not region: 

259 return 

260 

261 try: 

262 cfn = boto3.client("cloudformation", region_name=region) 

263 response = cfn.describe_stacks(StackName=stack_name) 

264 status = response["Stacks"][0]["StackStatus"] 

265 

266 stuck_states = { 

267 "REVIEW_IN_PROGRESS", 

268 "ROLLBACK_COMPLETE", 

269 "ROLLBACK_FAILED", 

270 "CREATE_FAILED", 

271 "DELETE_FAILED", 

272 } 

273 

274 if status in stuck_states: 

275 print(f" Stack {stack_name} is in {status} state, cleaning up...") 

276 cfn.delete_stack(StackName=stack_name) 

277 waiter = cfn.get_waiter("stack_delete_complete") 

278 waiter.wait(StackName=stack_name, WaiterConfig={"Delay": 10, "MaxAttempts": 60}) 

279 print(f" Stack {stack_name} cleaned up, will recreate on deploy") 

280 

281 except Exception as e: 

282 logger.debug("Stack pre-check for %s: %s", stack_name, e) 

283 # Stack doesn't exist or can't be described — fine, deploy will create it 

284 

285 def _diagnose_deploy_failure(self, stack_name: str) -> None: 

286 """Fetch CloudFormation events after a failed deploy and print diagnostics. 

287 

288 Gives users actionable information instead of just the CDK error message. 

289 """ 

290 import boto3 

291 

292 region = self._get_deploy_region(stack_name) 

293 if not region: 

294 return 

295 

296 try: 

297 cfn = boto3.client("cloudformation", region_name=region) 

298 

299 # Get recent events 

300 response = cfn.describe_stack_events(StackName=stack_name) 

301 events = response.get("StackEvents", []) 

302 

303 # Filter to failed events 

304 failed = [ 

305 e 

306 for e in events[:20] 

307 if "FAILED" in e.get("ResourceStatus", "") 

308 or "ROLLBACK" in e.get("ResourceStatus", "") 

309 ] 

310 

311 if failed: 

312 print(f"\n CloudFormation failure details for {stack_name}:") 

313 for event in failed[:5]: 

314 resource = event.get("LogicalResourceId", "unknown") 

315 status = event.get("ResourceStatus", "unknown") 

316 reason = event.get("ResourceStatusReason", "no reason given") 

317 print(f" {resource}: {status}") 

318 print(f" {reason}") 

319 

320 # Check stack status for actionable advice 

321 try: 

322 stack_resp = cfn.describe_stacks(StackName=stack_name) 

323 status = stack_resp["Stacks"][0]["StackStatus"] 

324 

325 advice = { 

326 "REVIEW_IN_PROGRESS": ( 

327 "Stack is stuck in REVIEW_IN_PROGRESS. " 

328 "Run: aws cloudformation delete-stack " 

329 f"--stack-name {stack_name} --region {region}" 

330 ), 

331 "ROLLBACK_COMPLETE": ( 

332 "Stack rolled back. Delete it and retry: " 

333 f"aws cloudformation delete-stack " 

334 f"--stack-name {stack_name} --region {region}" 

335 ), 

336 "ROLLBACK_FAILED": ( 

337 "Stack rollback failed. Delete with --retain: " 

338 f"aws cloudformation delete-stack " 

339 f"--stack-name {stack_name} --region {region}" 

340 ), 

341 "UPDATE_ROLLBACK_COMPLETE": ( 

342 "Update rolled back but stack is stable. " 

343 "Check the events above and retry the deploy." 

344 ), 

345 } 

346 

347 if status in advice: 347 ↛ exitline 347 didn't return from function '_diagnose_deploy_failure' because the condition on line 347 was always true

348 print(f"\n Suggested fix: {advice[status]}") 

349 

350 except Exception as e: 

351 logger.debug("Failed to parse stack events: %s", e) 

352 

353 except Exception as e: 

354 logger.debug("Failed to diagnose deploy failure for %s: %s", stack_name, e) 

355 # Best effort — don't fail the deploy further 

356 

357 def _sync_lambda_sources(self) -> None: 

358 """ 

359 Sync latest handler code and manifests into the build directory. 

360 

361 Called at the start of ``deploy()`` to ensure CDK picks up the 

362 latest source changes. Only copies files — does NOT rebuild pip 

363 deps or rmtree. Runs once per StackManager instance. 

364 """ 

365 if getattr(self, "_lambda_sources_synced", False): 365 ↛ 366line 365 didn't jump to line 366 because the condition on line 365 was never true

366 return 

367 self._lambda_sources_synced = True 

368 

369 source_dir = self.project_root / "lambda" / "kubectl-applier-simple" 

370 build_dir = self.project_root / "lambda" / "kubectl-applier-simple-build" 

371 

372 if not source_dir.exists() or not build_dir.exists(): 

373 return 

374 

375 # Sync handler.py 

376 source_handler = source_dir / "handler.py" 

377 build_handler = build_dir / "handler.py" 

378 if source_handler.exists(): 378 ↛ 382line 378 didn't jump to line 382 because the condition on line 378 was always true

379 shutil.copy2(source_handler, build_handler) 

380 

381 # Sync manifests directory 

382 source_manifests = source_dir / "manifests" 

383 build_manifests = build_dir / "manifests" 

384 if source_manifests.exists(): 384 ↛ exitline 384 didn't return from function '_sync_lambda_sources' because the condition on line 384 was always true

385 build_manifests.mkdir(parents=True, exist_ok=True) 

386 for manifest_file in source_manifests.glob("*.yaml"): 

387 shutil.copy2(manifest_file, build_manifests / manifest_file.name) 

388 

389 def _rebuild_lambda_packages(self) -> None: 

390 """Full rebuild of Lambda packages for deploy. 

391 

392 Nukes the build directories and recreates them from scratch with 

393 fresh pip deps, handler code, and manifests. Called once at the 

394 start of a deploy to ensure CDK picks up all changes. 

395 

396 Runs once per StackManager instance. 

397 """ 

398 if getattr(self, "_lambda_packages_rebuilt", False): 398 ↛ 399line 398 didn't jump to line 399 because the condition on line 398 was never true

399 return 

400 self._lambda_packages_rebuilt = True 

401 self._build_lambda_packages() 

402 

403 def _build_lambda_packages(self) -> None: 

404 """Build Lambda packages for kubectl-applier and helm-installer. 

405 

406 Creates build directories with fresh copies of handler code, manifests, 

407 charts config, and pip dependencies. This ensures CDK always picks up 

408 the latest content regardless of Docker/asset caching. 

409 """ 

410 self._build_kubectl_lambda() 

411 self._build_helm_installer_lambda() 

412 

413 def _build_kubectl_lambda(self) -> None: 

414 """Build the kubectl-applier-simple Lambda package.""" 

415 source_dir = self.project_root / "lambda" / "kubectl-applier-simple" 

416 build_dir = self.project_root / "lambda" / "kubectl-applier-simple-build" 

417 requirements = source_dir / "requirements.txt" 

418 

419 if not source_dir.exists() or not requirements.exists(): 

420 return 

421 

422 print(" Building kubectl-applier-simple Lambda package...") 

423 

424 # Clean stale build directory to avoid broken symlinks from previous 

425 # pip installs (botocore/data/ is a common source of dangling symlinks 

426 # that cause CDK's asset fingerprinting to fail with ENOENT). 

427 if build_dir.exists(): 

428 _safe_rmtree(build_dir) 

429 

430 # Create build directory 

431 build_dir.mkdir(parents=True, exist_ok=True) 

432 

433 # Copy handler and manifests (always overwrite to prevent stale content) 

434 shutil.copy2(source_dir / "handler.py", build_dir / "handler.py") 

435 build_manifests = build_dir / "manifests" 

436 if (source_dir / "manifests").exists(): 436 ↛ 442line 436 didn't jump to line 442 because the condition on line 436 was always true

437 if build_manifests.exists(): 437 ↛ 438line 437 didn't jump to line 438 because the condition on line 437 was never true

438 shutil.rmtree(build_manifests) 

439 shutil.copytree(source_dir / "manifests", build_manifests, dirs_exist_ok=True) 

440 

441 # Install pip dependencies for Lambda runtime 

442 import sys 

443 

444 result = subprocess.run( # nosemgrep: dangerous-subprocess-use-audit - static list: sys.executable + pip args + Path objects, no user input 

445 [ 

446 sys.executable, 

447 "-m", 

448 "pip", 

449 "install", 

450 "-r", 

451 str(requirements), 

452 "-t", 

453 str(build_dir), 

454 "--upgrade", 

455 "--platform", 

456 "manylinux2014_x86_64", 

457 "--only-binary=:all:", 

458 "--quiet", 

459 ], 

460 capture_output=True, 

461 text=True, 

462 ) 

463 if result.returncode != 0: 463 ↛ 464line 463 didn't jump to line 464 because the condition on line 463 was never true

464 print(f" Warning: pip install failed: {result.stderr[:200]}") 

465 else: 

466 print(" Lambda package built successfully") 

467 

468 def _build_helm_installer_lambda(self) -> None: 

469 """Build the helm-installer Lambda Docker context. 

470 

471 Copies all source files into a clean build directory so CDK always 

472 detects changes to charts.yaml, handler.py, Dockerfile, etc. 

473 """ 

474 source_dir = self.project_root / "lambda" / "helm-installer" 

475 build_dir = self.project_root / "lambda" / "helm-installer-build" 

476 

477 if not source_dir.exists(): 

478 return 

479 

480 print(" Building helm-installer Lambda package...") 

481 

482 # Clean and recreate build directory 

483 if build_dir.exists(): 

484 _safe_rmtree(build_dir) 

485 build_dir.mkdir(parents=True, exist_ok=True) 

486 

487 # Copy all source files into the build directory 

488 for item in source_dir.iterdir(): 

489 if item.name == "__pycache__": 

490 continue 

491 if item.is_file(): 491 ↛ 493line 491 didn't jump to line 493 because the condition on line 491 was always true

492 shutil.copy2(item, build_dir / item.name) 

493 elif item.is_dir(): 

494 shutil.copytree(item, build_dir / item.name, dirs_exist_ok=True) 

495 

496 print(" Helm installer package built successfully") 

497 

498 def _get_python_path(self) -> str: 

499 """ 

500 Get PYTHONPATH that includes the current Python's site-packages. 

501 

502 This is critical for pipx installations where CDK runs `python3 app.py` 

503 using the system Python, which doesn't have aws_cdk installed. 

504 By setting PYTHONPATH, we ensure CDK's subprocess can find our modules. 

505 """ 

506 # Get all site-packages directories from the current Python 

507 site_packages = site.getsitepackages() 

508 

509 # Also include user site-packages if available 

510 user_site = site.getusersitepackages() 

511 if user_site and os.path.isdir(user_site): 511 ↛ 512line 511 didn't jump to line 512 because the condition on line 511 was never true

512 site_packages.append(user_site) 

513 

514 # Include the directory containing the current module (for editable installs) 

515 current_module_dir = Path(__file__).parent.parent 

516 if current_module_dir.exists(): 516 ↛ 520line 516 didn't jump to line 520 because the condition on line 516 was always true

517 site_packages.append(str(current_module_dir)) 

518 

519 # Combine with existing PYTHONPATH if any 

520 existing_path = os.environ.get("PYTHONPATH", "") 

521 all_paths = site_packages + ([existing_path] if existing_path else []) 

522 

523 return os.pathsep.join(all_paths) 

524 

525 def _run_cdk( 

526 self, 

527 command: list[str], 

528 capture_output: bool = False, 

529 env: dict[str, str] | None = None, 

530 ) -> subprocess.CompletedProcess[str]: 

531 """Run a CDK command.""" 

532 full_env = os.environ.copy() 

533 

534 # Inject PYTHONPATH so CDK's python3 subprocess can find aws_cdk 

535 # This is essential for pipx installations 

536 full_env["PYTHONPATH"] = self._get_python_path() 

537 

538 if env: 

539 full_env.update(env) 

540 

541 cdk_cmd = self._cdk_path.split() + command 

542 

543 if capture_output: 

544 return subprocess.run( # nosemgrep: dangerous-subprocess-use-audit - cdk_cmd is a list of static CDK subcommands, no user-controlled shell injection 

545 cdk_cmd, 

546 cwd=self.project_root, 

547 capture_output=True, 

548 text=True, 

549 env=full_env, 

550 ) 

551 return subprocess.run( # nosemgrep: dangerous-subprocess-use-audit - cdk_cmd is a list of static CDK subcommands, no user-controlled shell injection 

552 cdk_cmd, 

553 cwd=self.project_root, 

554 env=full_env, 

555 text=True, 

556 ) 

557 

558 def list_stacks(self) -> list[str]: 

559 """List all available CDK stacks.""" 

560 result = self._run_cdk(["list"], capture_output=True) 

561 if result.returncode != 0: 

562 raise RuntimeError(f"Failed to list stacks: {result.stderr}") 

563 return [s.strip() for s in result.stdout.strip().split("\n") if s.strip()] 

564 

565 def synth(self, stack_name: str | None = None, quiet: bool = True) -> str: 

566 """Synthesize CloudFormation templates.""" 

567 cmd = ["synth"] 

568 if stack_name: 568 ↛ 570line 568 didn't jump to line 570 because the condition on line 568 was always true

569 cmd.append(stack_name) 

570 if quiet: 570 ↛ 573line 570 didn't jump to line 573 because the condition on line 570 was always true

571 cmd.append("--quiet") 

572 

573 result = self._run_cdk(cmd, capture_output=True) 

574 if result.returncode != 0: 

575 raise RuntimeError(f"CDK synth failed: {result.stderr}") 

576 return str(result.stdout) 

577 

578 def diff(self, stack_name: str | None = None) -> str: 

579 """Show diff between deployed and local stacks.""" 

580 cmd = ["diff", "--no-color"] 

581 if stack_name: 581 ↛ 584line 581 didn't jump to line 584 because the condition on line 581 was always true

582 cmd.append(stack_name) 

583 

584 result = self._run_cdk(cmd, capture_output=True) 

585 # diff returns non-zero if there are differences, which is expected 

586 return str(result.stdout or result.stderr) 

587 

588 def deploy( 

589 self, 

590 stack_name: str | None = None, 

591 require_approval: bool = True, 

592 all_stacks: bool = False, 

593 outputs_file: str | None = None, 

594 parameters: dict[str, str] | None = None, 

595 tags: dict[str, str] | None = None, 

596 progress: str = "events", 

597 output_dir: str | None = None, 

598 exclusively: bool = False, 

599 ) -> bool: 

600 """Deploy CDK stacks. 

601 

602 Args: 

603 stack_name: Name of the stack to deploy 

604 require_approval: Whether to require approval for changes 

605 all_stacks: Deploy all stacks 

606 outputs_file: File to write outputs to 

607 parameters: CDK parameters 

608 tags: Tags to apply to stacks 

609 progress: Progress display type 

610 output_dir: Custom CDK output directory (for parallel deployments) 

611 exclusively: Pass ``--exclusively`` to CDK so only the named 

612 stack is evaluated, not its transitive dependencies. Used by 

613 ``deploy_orchestrated`` once earlier phases have already 

614 deployed the globals — re-synthesizing them every phase 

615 forces custom resources (notably KubectlApplyManifests) 

616 to re-run each time, adding minutes per phase for no 

617 actual change. 

618 """ 

619 # Full rebuild of Lambda packages on first deploy call (once per session), 

620 # then sync latest source on subsequent calls 

621 self._rebuild_lambda_packages() 

622 self._sync_lambda_sources() 

623 

624 # Check for stuck stacks and auto-recover 

625 if stack_name: 

626 self._check_and_fix_stuck_stack(stack_name) 

627 

628 # Ensure container runtime is available for building images 

629 runtime = _detect_container_runtime() 

630 if not runtime: 

631 raise RuntimeError( 

632 "No container runtime found. Please install Docker, Finch, or Podman.\n" 

633 " - Docker: https://docs.docker.com/get-docker/\n" 

634 " - Finch: brew install finch && finch vm init\n" 

635 " - Podman: https://podman.io/getting-started/installation" 

636 ) 

637 

638 # Auto-bootstrap the target region if needed 

639 if stack_name: 

640 region = self._get_deploy_region(stack_name) 

641 if region and not self.ensure_bootstrapped(region): 

642 raise RuntimeError( 

643 f"Region {region} could not be bootstrapped. " 

644 "Run 'gco stacks bootstrap --region " 

645 f"{region}' manually to diagnose." 

646 ) 

647 

648 cmd = ["deploy"] 

649 

650 if all_stacks: 

651 cmd.append("--all") 

652 elif stack_name: 652 ↛ 660line 652 didn't jump to line 660 because the condition on line 652 was always true

653 cmd.append(stack_name) 

654 

655 # --exclusively tells CDK to deploy *only* the named stack, not its 

656 # transitive dependencies. deploy_orchestrated sets this once the 

657 # earlier phases (global, api-gateway) are already in place so that 

658 # the regional and monitoring phases don't re-synthesize and 

659 # re-evaluate globals on every pass. 

660 if exclusively and stack_name and not all_stacks: 

661 cmd.append("--exclusively") 

662 

663 if not require_approval: 

664 cmd.extend(["--require-approval", "never"]) 

665 

666 if outputs_file: 

667 cmd.extend(["--outputs-file", outputs_file]) 

668 

669 if parameters: 

670 for key, value in parameters.items(): 

671 cmd.extend(["--parameters", f"{key}={value}"]) 

672 

673 if tags: 

674 for key, value in tags.items(): 

675 cmd.extend(["--tags", f"{key}={value}"]) 

676 

677 cmd.extend(["--progress", progress]) 

678 

679 # Use custom output directory for parallel deployments 

680 if output_dir: 

681 cmd.extend(["--output", output_dir]) 

682 

683 # Set CDK_DOCKER env var if not already set 

684 env = {"CDK_DOCKER": runtime} if not os.environ.get("CDK_DOCKER") else None 

685 

686 result = self._run_cdk(cmd, env=env) 

687 success = result.returncode == 0 

688 

689 if not success and stack_name: 

690 self._diagnose_deploy_failure(stack_name) 

691 

692 return success 

693 

694 def destroy( 

695 self, 

696 stack_name: str | None = None, 

697 all_stacks: bool = False, 

698 force: bool = False, 

699 output_dir: str | None = None, 

700 ) -> bool: 

701 """Destroy CDK stacks. 

702 

703 Args: 

704 stack_name: Name of the stack to destroy 

705 all_stacks: Destroy all stacks 

706 force: Skip confirmation prompts 

707 output_dir: Custom CDK output directory (for parallel deployments) 

708 """ 

709 cmd = ["destroy"] 

710 

711 if all_stacks: 

712 cmd.append("--all") 

713 elif stack_name: 713 ↛ 716line 713 didn't jump to line 716 because the condition on line 713 was always true

714 cmd.append(stack_name) 

715 

716 if force: 

717 cmd.append("--force") 

718 

719 # Use custom output directory for parallel deployments 

720 if output_dir: 

721 cmd.extend(["--output", output_dir]) 

722 

723 result = self._run_cdk(cmd) 

724 return result.returncode == 0 

725 

726 def bootstrap( 

727 self, 

728 account: str | None = None, 

729 region: str | None = None, 

730 ) -> bool: 

731 """Bootstrap CDK in an AWS account/region.""" 

732 cmd = ["bootstrap"] 

733 

734 if account and region: 

735 cmd.append(f"aws://{account}/{region}") 

736 elif region: 736 ↛ 739line 736 didn't jump to line 739 because the condition on line 736 was always true

737 cmd.append(f"aws://unknown-account/{region}") 

738 

739 result = self._run_cdk(cmd) 

740 return result.returncode == 0 

741 

742 def is_bootstrapped(self, region: str) -> bool: 

743 """Check if CDK has been bootstrapped in a region. 

744 

745 Looks for the CDKToolkit CloudFormation stack which is created 

746 by ``cdk bootstrap``. Result is cached per region for the lifetime 

747 of this StackManager instance. 

748 """ 

749 if not hasattr(self, "_bootstrap_cache"): 749 ↛ 752line 749 didn't jump to line 752 because the condition on line 749 was always true

750 self._bootstrap_cache: dict[str, bool] = {} 

751 

752 if region in self._bootstrap_cache: 752 ↛ 753line 752 didn't jump to line 753 because the condition on line 752 was never true

753 return self._bootstrap_cache[region] 

754 

755 import boto3 

756 

757 cf = boto3.client("cloudformation", region_name=region) 

758 try: 

759 response = cf.describe_stacks(StackName="CDKToolkit") 

760 stacks = response.get("Stacks", []) 

761 if stacks: 

762 status = stacks[0].get("StackStatus", "") 

763 # Any non-deleted state counts as bootstrapped 

764 result = "DELETE" not in status 

765 self._bootstrap_cache[region] = result 

766 return result 

767 except ClientError: 

768 pass # Stack doesn't exist — not bootstrapped 

769 except Exception as e: 

770 logger.debug("Failed to check CDK bootstrap in %s: %s", region, e) 

771 

772 self._bootstrap_cache[region] = False 

773 return False 

774 

775 def ensure_bootstrapped(self, region: str) -> bool: 

776 """Ensure a region is CDK-bootstrapped, auto-bootstrapping if needed. 

777 

778 Returns True if the region is (or was successfully) bootstrapped. 

779 """ 

780 if self.is_bootstrapped(region): 

781 return True 

782 

783 print(f"ℹ Region {region} is not CDK-bootstrapped. Bootstrapping now...") 

784 success = self.bootstrap(region=region) 

785 if success: 

786 # Update cache so we don't re-check this region 

787 if not hasattr(self, "_bootstrap_cache"): 787 ↛ 789line 787 didn't jump to line 789 because the condition on line 787 was always true

788 self._bootstrap_cache = {} 

789 self._bootstrap_cache[region] = True 

790 print(f"✓ CDK bootstrapped in {region}") 

791 else: 

792 print(f"✗ Failed to bootstrap CDK in {region}") 

793 return success 

794 

795 def _get_deploy_region(self, stack_name: str) -> str | None: 

796 """Determine the target AWS region for a given stack name.""" 

797 from .config import _load_cdk_json 

798 

799 cdk_regions = _load_cdk_json() 

800 

801 region: str | None 

802 if stack_name == "gco-global": 

803 region = cdk_regions.get("global") or self.config.global_region 

804 return region 

805 if stack_name == "gco-api-gateway": 

806 region = cdk_regions.get("api_gateway") or self.config.api_gateway_region 

807 return region 

808 if stack_name == "gco-monitoring": 

809 region = cdk_regions.get("monitoring") or self.config.monitoring_region 

810 return region 

811 

812 # Regional stacks: gco-{region} 

813 prefix = "gco-" 

814 if stack_name.startswith(prefix): 

815 return stack_name[len(prefix) :] 

816 

817 return None 

818 

819 def get_outputs(self, stack_name: str, region: str) -> dict[str, str]: 

820 """Get stack outputs from CloudFormation.""" 

821 import boto3 

822 

823 cf = boto3.client("cloudformation", region_name=region) 

824 try: 

825 response = cf.describe_stacks(StackName=stack_name) 

826 if response["Stacks"]: 

827 stack = response["Stacks"][0] 

828 outputs: dict[str, str] = {} 

829 for output in stack.get("Outputs", []): 

830 outputs[str(output["OutputKey"])] = str(output["OutputValue"]) 

831 return outputs 

832 except Exception as e: 

833 logger.debug("Failed to get outputs for %s in %s: %s", stack_name, region, e) 

834 return {} 

835 

836 def get_stack_status(self, stack_name: str, region: str) -> StackInfo | None: 

837 """Get detailed stack status from CloudFormation.""" 

838 import boto3 

839 

840 cf = boto3.client("cloudformation", region_name=region) 

841 try: 

842 response = cf.describe_stacks(StackName=stack_name) 

843 if response["Stacks"]: 

844 stack = response["Stacks"][0] 

845 return StackInfo( 

846 name=stack["StackName"], 

847 status=stack["StackStatus"], 

848 region=region, 

849 created_time=stack.get("CreationTime"), 

850 updated_time=stack.get("LastUpdatedTime"), 

851 outputs={o["OutputKey"]: o["OutputValue"] for o in stack.get("Outputs", [])}, 

852 tags={t["Key"]: t["Value"] for t in stack.get("Tags", [])}, 

853 ) 

854 except Exception as e: 

855 logger.debug("Failed to get stack status for %s in %s: %s", stack_name, region, e) 

856 return None 

857 

858 def deploy_orchestrated( 

859 self, 

860 require_approval: bool = True, 

861 outputs_file: str | None = None, 

862 parameters: dict[str, str] | None = None, 

863 tags: dict[str, str] | None = None, 

864 progress: str = "events", 

865 on_stack_start: Callable[[str], None] | None = None, 

866 on_stack_complete: Callable[[str, bool], None] | None = None, 

867 parallel: bool = False, 

868 max_workers: int = 4, 

869 ) -> tuple[bool, list[str], list[str]]: 

870 """ 

871 Deploy all stacks in the correct order. 

872 

873 Deploys global stacks first, then regional stacks (optionally in parallel), 

874 then the monitoring stack (which depends on regional stacks). 

875 

876 Args: 

877 require_approval: Whether to require approval for changes 

878 outputs_file: File to write outputs to 

879 parameters: CDK parameters 

880 tags: Tags to apply to stacks 

881 progress: Progress display type 

882 on_stack_start: Callback(stack_name) called when starting a stack 

883 on_stack_complete: Callback(stack_name, success) called when stack completes 

884 parallel: Deploy regional stacks in parallel 

885 max_workers: Maximum number of parallel deployments (default: 4) 

886 

887 Returns: 

888 Tuple of (overall_success, successful_stacks, failed_stacks) 

889 """ 

890 stacks = self.list_stacks() 

891 ordered_stacks = get_stack_deployment_order(stacks) 

892 

893 # Separate stacks into three groups: 

894 # 1. Pre-regional global stacks (gco-global, gco-api-gateway) 

895 # 2. Regional stacks (can be parallelized) 

896 # 3. Post-regional stacks (gco-monitoring - depends on regional stacks) 

897 pre_regional = {"gco-global", "gco-api-gateway"} 

898 post_regional = {"gco-monitoring"} 

899 

900 pre_regional_stacks = [s for s in ordered_stacks if s in pre_regional] 

901 regional_stacks = [ 

902 s for s in ordered_stacks if s not in pre_regional and s not in post_regional 

903 ] 

904 post_regional_stacks = [s for s in ordered_stacks if s in post_regional] 

905 

906 successful: list[str] = [] 

907 failed: list[str] = [] 

908 

909 # Phase 1: Deploy pre-regional global stacks sequentially 

910 for stack_name in pre_regional_stacks: 

911 if on_stack_start: 

912 on_stack_start(stack_name) 

913 

914 success = self.deploy( 

915 stack_name=stack_name, 

916 require_approval=require_approval, 

917 outputs_file=outputs_file, 

918 parameters=parameters, 

919 tags=tags, 

920 progress=progress, 

921 ) 

922 

923 if success: 

924 successful.append(stack_name) 

925 else: 

926 failed.append(stack_name) 

927 

928 if on_stack_complete: 

929 on_stack_complete(stack_name, success) 

930 

931 # Stop on failure to prevent cascading issues 

932 if not success: 

933 return False, successful, failed 

934 

935 # Phase 2: Deploy regional stacks (parallel or sequential) 

936 # All regional stacks pass --exclusively: globals are already deployed 

937 # in Phase 1, so CDK doesn't need to re-evaluate them. Skipping that 

938 # re-evaluation avoids re-running custom resources (notably 

939 # KubectlApplyManifests) on the global stacks every time a regional 

940 # stack is deployed — that would otherwise re-apply manifests and 

941 # rollout-restart controllers for no actual change. 

942 if regional_stacks: 942 ↛ 994line 942 didn't jump to line 994 because the condition on line 942 was always true

943 if parallel and len(regional_stacks) > 1: 

944 # Parallel deployment of regional stacks 

945 successful_regional, failed_regional = self._deploy_stacks_parallel( 

946 stacks=regional_stacks, 

947 require_approval=require_approval, 

948 outputs_file=outputs_file, 

949 parameters=parameters, 

950 tags=tags, 

951 progress=progress, 

952 on_stack_start=on_stack_start, 

953 on_stack_complete=on_stack_complete, 

954 max_workers=max_workers, 

955 ) 

956 successful.extend(successful_regional) 

957 failed.extend(failed_regional) 

958 

959 # Stop if any regional stack failed 

960 if failed_regional: 

961 return False, successful, failed 

962 else: 

963 # Sequential deployment 

964 for stack_name in regional_stacks: 

965 if on_stack_start: 

966 on_stack_start(stack_name) 

967 

968 success = self.deploy( 

969 stack_name=stack_name, 

970 require_approval=require_approval, 

971 outputs_file=outputs_file, 

972 parameters=parameters, 

973 tags=tags, 

974 progress=progress, 

975 exclusively=True, 

976 ) 

977 

978 if success: 978 ↛ 981line 978 didn't jump to line 981 because the condition on line 978 was always true

979 successful.append(stack_name) 

980 else: 

981 failed.append(stack_name) 

982 

983 if on_stack_complete: 

984 on_stack_complete(stack_name, success) 

985 

986 # Stop on failure 

987 if not success: 987 ↛ 988line 987 didn't jump to line 988 because the condition on line 987 was never true

988 return False, successful, failed 

989 

990 # Phase 3: Deploy post-regional stacks (monitoring) sequentially. 

991 # Same rationale as Phase 2: every upstream stack is already 

992 # deployed, so --exclusively prevents a redundant pass over 

993 # global/api-gateway/regional. 

994 for stack_name in post_regional_stacks: 

995 if on_stack_start: 995 ↛ 996line 995 didn't jump to line 996 because the condition on line 995 was never true

996 on_stack_start(stack_name) 

997 

998 success = self.deploy( 

999 stack_name=stack_name, 

1000 require_approval=require_approval, 

1001 outputs_file=outputs_file, 

1002 parameters=parameters, 

1003 tags=tags, 

1004 progress=progress, 

1005 exclusively=True, 

1006 ) 

1007 

1008 if success: 1008 ↛ 1011line 1008 didn't jump to line 1011 because the condition on line 1008 was always true

1009 successful.append(stack_name) 

1010 else: 

1011 failed.append(stack_name) 

1012 

1013 if on_stack_complete: 1013 ↛ 1014line 1013 didn't jump to line 1014 because the condition on line 1013 was never true

1014 on_stack_complete(stack_name, success) 

1015 

1016 if not success: 1016 ↛ 1017line 1016 didn't jump to line 1017 because the condition on line 1016 was never true

1017 return False, successful, failed 

1018 

1019 return len(failed) == 0, successful, failed 

1020 

1021 def _deploy_stacks_parallel( 

1022 self, 

1023 stacks: list[str], 

1024 require_approval: bool, 

1025 outputs_file: str | None, 

1026 parameters: dict[str, str] | None, 

1027 tags: dict[str, str] | None, 

1028 progress: str, 

1029 on_stack_start: Callable[[str], None] | None, 

1030 on_stack_complete: Callable[[str, bool], None] | None, 

1031 max_workers: int, 

1032 ) -> tuple[list[str], list[str]]: 

1033 """Deploy multiple stacks in parallel using separate CDK output directories.""" 

1034 import tempfile 

1035 

1036 successful: list[str] = [] 

1037 failed: list[str] = [] 

1038 lock = Lock() 

1039 

1040 def deploy_single(stack_name: str) -> tuple[str, bool]: 

1041 # Use a unique output directory in /tmp for each parallel deployment 

1042 # This avoids CDK copying cdk.out.* directories into assets 

1043 output_dir = tempfile.mkdtemp(prefix=f"cdk-{stack_name}-") 

1044 

1045 if on_stack_start: 1045 ↛ 1046line 1045 didn't jump to line 1046 because the condition on line 1045 was never true

1046 with lock: 

1047 on_stack_start(stack_name) 

1048 

1049 success = self.deploy( 

1050 stack_name=stack_name, 

1051 require_approval=require_approval, 

1052 outputs_file=outputs_file, 

1053 parameters=parameters, 

1054 tags=tags, 

1055 progress=progress, 

1056 output_dir=output_dir, 

1057 exclusively=True, 

1058 ) 

1059 

1060 # Clean up the temporary output directory 

1061 try: 

1062 import shutil 

1063 

1064 if os.path.exists(output_dir): 1064 ↛ 1069line 1064 didn't jump to line 1069 because the condition on line 1064 was always true

1065 shutil.rmtree(output_dir) 

1066 except Exception as e: 

1067 logger.debug("Cleanup of %s failed: %s", output_dir, e) 

1068 

1069 return stack_name, success 

1070 

1071 with ThreadPoolExecutor(max_workers=max_workers) as executor: 

1072 futures = {executor.submit(deploy_single, stack): stack for stack in stacks} 

1073 

1074 for future in as_completed(futures): 

1075 stack_name, success = future.result() 

1076 

1077 with lock: 

1078 if success: 

1079 successful.append(stack_name) 

1080 else: 

1081 failed.append(stack_name) 

1082 

1083 if on_stack_complete: 1083 ↛ 1084line 1083 didn't jump to line 1084 because the condition on line 1083 was never true

1084 on_stack_complete(stack_name, success) 

1085 

1086 return successful, failed 

1087 

1088 def destroy_orchestrated( 

1089 self, 

1090 force: bool = False, 

1091 on_stack_start: Callable[[str], None] | None = None, 

1092 on_stack_complete: Callable[[str, bool], None] | None = None, 

1093 parallel: bool = False, 

1094 max_workers: int = 4, 

1095 ) -> tuple[bool, list[str], list[str]]: 

1096 """ 

1097 Destroy all stacks in the correct order. 

1098 

1099 Destroys monitoring stack first, then regional stacks (optionally in parallel), 

1100 then global stacks. 

1101 

1102 Args: 

1103 force: Skip confirmation prompts 

1104 on_stack_start: Callback(stack_name) called when starting a stack 

1105 on_stack_complete: Callback(stack_name, success) called when stack completes 

1106 parallel: Destroy regional stacks in parallel 

1107 max_workers: Maximum number of parallel destructions (default: 4) 

1108 

1109 Returns: 

1110 Tuple of (overall_success, successful_stacks, failed_stacks) 

1111 """ 

1112 stacks = self.list_stacks() 

1113 

1114 # Phase 0: Clean up backup vault recovery points so the global stack 

1115 # can be deleted cleanly by CloudFormation. 

1116 self._cleanup_backup_vault() 

1117 

1118 # Separate stacks into three groups (reverse of deploy order): 

1119 pre_regional = {"gco-global", "gco-api-gateway"} 

1120 post_regional = {"gco-monitoring"} 

1121 

1122 post_regional_stacks = [s for s in stacks if s in post_regional] 

1123 regional_stacks = [s for s in stacks if s not in pre_regional and s not in post_regional] 

1124 pre_regional_stacks = [s for s in stacks if s in pre_regional] 

1125 

1126 # Sort regional stacks alphabetically (reversed for destroy) 

1127 regional_stacks.sort(reverse=True) 

1128 # Sort pre-regional stacks in reverse priority order 

1129 pre_regional_order = {"gco-api-gateway": 1, "gco-global": 2} 

1130 pre_regional_stacks.sort(key=lambda x: pre_regional_order.get(x, 0)) 

1131 

1132 successful: list[str] = [] 

1133 failed: list[str] = [] 

1134 

1135 # Phase 1: Destroy post-regional stacks (monitoring) first 

1136 for stack_name in post_regional_stacks: 

1137 if on_stack_start: 1137 ↛ 1138line 1137 didn't jump to line 1138 because the condition on line 1137 was never true

1138 on_stack_start(stack_name) 

1139 

1140 success = self.destroy( 

1141 stack_name=stack_name, 

1142 force=force, 

1143 ) 

1144 

1145 if success: 1145 ↛ 1148line 1145 didn't jump to line 1148 because the condition on line 1145 was always true

1146 successful.append(stack_name) 

1147 else: 

1148 failed.append(stack_name) 

1149 

1150 if on_stack_complete: 1150 ↛ 1151line 1150 didn't jump to line 1151 because the condition on line 1150 was never true

1151 on_stack_complete(stack_name, success) 

1152 

1153 # Phase 2: Destroy regional stacks (parallel or sequential) 

1154 # Each regional destroy has a background watchdog that proactively 

1155 # deletes EKS-managed security groups + orphaned ENIs as soon as 

1156 # they appear. Without this, CloudFormation's VPC delete step sits 

1157 # in DELETE_IN_PROGRESS for ~10 min waiting for EKS to GC the 

1158 # ``eks-cluster-sg-<cluster>-*`` security group and its cluster 

1159 # ENIs. The SG is owned by the EKS service (not CloudFormation), 

1160 # so CFN can't delete it directly — it just polls the VPC's 

1161 # dependencies until they drain. See ``_start_eks_sg_watchdog``. 

1162 watchdog_stops: dict[str, Event] = {} 

1163 watchdog_threads: dict[str, Thread] = {} 

1164 for stack_name in regional_stacks: 

1165 stop_event = Event() 

1166 thread = self._start_eks_sg_watchdog(stack_name, stop_event) 

1167 watchdog_stops[stack_name] = stop_event 

1168 watchdog_threads[stack_name] = thread 

1169 

1170 if regional_stacks: 1170 ↛ 1203line 1170 didn't jump to line 1203 because the condition on line 1170 was always true

1171 if parallel and len(regional_stacks) > 1: 

1172 # Parallel destruction of regional stacks 

1173 successful_regional, failed_regional = self._destroy_stacks_parallel( 

1174 stacks=regional_stacks, 

1175 force=force, 

1176 on_stack_start=on_stack_start, 

1177 on_stack_complete=on_stack_complete, 

1178 max_workers=max_workers, 

1179 ) 

1180 successful.extend(successful_regional) 

1181 failed.extend(failed_regional) 

1182 else: 

1183 # Sequential destruction 

1184 for stack_name in regional_stacks: 

1185 if on_stack_start: 

1186 on_stack_start(stack_name) 

1187 

1188 success = self.destroy( 

1189 stack_name=stack_name, 

1190 force=force, 

1191 ) 

1192 

1193 if success: 

1194 successful.append(stack_name) 

1195 else: 

1196 failed.append(stack_name) 

1197 

1198 if on_stack_complete: 

1199 on_stack_complete(stack_name, success) 

1200 

1201 # Stop all watchdogs and do one final cleanup pass per stack to 

1202 # catch anything EKS re-created during the destroy flow. 

1203 for stack_name in regional_stacks: 

1204 watchdog_stops[stack_name].set() 

1205 watchdog_threads[stack_name].join(timeout=5) 

1206 self._cleanup_eks_security_groups(stack_name) 

1207 

1208 # Phase 3: Destroy pre-regional global stacks last 

1209 for stack_name in pre_regional_stacks: 

1210 if on_stack_start: 

1211 on_stack_start(stack_name) 

1212 

1213 success = self.destroy( 

1214 stack_name=stack_name, 

1215 force=force, 

1216 ) 

1217 

1218 if success: 1218 ↛ 1221line 1218 didn't jump to line 1221 because the condition on line 1218 was always true

1219 successful.append(stack_name) 

1220 else: 

1221 failed.append(stack_name) 

1222 

1223 if on_stack_complete: 

1224 on_stack_complete(stack_name, success) 

1225 

1226 return len(failed) == 0, successful, failed 

1227 

1228 def _destroy_stacks_parallel( 

1229 self, 

1230 stacks: list[str], 

1231 force: bool, 

1232 on_stack_start: Callable[[str], None] | None, 

1233 on_stack_complete: Callable[[str, bool], None] | None, 

1234 max_workers: int, 

1235 ) -> tuple[list[str], list[str]]: 

1236 """Destroy multiple stacks in parallel using separate CDK output directories.""" 

1237 import tempfile 

1238 

1239 successful: list[str] = [] 

1240 failed: list[str] = [] 

1241 lock = Lock() 

1242 

1243 def destroy_single(stack_name: str) -> tuple[str, bool]: 

1244 # Use a unique output directory in /tmp for each parallel destruction 

1245 output_dir = tempfile.mkdtemp(prefix=f"cdk-{stack_name}-") 

1246 

1247 if on_stack_start: 1247 ↛ 1248line 1247 didn't jump to line 1248 because the condition on line 1247 was never true

1248 with lock: 

1249 on_stack_start(stack_name) 

1250 

1251 success = self.destroy( 

1252 stack_name=stack_name, 

1253 force=force, 

1254 output_dir=output_dir, 

1255 ) 

1256 

1257 # Clean up the temporary output directory 

1258 try: 

1259 import shutil 

1260 

1261 if os.path.exists(output_dir): 1261 ↛ 1266line 1261 didn't jump to line 1266 because the condition on line 1261 was always true

1262 shutil.rmtree(output_dir) 

1263 except Exception as e: 

1264 logger.debug("Cleanup of %s failed: %s", output_dir, e) 

1265 

1266 return stack_name, success 

1267 

1268 with ThreadPoolExecutor(max_workers=max_workers) as executor: 

1269 futures = {executor.submit(destroy_single, stack): stack for stack in stacks} 

1270 

1271 for future in as_completed(futures): 

1272 stack_name, success = future.result() 

1273 

1274 with lock: 

1275 if success: 

1276 successful.append(stack_name) 

1277 else: 

1278 failed.append(stack_name) 

1279 

1280 if on_stack_complete: 1280 ↛ 1281line 1280 didn't jump to line 1281 because the condition on line 1280 was never true

1281 on_stack_complete(stack_name, success) 

1282 

1283 return successful, failed 

1284 

1285 def _cleanup_backup_vault(self) -> None: 

1286 """Delete all recovery points from the GCO backup vault. 

1287 

1288 This is called before destroy-all so CloudFormation can delete the 

1289 backup vault cleanly. Without this, the vault deletion fails because 

1290 it contains recovery points. 

1291 """ 

1292 import boto3 

1293 

1294 global_region = self.config.global_region 

1295 project_name = self.config.project_name 

1296 

1297 try: 

1298 backup_client = boto3.client("backup", region_name=global_region) 

1299 

1300 # Find the backup vault by listing vaults with the project prefix 

1301 paginator = backup_client.get_paginator("list_backup_vaults") 

1302 vault_name = None 

1303 for page in paginator.paginate(): 

1304 for vault in page.get("BackupVaultList", []): 

1305 if project_name in vault["BackupVaultName"].lower(): 

1306 vault_name = vault["BackupVaultName"] 

1307 break 

1308 if vault_name: 

1309 break 

1310 

1311 if not vault_name: 

1312 return 

1313 

1314 # Delete all recovery points in the vault 

1315 rp_paginator = backup_client.get_paginator("list_recovery_points_by_backup_vault") 

1316 deleted = 0 

1317 for page in rp_paginator.paginate(BackupVaultName=vault_name): 

1318 for rp in page.get("RecoveryPoints", []): 

1319 try: 

1320 backup_client.delete_recovery_point( 

1321 BackupVaultName=vault_name, 

1322 RecoveryPointArn=rp["RecoveryPointArn"], 

1323 ) 

1324 deleted += 1 

1325 except Exception as e: 

1326 logger.debug("Failed to delete recovery point: %s", e) 

1327 

1328 if deleted > 0: 

1329 print(f" Cleaned up {deleted} backup recovery points from {vault_name}") 

1330 

1331 except Exception as e: 

1332 print(f" Warning: Backup vault cleanup failed (non-fatal): {e}") 

1333 

1334 def cleanup_eks_security_groups(self) -> None: 

1335 """Clean up EKS-managed security groups across all regional stacks. 

1336 

1337 Called between destroy retries to remove orphaned security groups 

1338 that block VPC deletion. 

1339 """ 

1340 stacks = self.list_stacks() 

1341 pre_regional = {"gco-global", "gco-api-gateway", "gco-monitoring"} 

1342 regional_stacks = [s for s in stacks if s not in pre_regional] 

1343 for stack_name in regional_stacks: 

1344 self._cleanup_eks_security_groups(stack_name) 

1345 

1346 def _cleanup_eks_security_groups(self, stack_name: str) -> None: 

1347 """Clean up EKS-managed security groups that block VPC deletion. 

1348 

1349 EKS creates a cluster security group (eks-cluster-sg-<cluster-name>-*) 

1350 that is owned by EKS, not CloudFormation. When the stack is destroyed, 

1351 this SG and its attached ENIs can linger, causing VPC deletion to fail. 

1352 

1353 This method finds and deletes these orphaned security groups and their 

1354 ENIs before the stack destroy runs. 

1355 """ 

1356 import time as _time 

1357 

1358 import boto3 

1359 

1360 # Extract region from stack name (e.g., gco-us-east-1 -> us-east-1) 

1361 project_name = self.config.project_name 

1362 region = stack_name.replace(f"{project_name}-", "", 1) 

1363 cluster_name = stack_name # cluster name matches stack name 

1364 

1365 try: 

1366 ec2 = boto3.client("ec2", region_name=region) 

1367 

1368 # Find EKS-managed security groups by name pattern 

1369 response = ec2.describe_security_groups( 

1370 Filters=[ 

1371 { 

1372 "Name": "group-name", 

1373 "Values": [f"eks-cluster-sg-{cluster_name}-*"], 

1374 } 

1375 ] 

1376 ) 

1377 

1378 sgs = response.get("SecurityGroups", []) 

1379 if not sgs: 

1380 return 

1381 

1382 for sg in sgs: 

1383 sg_id = sg["GroupId"] 

1384 sg_name = sg.get("GroupName", "") 

1385 

1386 # First, detach and delete any ENIs using this SG 

1387 eni_response = ec2.describe_network_interfaces( 

1388 Filters=[{"Name": "group-id", "Values": [sg_id]}] 

1389 ) 

1390 

1391 for eni in eni_response.get("NetworkInterfaces", []): 

1392 eni_id = eni["NetworkInterfaceId"] 

1393 try: 

1394 if eni.get("Attachment"): 1394 ↛ 1400line 1394 didn't jump to line 1400 because the condition on line 1394 was always true

1395 ec2.detach_network_interface( 

1396 AttachmentId=eni["Attachment"]["AttachmentId"], 

1397 Force=True, 

1398 ) 

1399 _time.sleep(5) 

1400 ec2.delete_network_interface(NetworkInterfaceId=eni_id) 

1401 logger.debug("Deleted ENI %s from SG %s", eni_id, sg_name) 

1402 except Exception as e: 

1403 logger.debug("Failed to delete ENI %s: %s", eni_id, e) 

1404 

1405 # Now delete the security group 

1406 try: 

1407 ec2.delete_security_group(GroupId=sg_id) 

1408 print(f" Cleaned up EKS security group: {sg_name} ({sg_id})") 

1409 except Exception as e: 

1410 logger.debug( 

1411 "Failed to delete SG %s: %s (will retry on next attempt)", sg_id, e 

1412 ) 

1413 

1414 except Exception as e: 

1415 # Non-fatal — the retry loop will handle it 

1416 logger.debug("EKS security group cleanup for %s failed: %s", stack_name, e) 

1417 

1418 def _start_eks_sg_watchdog(self, stack_name: str, stop_event: Event) -> Thread: 

1419 """Start a background thread that polls for orphaned EKS security groups. 

1420 

1421 EKS creates an ``eks-cluster-sg-<cluster-name>-*`` security group that 

1422 is owned by the EKS service (not CloudFormation). When the stack's 

1423 EKS cluster resource deletes, the SG is supposed to GC along with its 

1424 cluster ENIs — but on EKS Auto Mode there's a window where the SG 

1425 lingers after the cluster is gone, blocking the subsequent VPC 

1426 delete with ``DependencyViolation``. CloudFormation then sits in 

1427 ``DELETE_IN_PROGRESS`` on the VPC for ~10 minutes retrying. 

1428 

1429 This watchdog runs for the full duration of the regional-stack 

1430 destroy, polling every 30 seconds. As soon as an orphaned SG appears 

1431 (which only happens after the cluster delete has progressed past 

1432 the cluster resource itself), it deletes the SG and any ENIs still 

1433 attached. That unblocks the VPC delete immediately instead of 

1434 waiting for EKS's own GC timer. 

1435 

1436 The thread exits when ``stop_event`` is set by the orchestrator at 

1437 the end of the regional phase. 

1438 """ 

1439 

1440 def _watchdog() -> None: 

1441 while not stop_event.is_set(): 

1442 try: 

1443 self._cleanup_eks_security_groups(stack_name) 

1444 except Exception as e: 

1445 logger.debug( 

1446 "EKS SG watchdog tick for %s failed (non-fatal): %s", 

1447 stack_name, 

1448 e, 

1449 ) 

1450 # ``wait`` returns immediately when the event is set, so this 

1451 # doubles as the sleep-and-shutdown-check in one call. 

1452 stop_event.wait(timeout=30) 

1453 

1454 thread = Thread( 

1455 target=_watchdog, 

1456 name=f"eks-sg-watchdog-{stack_name}", 

1457 daemon=True, 

1458 ) 

1459 thread.start() 

1460 return thread 

1461 

1462 

1463def get_stack_manager(config: GCOConfig) -> StackManager: 

1464 """Factory function to get a StackManager instance.""" 

1465 return StackManager(config) 

1466 

1467 

1468def get_stack_deployment_order(stacks: list[str]) -> list[str]: 

1469 """ 

1470 Get the correct deployment order for stacks. 

1471 

1472 Order: global stacks first, then regional stacks. 

1473 Global stacks: gco-global, gco-api-gateway, gco-monitoring 

1474 Regional stacks: gco-{region} (e.g., gco-us-east-1) 

1475 """ 

1476 global_stacks = [] 

1477 regional_stacks = [] 

1478 

1479 # Define global stack priority (lower = deploy first) 

1480 global_priority = { 

1481 "gco-global": 1, 

1482 "gco-api-gateway": 2, 

1483 "gco-monitoring": 3, 

1484 } 

1485 

1486 for stack in stacks: 

1487 if stack in global_priority: 

1488 global_stacks.append((global_priority[stack], stack)) 

1489 else: 

1490 regional_stacks.append(stack) 

1491 

1492 # Sort global stacks by priority, regional stacks alphabetically 

1493 global_stacks.sort(key=lambda x: x[0]) 

1494 regional_stacks.sort() 

1495 

1496 return [s[1] for s in global_stacks] + regional_stacks 

1497 

1498 

1499def get_stack_destroy_order(stacks: list[str]) -> list[str]: 

1500 """ 

1501 Get the correct destroy order for stacks. 

1502 

1503 Order: regional stacks first, then global stacks (reverse of deploy). 

1504 """ 

1505 deployment_order = get_stack_deployment_order(stacks) 

1506 return list(reversed(deployment_order)) 

1507 

1508 

1509# ============================================================================= 

1510# Feature toggle helpers 

1511# ============================================================================= 

1512 

1513_FSX_DEFAULTS: dict[str, Any] = { 

1514 "enabled": False, 

1515 "storage_capacity_gib": 1200, 

1516 "deployment_type": "SCRATCH_2", 

1517 "per_unit_storage_throughput": 200, 

1518 "data_compression_type": "LZ4", 

1519 "import_path": None, 

1520 "export_path": None, 

1521 "auto_import_policy": "NEW_CHANGED_DELETED", 

1522} 

1523 

1524 

1525def _find_cdk_json() -> Path | None: 

1526 """Find cdk.json in current or parent directories.""" 

1527 current = Path.cwd() 

1528 for parent in [current] + list(current.parents): 

1529 cdk_path = parent / "cdk.json" 

1530 if cdk_path.exists(): 

1531 return cdk_path 

1532 return None 

1533 

1534 

1535def get_fsx_config(region: str | None = None) -> dict[str, Any]: 

1536 """Get current FSx for Lustre configuration from cdk.json. 

1537 

1538 Args: 

1539 region: Optional region to get config for. If provided, checks for 

1540 region-specific overrides first. 

1541 

1542 Returns: 

1543 FSx configuration dictionary 

1544 """ 

1545 return _get_feature_config("fsx_lustre", _FSX_DEFAULTS, region) 

1546 

1547 

1548def update_fsx_config(settings: dict[str, Any], region: str | None = None) -> None: 

1549 """Update FSx for Lustre configuration in cdk.json. 

1550 

1551 Args: 

1552 settings: FSx settings to update 

1553 region: Optional region for region-specific config. If None, updates global config. 

1554 """ 

1555 _update_feature_config("fsx_lustre", settings, _FSX_DEFAULTS, region) 

1556 

1557 

1558# ============================================================================= 

1559# Generic feature toggle helpers (used by FSx, Valkey, Aurora, and future features) 

1560# ============================================================================= 

1561 

1562 

1563def _get_feature_config( 

1564 feature_key: str, 

1565 default_config: dict[str, Any], 

1566 region: str | None = None, 

1567) -> dict[str, Any]: 

1568 """Get configuration for a toggleable feature from cdk.json. 

1569 

1570 Args: 

1571 feature_key: The cdk.json context key (e.g. "valkey", "aurora_pgvector"). 

1572 default_config: Default configuration values when the key is missing. 

1573 region: Optional region for region-specific overrides. 

1574 

1575 Returns: 

1576 Merged configuration dictionary. 

1577 """ 

1578 cdk_json_path = _find_cdk_json() 

1579 if not cdk_json_path: 

1580 raise RuntimeError("cdk.json not found") 

1581 

1582 import json 

1583 

1584 with open(cdk_json_path, encoding="utf-8") as f: 

1585 cdk_config = json.load(f) 

1586 

1587 global_config = cdk_config.get("context", {}).get(feature_key, default_config) 

1588 

1589 if region: 

1590 region_key = f"{feature_key}_regions" 

1591 region_overrides = cdk_config.get("context", {}).get(region_key, {}) 

1592 if region in region_overrides: 

1593 merged = {**global_config, **region_overrides[region]} 

1594 merged["region"] = region 

1595 merged["is_region_specific"] = True 

1596 return merged 

1597 

1598 result = {**default_config, **global_config} 

1599 result["is_region_specific"] = False 

1600 return result 

1601 

1602 

1603def _update_feature_config( 

1604 feature_key: str, 

1605 settings: dict[str, Any], 

1606 default_config: dict[str, Any], 

1607 region: str | None = None, 

1608) -> None: 

1609 """Update configuration for a toggleable feature in cdk.json. 

1610 

1611 Args: 

1612 feature_key: The cdk.json context key (e.g. "valkey", "aurora_pgvector"). 

1613 settings: Settings to update. 

1614 default_config: Default configuration values when the key is missing. 

1615 region: Optional region for region-specific config. 

1616 """ 

1617 cdk_json_path = _find_cdk_json() 

1618 if not cdk_json_path: 

1619 raise RuntimeError("cdk.json not found") 

1620 

1621 import json 

1622 

1623 with open(cdk_json_path, encoding="utf-8") as f: 

1624 cdk_config = json.load(f) 

1625 

1626 if "context" not in cdk_config: 

1627 cdk_config["context"] = {} 

1628 

1629 if region: 

1630 region_key = f"{feature_key}_regions" 

1631 if region_key not in cdk_config["context"]: 1631 ↛ 1633line 1631 didn't jump to line 1633 because the condition on line 1631 was always true

1632 cdk_config["context"][region_key] = {} 

1633 if region not in cdk_config["context"][region_key]: 1633 ↛ 1635line 1633 didn't jump to line 1635 because the condition on line 1633 was always true

1634 cdk_config["context"][region_key][region] = {} 

1635 for key, value in settings.items(): 

1636 if value is not None or key == "enabled": 1636 ↛ 1635line 1636 didn't jump to line 1635 because the condition on line 1636 was always true

1637 cdk_config["context"][region_key][region][key] = value 

1638 else: 

1639 if feature_key not in cdk_config["context"]: 

1640 cdk_config["context"][feature_key] = {**default_config} 

1641 for key, value in settings.items(): 

1642 if value is not None or key == "enabled": 

1643 cdk_config["context"][feature_key][key] = value 

1644 

1645 with open(cdk_json_path, "w", encoding="utf-8") as f: 

1646 json.dump(cdk_config, f, indent=2) 

1647 

1648 

1649# ============================================================================= 

1650# Valkey configuration 

1651# ============================================================================= 

1652 

1653_VALKEY_DEFAULTS: dict[str, Any] = { 

1654 "enabled": False, 

1655 "max_data_storage_gb": 5, 

1656 "max_ecpu_per_second": 5000, 

1657 "snapshot_retention_limit": 1, 

1658} 

1659 

1660 

1661def get_valkey_config(region: str | None = None) -> dict[str, Any]: 

1662 """Get current Valkey Serverless configuration from cdk.json.""" 

1663 return _get_feature_config("valkey", _VALKEY_DEFAULTS, region) 

1664 

1665 

1666def update_valkey_config(settings: dict[str, Any], region: str | None = None) -> None: 

1667 """Update Valkey Serverless configuration in cdk.json.""" 

1668 _update_feature_config("valkey", settings, _VALKEY_DEFAULTS, region) 

1669 

1670 

1671# ============================================================================= 

1672# Aurora pgvector configuration 

1673# ============================================================================= 

1674 

1675_AURORA_DEFAULTS: dict[str, Any] = { 

1676 "enabled": False, 

1677 "min_acu": 0, 

1678 "max_acu": 16, 

1679 "backup_retention_days": 7, 

1680 "deletion_protection": False, 

1681} 

1682 

1683 

1684def get_aurora_config(region: str | None = None) -> dict[str, Any]: 

1685 """Get current Aurora pgvector configuration from cdk.json.""" 

1686 return _get_feature_config("aurora_pgvector", _AURORA_DEFAULTS, region) 

1687 

1688 

1689def update_aurora_config(settings: dict[str, Any], region: str | None = None) -> None: 

1690 """Update Aurora pgvector configuration in cdk.json.""" 

1691 _update_feature_config("aurora_pgvector", settings, _AURORA_DEFAULTS, region)