Coverage for cli/stacks.py: 93%

933 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-15 15:07 +0000

1""" 

2Stack management for GCO CLI. 

3 

4Provides commands for deploying, updating, and managing CDK stacks. 

5This is the largest CLI module (~1600 lines) because it orchestrates the 

6full deployment lifecycle including container runtime detection, CDK 

7bootstrapping, Lambda source synchronization, and parallel regional deploys. 

8 

9This module handles: 

10 - Container runtime detection (Docker, Finch, Podman) with automatic fallback 

11 - CDK bootstrap across all target regions (idempotent) 

12 - Lambda source synchronization (copies handler code + dependencies before synth) 

13 - CDK stack deployment with proper dependency ordering: 

14 1. Global stack (Global Accelerator, DynamoDB) 

15 2. API Gateway stack (auth secret, Lambda proxy) 

16 3. Regional stacks in parallel (EKS, VPC, ALB per region) 

17 4. Monitoring stack (CloudWatch dashboards, alarms) 

18 - Parallel deployment of regional stacks via ThreadPoolExecutor 

19 - Stack destruction in reverse dependency order 

20 - FSx for Lustre enable/disable toggle 

21 - kubectl access configuration (EKS access entries + kubeconfig) 

22 

23Key Design Decisions: 

24 - Regional stacks deploy in parallel for speed; global/API/monitoring are sequential 

25 - Lambda build directories are synced before every deploy to avoid stale code 

26 - Container runtime is auto-detected; CDK_DOCKER env var overrides 

27 - All destructive operations require -y/--yes confirmation 

28 - Stack status is read from CloudFormation, not cached locally 

29 

30Environment Variables: 

31 CDK_DOCKER: Override container runtime (default: auto-detect Docker/Finch/Podman) 

32 AWS_REGION: Default region for single-region operations 

33""" 

34 

35from __future__ import annotations 

36 

37import logging 

38import os 

39import shutil 

40import site 

41import subprocess 

42import sys 

43from collections.abc import Callable 

44from concurrent.futures import ThreadPoolExecutor, as_completed 

45from dataclasses import dataclass, field 

46from datetime import datetime 

47from pathlib import Path 

48from threading import Event, Lock, Thread 

49from typing import TYPE_CHECKING, Any 

50 

51from botocore.exceptions import ClientError 

52 

53# <pyflowchart-code-diagram> BEGIN - auto-inserted, do not edit 

54# Flowchart(s) generated from this file: 

55# * ``StackManager.deploy_orchestrated`` -> ``diagrams/code_diagrams/cli/stacks.StackManager_deploy_orchestrated.html`` 

56# (PNG: ``diagrams/code_diagrams/cli/stacks.StackManager_deploy_orchestrated.png``) 

57# * ``StackManager.destroy_orchestrated`` -> ``diagrams/code_diagrams/cli/stacks.StackManager_destroy_orchestrated.html`` 

58# (PNG: ``diagrams/code_diagrams/cli/stacks.StackManager_destroy_orchestrated.png``) 

59# Regenerate with ``python diagrams/code_diagrams/generate.py``. 

60# <pyflowchart-code-diagram> END 

61 

62 

63if TYPE_CHECKING: 

64 from .config import GCOConfig 

65 

66logger = logging.getLogger(__name__) 

67 

68 

69@dataclass 

70class StackInfo: 

71 """Information about a CDK stack.""" 

72 

73 name: str 

74 status: str 

75 region: str 

76 created_time: datetime | None = None 

77 updated_time: datetime | None = None 

78 outputs: dict[str, str] = field(default_factory=dict) 

79 tags: dict[str, str] = field(default_factory=dict) 

80 

81 def to_dict(self) -> dict[str, Any]: 

82 return { 

83 "name": self.name, 

84 "status": self.status, 

85 "region": self.region, 

86 "created_time": self.created_time.isoformat() if self.created_time else None, 

87 "updated_time": self.updated_time.isoformat() if self.updated_time else None, 

88 "outputs": self.outputs, 

89 "tags": self.tags, 

90 } 

91 

92 

93def _safe_rmtree(path: Path) -> None: 

94 """Remove a directory tree, handling broken symlinks on macOS. 

95 

96 shutil.rmtree can fail with ``OSError: [Errno 66] Directory not empty`` 

97 on macOS when pip-installed packages (e.g. botocore) contain broken 

98 symlinks or extended-attribute resource forks. 

99 

100 Falls back to ``rm -rf`` via subprocess, but only after validating the 

101 path is a real directory under the project tree to avoid accidents. 

102 """ 

103 resolved = path.resolve() 

104 

105 # Safety: refuse to remove anything that isn't clearly a build artifact 

106 # inside the project. The path must contain "lambda" and end with "-build". 

107 if "lambda" not in resolved.parts or not resolved.name.endswith("-build"): 

108 raise ValueError(f"Refusing to remove unexpected path: {resolved}") 

109 

110 try: 

111 shutil.rmtree(str(resolved)) 

112 except OSError: 

113 subprocess.run(["rm", "-rf", "--", str(resolved)], check=True) 

114 

115 

116# Container runtime detection lives in cli/_container_runtime.py so it can 

117# be shared between StackManager (CDK asset bundling) and ImageManager 

118# (gco images build/push). The uncached probe is imported from there; 

119# this module keeps its own small cache so existing tests that reset 

120# ``cli.stacks._container_runtime_cache`` continue to work without 

121# touching the new module's cache. 

122from cli._container_runtime import ( # noqa: E402 

123 _detect_container_runtime_uncached, 

124) 

125 

126# Cached result for container runtime detection (None = not yet checked) 

127_container_runtime_cache: str | None = None 

128_container_runtime_checked: bool = False 

129 

130 

131def _detect_container_runtime() -> str | None: 

132 """ 

133 Detect available container runtime for CDK asset bundling. 

134 

135 Thin caching wrapper around the shared 

136 ``cli._container_runtime._detect_container_runtime_uncached`` probe. 

137 The cache state is held on this module so tests that patch or reset 

138 ``cli.stacks._container_runtime_cache`` keep working unchanged. 

139 """ 

140 global _container_runtime_cache, _container_runtime_checked 

141 if _container_runtime_checked: 141 ↛ 142line 141 didn't jump to line 142 because the condition on line 141 was never true

142 return _container_runtime_cache 

143 

144 _container_runtime_cache = _detect_container_runtime_uncached() 

145 _container_runtime_checked = True 

146 return _container_runtime_cache 

147 

148 

149class StackManager: 

150 """Manages CDK stack operations.""" 

151 

152 def __init__(self, config: GCOConfig, project_root: Path | None = None): 

153 self.config = config 

154 self.project_root = project_root or self._find_project_root() 

155 self._cdk_path = self._find_cdk() 

156 

157 # Ensure Lambda build directory exists before any CDK synth 

158 self._ensure_lambda_build() 

159 

160 def _find_project_root(self) -> Path: 

161 """Find the project root by looking for cdk.json.""" 

162 current = Path.cwd() 

163 for parent in [current] + list(current.parents): 

164 if (parent / "cdk.json").exists(): 

165 return parent 

166 return current 

167 

168 def _find_cdk(self) -> str: 

169 """Find CDK executable.""" 

170 # Check if cdk is in PATH 

171 try: 

172 result = subprocess.run(["which", "cdk"], capture_output=True, text=True, check=True) 

173 return result.stdout.strip() 

174 except subprocess.CalledProcessError: 

175 pass 

176 

177 # Check common locations 

178 for path in ["/usr/local/bin/cdk", "~/.npm-global/bin/cdk"]: 

179 expanded = os.path.expanduser(path) 

180 if os.path.exists(expanded): 

181 return expanded 

182 

183 # Fall back to npx 

184 return "npx cdk" 

185 

186 def _ensure_lambda_build(self) -> None: 

187 """Ensure the Lambda build directories exist for CDK synthesis. 

188 

189 Called from __init__ so the directory is ready before any CDK synth 

190 (even ``cdk list`` needs it because ``app.py`` references the asset). 

191 

192 This does a lightweight check — only builds if the directory is 

193 missing or incomplete. It does NOT do a full rebuild (no rmtree). 

194 The full rebuild happens in ``_rebuild_lambda_packages()``, which 

195 is called by ``deploy()`` before the actual CDK deploy. 

196 """ 

197 kubectl_source = self.project_root / "lambda" / "kubectl-applier-simple" 

198 kubectl_build = self.project_root / "lambda" / "kubectl-applier-simple-build" 

199 helm_source = self.project_root / "lambda" / "helm-installer" 

200 helm_build = self.project_root / "lambda" / "helm-installer-build" 

201 

202 # Only build if the directory is missing or deps aren't installed 

203 if kubectl_source.exists() and ( 

204 not kubectl_build.exists() or not (kubectl_build / "yaml").exists() 

205 ): 

206 self._build_kubectl_lambda() 

207 

208 if helm_source.exists() and not helm_build.exists(): 208 ↛ 209line 208 didn't jump to line 209 because the condition on line 208 was never true

209 self._build_helm_installer_lambda() 

210 

211 def _check_and_fix_stuck_stack(self, stack_name: str) -> None: 

212 """Check if a stack is in a stuck state and auto-recover. 

213 

214 Stacks can get stuck in REVIEW_IN_PROGRESS, ROLLBACK_COMPLETE, or 

215 other non-deployable states. This detects those and cleans up so 

216 the next deploy can succeed. 

217 """ 

218 import boto3 

219 

220 region = self._get_deploy_region(stack_name) 

221 if not region: 

222 return 

223 

224 try: 

225 cfn = boto3.client("cloudformation", region_name=region) 

226 response = cfn.describe_stacks(StackName=stack_name) 

227 status = response["Stacks"][0]["StackStatus"] 

228 

229 stuck_states = { 

230 "REVIEW_IN_PROGRESS", 

231 "ROLLBACK_COMPLETE", 

232 "ROLLBACK_FAILED", 

233 "CREATE_FAILED", 

234 "DELETE_FAILED", 

235 } 

236 

237 if status in stuck_states: 

238 print(f" Stack {stack_name} is in {status} state, cleaning up...") 

239 cfn.delete_stack(StackName=stack_name) 

240 waiter = cfn.get_waiter("stack_delete_complete") 

241 waiter.wait(StackName=stack_name, WaiterConfig={"Delay": 10, "MaxAttempts": 60}) 

242 print(f" Stack {stack_name} cleaned up, will recreate on deploy") 

243 

244 except Exception as e: 

245 logger.debug("Stack pre-check for %s: %s", stack_name, e) 

246 # Stack doesn't exist or can't be described — fine, deploy will create it 

247 

248 def _diagnose_deploy_failure(self, stack_name: str) -> None: 

249 """Fetch CloudFormation events after a failed deploy and print diagnostics. 

250 

251 Gives users actionable information instead of just the CDK error message. 

252 """ 

253 import boto3 

254 

255 region = self._get_deploy_region(stack_name) 

256 if not region: 

257 return 

258 

259 try: 

260 cfn = boto3.client("cloudformation", region_name=region) 

261 

262 # Get recent events 

263 response = cfn.describe_stack_events(StackName=stack_name) 

264 events = response.get("StackEvents", []) 

265 

266 # Filter to failed events 

267 failed = [ 

268 e 

269 for e in events[:20] 

270 if "FAILED" in e.get("ResourceStatus", "") 

271 or "ROLLBACK" in e.get("ResourceStatus", "") 

272 ] 

273 

274 if failed: 

275 print(f"\n CloudFormation failure details for {stack_name}:") 

276 for event in failed[:5]: 

277 resource = event.get("LogicalResourceId", "unknown") 

278 status = event.get("ResourceStatus", "unknown") 

279 reason = event.get("ResourceStatusReason", "no reason given") 

280 print(f" {resource}: {status}") 

281 print(f" {reason}") 

282 

283 # Check stack status for actionable advice 

284 try: 

285 stack_resp = cfn.describe_stacks(StackName=stack_name) 

286 status = stack_resp["Stacks"][0]["StackStatus"] 

287 

288 advice = { 

289 "REVIEW_IN_PROGRESS": ( 

290 "Stack is stuck in REVIEW_IN_PROGRESS. " 

291 "Run: aws cloudformation delete-stack " 

292 f"--stack-name {stack_name} --region {region}" 

293 ), 

294 "ROLLBACK_COMPLETE": ( 

295 "Stack rolled back. Delete it and retry: " 

296 f"aws cloudformation delete-stack " 

297 f"--stack-name {stack_name} --region {region}" 

298 ), 

299 "ROLLBACK_FAILED": ( 

300 "Stack rollback failed. Delete with --retain: " 

301 f"aws cloudformation delete-stack " 

302 f"--stack-name {stack_name} --region {region}" 

303 ), 

304 "UPDATE_ROLLBACK_COMPLETE": ( 

305 "Update rolled back but stack is stable. " 

306 "Check the events above and retry the deploy." 

307 ), 

308 } 

309 

310 if status in advice: 310 ↛ exitline 310 didn't return from function '_diagnose_deploy_failure' because the condition on line 310 was always true

311 print(f"\n Suggested fix: {advice[status]}") 

312 

313 except Exception as e: 

314 logger.debug("Failed to parse stack events: %s", e) 

315 

316 except Exception as e: 

317 logger.debug("Failed to diagnose deploy failure for %s: %s", stack_name, e) 

318 # Best effort — don't fail the deploy further 

319 

320 def _sync_lambda_sources(self) -> None: 

321 """ 

322 Sync latest handler code and manifests into the build directory. 

323 

324 Called at the start of ``deploy()`` to ensure CDK picks up the 

325 latest source changes. Only copies files — does NOT rebuild pip 

326 deps or rmtree. Runs once per StackManager instance. 

327 """ 

328 if getattr(self, "_lambda_sources_synced", False): 

329 return 

330 self._lambda_sources_synced = True 

331 

332 source_dir = self.project_root / "lambda" / "kubectl-applier-simple" 

333 build_dir = self.project_root / "lambda" / "kubectl-applier-simple-build" 

334 

335 if not source_dir.exists() or not build_dir.exists(): 

336 return 

337 

338 # Sync handler.py 

339 source_handler = source_dir / "handler.py" 

340 build_handler = build_dir / "handler.py" 

341 if source_handler.exists(): 341 ↛ 345line 341 didn't jump to line 345 because the condition on line 341 was always true

342 shutil.copy2(source_handler, build_handler) 

343 

344 # Sync manifests directory 

345 source_manifests = source_dir / "manifests" 

346 build_manifests = build_dir / "manifests" 

347 if source_manifests.exists(): 347 ↛ exitline 347 didn't return from function '_sync_lambda_sources' because the condition on line 347 was always true

348 build_manifests.mkdir(parents=True, exist_ok=True) 

349 for manifest_file in source_manifests.glob("*.yaml"): 

350 shutil.copy2(manifest_file, build_manifests / manifest_file.name) 

351 

352 def _rebuild_lambda_packages(self) -> None: 

353 """Full rebuild of Lambda packages for deploy. 

354 

355 Nukes the build directories and recreates them from scratch with 

356 fresh pip deps, handler code, and manifests. Called once at the 

357 start of a deploy to ensure CDK picks up all changes. 

358 

359 Runs once per StackManager instance. 

360 """ 

361 if getattr(self, "_lambda_packages_rebuilt", False): 361 ↛ 362line 361 didn't jump to line 362 because the condition on line 361 was never true

362 return 

363 self._lambda_packages_rebuilt = True 

364 self._build_lambda_packages() 

365 

366 def _build_lambda_packages(self) -> None: 

367 """Build Lambda packages for kubectl-applier and helm-installer. 

368 

369 Creates build directories with fresh copies of handler code, manifests, 

370 charts config, and pip dependencies. This ensures CDK always picks up 

371 the latest content regardless of Docker/asset caching. 

372 """ 

373 self._build_kubectl_lambda() 

374 self._build_helm_installer_lambda() 

375 

376 def _build_kubectl_lambda(self) -> None: 

377 """Build the kubectl-applier-simple Lambda package.""" 

378 source_dir = self.project_root / "lambda" / "kubectl-applier-simple" 

379 build_dir = self.project_root / "lambda" / "kubectl-applier-simple-build" 

380 requirements = source_dir / "requirements.txt" 

381 

382 if not source_dir.exists() or not requirements.exists(): 

383 return 

384 

385 print(" Building kubectl-applier-simple Lambda package...") 

386 

387 # Clean stale build directory to avoid broken symlinks from previous 

388 # pip installs (botocore/data/ is a common source of dangling symlinks 

389 # that cause CDK's asset fingerprinting to fail with ENOENT). 

390 if build_dir.exists(): 

391 _safe_rmtree(build_dir) 

392 

393 # Create build directory 

394 build_dir.mkdir(parents=True, exist_ok=True) 

395 

396 # Copy handler and manifests (always overwrite to prevent stale content) 

397 shutil.copy2(source_dir / "handler.py", build_dir / "handler.py") 

398 build_manifests = build_dir / "manifests" 

399 if (source_dir / "manifests").exists(): 399 ↛ 405line 399 didn't jump to line 405 because the condition on line 399 was always true

400 if build_manifests.exists(): 400 ↛ 401line 400 didn't jump to line 401 because the condition on line 400 was never true

401 shutil.rmtree(build_manifests) 

402 shutil.copytree(source_dir / "manifests", build_manifests, dirs_exist_ok=True) 

403 

404 # Install pip dependencies for Lambda runtime 

405 import sys 

406 

407 result = subprocess.run( # nosemgrep: dangerous-subprocess-use-audit - static list: sys.executable + pip args + Path objects, no user input 

408 [ 

409 sys.executable, 

410 "-m", 

411 "pip", 

412 "install", 

413 "-r", 

414 str(requirements), 

415 "-t", 

416 str(build_dir), 

417 "--upgrade", 

418 "--platform", 

419 "manylinux2014_x86_64", 

420 "--only-binary=:all:", 

421 "--quiet", 

422 ], 

423 capture_output=True, 

424 text=True, 

425 ) 

426 if result.returncode != 0: 426 ↛ 427line 426 didn't jump to line 427 because the condition on line 426 was never true

427 print(f" Warning: pip install failed: {result.stderr[:200]}") 

428 else: 

429 print(" Lambda package built successfully") 

430 

431 def _build_helm_installer_lambda(self) -> None: 

432 """Build the helm-installer Lambda Docker context. 

433 

434 Copies all source files into a clean build directory so CDK always 

435 detects changes to charts.yaml, handler.py, Dockerfile, etc. 

436 """ 

437 source_dir = self.project_root / "lambda" / "helm-installer" 

438 build_dir = self.project_root / "lambda" / "helm-installer-build" 

439 

440 if not source_dir.exists(): 

441 return 

442 

443 print(" Building helm-installer Lambda package...") 

444 

445 # Clean and recreate build directory 

446 if build_dir.exists(): 

447 _safe_rmtree(build_dir) 

448 build_dir.mkdir(parents=True, exist_ok=True) 

449 

450 # Copy all source files into the build directory 

451 for item in source_dir.iterdir(): 

452 if item.name == "__pycache__": 

453 continue 

454 if item.is_file(): 454 ↛ 456line 454 didn't jump to line 456 because the condition on line 454 was always true

455 shutil.copy2(item, build_dir / item.name) 

456 elif item.is_dir(): 

457 shutil.copytree(item, build_dir / item.name, dirs_exist_ok=True) 

458 

459 print(" Helm installer package built successfully") 

460 

461 def _get_python_path(self) -> str: 

462 """ 

463 Get PYTHONPATH that includes the current Python's site-packages. 

464 

465 This is critical for pipx installations where CDK runs `python3 app.py` 

466 using the system Python, which doesn't have aws_cdk installed. 

467 By setting PYTHONPATH, we ensure CDK's subprocess can find our modules. 

468 """ 

469 # Get all site-packages directories from the current Python 

470 site_packages = site.getsitepackages() 

471 

472 # Also include user site-packages if available 

473 user_site = site.getusersitepackages() 

474 if user_site and os.path.isdir(user_site): 474 ↛ 475line 474 didn't jump to line 475 because the condition on line 474 was never true

475 site_packages.append(user_site) 

476 

477 # Include the directory containing the current module (for editable installs) 

478 current_module_dir = Path(__file__).parent.parent 

479 if current_module_dir.exists(): 479 ↛ 483line 479 didn't jump to line 483 because the condition on line 479 was always true

480 site_packages.append(str(current_module_dir)) 

481 

482 # Combine with existing PYTHONPATH if any 

483 existing_path = os.environ.get("PYTHONPATH", "") 

484 all_paths = site_packages + ([existing_path] if existing_path else []) 

485 

486 return os.pathsep.join(all_paths) 

487 

488 def _run_cdk( 

489 self, 

490 command: list[str], 

491 capture_output: bool = False, 

492 env: dict[str, str] | None = None, 

493 timeout: float | None = None, 

494 ) -> subprocess.CompletedProcess[str]: 

495 """Run a CDK command. 

496 

497 Args: 

498 command: CDK subcommand argv (e.g. ``["destroy", "gco-us-east-1", "--force"]``). 

499 capture_output: Capture stdout / stderr instead of streaming. 

500 env: Extra env vars merged onto the parent process environment. 

501 timeout: Wall-clock timeout in seconds. ``None`` (default) waits 

502 forever — preserving the old behaviour for ``synth`` / ``list``. 

503 When set, on timeout we send SIGTERM, give the CDK process up 

504 to 30 seconds to exit cleanly, then SIGKILL, and finally 

505 re-raise ``subprocess.TimeoutExpired`` so callers can decide 

506 how to handle a hung subprocess. ``deploy()`` and ``destroy()`` 

507 pass a per-stack budget so a wedged ``cdk destroy`` (e.g. its 

508 post-delete polling loop hanging after CloudFormation has 

509 already finished) can't block the orchestrator forever. 

510 """ 

511 full_env = os.environ.copy() 

512 

513 # Inject PYTHONPATH so CDK's python3 subprocess can find aws_cdk 

514 # This is essential for pipx installations 

515 full_env["PYTHONPATH"] = self._get_python_path() 

516 

517 if env: 

518 full_env.update(env) 

519 

520 cdk_cmd = self._cdk_path.split() + command 

521 

522 try: 

523 if capture_output: 

524 return subprocess.run( # nosemgrep: dangerous-subprocess-use-audit - cdk_cmd is a list of static CDK subcommands, no user-controlled shell injection 

525 cdk_cmd, 

526 cwd=self.project_root, 

527 capture_output=True, 

528 text=True, 

529 env=full_env, 

530 timeout=timeout, 

531 ) 

532 return subprocess.run( # nosemgrep: dangerous-subprocess-use-audit - cdk_cmd is a list of static CDK subcommands, no user-controlled shell injection 

533 cdk_cmd, 

534 cwd=self.project_root, 

535 env=full_env, 

536 text=True, 

537 timeout=timeout, 

538 ) 

539 except subprocess.TimeoutExpired: 

540 # subprocess.run already sent SIGKILL and reaped the child by 

541 # the time TimeoutExpired propagates. Surface the timeout so 

542 # callers (deploy / destroy) can verify post-state via CFN. 

543 logger.warning( 

544 "cdk command timed out after %ss: %s", 

545 timeout, 

546 " ".join(cdk_cmd), 

547 ) 

548 raise 

549 

550 def list_stacks(self) -> list[str]: 

551 """List all available CDK stacks.""" 

552 result = self._run_cdk(["list"], capture_output=True) 

553 if result.returncode != 0: 

554 raise RuntimeError(f"Failed to list stacks: {result.stderr}") 

555 return [s.strip() for s in result.stdout.strip().split("\n") if s.strip()] 

556 

557 def synth(self, stack_name: str | None = None, quiet: bool = True) -> str: 

558 """Synthesize CloudFormation templates.""" 

559 cmd = ["synth"] 

560 if stack_name: 560 ↛ 562line 560 didn't jump to line 562 because the condition on line 560 was always true

561 cmd.append(stack_name) 

562 if quiet: 562 ↛ 565line 562 didn't jump to line 565 because the condition on line 562 was always true

563 cmd.append("--quiet") 

564 

565 result = self._run_cdk(cmd, capture_output=True) 

566 if result.returncode != 0: 

567 raise RuntimeError(f"CDK synth failed: {result.stderr}") 

568 return str(result.stdout) 

569 

570 def diff(self, stack_name: str | None = None) -> str: 

571 """Show diff between deployed and local stacks.""" 

572 cmd = ["diff", "--no-color"] 

573 if stack_name: 573 ↛ 576line 573 didn't jump to line 576 because the condition on line 573 was always true

574 cmd.append(stack_name) 

575 

576 result = self._run_cdk(cmd, capture_output=True) 

577 # diff returns non-zero if there are differences, which is expected 

578 return str(result.stdout or result.stderr) 

579 

580 def deploy( 

581 self, 

582 stack_name: str | None = None, 

583 require_approval: bool = True, 

584 all_stacks: bool = False, 

585 outputs_file: str | None = None, 

586 parameters: dict[str, str] | None = None, 

587 tags: dict[str, str] | None = None, 

588 progress: str = "events", 

589 output_dir: str | None = None, 

590 exclusively: bool = False, 

591 ) -> bool: 

592 """Deploy CDK stacks. 

593 

594 Args: 

595 stack_name: Name of the stack to deploy 

596 require_approval: Whether to require approval for changes 

597 all_stacks: Deploy all stacks 

598 outputs_file: File to write outputs to 

599 parameters: CDK parameters 

600 tags: Tags to apply to stacks 

601 progress: Progress display type 

602 output_dir: Custom CDK output directory (for parallel deployments) 

603 exclusively: Pass ``--exclusively`` to CDK so only the named 

604 stack is evaluated, not its transitive dependencies. Used by 

605 ``deploy_orchestrated`` once earlier phases have already 

606 deployed the globals — re-synthesizing them every phase 

607 forces custom resources (notably KubectlApplyManifests) 

608 to re-run each time, adding minutes per phase for no 

609 actual change. 

610 """ 

611 # Full rebuild of Lambda packages on first deploy call (once per session), 

612 # then sync latest source on subsequent calls 

613 self._rebuild_lambda_packages() 

614 self._sync_lambda_sources() 

615 

616 # Check for stuck stacks and auto-recover 

617 if stack_name: 

618 self._check_and_fix_stuck_stack(stack_name) 

619 

620 # Ensure container runtime is available for building images 

621 runtime = _detect_container_runtime() 

622 if not runtime: 

623 raise RuntimeError( 

624 "No container runtime found. Please install Docker, Finch, or Podman.\n" 

625 " - Docker: https://docs.docker.com/get-docker/\n" 

626 " - Finch: brew install finch && finch vm init\n" 

627 " - Podman: https://podman.io/getting-started/installation" 

628 ) 

629 

630 # Auto-bootstrap the target region if needed 

631 if stack_name: 

632 region = self._get_deploy_region(stack_name) 

633 if region and not self.ensure_bootstrapped(region): 

634 raise RuntimeError( 

635 f"Region {region} could not be bootstrapped. " 

636 "Run 'gco stacks bootstrap --region " 

637 f"{region}' manually to diagnose." 

638 ) 

639 

640 cmd = ["deploy"] 

641 

642 if all_stacks: 

643 cmd.append("--all") 

644 elif stack_name: 644 ↛ 652line 644 didn't jump to line 652 because the condition on line 644 was always true

645 cmd.append(stack_name) 

646 

647 # --exclusively tells CDK to deploy *only* the named stack, not its 

648 # transitive dependencies. deploy_orchestrated sets this once the 

649 # earlier phases (global, api-gateway) are already in place so that 

650 # the regional and monitoring phases don't re-synthesize and 

651 # re-evaluate globals on every pass. 

652 if exclusively and stack_name and not all_stacks: 

653 cmd.append("--exclusively") 

654 

655 if not require_approval: 

656 cmd.extend(["--require-approval", "never"]) 

657 

658 if outputs_file: 

659 cmd.extend(["--outputs-file", outputs_file]) 

660 

661 if parameters: 

662 for key, value in parameters.items(): 

663 cmd.extend(["--parameters", f"{key}={value}"]) 

664 

665 if tags: 

666 for key, value in tags.items(): 

667 cmd.extend(["--tags", f"{key}={value}"]) 

668 

669 cmd.extend(["--progress", progress]) 

670 

671 # Use custom output directory for parallel deployments 

672 if output_dir: 

673 cmd.extend(["--output", output_dir]) 

674 

675 # Set CDK_DOCKER env var if not already set 

676 env = {"CDK_DOCKER": runtime} if not os.environ.get("CDK_DOCKER") else None 

677 

678 # Per-stack wall-clock cap so a wedged ``cdk deploy`` (e.g. an 

679 # IAM eventual-consistency wait that never completes) can't block 

680 # the orchestrator forever. Default 60 minutes — long enough for 

681 # a fresh EKS cluster cold start. Override via 

682 # GCO_CDK_DEPLOY_TIMEOUT_SECONDS. 

683 timeout_s = float(os.environ.get("GCO_CDK_DEPLOY_TIMEOUT_SECONDS", "3600")) 

684 

685 try: 

686 result = self._run_cdk(cmd, env=env, timeout=timeout_s) 

687 success = result.returncode == 0 

688 except subprocess.TimeoutExpired: 

689 print( 

690 f" cdk deploy timed out after {timeout_s}s for " 

691 f"{stack_name or 'all stacks'}; verifying CloudFormation state..." 

692 ) 

693 success = False 

694 

695 # Reconcile against CloudFormation. If the stack is in a terminal 

696 # CREATE/UPDATE_COMPLETE state, the deploy actually succeeded 

697 # despite cdk's exit code or timeout. 

698 if stack_name and not all_stacks and not success: 

699 cfn_status = self._get_stack_status(stack_name) 

700 if cfn_status in ("CREATE_COMPLETE", "UPDATE_COMPLETE"): 

701 print( 

702 f" cdk reported a non-zero exit but {stack_name} is in " 

703 f"{cfn_status} in CloudFormation — treating as success." 

704 ) 

705 success = True 

706 

707 if not success and stack_name: 

708 self._diagnose_deploy_failure(stack_name) 

709 

710 # After deploying gco-analytics, automatically redeploy 

711 # gco-api-gateway to wire in the /studio/* routes (the API gateway 

712 # imports the Cognito pool ARN and presigned-URL Lambda ARN from 

713 # the analytics stack). 

714 if success and stack_name and "analytics" in stack_name and not all_stacks: 

715 print(" Updating gco-api-gateway with analytics routes...") 

716 self.deploy( 

717 stack_name="gco-api-gateway", 

718 require_approval=require_approval, 

719 exclusively=True, 

720 ) 

721 

722 return success 

723 

724 def destroy( 

725 self, 

726 stack_name: str | None = None, 

727 all_stacks: bool = False, 

728 force: bool = False, 

729 output_dir: str | None = None, 

730 ) -> bool: 

731 """Destroy CDK stacks. 

732 

733 If the target stack exists in CloudFormation but isn't in the CDK 

734 app (e.g. because a toggle was disabled), temporarily enables the 

735 toggle so CDK can synthesize and destroy the stack properly. This 

736 ensures custom resource cleanup handlers (like the analytics 

737 cleanup Lambda) fire during deletion. 

738 

739 Args: 

740 stack_name: Name of the stack to destroy 

741 all_stacks: Destroy all stacks 

742 force: Skip confirmation prompts 

743 output_dir: Custom CDK output directory (for parallel deployments) 

744 """ 

745 # Image-registry pre-destroy guards. Only fires for the global 

746 # stack (where the registry lives) and only when the operator 

747 # has explicitly chosen ``removal_policy: "destroy"``. The 

748 # default ``retain`` posture is a no-op here. See 

749 # ``_image_registry_destroy_preflight`` for the exact rules. 

750 if ( 

751 stack_name == "gco-global" 

752 and not all_stacks 

753 and not self._image_registry_destroy_preflight(force=force) 

754 ): 

755 return False 

756 

757 # If destroying a specific stack that exists in CloudFormation but 

758 # might not be in the CDK app, temporarily enable its toggle. 

759 toggle_restored = False 

760 if ( 

761 stack_name 

762 and not all_stacks 

763 and "analytics" in stack_name 

764 and self._stack_exists_in_cloudformation(stack_name) 

765 ): 

766 toggle_restored = self._ensure_analytics_enabled_for_destroy() 

767 

768 # The analytics stack exports values (e.g. Cognito pool ARN) that 

769 # gco-api-gateway imports. CloudFormation blocks deletion of stacks 

770 # with consumed exports. To break the dependency, redeploy the API 

771 # gateway with analytics disabled first, then destroy analytics. 

772 if stack_name and not all_stacks and "analytics" in stack_name: 

773 safe_to_destroy = self._remove_api_gateway_analytics_dependency() 

774 if not safe_to_destroy: 

775 # Restore analytics toggle before bailing out. 

776 if toggle_restored: 776 ↛ 777line 776 didn't jump to line 777 because the condition on line 776 was never true

777 self._restore_analytics_disabled() 

778 print( 

779 " Aborting gco-analytics destroy: gco-api-gateway still " 

780 "imports analytics exports. Fix the API gateway and retry." 

781 ) 

782 return False 

783 

784 cmd = ["destroy"] 

785 

786 if all_stacks: 

787 cmd.append("--all") 

788 elif stack_name: 788 ↛ 796line 788 didn't jump to line 796 because the condition on line 788 was always true

789 cmd.append(stack_name) 

790 # --exclusively prevents CDK from cascading the destroy to 

791 # dependent stacks (e.g. destroying gco-analytics should not 

792 # also destroy gco-api-gateway just because it references the 

793 # presigned-URL Lambda ARN). 

794 cmd.append("--exclusively") 

795 

796 if force: 

797 cmd.append("--force") 

798 

799 if output_dir: 

800 cmd.extend(["--output", output_dir]) 

801 

802 # Per-stack wall-clock cap so a wedged ``cdk destroy`` (its 

803 # post-delete polling loop hanging after CloudFormation has 

804 # already finished) can't block the orchestrator forever. Default 

805 # 45 minutes — enough for an EKS regional teardown which can take 

806 # 25-30 minutes when SG / ENI cleanup serialises. Override via 

807 # GCO_CDK_DESTROY_TIMEOUT_SECONDS. 

808 timeout_s = float(os.environ.get("GCO_CDK_DESTROY_TIMEOUT_SECONDS", "2700")) 

809 

810 try: 

811 result = self._run_cdk(cmd, timeout=timeout_s) 

812 cdk_succeeded = result.returncode == 0 

813 except subprocess.TimeoutExpired: 

814 # CDK hung. Verify the AWS-side state below — if the stack 

815 # is gone in CloudFormation, the destroy actually succeeded 

816 # and the timeout was just CDK's polling loop wedged. 

817 print( 

818 f" cdk destroy timed out after {timeout_s}s; verifying " 

819 f"CloudFormation state for {stack_name}..." 

820 ) 

821 cdk_succeeded = False 

822 

823 # Restore the toggle if we changed it 

824 if toggle_restored: 

825 self._restore_analytics_disabled() 

826 

827 # Reconcile against CloudFormation. Three outcomes: 

828 # 1. cdk succeeded AND stack is gone → success 

829 # 2. stack is gone (regardless of cdk return code or timeout) → success 

830 # 3. stack still exists → fall back to direct CFN delete or surface failure 

831 if stack_name and not all_stacks: 

832 still_present = self._stack_exists_in_cloudformation(stack_name) 

833 if not still_present: 

834 # Whatever cdk did or didn't do, AWS confirms the stack 

835 # is gone. Treat as success. 

836 if not cdk_succeeded: 

837 print( 

838 f" cdk reported a non-zero exit but {stack_name} is " 

839 f"already deleted in CloudFormation — treating as success." 

840 ) 

841 return True 

842 # Stack still there. If cdk succeeded but CFN still has it, 

843 # fall back to a direct CFN delete (existing behaviour). 

844 if cdk_succeeded: 

845 return self._cloudformation_delete_stack(stack_name) 

846 # cdk failed AND stack is still present → real failure. 

847 return False 

848 

849 return cdk_succeeded 

850 

851 # ------------------------------------------------------------------ 

852 # Image registry pre-destroy guards 

853 # ------------------------------------------------------------------ 

854 def _read_images_config(self) -> dict[str, Any]: 

855 """Read the ``images`` block from cdk.json with defaults applied. 

856 

857 Mirrors the parser in ``gco/stacks/global_stack.py`` so the CLI 

858 can reason about the same fields without importing the CDK 

859 module (which pulls aws_cdk and the full constructs surface). 

860 Defaults stay aligned with the global-stack parser; any value 

861 that fails validation (e.g. an unexpected ``removal_policy``) 

862 is silently coerced to ``"retain"`` here so the CLI never blocks 

863 on a typo — the actual deploy-time validation is the global 

864 stack's responsibility. 

865 """ 

866 import json 

867 

868 cdk_json_path = _find_cdk_json() 

869 if not cdk_json_path: 

870 return { 

871 "removal_policy": "retain", 

872 "empty_on_delete": False, 

873 } 

874 try: 

875 with open(cdk_json_path, encoding="utf-8") as f: 

876 ctx = json.load(f).get("context", {}) or {} 

877 except (OSError, json.JSONDecodeError) as exc: 

878 logger.debug("Failed to read cdk.json for images config: %s", exc) 

879 return {"removal_policy": "retain", "empty_on_delete": False} 

880 

881 raw = ctx.get("images") or {} 

882 removal_policy = str(raw.get("removal_policy", "retain")).strip().lower() 

883 if removal_policy not in ("retain", "destroy"): 

884 removal_policy = "retain" 

885 return { 

886 "removal_policy": removal_policy, 

887 "empty_on_delete": bool(raw.get("empty_on_delete", False)), 

888 } 

889 

890 def _build_image_registry_inventory(self) -> dict[str, Any]: 

891 """Aggregate repo / tag / size / reference counts for the registry. 

892 

893 Returns a dict shape suitable for printing to the operator. Best 

894 effort: a missing ImageManager dependency or an AWS error 

895 produces a partially-populated dict rather than raising. 

896 """ 

897 inventory: dict[str, Any] = { 

898 "repo_count": 0, 

899 "tag_count": 0, 

900 "total_bytes": 0, 

901 "endpoint_refs": 0, 

902 "job_refs": 0, 

903 } 

904 try: 

905 from cli.images import ImageManager 

906 except Exception as exc: # noqa: BLE001 

907 logger.debug("ImageManager import failed during preflight: %s", exc) 

908 return inventory 

909 

910 try: 

911 manager = ImageManager(config=self.config) 

912 repos = manager.list_repos() 

913 inventory["repo_count"] = len(repos) 

914 for repo in repos: 

915 repo_name = repo.get("name", "") 

916 if not repo_name.startswith("gco/"): 

917 continue 

918 short = repo_name.removeprefix("gco/") 

919 try: 

920 tags = manager.list_tags(short) 

921 except Exception as exc: # noqa: BLE001 

922 logger.debug("list_tags failed for %s: %s", repo_name, exc) 

923 continue 

924 inventory["tag_count"] += len(tags) 

925 for row in tags: 

926 size = row.get("size_bytes") 

927 if isinstance(size, int): 927 ↛ 925line 927 didn't jump to line 925 because the condition on line 927 was always true

928 inventory["total_bytes"] += size 

929 try: 

930 inventory["endpoint_refs"] = len(manager._collect_inference_image_refs()) 

931 except Exception as exc: # noqa: BLE001 

932 logger.debug("inference ref collection failed: %s", exc) 

933 try: 

934 inventory["job_refs"] = len(manager._collect_recent_job_image_refs()) 

935 except Exception as exc: # noqa: BLE001 

936 logger.debug("job ref collection failed: %s", exc) 

937 except Exception as exc: # noqa: BLE001 

938 logger.debug("Image registry inventory failed: %s", exc) 

939 return inventory 

940 

941 def _image_registry_destroy_preflight(self, *, force: bool) -> bool: 

942 """Validate the image-registry destroy posture before invoking CFN. 

943 

944 Two rules: 

945 

946 1. ``removal_policy: "destroy"`` AND ``empty_on_delete: false`` 

947 → refuse with the literal helpful-error message pointing 

948 the operator at ``gco images cleanup --all`` or at flipping 

949 ``empty_on_delete: true``. 

950 

951 2. ``removal_policy: "destroy"`` AND ``empty_on_delete: true`` 

952 → print the inventory summary first. On a TTY the operator 

953 is also prompted for confirmation; non-TTY runs proceed 

954 (the operator presumably passed ``-y`` or is automating). 

955 

956 Returns True when the destroy may proceed, False when it has 

957 been refused or declined. 

958 """ 

959 cfg = self._read_images_config() 

960 if cfg["removal_policy"] != "destroy": 

961 return True 

962 

963 if not cfg["empty_on_delete"]: 

964 print( 

965 "Repos under gco/* are not empty and empty_on_delete is " 

966 "false. Run 'gco images cleanup --all' first, or set " 

967 "images.empty_on_delete: true in cdk.json." 

968 ) 

969 return False 

970 

971 inventory = self._build_image_registry_inventory() 

972 gib = inventory["total_bytes"] / (1024**3) if inventory["total_bytes"] else 0.0 

973 print("Image registry inventory before destroy:") 

974 print(f" repos: {inventory['repo_count']}") 

975 print(f" tags: {inventory['tag_count']}") 

976 print(f" total size: {gib:.2f} GiB") 

977 print(f" referencing endpoints: {inventory['endpoint_refs']}") 

978 print(f" recent job refs: {inventory['job_refs']}") 

979 

980 # Already confirmed via -y, or non-interactive — proceed. 

981 if force or not sys.stdin.isatty(): 

982 return True 

983 

984 try: 

985 response = input("Destroy gco-global and delete every gco/* repo? [y/N]: ") 

986 except EOFError, KeyboardInterrupt: 

987 print("Aborted.") 

988 return False 

989 if response.strip().lower() not in ("y", "yes"): 

990 print("Aborted.") 

991 return False 

992 return True 

993 

994 def _stack_exists_in_cloudformation(self, stack_name: str) -> bool: 

995 """Check if a stack exists and is not in a deleted state.""" 

996 import boto3 

997 

998 region = self._get_destroy_region(stack_name) 

999 cfn = boto3.client("cloudformation", region_name=region) 

1000 try: 

1001 resp = cfn.describe_stacks(StackName=stack_name) 

1002 status = resp["Stacks"][0]["StackStatus"] 

1003 return "DELETE" not in status 

1004 except Exception: 

1005 return False 

1006 

1007 def _get_stack_status(self, stack_name: str) -> str | None: 

1008 """Return the live CloudFormation status of ``stack_name`` or None. 

1009 

1010 Used by ``deploy()`` to reconcile against AWS-side state when ``cdk 

1011 deploy`` returns a non-zero exit code or times out — if the stack 

1012 actually finished CREATE_COMPLETE or UPDATE_COMPLETE on the AWS 

1013 side, the deploy succeeded regardless of what cdk reported. 

1014 Returns None when the stack does not exist or the lookup itself 

1015 fails (network blip, perms, etc.) so callers can treat the 

1016 unknown case as 'cdk's verdict stands'. 

1017 """ 

1018 import boto3 

1019 

1020 region = self._get_destroy_region(stack_name) 

1021 cfn = boto3.client("cloudformation", region_name=region) 

1022 try: 

1023 resp = cfn.describe_stacks(StackName=stack_name) 

1024 return str(resp["Stacks"][0]["StackStatus"]) 

1025 except Exception: 

1026 return None 

1027 

1028 def _cloudformation_delete_stack(self, stack_name: str) -> bool: 

1029 """Delete a stack directly via CloudFormation API.""" 

1030 import boto3 

1031 

1032 region = self._get_destroy_region(stack_name) 

1033 cfn = boto3.client("cloudformation", region_name=region) 

1034 try: 

1035 cfn.delete_stack(StackName=stack_name) 

1036 waiter = cfn.get_waiter("stack_delete_complete") 

1037 waiter.wait( 

1038 StackName=stack_name, 

1039 WaiterConfig={"Delay": 15, "MaxAttempts": 120}, 

1040 ) 

1041 return True 

1042 except Exception: 

1043 return False 

1044 

1045 def _get_destroy_region(self, stack_name: str) -> str: 

1046 """Determine the region for a stack based on its name.""" 

1047 try: 

1048 region = self._get_deploy_region(stack_name) 

1049 return region or self.config.api_gateway_region 

1050 except Exception: 

1051 return self.config.api_gateway_region 

1052 

1053 def _ensure_analytics_enabled_for_destroy(self) -> bool: 

1054 """Temporarily enable analytics so CDK includes the stack for destroy.""" 

1055 try: 

1056 current = get_analytics_config() 

1057 if not current.get("enabled"): 

1058 update_analytics_config({"enabled": True}) 

1059 return True 

1060 except Exception as exc: 

1061 logger.debug( 

1062 "Failed to enable analytics toggle for destroy: %s", 

1063 exc, 

1064 exc_info=True, 

1065 ) 

1066 return False 

1067 

1068 def _restore_analytics_disabled(self) -> None: 

1069 """Restore analytics toggle to disabled after destroy.""" 

1070 try: 

1071 update_analytics_config({"enabled": False}) 

1072 except Exception as exc: 

1073 logger.warning( 

1074 "Failed to restore analytics toggle to disabled after destroy: %s", 

1075 exc, 

1076 exc_info=True, 

1077 ) 

1078 

1079 def _remove_api_gateway_analytics_dependency(self) -> bool: 

1080 """Redeploy gco-api-gateway with analytics disabled to drop cross-stack imports. 

1081 

1082 The analytics stack exports values (Cognito pool ARN, presigned-URL 

1083 Lambda ARN) that gco-api-gateway imports for the /studio/* routes. 

1084 CloudFormation blocks deletion of stacks with consumed exports. By 

1085 disabling analytics and redeploying the API gateway, the /studio/* 

1086 routes are removed and the imports are dropped, unblocking the 

1087 analytics stack deletion. 

1088 

1089 Returns: 

1090 True if the analytics stack is safe to destroy (either because 

1091 no consumer remains or because the redeploy successfully 

1092 dropped the imports). False if a consumer of the analytics 

1093 exports still exists and the analytics destroy will fail. 

1094 """ 

1095 # Fast path: if gco-api-gateway doesn't exist (or has already been 

1096 # deleted/rolled-back into a non-consuming state), there's nothing 

1097 # importing the analytics exports. Skip the redeploy entirely. 

1098 if not self._stack_exists_in_cloudformation("gco-api-gateway"): 

1099 logger.info( 

1100 "gco-api-gateway does not exist in CloudFormation; " 

1101 "skipping redeploy before analytics destroy." 

1102 ) 

1103 return True 

1104 

1105 # Second fast path: if the deployed api-gateway isn't actually 

1106 # importing anything from the analytics stack, we don't need to 

1107 # touch it. This happens when analytics was never fully wired up. 

1108 if not self._api_gateway_imports_from_analytics(): 

1109 logger.info( 

1110 "gco-api-gateway does not import any gco-analytics exports; " 

1111 "skipping redeploy before analytics destroy." 

1112 ) 

1113 return True 

1114 

1115 try: 

1116 # Temporarily disable analytics so CDK drops the /studio/* routes. 

1117 current = get_analytics_config() 

1118 was_enabled = current.get("enabled", False) 

1119 if was_enabled: 

1120 update_analytics_config({"enabled": False}) 

1121 

1122 print(" Updating gco-api-gateway to remove analytics routes...") 

1123 import tempfile 

1124 

1125 with tempfile.TemporaryDirectory() as tmp_out: 

1126 success = self.deploy( 

1127 stack_name="gco-api-gateway", 

1128 require_approval=False, 

1129 exclusively=True, 

1130 output_dir=tmp_out, 

1131 ) 

1132 

1133 # Re-enable analytics so CDK can synthesize the analytics stack 

1134 # for the destroy operation (custom resources need to fire). 

1135 if was_enabled: 

1136 update_analytics_config({"enabled": True}) 

1137 

1138 if not success: 

1139 # The redeploy failed. That's only a real problem if the 

1140 # api-gateway still imports analytics exports. Recheck: 

1141 # the auto-cleanup of ROLLBACK_COMPLETE stacks may have 

1142 # deleted the consumer entirely, in which case the destroy 

1143 # can still proceed. 

1144 if not self._api_gateway_imports_from_analytics(): 

1145 logger.info( 

1146 "gco-api-gateway redeploy failed, but the stack no " 

1147 "longer imports analytics exports (likely deleted " 

1148 "during cleanup). Analytics destroy can proceed." 

1149 ) 

1150 return True 

1151 logger.error( 

1152 "Failed to redeploy gco-api-gateway to drop analytics " 

1153 "imports, and the stack still consumes analytics exports. " 

1154 "Destroying gco-analytics will fail with " 

1155 "'Export ... cannot be deleted as it is in use'. Fix " 

1156 "gco-api-gateway first (see events above) and retry." 

1157 ) 

1158 return False 

1159 

1160 return True 

1161 except Exception as exc: 

1162 logger.warning( 

1163 "Failed to remove API gateway analytics dependency: %s", 

1164 exc, 

1165 exc_info=True, 

1166 ) 

1167 # On unexpected exceptions, recheck whether imports remain. 

1168 # Be permissive only if we can confirm the destroy is safe. 

1169 try: 

1170 return not self._api_gateway_imports_from_analytics() 

1171 except Exception: 

1172 return False 

1173 

1174 def _api_gateway_imports_from_analytics(self) -> bool: 

1175 """Return True if gco-api-gateway imports any exports from gco-analytics. 

1176 

1177 Uses CloudFormation's ``list_exports`` + ``list_imports`` to detect 

1178 cross-stack references at runtime. This is more reliable than 

1179 inspecting the CDK app because it reflects what's actually 

1180 deployed. 

1181 """ 

1182 import boto3 

1183 

1184 region = self._get_deploy_region("gco-analytics") 

1185 if not region: 

1186 return False 

1187 

1188 try: 

1189 cfn = boto3.client("cloudformation", region_name=region) 

1190 # Collect every export whose owning stack is gco-analytics. 

1191 analytics_exports: list[str] = [] 

1192 paginator = cfn.get_paginator("list_exports") 

1193 for page in paginator.paginate(): 

1194 for export in page.get("Exports", []): 

1195 owner = export.get("ExportingStackId", "") 

1196 # ExportingStackId is a full ARN; match by stack name. 

1197 if ":stack/gco-analytics/" in owner: 

1198 analytics_exports.append(export["Name"]) 

1199 

1200 if not analytics_exports: 

1201 return False 

1202 

1203 # For each export, check whether gco-api-gateway is listed 

1204 # as an importer. ``list_imports`` returns the stack names 

1205 # that currently import the given export. 

1206 import_paginator = cfn.get_paginator("list_imports") 

1207 for export_name in analytics_exports: 

1208 try: 

1209 for page in import_paginator.paginate(ExportName=export_name): 

1210 for importer in page.get("Imports", []): 

1211 if importer == "gco-api-gateway": 1211 ↛ 1210line 1211 didn't jump to line 1210 because the condition on line 1211 was always true

1212 return True 

1213 except Exception as exc: 

1214 # ``list_imports`` raises when an export has zero 

1215 # consumers — treat that as "not imported" and move on. 

1216 logger.debug( 

1217 "list_imports(%s) failed (likely no consumers): %s", 

1218 export_name, 

1219 exc, 

1220 ) 

1221 return False 

1222 except Exception as exc: 

1223 logger.debug( 

1224 "Failed to check analytics imports for gco-api-gateway: %s", 

1225 exc, 

1226 exc_info=True, 

1227 ) 

1228 # On failure to check, err on the side of attempting the 

1229 # redeploy so we don't skip necessary cleanup. 

1230 return True 

1231 

1232 def bootstrap( 

1233 self, 

1234 account: str | None = None, 

1235 region: str | None = None, 

1236 ) -> bool: 

1237 """Bootstrap CDK in an AWS account/region.""" 

1238 cmd = ["bootstrap"] 

1239 

1240 if account and region: 

1241 cmd.append(f"aws://{account}/{region}") 

1242 elif region: 1242 ↛ 1245line 1242 didn't jump to line 1245 because the condition on line 1242 was always true

1243 cmd.append(f"aws://unknown-account/{region}") 

1244 

1245 result = self._run_cdk(cmd) 

1246 return result.returncode == 0 

1247 

1248 def is_bootstrapped(self, region: str) -> bool: 

1249 """Check if CDK has been bootstrapped in a region. 

1250 

1251 Looks for the CDKToolkit CloudFormation stack which is created 

1252 by ``cdk bootstrap``. Result is cached per region for the lifetime 

1253 of this StackManager instance. 

1254 """ 

1255 if not hasattr(self, "_bootstrap_cache"): 1255 ↛ 1258line 1255 didn't jump to line 1258 because the condition on line 1255 was always true

1256 self._bootstrap_cache: dict[str, bool] = {} 

1257 

1258 if region in self._bootstrap_cache: 1258 ↛ 1259line 1258 didn't jump to line 1259 because the condition on line 1258 was never true

1259 return self._bootstrap_cache[region] 

1260 

1261 import boto3 

1262 

1263 cf = boto3.client("cloudformation", region_name=region) 

1264 try: 

1265 response = cf.describe_stacks(StackName="CDKToolkit") 

1266 stacks = response.get("Stacks", []) 

1267 if stacks: 

1268 status = stacks[0].get("StackStatus", "") 

1269 # Any non-deleted state counts as bootstrapped 

1270 result = "DELETE" not in status 

1271 self._bootstrap_cache[region] = result 

1272 return result 

1273 except ClientError: 

1274 pass # Stack doesn't exist — not bootstrapped 

1275 except Exception as e: 

1276 logger.debug("Failed to check CDK bootstrap in %s: %s", region, e) 

1277 

1278 self._bootstrap_cache[region] = False 

1279 return False 

1280 

1281 def ensure_bootstrapped(self, region: str) -> bool: 

1282 """Ensure a region is CDK-bootstrapped, auto-bootstrapping if needed. 

1283 

1284 Returns True if the region is (or was successfully) bootstrapped. 

1285 """ 

1286 if self.is_bootstrapped(region): 

1287 return True 

1288 

1289 print(f"ℹ Region {region} is not CDK-bootstrapped. Bootstrapping now...") 

1290 success = self.bootstrap(region=region) 

1291 if success: 

1292 # Update cache so we don't re-check this region 

1293 if not hasattr(self, "_bootstrap_cache"): 

1294 self._bootstrap_cache = {} 

1295 self._bootstrap_cache[region] = True 

1296 print(f"✓ CDK bootstrapped in {region}") 

1297 else: 

1298 print(f"✗ Failed to bootstrap CDK in {region}") 

1299 return success 

1300 

1301 def _get_deploy_region(self, stack_name: str) -> str | None: 

1302 """Determine the target AWS region for a given stack name.""" 

1303 from .config import _load_cdk_json 

1304 

1305 cdk_regions = _load_cdk_json() 

1306 

1307 region: str | None 

1308 if stack_name == "gco-global": 

1309 region = cdk_regions.get("global") or self.config.global_region 

1310 return region 

1311 if stack_name == "gco-api-gateway": 

1312 region = cdk_regions.get("api_gateway") or self.config.api_gateway_region 

1313 return region 

1314 if stack_name == "gco-monitoring": 

1315 region = cdk_regions.get("monitoring") or self.config.monitoring_region 

1316 return region 

1317 if stack_name == "gco-analytics": 

1318 # The analytics stack shares the API gateway region so the 

1319 # presigned-URL Lambda can hook into the existing /studio/* 

1320 # routes on the same API Gateway. 

1321 region = cdk_regions.get("api_gateway") or self.config.api_gateway_region 

1322 return region 

1323 

1324 # Regional stacks: gco-{region} 

1325 prefix = "gco-" 

1326 if stack_name.startswith(prefix): 

1327 return stack_name[len(prefix) :] 

1328 

1329 return None 

1330 

1331 def get_outputs(self, stack_name: str, region: str) -> dict[str, str]: 

1332 """Get stack outputs from CloudFormation.""" 

1333 import boto3 

1334 

1335 cf = boto3.client("cloudformation", region_name=region) 

1336 try: 

1337 response = cf.describe_stacks(StackName=stack_name) 

1338 if response["Stacks"]: 

1339 stack = response["Stacks"][0] 

1340 outputs: dict[str, str] = {} 

1341 for output in stack.get("Outputs", []): 

1342 outputs[str(output["OutputKey"])] = str(output["OutputValue"]) 

1343 return outputs 

1344 except Exception as e: 

1345 logger.debug("Failed to get outputs for %s in %s: %s", stack_name, region, e) 

1346 return {} 

1347 

1348 def get_stack_status(self, stack_name: str, region: str) -> StackInfo | None: 

1349 """Get detailed stack status from CloudFormation.""" 

1350 import boto3 

1351 

1352 cf = boto3.client("cloudformation", region_name=region) 

1353 try: 

1354 response = cf.describe_stacks(StackName=stack_name) 

1355 if response["Stacks"]: 

1356 stack = response["Stacks"][0] 

1357 return StackInfo( 

1358 name=stack["StackName"], 

1359 status=stack["StackStatus"], 

1360 region=region, 

1361 created_time=stack.get("CreationTime"), 

1362 updated_time=stack.get("LastUpdatedTime"), 

1363 outputs={o["OutputKey"]: o["OutputValue"] for o in stack.get("Outputs", [])}, 

1364 tags={t["Key"]: t["Value"] for t in stack.get("Tags", [])}, 

1365 ) 

1366 except Exception as e: 

1367 logger.debug("Failed to get stack status for %s in %s: %s", stack_name, region, e) 

1368 return None 

1369 

1370 def deploy_orchestrated( 

1371 self, 

1372 require_approval: bool = True, 

1373 outputs_file: str | None = None, 

1374 parameters: dict[str, str] | None = None, 

1375 tags: dict[str, str] | None = None, 

1376 progress: str = "events", 

1377 on_stack_start: Callable[[str], None] | None = None, 

1378 on_stack_complete: Callable[[str, bool], None] | None = None, 

1379 parallel: bool = False, 

1380 max_workers: int = 4, 

1381 ) -> tuple[bool, list[str], list[str]]: 

1382 """ 

1383 Deploy all stacks in the correct order. 

1384 

1385 Deploys global stacks first, then regional stacks (optionally in parallel), 

1386 then the monitoring stack (which depends on regional stacks). 

1387 

1388 Args: 

1389 require_approval: Whether to require approval for changes 

1390 outputs_file: File to write outputs to 

1391 parameters: CDK parameters 

1392 tags: Tags to apply to stacks 

1393 progress: Progress display type 

1394 on_stack_start: Callback(stack_name) called when starting a stack 

1395 on_stack_complete: Callback(stack_name, success) called when stack completes 

1396 parallel: Deploy regional stacks in parallel 

1397 max_workers: Maximum number of parallel deployments (default: 4) 

1398 

1399 Returns: 

1400 Tuple of (overall_success, successful_stacks, failed_stacks) 

1401 """ 

1402 stacks = self.list_stacks() 

1403 ordered_stacks = get_stack_deployment_order(stacks) 

1404 

1405 # Separate stacks into three groups: 

1406 # 1. Pre-regional global stacks (gco-global, gco-api-gateway) 

1407 # 2. Regional stacks (can be parallelized) 

1408 # 3. Post-regional stacks (gco-monitoring - depends on regional stacks) 

1409 pre_regional = {"gco-global", "gco-api-gateway"} 

1410 post_regional = {"gco-monitoring"} 

1411 

1412 pre_regional_stacks = [s for s in ordered_stacks if s in pre_regional] 

1413 regional_stacks = [ 

1414 s for s in ordered_stacks if s not in pre_regional and s not in post_regional 

1415 ] 

1416 post_regional_stacks = [s for s in ordered_stacks if s in post_regional] 

1417 

1418 successful: list[str] = [] 

1419 failed: list[str] = [] 

1420 

1421 # Phase 1: Deploy pre-regional global stacks sequentially 

1422 for stack_name in pre_regional_stacks: 

1423 if on_stack_start: 

1424 on_stack_start(stack_name) 

1425 

1426 success = self.deploy( 

1427 stack_name=stack_name, 

1428 require_approval=require_approval, 

1429 outputs_file=outputs_file, 

1430 parameters=parameters, 

1431 tags=tags, 

1432 progress=progress, 

1433 ) 

1434 

1435 if success: 

1436 successful.append(stack_name) 

1437 else: 

1438 failed.append(stack_name) 

1439 

1440 if on_stack_complete: 

1441 on_stack_complete(stack_name, success) 

1442 

1443 # Stop on failure to prevent cascading issues 

1444 if not success: 

1445 return False, successful, failed 

1446 

1447 # Phase 2: Deploy regional stacks (parallel or sequential) 

1448 # All regional stacks pass --exclusively: globals are already deployed 

1449 # in Phase 1, so CDK doesn't need to re-evaluate them. Skipping that 

1450 # re-evaluation avoids re-running custom resources (notably 

1451 # KubectlApplyManifests) on the global stacks every time a regional 

1452 # stack is deployed — that would otherwise re-apply manifests and 

1453 # rollout-restart controllers for no actual change. 

1454 if regional_stacks: 1454 ↛ 1506line 1454 didn't jump to line 1506 because the condition on line 1454 was always true

1455 if parallel and len(regional_stacks) > 1: 

1456 # Parallel deployment of regional stacks 

1457 successful_regional, failed_regional = self._deploy_stacks_parallel( 

1458 stacks=regional_stacks, 

1459 require_approval=require_approval, 

1460 outputs_file=outputs_file, 

1461 parameters=parameters, 

1462 tags=tags, 

1463 progress=progress, 

1464 on_stack_start=on_stack_start, 

1465 on_stack_complete=on_stack_complete, 

1466 max_workers=max_workers, 

1467 ) 

1468 successful.extend(successful_regional) 

1469 failed.extend(failed_regional) 

1470 

1471 # Stop if any regional stack failed 

1472 if failed_regional: 

1473 return False, successful, failed 

1474 else: 

1475 # Sequential deployment 

1476 for stack_name in regional_stacks: 

1477 if on_stack_start: 

1478 on_stack_start(stack_name) 

1479 

1480 success = self.deploy( 

1481 stack_name=stack_name, 

1482 require_approval=require_approval, 

1483 outputs_file=outputs_file, 

1484 parameters=parameters, 

1485 tags=tags, 

1486 progress=progress, 

1487 exclusively=True, 

1488 ) 

1489 

1490 if success: 1490 ↛ 1493line 1490 didn't jump to line 1493 because the condition on line 1490 was always true

1491 successful.append(stack_name) 

1492 else: 

1493 failed.append(stack_name) 

1494 

1495 if on_stack_complete: 

1496 on_stack_complete(stack_name, success) 

1497 

1498 # Stop on failure 

1499 if not success: 1499 ↛ 1500line 1499 didn't jump to line 1500 because the condition on line 1499 was never true

1500 return False, successful, failed 

1501 

1502 # Phase 3: Deploy post-regional stacks (monitoring) sequentially. 

1503 # Same rationale as Phase 2: every upstream stack is already 

1504 # deployed, so --exclusively prevents a redundant pass over 

1505 # global/api-gateway/regional. 

1506 for stack_name in post_regional_stacks: 

1507 if on_stack_start: 1507 ↛ 1508line 1507 didn't jump to line 1508 because the condition on line 1507 was never true

1508 on_stack_start(stack_name) 

1509 

1510 success = self.deploy( 

1511 stack_name=stack_name, 

1512 require_approval=require_approval, 

1513 outputs_file=outputs_file, 

1514 parameters=parameters, 

1515 tags=tags, 

1516 progress=progress, 

1517 exclusively=True, 

1518 ) 

1519 

1520 if success: 1520 ↛ 1523line 1520 didn't jump to line 1523 because the condition on line 1520 was always true

1521 successful.append(stack_name) 

1522 else: 

1523 failed.append(stack_name) 

1524 

1525 if on_stack_complete: 1525 ↛ 1526line 1525 didn't jump to line 1526 because the condition on line 1525 was never true

1526 on_stack_complete(stack_name, success) 

1527 

1528 if not success: 1528 ↛ 1529line 1528 didn't jump to line 1529 because the condition on line 1528 was never true

1529 return False, successful, failed 

1530 

1531 return len(failed) == 0, successful, failed 

1532 

1533 def _deploy_stacks_parallel( 

1534 self, 

1535 stacks: list[str], 

1536 require_approval: bool, 

1537 outputs_file: str | None, 

1538 parameters: dict[str, str] | None, 

1539 tags: dict[str, str] | None, 

1540 progress: str, 

1541 on_stack_start: Callable[[str], None] | None, 

1542 on_stack_complete: Callable[[str, bool], None] | None, 

1543 max_workers: int, 

1544 ) -> tuple[list[str], list[str]]: 

1545 """Deploy multiple stacks in parallel using separate CDK output directories.""" 

1546 import tempfile 

1547 

1548 successful: list[str] = [] 

1549 failed: list[str] = [] 

1550 lock = Lock() 

1551 

1552 def deploy_single(stack_name: str) -> tuple[str, bool]: 

1553 # Use a unique output directory in /tmp for each parallel deployment 

1554 # This avoids CDK copying cdk.out.* directories into assets 

1555 output_dir = tempfile.mkdtemp(prefix=f"cdk-{stack_name}-") 

1556 

1557 if on_stack_start: 1557 ↛ 1558line 1557 didn't jump to line 1558 because the condition on line 1557 was never true

1558 with lock: 

1559 on_stack_start(stack_name) 

1560 

1561 success = self.deploy( 

1562 stack_name=stack_name, 

1563 require_approval=require_approval, 

1564 outputs_file=outputs_file, 

1565 parameters=parameters, 

1566 tags=tags, 

1567 progress=progress, 

1568 output_dir=output_dir, 

1569 exclusively=True, 

1570 ) 

1571 

1572 # Clean up the temporary output directory 

1573 try: 

1574 import shutil 

1575 

1576 if os.path.exists(output_dir): 1576 ↛ 1581line 1576 didn't jump to line 1581 because the condition on line 1576 was always true

1577 shutil.rmtree(output_dir) 

1578 except Exception as e: 

1579 logger.debug("Cleanup of %s failed: %s", output_dir, e) 

1580 

1581 return stack_name, success 

1582 

1583 with ThreadPoolExecutor(max_workers=max_workers) as executor: 

1584 futures = {executor.submit(deploy_single, stack): stack for stack in stacks} 

1585 

1586 for future in as_completed(futures): 

1587 stack_name, success = future.result() 

1588 

1589 with lock: 

1590 if success: 

1591 successful.append(stack_name) 

1592 else: 

1593 failed.append(stack_name) 

1594 

1595 if on_stack_complete: 1595 ↛ 1596line 1595 didn't jump to line 1596 because the condition on line 1595 was never true

1596 on_stack_complete(stack_name, success) 

1597 

1598 return successful, failed 

1599 

1600 def destroy_orchestrated( 

1601 self, 

1602 force: bool = False, 

1603 on_stack_start: Callable[[str], None] | None = None, 

1604 on_stack_complete: Callable[[str, bool], None] | None = None, 

1605 parallel: bool = False, 

1606 max_workers: int = 4, 

1607 ) -> tuple[bool, list[str], list[str]]: 

1608 """ 

1609 Destroy all stacks in the correct order. 

1610 

1611 Destroys monitoring stack first, then regional stacks (optionally in parallel), 

1612 then global stacks. 

1613 

1614 Args: 

1615 force: Skip confirmation prompts 

1616 on_stack_start: Callback(stack_name) called when starting a stack 

1617 on_stack_complete: Callback(stack_name, success) called when stack completes 

1618 parallel: Destroy regional stacks in parallel 

1619 max_workers: Maximum number of parallel destructions (default: 4) 

1620 

1621 Returns: 

1622 Tuple of (overall_success, successful_stacks, failed_stacks) 

1623 """ 

1624 stacks = self.list_stacks() 

1625 

1626 # Image-registry pre-destroy guard — fires before ANY teardown so 

1627 # operators don't end up with monitoring + regional + api-gateway 

1628 # already destroyed when the global-stack ECR delete fails on a 

1629 # non-empty repo. Skipped entirely under the default 

1630 # ``removal_policy: "retain"`` posture. See 

1631 # ``_image_registry_destroy_preflight`` for the exact rules. 

1632 if not self._image_registry_destroy_preflight(force=force): 

1633 return False, [], list(stacks) 

1634 

1635 # Phase 0: Clean up backup vault recovery points so the global stack 

1636 # can be deleted cleanly by CloudFormation. 

1637 self._cleanup_backup_vault() 

1638 

1639 # Separate stacks into three groups (reverse of deploy order): 

1640 pre_regional = {"gco-global", "gco-api-gateway"} 

1641 post_regional = {"gco-monitoring"} 

1642 

1643 post_regional_stacks = [s for s in stacks if s in post_regional] 

1644 regional_stacks = [s for s in stacks if s not in pre_regional and s not in post_regional] 

1645 pre_regional_stacks = [s for s in stacks if s in pre_regional] 

1646 

1647 # Sort regional stacks alphabetically (reversed for destroy) 

1648 regional_stacks.sort(reverse=True) 

1649 # Sort pre-regional stacks in reverse priority order 

1650 pre_regional_order = {"gco-api-gateway": 1, "gco-global": 2} 

1651 pre_regional_stacks.sort(key=lambda x: pre_regional_order.get(x, 0)) 

1652 

1653 successful: list[str] = [] 

1654 failed: list[str] = [] 

1655 

1656 # Phase 1: Destroy post-regional stacks (monitoring) first 

1657 for stack_name in post_regional_stacks: 

1658 if on_stack_start: 1658 ↛ 1659line 1658 didn't jump to line 1659 because the condition on line 1658 was never true

1659 on_stack_start(stack_name) 

1660 

1661 success = self.destroy( 

1662 stack_name=stack_name, 

1663 force=force, 

1664 ) 

1665 

1666 if success: 1666 ↛ 1669line 1666 didn't jump to line 1669 because the condition on line 1666 was always true

1667 successful.append(stack_name) 

1668 else: 

1669 failed.append(stack_name) 

1670 

1671 if on_stack_complete: 1671 ↛ 1672line 1671 didn't jump to line 1672 because the condition on line 1671 was never true

1672 on_stack_complete(stack_name, success) 

1673 

1674 # Phase 2: Destroy regional stacks (parallel or sequential) 

1675 # Each regional destroy has a background watchdog that proactively 

1676 # deletes EKS-managed security groups + orphaned ENIs as soon as 

1677 # they appear. Without this, CloudFormation's VPC delete step sits 

1678 # in DELETE_IN_PROGRESS for ~10 min waiting for EKS to GC the 

1679 # ``eks-cluster-sg-<cluster>-*`` security group and its cluster 

1680 # ENIs. The SG is owned by the EKS service (not CloudFormation), 

1681 # so CFN can't delete it directly — it just polls the VPC's 

1682 # dependencies until they drain. See ``_start_eks_sg_watchdog``. 

1683 watchdog_stops: dict[str, Event] = {} 

1684 watchdog_threads: dict[str, Thread] = {} 

1685 for stack_name in regional_stacks: 

1686 stop_event = Event() 

1687 thread = self._start_eks_sg_watchdog(stack_name, stop_event) 

1688 watchdog_stops[stack_name] = stop_event 

1689 watchdog_threads[stack_name] = thread 

1690 

1691 if regional_stacks: 1691 ↛ 1724line 1691 didn't jump to line 1724 because the condition on line 1691 was always true

1692 if parallel and len(regional_stacks) > 1: 

1693 # Parallel destruction of regional stacks 

1694 successful_regional, failed_regional = self._destroy_stacks_parallel( 

1695 stacks=regional_stacks, 

1696 force=force, 

1697 on_stack_start=on_stack_start, 

1698 on_stack_complete=on_stack_complete, 

1699 max_workers=max_workers, 

1700 ) 

1701 successful.extend(successful_regional) 

1702 failed.extend(failed_regional) 

1703 else: 

1704 # Sequential destruction 

1705 for stack_name in regional_stacks: 

1706 if on_stack_start: 

1707 on_stack_start(stack_name) 

1708 

1709 success = self.destroy( 

1710 stack_name=stack_name, 

1711 force=force, 

1712 ) 

1713 

1714 if success: 

1715 successful.append(stack_name) 

1716 else: 

1717 failed.append(stack_name) 

1718 

1719 if on_stack_complete: 

1720 on_stack_complete(stack_name, success) 

1721 

1722 # Stop all watchdogs and do one final cleanup pass per stack to 

1723 # catch anything EKS re-created during the destroy flow. 

1724 for stack_name in regional_stacks: 

1725 watchdog_stops[stack_name].set() 

1726 watchdog_threads[stack_name].join(timeout=5) 

1727 self._cleanup_eks_security_groups(stack_name) 

1728 

1729 # Phase 3: Destroy pre-regional global stacks last 

1730 for stack_name in pre_regional_stacks: 

1731 if on_stack_start: 

1732 on_stack_start(stack_name) 

1733 

1734 success = self.destroy( 

1735 stack_name=stack_name, 

1736 force=force, 

1737 ) 

1738 

1739 if success: 1739 ↛ 1742line 1739 didn't jump to line 1742 because the condition on line 1739 was always true

1740 successful.append(stack_name) 

1741 else: 

1742 failed.append(stack_name) 

1743 

1744 if on_stack_complete: 

1745 on_stack_complete(stack_name, success) 

1746 

1747 return len(failed) == 0, successful, failed 

1748 

1749 def _destroy_stacks_parallel( 

1750 self, 

1751 stacks: list[str], 

1752 force: bool, 

1753 on_stack_start: Callable[[str], None] | None, 

1754 on_stack_complete: Callable[[str, bool], None] | None, 

1755 max_workers: int, 

1756 ) -> tuple[list[str], list[str]]: 

1757 """Destroy multiple stacks in parallel using separate CDK output directories.""" 

1758 import tempfile 

1759 

1760 successful: list[str] = [] 

1761 failed: list[str] = [] 

1762 lock = Lock() 

1763 

1764 def destroy_single(stack_name: str) -> tuple[str, bool]: 

1765 # Use a unique output directory in /tmp for each parallel destruction 

1766 output_dir = tempfile.mkdtemp(prefix=f"cdk-{stack_name}-") 

1767 

1768 if on_stack_start: 1768 ↛ 1769line 1768 didn't jump to line 1769 because the condition on line 1768 was never true

1769 with lock: 

1770 on_stack_start(stack_name) 

1771 

1772 success = self.destroy( 

1773 stack_name=stack_name, 

1774 force=force, 

1775 output_dir=output_dir, 

1776 ) 

1777 

1778 # Clean up the temporary output directory 

1779 try: 

1780 import shutil 

1781 

1782 if os.path.exists(output_dir): 1782 ↛ 1787line 1782 didn't jump to line 1787 because the condition on line 1782 was always true

1783 shutil.rmtree(output_dir) 

1784 except Exception as e: 

1785 logger.debug("Cleanup of %s failed: %s", output_dir, e) 

1786 

1787 return stack_name, success 

1788 

1789 with ThreadPoolExecutor(max_workers=max_workers) as executor: 

1790 futures = {executor.submit(destroy_single, stack): stack for stack in stacks} 

1791 

1792 for future in as_completed(futures): 

1793 stack_name, success = future.result() 

1794 

1795 with lock: 

1796 if success: 

1797 successful.append(stack_name) 

1798 else: 

1799 failed.append(stack_name) 

1800 

1801 if on_stack_complete: 1801 ↛ 1802line 1801 didn't jump to line 1802 because the condition on line 1801 was never true

1802 on_stack_complete(stack_name, success) 

1803 

1804 return successful, failed 

1805 

1806 def _cleanup_backup_vault(self) -> None: 

1807 """Delete all recovery points from the GCO backup vault. 

1808 

1809 This is called before destroy-all so CloudFormation can delete the 

1810 backup vault cleanly. Without this, the vault deletion fails because 

1811 it contains recovery points. 

1812 """ 

1813 import boto3 

1814 

1815 global_region = self.config.global_region 

1816 project_name = self.config.project_name 

1817 

1818 try: 

1819 backup_client = boto3.client("backup", region_name=global_region) 

1820 

1821 # Find the backup vault by listing vaults with the project prefix 

1822 paginator = backup_client.get_paginator("list_backup_vaults") 

1823 vault_name = None 

1824 for page in paginator.paginate(): 

1825 for vault in page.get("BackupVaultList", []): 

1826 if project_name in vault["BackupVaultName"].lower(): 1826 ↛ 1825line 1826 didn't jump to line 1825 because the condition on line 1826 was always true

1827 vault_name = vault["BackupVaultName"] 

1828 break 

1829 if vault_name: 

1830 break 

1831 

1832 if not vault_name: 

1833 return 

1834 

1835 # Delete all recovery points in the vault 

1836 rp_paginator = backup_client.get_paginator("list_recovery_points_by_backup_vault") 

1837 deleted = 0 

1838 for page in rp_paginator.paginate(BackupVaultName=vault_name): 

1839 for rp in page.get("RecoveryPoints", []): 

1840 try: 

1841 backup_client.delete_recovery_point( 

1842 BackupVaultName=vault_name, 

1843 RecoveryPointArn=rp["RecoveryPointArn"], 

1844 ) 

1845 deleted += 1 

1846 except Exception as e: 

1847 logger.debug("Failed to delete recovery point: %s", e) 

1848 

1849 if deleted > 0: 

1850 print(f" Cleaned up {deleted} backup recovery points from {vault_name}") 

1851 

1852 except Exception as e: 

1853 print(f" Warning: Backup vault cleanup failed (non-fatal): {e}") 

1854 

1855 def cleanup_eks_security_groups(self) -> None: 

1856 """Clean up EKS-managed security groups across all regional stacks. 

1857 

1858 Called between destroy retries to remove orphaned security groups 

1859 that block VPC deletion. 

1860 """ 

1861 stacks = self.list_stacks() 

1862 pre_regional = {"gco-global", "gco-api-gateway", "gco-monitoring"} 

1863 regional_stacks = [s for s in stacks if s not in pre_regional] 

1864 for stack_name in regional_stacks: 

1865 self._cleanup_eks_security_groups(stack_name) 

1866 

1867 def _cleanup_eks_security_groups(self, stack_name: str) -> None: 

1868 """Clean up EKS-managed security groups that block VPC deletion. 

1869 

1870 EKS creates a cluster security group (eks-cluster-sg-<cluster-name>-*) 

1871 that is owned by EKS, not CloudFormation. When the stack is destroyed, 

1872 this SG and its attached ENIs can linger, causing VPC deletion to fail. 

1873 

1874 This method finds and deletes these orphaned security groups and their 

1875 ENIs before the stack destroy runs. 

1876 """ 

1877 import time as _time 

1878 

1879 import boto3 

1880 

1881 # Extract region from stack name (e.g., gco-us-east-1 -> us-east-1) 

1882 project_name = self.config.project_name 

1883 region = stack_name.replace(f"{project_name}-", "", 1) 

1884 cluster_name = stack_name # cluster name matches stack name 

1885 

1886 try: 

1887 ec2 = boto3.client("ec2", region_name=region) 

1888 

1889 # Find EKS-managed security groups by name pattern 

1890 response = ec2.describe_security_groups( 

1891 Filters=[ 

1892 { 

1893 "Name": "group-name", 

1894 "Values": [f"eks-cluster-sg-{cluster_name}-*"], 

1895 } 

1896 ] 

1897 ) 

1898 

1899 sgs = response.get("SecurityGroups", []) 

1900 if not sgs: 

1901 return 

1902 

1903 for sg in sgs: 

1904 sg_id = sg["GroupId"] 

1905 sg_name = sg.get("GroupName", "") 

1906 

1907 # First, detach and delete any ENIs using this SG 

1908 eni_response = ec2.describe_network_interfaces( 

1909 Filters=[{"Name": "group-id", "Values": [sg_id]}] 

1910 ) 

1911 

1912 for eni in eni_response.get("NetworkInterfaces", []): 

1913 eni_id = eni["NetworkInterfaceId"] 

1914 try: 

1915 if eni.get("Attachment"): 

1916 ec2.detach_network_interface( 

1917 AttachmentId=eni["Attachment"]["AttachmentId"], 

1918 Force=True, 

1919 ) 

1920 _time.sleep(5) 

1921 ec2.delete_network_interface(NetworkInterfaceId=eni_id) 

1922 logger.debug("Deleted ENI %s from SG %s", eni_id, sg_name) 

1923 except Exception as e: 

1924 logger.debug("Failed to delete ENI %s: %s", eni_id, e) 

1925 

1926 # Now delete the security group 

1927 try: 

1928 ec2.delete_security_group(GroupId=sg_id) 

1929 print(f" Cleaned up EKS security group: {sg_name} ({sg_id})") 

1930 except Exception as e: 

1931 logger.debug( 

1932 "Failed to delete SG %s: %s (will retry on next attempt)", sg_id, e 

1933 ) 

1934 

1935 except Exception as e: 

1936 # Non-fatal — the retry loop will handle it 

1937 logger.debug("EKS security group cleanup for %s failed: %s", stack_name, e) 

1938 

1939 def _start_eks_sg_watchdog(self, stack_name: str, stop_event: Event) -> Thread: 

1940 """Start a background thread that polls for orphaned EKS security groups. 

1941 

1942 EKS creates an ``eks-cluster-sg-<cluster-name>-*`` security group that 

1943 is owned by the EKS service (not CloudFormation). When the stack's 

1944 EKS cluster resource deletes, the SG is supposed to GC along with its 

1945 cluster ENIs — but on EKS Auto Mode there's a window where the SG 

1946 lingers after the cluster is gone, blocking the subsequent VPC 

1947 delete with ``DependencyViolation``. CloudFormation then sits in 

1948 ``DELETE_IN_PROGRESS`` on the VPC for ~10 minutes retrying. 

1949 

1950 This watchdog runs for the full duration of the regional-stack 

1951 destroy, polling every 30 seconds. As soon as an orphaned SG appears 

1952 (which only happens after the cluster delete has progressed past 

1953 the cluster resource itself), it deletes the SG and any ENIs still 

1954 attached. That unblocks the VPC delete immediately instead of 

1955 waiting for EKS's own GC timer. 

1956 

1957 The thread exits when ``stop_event`` is set by the orchestrator at 

1958 the end of the regional phase. 

1959 """ 

1960 

1961 def _watchdog() -> None: 

1962 while not stop_event.is_set(): 

1963 try: 

1964 self._cleanup_eks_security_groups(stack_name) 

1965 except Exception as e: 

1966 logger.debug( 

1967 "EKS SG watchdog tick for %s failed (non-fatal): %s", 

1968 stack_name, 

1969 e, 

1970 ) 

1971 # ``wait`` returns immediately when the event is set, so this 

1972 # doubles as the sleep-and-shutdown-check in one call. 

1973 stop_event.wait(timeout=30) 

1974 

1975 thread = Thread( 

1976 target=_watchdog, 

1977 name=f"eks-sg-watchdog-{stack_name}", 

1978 daemon=True, 

1979 ) 

1980 thread.start() 

1981 return thread 

1982 

1983 

1984def get_stack_manager(config: GCOConfig) -> StackManager: 

1985 """Factory function to get a StackManager instance.""" 

1986 return StackManager(config) 

1987 

1988 

1989def get_stack_deployment_order(stacks: list[str]) -> list[str]: 

1990 """ 

1991 Get the correct deployment order for stacks. 

1992 

1993 Order: global stacks first, then regional stacks. 

1994 Global stacks: gco-global, gco-api-gateway, gco-monitoring 

1995 Regional stacks: gco-{region} (e.g., gco-us-east-1) 

1996 """ 

1997 global_stacks = [] 

1998 regional_stacks = [] 

1999 

2000 # Define global stack priority (lower = deploy first) 

2001 global_priority = { 

2002 "gco-global": 1, 

2003 "gco-api-gateway": 2, 

2004 "gco-analytics": 2.5, 

2005 "gco-monitoring": 3, 

2006 } 

2007 

2008 for stack in stacks: 

2009 if stack in global_priority: 

2010 global_stacks.append((global_priority[stack], stack)) 

2011 else: 

2012 regional_stacks.append(stack) 

2013 

2014 # Sort global stacks by priority, regional stacks alphabetically 

2015 global_stacks.sort(key=lambda x: x[0]) 

2016 regional_stacks.sort() 

2017 

2018 return [s[1] for s in global_stacks] + regional_stacks 

2019 

2020 

2021def get_stack_destroy_order(stacks: list[str]) -> list[str]: 

2022 """ 

2023 Get the correct destroy order for stacks. 

2024 

2025 Order: regional stacks first, then global stacks (reverse of deploy). 

2026 """ 

2027 deployment_order = get_stack_deployment_order(stacks) 

2028 return list(reversed(deployment_order)) 

2029 

2030 

2031# ============================================================================= 

2032# Feature toggle helpers 

2033# ============================================================================= 

2034 

2035_FSX_DEFAULTS: dict[str, Any] = { 

2036 "enabled": False, 

2037 "storage_capacity_gib": 1200, 

2038 "deployment_type": "SCRATCH_2", 

2039 "per_unit_storage_throughput": 200, 

2040 "data_compression_type": "LZ4", 

2041 "import_path": None, 

2042 "export_path": None, 

2043 "auto_import_policy": "NEW_CHANGED_DELETED", 

2044} 

2045 

2046 

2047def _find_cdk_json() -> Path | None: 

2048 """Find cdk.json in current or parent directories.""" 

2049 current = Path.cwd() 

2050 for parent in [current] + list(current.parents): 

2051 cdk_path = parent / "cdk.json" 

2052 if cdk_path.exists(): 

2053 return cdk_path 

2054 return None 

2055 

2056 

2057def get_fsx_config(region: str | None = None) -> dict[str, Any]: 

2058 """Get current FSx for Lustre configuration from cdk.json. 

2059 

2060 Args: 

2061 region: Optional region to get config for. If provided, checks for 

2062 region-specific overrides first. 

2063 

2064 Returns: 

2065 FSx configuration dictionary 

2066 """ 

2067 return _get_feature_config("fsx_lustre", _FSX_DEFAULTS, region) 

2068 

2069 

2070def update_fsx_config(settings: dict[str, Any], region: str | None = None) -> None: 

2071 """Update FSx for Lustre configuration in cdk.json. 

2072 

2073 Args: 

2074 settings: FSx settings to update 

2075 region: Optional region for region-specific config. If None, updates global config. 

2076 """ 

2077 _update_feature_config("fsx_lustre", settings, _FSX_DEFAULTS, region) 

2078 

2079 

2080# ============================================================================= 

2081# Generic feature toggle helpers (used by FSx, Valkey, Aurora, and future features) 

2082# ============================================================================= 

2083 

2084 

2085def _get_feature_config( 

2086 feature_key: str, 

2087 default_config: dict[str, Any], 

2088 region: str | None = None, 

2089) -> dict[str, Any]: 

2090 """Get configuration for a toggleable feature from cdk.json. 

2091 

2092 Args: 

2093 feature_key: The cdk.json context key (e.g. "valkey", "aurora_pgvector"). 

2094 default_config: Default configuration values when the key is missing. 

2095 region: Optional region for region-specific overrides. 

2096 

2097 Returns: 

2098 Merged configuration dictionary. 

2099 """ 

2100 cdk_json_path = _find_cdk_json() 

2101 if not cdk_json_path: 

2102 raise RuntimeError("cdk.json not found") 

2103 

2104 import json 

2105 

2106 with open(cdk_json_path, encoding="utf-8") as f: 

2107 cdk_config = json.load(f) 

2108 

2109 global_config = cdk_config.get("context", {}).get(feature_key, default_config) 

2110 

2111 if region: 

2112 region_key = f"{feature_key}_regions" 

2113 region_overrides = cdk_config.get("context", {}).get(region_key, {}) 

2114 if region in region_overrides: 

2115 merged = {**global_config, **region_overrides[region]} 

2116 merged["region"] = region 

2117 merged["is_region_specific"] = True 

2118 return merged 

2119 

2120 result = {**default_config, **global_config} 

2121 result["is_region_specific"] = False 

2122 return result 

2123 

2124 

2125def _update_feature_config( 

2126 feature_key: str, 

2127 settings: dict[str, Any], 

2128 default_config: dict[str, Any], 

2129 region: str | None = None, 

2130) -> None: 

2131 """Update configuration for a toggleable feature in cdk.json. 

2132 

2133 Args: 

2134 feature_key: The cdk.json context key (e.g. "valkey", "aurora_pgvector"). 

2135 settings: Settings to update. 

2136 default_config: Default configuration values when the key is missing. 

2137 region: Optional region for region-specific config. 

2138 """ 

2139 cdk_json_path = _find_cdk_json() 

2140 if not cdk_json_path: 

2141 raise RuntimeError("cdk.json not found") 

2142 

2143 import json 

2144 

2145 with open(cdk_json_path, encoding="utf-8") as f: 

2146 cdk_config = json.load(f) 

2147 

2148 if "context" not in cdk_config: 

2149 cdk_config["context"] = {} 

2150 

2151 if region: 

2152 region_key = f"{feature_key}_regions" 

2153 if region_key not in cdk_config["context"]: 2153 ↛ 2155line 2153 didn't jump to line 2155 because the condition on line 2153 was always true

2154 cdk_config["context"][region_key] = {} 

2155 if region not in cdk_config["context"][region_key]: 2155 ↛ 2157line 2155 didn't jump to line 2157 because the condition on line 2155 was always true

2156 cdk_config["context"][region_key][region] = {} 

2157 for key, value in settings.items(): 

2158 if value is not None or key == "enabled": 2158 ↛ 2157line 2158 didn't jump to line 2157 because the condition on line 2158 was always true

2159 cdk_config["context"][region_key][region][key] = value 

2160 else: 

2161 if feature_key not in cdk_config["context"]: 

2162 cdk_config["context"][feature_key] = {**default_config} 

2163 for key, value in settings.items(): 

2164 if value is not None or key == "enabled": 

2165 cdk_config["context"][feature_key][key] = value 

2166 

2167 with open(cdk_json_path, "w", encoding="utf-8") as f: 

2168 json.dump(cdk_config, f, indent=2) 

2169 

2170 

2171# ============================================================================= 

2172# Valkey configuration 

2173# ============================================================================= 

2174 

2175_VALKEY_DEFAULTS: dict[str, Any] = { 

2176 "enabled": False, 

2177 "max_data_storage_gb": 5, 

2178 "max_ecpu_per_second": 5000, 

2179 "snapshot_retention_limit": 1, 

2180} 

2181 

2182 

2183def get_valkey_config(region: str | None = None) -> dict[str, Any]: 

2184 """Get current Valkey Serverless configuration from cdk.json.""" 

2185 return _get_feature_config("valkey", _VALKEY_DEFAULTS, region) 

2186 

2187 

2188def update_valkey_config(settings: dict[str, Any], region: str | None = None) -> None: 

2189 """Update Valkey Serverless configuration in cdk.json.""" 

2190 _update_feature_config("valkey", settings, _VALKEY_DEFAULTS, region) 

2191 

2192 

2193# ============================================================================= 

2194# Aurora pgvector configuration 

2195# ============================================================================= 

2196 

2197_AURORA_DEFAULTS: dict[str, Any] = { 

2198 "enabled": False, 

2199 "min_acu": 0, 

2200 "max_acu": 16, 

2201 "backup_retention_days": 7, 

2202 "deletion_protection": False, 

2203} 

2204 

2205 

2206def get_aurora_config(region: str | None = None) -> dict[str, Any]: 

2207 """Get current Aurora pgvector configuration from cdk.json.""" 

2208 return _get_feature_config("aurora_pgvector", _AURORA_DEFAULTS, region) 

2209 

2210 

2211def update_aurora_config(settings: dict[str, Any], region: str | None = None) -> None: 

2212 """Update Aurora pgvector configuration in cdk.json.""" 

2213 _update_feature_config("aurora_pgvector", settings, _AURORA_DEFAULTS, region) 

2214 

2215 

2216# ============================================================================= 

2217# Analytics environment configuration 

2218# ============================================================================= 

2219 

2220_ANALYTICS_DEFAULTS: dict[str, Any] = { 

2221 "enabled": False, 

2222 "hyperpod": {"enabled": False}, 

2223 "canvas": {"enabled": False}, 

2224 "cognito": {"domain_prefix": None, "removal_policy": "destroy"}, 

2225 "efs": {"removal_policy": "destroy"}, 

2226 "studio": {"user_profile_name_prefix": None}, 

2227} 

2228 

2229 

2230def get_analytics_config() -> dict[str, Any]: 

2231 """Get the analytics environment configuration from cdk.json. 

2232 

2233 The analytics stack is single-region by construction (lives in the 

2234 api-gateway region), so this helper does not accept a region argument. 

2235 Returned dict is the defaults merged with any operator overrides from 

2236 the ``context.analytics_environment`` block. 

2237 """ 

2238 return _get_feature_config("analytics_environment", _ANALYTICS_DEFAULTS) 

2239 

2240 

2241def update_analytics_config(settings: dict[str, Any]) -> None: 

2242 """Update the analytics environment configuration in cdk.json. 

2243 

2244 Mirrors ``update_valkey_config`` / ``update_aurora_config``. Nested 

2245 keys under ``analytics_environment`` (``hyperpod``, ``canvas``, 

2246 ``cognito``, ``efs``, ``studio``) are merged one level deep rather 

2247 than replaced wholesale — ``enable --hyperpod`` must not clobber 

2248 ``cognito.removal_policy``. 

2249 """ 

2250 _update_feature_config("analytics_environment", settings, _ANALYTICS_DEFAULTS)