Coverage for cli/stacks.py: 88%

1"""

2Stack management for GCO CLI.

4Provides commands for deploying, updating, and managing CDK stacks.

5This is the largest CLI module (~1600 lines) because it orchestrates the

6full deployment lifecycle including container runtime detection, CDK

7bootstrapping, Lambda source synchronization, and parallel regional deploys.

9This module handles:

10 - Container runtime detection (Docker, Finch, Podman) with automatic fallback

11 - CDK bootstrap across all target regions (idempotent)

12 - Lambda source synchronization (copies handler code + dependencies before synth)

13 - CDK stack deployment with proper dependency ordering:

14 1. Global stack (Global Accelerator, DynamoDB)

15 2. API Gateway stack (auth secret, Lambda proxy)

16 3. Regional stacks in parallel (EKS, VPC, ALB per region)

17 4. Monitoring stack (CloudWatch dashboards, alarms)

18 - Parallel deployment of regional stacks via ThreadPoolExecutor

19 - Stack destruction in reverse dependency order

20 - FSx for Lustre enable/disable toggle

21 - kubectl access configuration (EKS access entries + kubeconfig)

23Key Design Decisions:

24 - Regional stacks deploy in parallel for speed; global/API/monitoring are sequential

25 - Lambda build directories are synced before every deploy to avoid stale code

26 - Container runtime is auto-detected; CDK_DOCKER env var overrides

27 - All destructive operations require -y/--yes confirmation

28 - Stack status is read from CloudFormation, not cached locally

30Environment Variables:

31 CDK_DOCKER: Override container runtime (default: auto-detect Docker/Finch/Podman)

32 AWS_REGION: Default region for single-region operations

33"""

35from __future__ import annotations

37import logging

38import os

39import shutil

40import site

41import subprocess

42from collections.abc import Callable

43from concurrent.futures import ThreadPoolExecutor, as_completed

44from dataclasses import dataclass, field

45from datetime import datetime

46from pathlib import Path

47from threading import Event, Lock, Thread

48from typing import TYPE_CHECKING, Any

50from botocore.exceptions import ClientError

52if TYPE_CHECKING:

53 from .config import GCOConfig

55logger = logging.getLogger(__name__)

58@dataclass

59class StackInfo:

60 """Information about a CDK stack."""

62 name: str

63 status: str

64 region: str

65 created_time: datetime | None = None

66 updated_time: datetime | None = None

67 outputs: dict[str, str] = field(default_factory=dict)

68 tags: dict[str, str] = field(default_factory=dict)

70 def to_dict(self) -> dict[str, Any]:

71 return {

72 "name": self.name,

73 "status": self.status,

74 "region": self.region,

75 "created_time": self.created_time.isoformat() if self.created_time else None,

76 "updated_time": self.updated_time.isoformat() if self.updated_time else None,

77 "outputs": self.outputs,

78 "tags": self.tags,

79 }

82def _safe_rmtree(path: Path) -> None:

83 """Remove a directory tree, handling broken symlinks on macOS.

85 shutil.rmtree can fail with ``OSError: [Errno 66] Directory not empty``

86 on macOS when pip-installed packages (e.g. botocore) contain broken

87 symlinks or extended-attribute resource forks.

89 Falls back to ``rm -rf`` via subprocess, but only after validating the

90 path is a real directory under the project tree to avoid accidents.

91 """

92 resolved = path.resolve()

94 # Safety: refuse to remove anything that isn't clearly a build artifact

95 # inside the project. The path must contain "lambda" and end with "-build".

96 if "lambda" not in resolved.parts or not resolved.name.endswith("-build"):

97 raise ValueError(f"Refusing to remove unexpected path: {resolved}")

99 try:

100 shutil.rmtree(str(resolved))

101 except OSError:

102 subprocess.run(["rm", "-rf", "--", str(resolved)], check=True)

103

104

105# Cached result for container runtime detection (None = not yet checked)

106_container_runtime_cache: str | None = None

107_container_runtime_checked: bool = False

108

109

110def _detect_container_runtime() -> str | None:

111 """

112 Detect available container runtime for CDK asset bundling.

113

114 CDK requires a container runtime to build Lambda function assets.

115 This function checks for available runtimes in order of preference

116 and verifies they are actually running (not just installed).

117

118 Priority order: docker > finch > podman

119

120 Returns:

121 Runtime name ('docker', 'finch', or 'podman') if found and running,

122 None if no runtime is available.

123

124 Note:

125 If CDK_DOCKER environment variable is already set, that value

126 is returned without checking if the runtime is available.

127 """

128 global _container_runtime_cache, _container_runtime_checked

129 if _container_runtime_checked: 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true

130 return _container_runtime_cache

131

132 _container_runtime_cache = _detect_container_runtime_uncached()

133 _container_runtime_checked = True

134 return _container_runtime_cache

135

136

137def _detect_container_runtime_uncached() -> str | None:

138 """Uncached implementation of container runtime detection."""

139 # Check if CDK_DOCKER is already set

140 if os.environ.get("CDK_DOCKER"):

141 return os.environ["CDK_DOCKER"]

142

143 # Try docker first

144 if shutil.which("docker"):

145 # Verify docker is actually running

146 try:

147 result = subprocess.run(

148 ["docker", "info"],

149 capture_output=True,

150 timeout=5,

151 )

152 if result.returncode == 0:

153 return "docker"

154 except subprocess.TimeoutExpired, Exception:

155 pass

156

157 # Try finch as fallback

158 if shutil.which("finch"):

159 try:

160 result = subprocess.run(

161 ["finch", "info"],

162 capture_output=True,

163 timeout=5,

164 )

165 if result.returncode == 0: 165 ↛ 171line 165 didn't jump to line 171 because the condition on line 165 was always true

166 return "finch"

167 except subprocess.TimeoutExpired, Exception:

168 pass

169

170 # Try podman as last resort

171 if shutil.which("podman"):

172 try:

173 result = subprocess.run(

174 ["podman", "info"],

175 capture_output=True,

176 timeout=5,

177 )

178 if result.returncode == 0: 178 ↛ 183line 178 didn't jump to line 183 because the condition on line 178 was always true

179 return "podman"

180 except subprocess.TimeoutExpired, Exception:

181 pass

182

183 return None

184

185

186class StackManager:

187 """Manages CDK stack operations."""

188

189 def __init__(self, config: GCOConfig, project_root: Path | None = None):

190 self.config = config

191 self.project_root = project_root or self._find_project_root()

192 self._cdk_path = self._find_cdk()

193

194 # Ensure Lambda build directory exists before any CDK synth

195 self._ensure_lambda_build()

196

197 def _find_project_root(self) -> Path:

198 """Find the project root by looking for cdk.json."""

199 current = Path.cwd()

200 for parent in [current] + list(current.parents):

201 if (parent / "cdk.json").exists():

202 return parent

203 return current

204

205 def _find_cdk(self) -> str:

206 """Find CDK executable."""

207 # Check if cdk is in PATH

208 try:

209 result = subprocess.run(["which", "cdk"], capture_output=True, text=True, check=True)

210 return result.stdout.strip()

211 except subprocess.CalledProcessError:

212 pass

213

214 # Check common locations

215 for path in ["/usr/local/bin/cdk", "~/.npm-global/bin/cdk"]:

216 expanded = os.path.expanduser(path)

217 if os.path.exists(expanded):

218 return expanded

219

220 # Fall back to npx

221 return "npx cdk"

222

223 def _ensure_lambda_build(self) -> None:

224 """Ensure the Lambda build directories exist for CDK synthesis.

225

226 Called from __init__ so the directory is ready before any CDK synth

227 (even ``cdk list`` needs it because ``app.py`` references the asset).

228

229 This does a lightweight check — only builds if the directory is

230 missing or incomplete. It does NOT do a full rebuild (no rmtree).

231 The full rebuild happens in ``_rebuild_lambda_packages()``, which

232 is called by ``deploy()`` before the actual CDK deploy.

233 """

234 kubectl_source = self.project_root / "lambda" / "kubectl-applier-simple"

235 kubectl_build = self.project_root / "lambda" / "kubectl-applier-simple-build"

236 helm_source = self.project_root / "lambda" / "helm-installer"

237 helm_build = self.project_root / "lambda" / "helm-installer-build"

238

239 # Only build if the directory is missing or deps aren't installed

240 if kubectl_source.exists() and (

241 not kubectl_build.exists() or not (kubectl_build / "yaml").exists()

242 ):

243 self._build_kubectl_lambda()

244

245 if helm_source.exists() and not helm_build.exists(): 245 ↛ 246line 245 didn't jump to line 246 because the condition on line 245 was never true

246 self._build_helm_installer_lambda()

247

248 def _check_and_fix_stuck_stack(self, stack_name: str) -> None:

249 """Check if a stack is in a stuck state and auto-recover.

250

251 Stacks can get stuck in REVIEW_IN_PROGRESS, ROLLBACK_COMPLETE, or

252 other non-deployable states. This detects those and cleans up so

253 the next deploy can succeed.

254 """

255 import boto3

256

257 region = self._get_deploy_region(stack_name)

258 if not region:

259 return

260

261 try:

262 cfn = boto3.client("cloudformation", region_name=region)

263 response = cfn.describe_stacks(StackName=stack_name)

264 status = response["Stacks"][0]["StackStatus"]

265

266 stuck_states = {

267 "REVIEW_IN_PROGRESS",

268 "ROLLBACK_COMPLETE",

269 "ROLLBACK_FAILED",

270 "CREATE_FAILED",

271 "DELETE_FAILED",

272 }

273

274 if status in stuck_states:

275 print(f" Stack {stack_name} is in {status} state, cleaning up...")

276 cfn.delete_stack(StackName=stack_name)

277 waiter = cfn.get_waiter("stack_delete_complete")

278 waiter.wait(StackName=stack_name, WaiterConfig={"Delay": 10, "MaxAttempts": 60})

279 print(f" Stack {stack_name} cleaned up, will recreate on deploy")

280

281 except Exception as e:

282 logger.debug("Stack pre-check for %s: %s", stack_name, e)

283 # Stack doesn't exist or can't be described — fine, deploy will create it

284

285 def _diagnose_deploy_failure(self, stack_name: str) -> None:

286 """Fetch CloudFormation events after a failed deploy and print diagnostics.

287

288 Gives users actionable information instead of just the CDK error message.

289 """

290 import boto3

291

292 region = self._get_deploy_region(stack_name)

293 if not region:

294 return

295

296 try:

297 cfn = boto3.client("cloudformation", region_name=region)

298

299 # Get recent events

300 response = cfn.describe_stack_events(StackName=stack_name)

301 events = response.get("StackEvents", [])

302

303 # Filter to failed events

304 failed = [

305 e

306 for e in events[:20]

307 if "FAILED" in e.get("ResourceStatus", "")

308 or "ROLLBACK" in e.get("ResourceStatus", "")

309 ]

310

311 if failed:

312 print(f"\n CloudFormation failure details for {stack_name}:")

313 for event in failed[:5]:

314 resource = event.get("LogicalResourceId", "unknown")

315 status = event.get("ResourceStatus", "unknown")

316 reason = event.get("ResourceStatusReason", "no reason given")

317 print(f" {resource}: {status}")

318 print(f" {reason}")

319

320 # Check stack status for actionable advice

321 try:

322 stack_resp = cfn.describe_stacks(StackName=stack_name)

323 status = stack_resp["Stacks"][0]["StackStatus"]

324

325 advice = {

326 "REVIEW_IN_PROGRESS": (

327 "Stack is stuck in REVIEW_IN_PROGRESS. "

328 "Run: aws cloudformation delete-stack "

329 f"--stack-name {stack_name} --region {region}"

330 ),

331 "ROLLBACK_COMPLETE": (

332 "Stack rolled back. Delete it and retry: "

333 f"aws cloudformation delete-stack "

334 f"--stack-name {stack_name} --region {region}"

335 ),

336 "ROLLBACK_FAILED": (

337 "Stack rollback failed. Delete with --retain: "

338 f"aws cloudformation delete-stack "

339 f"--stack-name {stack_name} --region {region}"

340 ),

341 "UPDATE_ROLLBACK_COMPLETE": (

342 "Update rolled back but stack is stable. "

343 "Check the events above and retry the deploy."

344 ),

345 }

346

347 if status in advice: 347 ↛ exitline 347 didn't return from function '_diagnose_deploy_failure' because the condition on line 347 was always true

348 print(f"\n Suggested fix: {advice[status]}")

349

350 except Exception as e:

351 logger.debug("Failed to parse stack events: %s", e)

352

353 except Exception as e:

354 logger.debug("Failed to diagnose deploy failure for %s: %s", stack_name, e)

355 # Best effort — don't fail the deploy further

356

357 def _sync_lambda_sources(self) -> None:

358 """

359 Sync latest handler code and manifests into the build directory.

360

361 Called at the start of ``deploy()`` to ensure CDK picks up the

362 latest source changes. Only copies files — does NOT rebuild pip

363 deps or rmtree. Runs once per StackManager instance.

364 """

365 if getattr(self, "_lambda_sources_synced", False): 365 ↛ 366line 365 didn't jump to line 366 because the condition on line 365 was never true

366 return

367 self._lambda_sources_synced = True

368

369 source_dir = self.project_root / "lambda" / "kubectl-applier-simple"

370 build_dir = self.project_root / "lambda" / "kubectl-applier-simple-build"

371

372 if not source_dir.exists() or not build_dir.exists():

373 return

374

375 # Sync handler.py

376 source_handler = source_dir / "handler.py"

377 build_handler = build_dir / "handler.py"

378 if source_handler.exists(): 378 ↛ 382line 378 didn't jump to line 382 because the condition on line 378 was always true

379 shutil.copy2(source_handler, build_handler)

380

381 # Sync manifests directory

382 source_manifests = source_dir / "manifests"

383 build_manifests = build_dir / "manifests"

384 if source_manifests.exists(): 384 ↛ exitline 384 didn't return from function '_sync_lambda_sources' because the condition on line 384 was always true

385 build_manifests.mkdir(parents=True, exist_ok=True)

386 for manifest_file in source_manifests.glob("*.yaml"):

387 shutil.copy2(manifest_file, build_manifests / manifest_file.name)

388

389 def _rebuild_lambda_packages(self) -> None:

390 """Full rebuild of Lambda packages for deploy.

391

392 Nukes the build directories and recreates them from scratch with

393 fresh pip deps, handler code, and manifests. Called once at the

394 start of a deploy to ensure CDK picks up all changes.

395

396 Runs once per StackManager instance.

397 """

398 if getattr(self, "_lambda_packages_rebuilt", False): 398 ↛ 399line 398 didn't jump to line 399 because the condition on line 398 was never true

399 return

400 self._lambda_packages_rebuilt = True

401 self._build_lambda_packages()

402

403 def _build_lambda_packages(self) -> None:

404 """Build Lambda packages for kubectl-applier and helm-installer.

405

406 Creates build directories with fresh copies of handler code, manifests,

407 charts config, and pip dependencies. This ensures CDK always picks up

408 the latest content regardless of Docker/asset caching.

409 """

410 self._build_kubectl_lambda()

411 self._build_helm_installer_lambda()

412

413 def _build_kubectl_lambda(self) -> None:

414 """Build the kubectl-applier-simple Lambda package."""

415 source_dir = self.project_root / "lambda" / "kubectl-applier-simple"

416 build_dir = self.project_root / "lambda" / "kubectl-applier-simple-build"

417 requirements = source_dir / "requirements.txt"

418

419 if not source_dir.exists() or not requirements.exists():

420 return

421

422 print(" Building kubectl-applier-simple Lambda package...")

423

424 # Clean stale build directory to avoid broken symlinks from previous

425 # pip installs (botocore/data/ is a common source of dangling symlinks

426 # that cause CDK's asset fingerprinting to fail with ENOENT).

427 if build_dir.exists():

428 _safe_rmtree(build_dir)

429

430 # Create build directory

431 build_dir.mkdir(parents=True, exist_ok=True)

432

433 # Copy handler and manifests (always overwrite to prevent stale content)

434 shutil.copy2(source_dir / "handler.py", build_dir / "handler.py")

435 build_manifests = build_dir / "manifests"

436 if (source_dir / "manifests").exists(): 436 ↛ 442line 436 didn't jump to line 442 because the condition on line 436 was always true

437 if build_manifests.exists(): 437 ↛ 438line 437 didn't jump to line 438 because the condition on line 437 was never true

438 shutil.rmtree(build_manifests)

439 shutil.copytree(source_dir / "manifests", build_manifests, dirs_exist_ok=True)

440

441 # Install pip dependencies for Lambda runtime

442 import sys

443

444 result = subprocess.run( # nosemgrep: dangerous-subprocess-use-audit - static list: sys.executable + pip args + Path objects, no user input

445 [

446 sys.executable,

447 "-m",

448 "pip",

449 "install",

450 "-r",

451 str(requirements),

452 "-t",

453 str(build_dir),

454 "--upgrade",

455 "--platform",

456 "manylinux2014_x86_64",

457 "--only-binary=:all:",

458 "--quiet",

459 ],

460 capture_output=True,

461 text=True,

462 )

463 if result.returncode != 0: 463 ↛ 464line 463 didn't jump to line 464 because the condition on line 463 was never true

464 print(f" Warning: pip install failed: {result.stderr[:200]}")

465 else:

466 print(" Lambda package built successfully")

467

468 def _build_helm_installer_lambda(self) -> None:

469 """Build the helm-installer Lambda Docker context.

470

471 Copies all source files into a clean build directory so CDK always

472 detects changes to charts.yaml, handler.py, Dockerfile, etc.

473 """

474 source_dir = self.project_root / "lambda" / "helm-installer"

475 build_dir = self.project_root / "lambda" / "helm-installer-build"

476

477 if not source_dir.exists():

478 return

479

480 print(" Building helm-installer Lambda package...")

481

482 # Clean and recreate build directory

483 if build_dir.exists():

484 _safe_rmtree(build_dir)

485 build_dir.mkdir(parents=True, exist_ok=True)

486

487 # Copy all source files into the build directory

488 for item in source_dir.iterdir():

489 if item.name == "__pycache__":

490 continue

491 if item.is_file(): 491 ↛ 493line 491 didn't jump to line 493 because the condition on line 491 was always true

492 shutil.copy2(item, build_dir / item.name)

493 elif item.is_dir():

494 shutil.copytree(item, build_dir / item.name, dirs_exist_ok=True)

495

496 print(" Helm installer package built successfully")

497

498 def _get_python_path(self) -> str:

499 """

500 Get PYTHONPATH that includes the current Python's site-packages.

501

502 This is critical for pipx installations where CDK runs `python3 app.py`

503 using the system Python, which doesn't have aws_cdk installed.

504 By setting PYTHONPATH, we ensure CDK's subprocess can find our modules.

505 """

506 # Get all site-packages directories from the current Python

507 site_packages = site.getsitepackages()

508

509 # Also include user site-packages if available

510 user_site = site.getusersitepackages()

511 if user_site and os.path.isdir(user_site): 511 ↛ 512line 511 didn't jump to line 512 because the condition on line 511 was never true

512 site_packages.append(user_site)

513

514 # Include the directory containing the current module (for editable installs)

515 current_module_dir = Path(__file__).parent.parent

516 if current_module_dir.exists(): 516 ↛ 520line 516 didn't jump to line 520 because the condition on line 516 was always true

517 site_packages.append(str(current_module_dir))

518

519 # Combine with existing PYTHONPATH if any

520 existing_path = os.environ.get("PYTHONPATH", "")

521 all_paths = site_packages + ([existing_path] if existing_path else [])

522

523 return os.pathsep.join(all_paths)

524

525 def _run_cdk(

526 self,

527 command: list[str],

528 capture_output: bool = False,

529 env: dict[str, str] | None = None,

530 ) -> subprocess.CompletedProcess[str]:

531 """Run a CDK command."""

532 full_env = os.environ.copy()

533

534 # Inject PYTHONPATH so CDK's python3 subprocess can find aws_cdk

535 # This is essential for pipx installations

536 full_env["PYTHONPATH"] = self._get_python_path()

537

538 if env:

539 full_env.update(env)

540

541 cdk_cmd = self._cdk_path.split() + command

542

543 if capture_output:

544 return subprocess.run( # nosemgrep: dangerous-subprocess-use-audit - cdk_cmd is a list of static CDK subcommands, no user-controlled shell injection

545 cdk_cmd,

546 cwd=self.project_root,

547 capture_output=True,

548 text=True,

549 env=full_env,

550 )

551 return subprocess.run( # nosemgrep: dangerous-subprocess-use-audit - cdk_cmd is a list of static CDK subcommands, no user-controlled shell injection

552 cdk_cmd,

553 cwd=self.project_root,

554 env=full_env,

555 text=True,

556 )

557

558 def list_stacks(self) -> list[str]:

559 """List all available CDK stacks."""

560 result = self._run_cdk(["list"], capture_output=True)

561 if result.returncode != 0:

562 raise RuntimeError(f"Failed to list stacks: {result.stderr}")

563 return [s.strip() for s in result.stdout.strip().split("\n") if s.strip()]

564

565 def synth(self, stack_name: str | None = None, quiet: bool = True) -> str:

566 """Synthesize CloudFormation templates."""

567 cmd = ["synth"]

568 if stack_name: 568 ↛ 570line 568 didn't jump to line 570 because the condition on line 568 was always true

569 cmd.append(stack_name)

570 if quiet: 570 ↛ 573line 570 didn't jump to line 573 because the condition on line 570 was always true

571 cmd.append("--quiet")

572

573 result = self._run_cdk(cmd, capture_output=True)

574 if result.returncode != 0:

575 raise RuntimeError(f"CDK synth failed: {result.stderr}")

576 return str(result.stdout)

577

578 def diff(self, stack_name: str | None = None) -> str:

579 """Show diff between deployed and local stacks."""

580 cmd = ["diff", "--no-color"]

581 if stack_name: 581 ↛ 584line 581 didn't jump to line 584 because the condition on line 581 was always true

582 cmd.append(stack_name)

583

584 result = self._run_cdk(cmd, capture_output=True)

585 # diff returns non-zero if there are differences, which is expected

586 return str(result.stdout or result.stderr)

587

588 def deploy(

589 self,

590 stack_name: str | None = None,

591 require_approval: bool = True,

592 all_stacks: bool = False,

593 outputs_file: str | None = None,

594 parameters: dict[str, str] | None = None,

595 tags: dict[str, str] | None = None,

596 progress: str = "events",

597 output_dir: str | None = None,

598 exclusively: bool = False,

599 ) -> bool:

600 """Deploy CDK stacks.

601

602 Args:

603 stack_name: Name of the stack to deploy

604 require_approval: Whether to require approval for changes

605 all_stacks: Deploy all stacks

606 outputs_file: File to write outputs to

607 parameters: CDK parameters

608 tags: Tags to apply to stacks

609 progress: Progress display type

610 output_dir: Custom CDK output directory (for parallel deployments)

611 exclusively: Pass ``--exclusively`` to CDK so only the named

612 stack is evaluated, not its transitive dependencies. Used by

613 ``deploy_orchestrated`` once earlier phases have already

614 deployed the globals — re-synthesizing them every phase

615 forces custom resources (notably KubectlApplyManifests)

616 to re-run each time, adding minutes per phase for no

617 actual change.

618 """

619 # Full rebuild of Lambda packages on first deploy call (once per session),

620 # then sync latest source on subsequent calls

621 self._rebuild_lambda_packages()

622 self._sync_lambda_sources()

623

624 # Check for stuck stacks and auto-recover

625 if stack_name:

626 self._check_and_fix_stuck_stack(stack_name)

627

628 # Ensure container runtime is available for building images

629 runtime = _detect_container_runtime()

630 if not runtime:

631 raise RuntimeError(

632 "No container runtime found. Please install Docker, Finch, or Podman.\n"

633 " - Docker: https://docs.docker.com/get-docker/\n"

634 " - Finch: brew install finch && finch vm init\n"

635 " - Podman: https://podman.io/getting-started/installation"

636 )

637

638 # Auto-bootstrap the target region if needed

639 if stack_name:

640 region = self._get_deploy_region(stack_name)

641 if region and not self.ensure_bootstrapped(region):

642 raise RuntimeError(

643 f"Region {region} could not be bootstrapped. "

644 "Run 'gco stacks bootstrap --region "

645 f"{region}' manually to diagnose."

646 )

647

648 cmd = ["deploy"]

649

650 if all_stacks:

651 cmd.append("--all")

652 elif stack_name: 652 ↛ 660line 652 didn't jump to line 660 because the condition on line 652 was always true

653 cmd.append(stack_name)

654

655 # --exclusively tells CDK to deploy *only* the named stack, not its

656 # transitive dependencies. deploy_orchestrated sets this once the

657 # earlier phases (global, api-gateway) are already in place so that

658 # the regional and monitoring phases don't re-synthesize and

659 # re-evaluate globals on every pass.

660 if exclusively and stack_name and not all_stacks:

661 cmd.append("--exclusively")

662

663 if not require_approval:

664 cmd.extend(["--require-approval", "never"])

665

666 if outputs_file:

667 cmd.extend(["--outputs-file", outputs_file])

668

669 if parameters:

670 for key, value in parameters.items():

671 cmd.extend(["--parameters", f"{key}={value}"])

672

673 if tags:

674 for key, value in tags.items():

675 cmd.extend(["--tags", f"{key}={value}"])

676

677 cmd.extend(["--progress", progress])

678

679 # Use custom output directory for parallel deployments

680 if output_dir:

681 cmd.extend(["--output", output_dir])

682

683 # Set CDK_DOCKER env var if not already set

684 env = {"CDK_DOCKER": runtime} if not os.environ.get("CDK_DOCKER") else None

685

686 result = self._run_cdk(cmd, env=env)

687 success = result.returncode == 0

688

689 if not success and stack_name:

690 self._diagnose_deploy_failure(stack_name)

691

692 return success

693

694 def destroy(

695 self,

696 stack_name: str | None = None,

697 all_stacks: bool = False,

698 force: bool = False,

699 output_dir: str | None = None,

700 ) -> bool:

701 """Destroy CDK stacks.

702

703 Args:

704 stack_name: Name of the stack to destroy

705 all_stacks: Destroy all stacks

706 force: Skip confirmation prompts

707 output_dir: Custom CDK output directory (for parallel deployments)

708 """

709 cmd = ["destroy"]

710

711 if all_stacks:

712 cmd.append("--all")

713 elif stack_name: 713 ↛ 716line 713 didn't jump to line 716 because the condition on line 713 was always true

714 cmd.append(stack_name)

715

716 if force:

717 cmd.append("--force")

718

719 # Use custom output directory for parallel deployments

720 if output_dir:

721 cmd.extend(["--output", output_dir])

722

723 result = self._run_cdk(cmd)

724 return result.returncode == 0

725

726 def bootstrap(

727 self,

728 account: str | None = None,

729 region: str | None = None,

730 ) -> bool:

731 """Bootstrap CDK in an AWS account/region."""

732 cmd = ["bootstrap"]

733

734 if account and region:

735 cmd.append(f"aws://{account}/{region}")

736 elif region: 736 ↛ 739line 736 didn't jump to line 739 because the condition on line 736 was always true

737 cmd.append(f"aws://unknown-account/{region}")

738

739 result = self._run_cdk(cmd)

740 return result.returncode == 0

741

742 def is_bootstrapped(self, region: str) -> bool:

743 """Check if CDK has been bootstrapped in a region.

744

745 Looks for the CDKToolkit CloudFormation stack which is created

746 by ``cdk bootstrap``. Result is cached per region for the lifetime

747 of this StackManager instance.

748 """

749 if not hasattr(self, "_bootstrap_cache"): 749 ↛ 752line 749 didn't jump to line 752 because the condition on line 749 was always true

750 self._bootstrap_cache: dict[str, bool] = {}

751

752 if region in self._bootstrap_cache: 752 ↛ 753line 752 didn't jump to line 753 because the condition on line 752 was never true

753 return self._bootstrap_cache[region]

754

755 import boto3

756

757 cf = boto3.client("cloudformation", region_name=region)

758 try:

759 response = cf.describe_stacks(StackName="CDKToolkit")

760 stacks = response.get("Stacks", [])

761 if stacks:

762 status = stacks[0].get("StackStatus", "")

763 # Any non-deleted state counts as bootstrapped

764 result = "DELETE" not in status

765 self._bootstrap_cache[region] = result

766 return result

767 except ClientError:

768 pass # Stack doesn't exist — not bootstrapped

769 except Exception as e:

770 logger.debug("Failed to check CDK bootstrap in %s: %s", region, e)

771

772 self._bootstrap_cache[region] = False

773 return False

774

775 def ensure_bootstrapped(self, region: str) -> bool:

776 """Ensure a region is CDK-bootstrapped, auto-bootstrapping if needed.

777

778 Returns True if the region is (or was successfully) bootstrapped.

779 """

780 if self.is_bootstrapped(region):

781 return True

782

783 print(f"ℹ Region {region} is not CDK-bootstrapped. Bootstrapping now...")

784 success = self.bootstrap(region=region)

785 if success:

786 # Update cache so we don't re-check this region

787 if not hasattr(self, "_bootstrap_cache"): 787 ↛ 789line 787 didn't jump to line 789 because the condition on line 787 was always true

788 self._bootstrap_cache = {}

789 self._bootstrap_cache[region] = True

790 print(f"✓ CDK bootstrapped in {region}")

791 else:

792 print(f"✗ Failed to bootstrap CDK in {region}")

793 return success

794

795 def _get_deploy_region(self, stack_name: str) -> str | None:

796 """Determine the target AWS region for a given stack name."""

797 from .config import _load_cdk_json

798

799 cdk_regions = _load_cdk_json()

800

801 region: str | None

802 if stack_name == "gco-global":

803 region = cdk_regions.get("global") or self.config.global_region

804 return region

805 if stack_name == "gco-api-gateway":

806 region = cdk_regions.get("api_gateway") or self.config.api_gateway_region

807 return region

808 if stack_name == "gco-monitoring":

809 region = cdk_regions.get("monitoring") or self.config.monitoring_region

810 return region

811

812 # Regional stacks: gco-{region}

813 prefix = "gco-"

814 if stack_name.startswith(prefix):

815 return stack_name[len(prefix) :]

816

817 return None

818

819 def get_outputs(self, stack_name: str, region: str) -> dict[str, str]:

820 """Get stack outputs from CloudFormation."""

821 import boto3

822

823 cf = boto3.client("cloudformation", region_name=region)

824 try:

825 response = cf.describe_stacks(StackName=stack_name)

826 if response["Stacks"]:

827 stack = response["Stacks"][0]

828 outputs: dict[str, str] = {}

829 for output in stack.get("Outputs", []):

830 outputs[str(output["OutputKey"])] = str(output["OutputValue"])

831 return outputs

832 except Exception as e:

833 logger.debug("Failed to get outputs for %s in %s: %s", stack_name, region, e)

834 return {}

835

836 def get_stack_status(self, stack_name: str, region: str) -> StackInfo | None:

837 """Get detailed stack status from CloudFormation."""

838 import boto3

839

840 cf = boto3.client("cloudformation", region_name=region)

841 try:

842 response = cf.describe_stacks(StackName=stack_name)

843 if response["Stacks"]:

844 stack = response["Stacks"][0]

845 return StackInfo(

846 name=stack["StackName"],

847 status=stack["StackStatus"],

848 region=region,

849 created_time=stack.get("CreationTime"),

850 updated_time=stack.get("LastUpdatedTime"),

851 outputs={o["OutputKey"]: o["OutputValue"] for o in stack.get("Outputs", [])},

852 tags={t["Key"]: t["Value"] for t in stack.get("Tags", [])},

853 )

854 except Exception as e:

855 logger.debug("Failed to get stack status for %s in %s: %s", stack_name, region, e)

856 return None

857

858 def deploy_orchestrated(

859 self,

860 require_approval: bool = True,

861 outputs_file: str | None = None,

862 parameters: dict[str, str] | None = None,

863 tags: dict[str, str] | None = None,

864 progress: str = "events",

865 on_stack_start: Callable[[str], None] | None = None,

866 on_stack_complete: Callable[[str, bool], None] | None = None,

867 parallel: bool = False,

868 max_workers: int = 4,

869 ) -> tuple[bool, list[str], list[str]]:

870 """

871 Deploy all stacks in the correct order.

872

873 Deploys global stacks first, then regional stacks (optionally in parallel),

874 then the monitoring stack (which depends on regional stacks).

875

876 Args:

877 require_approval: Whether to require approval for changes

878 outputs_file: File to write outputs to

879 parameters: CDK parameters

880 tags: Tags to apply to stacks

881 progress: Progress display type

882 on_stack_start: Callback(stack_name) called when starting a stack

883 on_stack_complete: Callback(stack_name, success) called when stack completes

884 parallel: Deploy regional stacks in parallel

885 max_workers: Maximum number of parallel deployments (default: 4)

886

887 Returns:

888 Tuple of (overall_success, successful_stacks, failed_stacks)

889 """

890 stacks = self.list_stacks()

891 ordered_stacks = get_stack_deployment_order(stacks)

892

893 # Separate stacks into three groups:

894 # 1. Pre-regional global stacks (gco-global, gco-api-gateway)

895 # 2. Regional stacks (can be parallelized)

896 # 3. Post-regional stacks (gco-monitoring - depends on regional stacks)

897 pre_regional = {"gco-global", "gco-api-gateway"}

898 post_regional = {"gco-monitoring"}

899

900 pre_regional_stacks = [s for s in ordered_stacks if s in pre_regional]

901 regional_stacks = [

902 s for s in ordered_stacks if s not in pre_regional and s not in post_regional

903 ]

904 post_regional_stacks = [s for s in ordered_stacks if s in post_regional]

905

906 successful: list[str] = []

907 failed: list[str] = []

908

909 # Phase 1: Deploy pre-regional global stacks sequentially

910 for stack_name in pre_regional_stacks:

911 if on_stack_start:

912 on_stack_start(stack_name)

913

914 success = self.deploy(

915 stack_name=stack_name,

916 require_approval=require_approval,

917 outputs_file=outputs_file,

918 parameters=parameters,

919 tags=tags,

920 progress=progress,

921 )

922

923 if success:

924 successful.append(stack_name)

925 else:

926 failed.append(stack_name)

927

928 if on_stack_complete:

929 on_stack_complete(stack_name, success)

930

931 # Stop on failure to prevent cascading issues

932 if not success:

933 return False, successful, failed

934

935 # Phase 2: Deploy regional stacks (parallel or sequential)

936 # All regional stacks pass --exclusively: globals are already deployed

937 # in Phase 1, so CDK doesn't need to re-evaluate them. Skipping that

938 # re-evaluation avoids re-running custom resources (notably

939 # KubectlApplyManifests) on the global stacks every time a regional

940 # stack is deployed — that would otherwise re-apply manifests and

941 # rollout-restart controllers for no actual change.

942 if regional_stacks: 942 ↛ 994line 942 didn't jump to line 994 because the condition on line 942 was always true

943 if parallel and len(regional_stacks) > 1:

944 # Parallel deployment of regional stacks

945 successful_regional, failed_regional = self._deploy_stacks_parallel(

946 stacks=regional_stacks,

947 require_approval=require_approval,

948 outputs_file=outputs_file,

949 parameters=parameters,

950 tags=tags,

951 progress=progress,

952 on_stack_start=on_stack_start,

953 on_stack_complete=on_stack_complete,

954 max_workers=max_workers,

955 )

956 successful.extend(successful_regional)

957 failed.extend(failed_regional)

958

959 # Stop if any regional stack failed

960 if failed_regional:

961 return False, successful, failed

962 else:

963 # Sequential deployment

964 for stack_name in regional_stacks:

965 if on_stack_start:

966 on_stack_start(stack_name)

967

968 success = self.deploy(

969 stack_name=stack_name,

970 require_approval=require_approval,

971 outputs_file=outputs_file,

972 parameters=parameters,

973 tags=tags,

974 progress=progress,

975 exclusively=True,

976 )

977

978 if success: 978 ↛ 981line 978 didn't jump to line 981 because the condition on line 978 was always true

979 successful.append(stack_name)

980 else:

981 failed.append(stack_name)

982

983 if on_stack_complete:

984 on_stack_complete(stack_name, success)

985

986 # Stop on failure

987 if not success: 987 ↛ 988line 987 didn't jump to line 988 because the condition on line 987 was never true

988 return False, successful, failed

989

990 # Phase 3: Deploy post-regional stacks (monitoring) sequentially.

991 # Same rationale as Phase 2: every upstream stack is already

992 # deployed, so --exclusively prevents a redundant pass over

993 # global/api-gateway/regional.

994 for stack_name in post_regional_stacks:

995 if on_stack_start: 995 ↛ 996line 995 didn't jump to line 996 because the condition on line 995 was never true

996 on_stack_start(stack_name)

997

998 success = self.deploy(

999 stack_name=stack_name,

1000 require_approval=require_approval,

1001 outputs_file=outputs_file,

1002 parameters=parameters,

1003 tags=tags,

1004 progress=progress,

1005 exclusively=True,

1006 )

1007

1008 if success: 1008 ↛ 1011line 1008 didn't jump to line 1011 because the condition on line 1008 was always true

1009 successful.append(stack_name)

1010 else:

1011 failed.append(stack_name)

1012

1013 if on_stack_complete: 1013 ↛ 1014line 1013 didn't jump to line 1014 because the condition on line 1013 was never true

1014 on_stack_complete(stack_name, success)

1015

1016 if not success: 1016 ↛ 1017line 1016 didn't jump to line 1017 because the condition on line 1016 was never true

1017 return False, successful, failed

1018

1019 return len(failed) == 0, successful, failed

1020

1021 def _deploy_stacks_parallel(

1022 self,

1023 stacks: list[str],

1024 require_approval: bool,

1025 outputs_file: str | None,

1026 parameters: dict[str, str] | None,

1027 tags: dict[str, str] | None,

1028 progress: str,

1029 on_stack_start: Callable[[str], None] | None,

1030 on_stack_complete: Callable[[str, bool], None] | None,

1031 max_workers: int,

1032 ) -> tuple[list[str], list[str]]:

1033 """Deploy multiple stacks in parallel using separate CDK output directories."""

1034 import tempfile

1035

1036 successful: list[str] = []

1037 failed: list[str] = []

1038 lock = Lock()

1039

1040 def deploy_single(stack_name: str) -> tuple[str, bool]:

1041 # Use a unique output directory in /tmp for each parallel deployment

1042 # This avoids CDK copying cdk.out.* directories into assets

1043 output_dir = tempfile.mkdtemp(prefix=f"cdk-{stack_name}-")

1044

1045 if on_stack_start: 1045 ↛ 1046line 1045 didn't jump to line 1046 because the condition on line 1045 was never true

1046 with lock:

1047 on_stack_start(stack_name)

1048

1049 success = self.deploy(

1050 stack_name=stack_name,

1051 require_approval=require_approval,

1052 outputs_file=outputs_file,

1053 parameters=parameters,

1054 tags=tags,

1055 progress=progress,

1056 output_dir=output_dir,

1057 exclusively=True,

1058 )

1059

1060 # Clean up the temporary output directory

1061 try:

1062 import shutil

1063

1064 if os.path.exists(output_dir): 1064 ↛ 1069line 1064 didn't jump to line 1069 because the condition on line 1064 was always true

1065 shutil.rmtree(output_dir)

1066 except Exception as e:

1067 logger.debug("Cleanup of %s failed: %s", output_dir, e)

1068

1069 return stack_name, success

1070

1071 with ThreadPoolExecutor(max_workers=max_workers) as executor:

1072 futures = {executor.submit(deploy_single, stack): stack for stack in stacks}

1073

1074 for future in as_completed(futures):

1075 stack_name, success = future.result()

1076

1077 with lock:

1078 if success:

1079 successful.append(stack_name)

1080 else:

1081 failed.append(stack_name)

1082

1083 if on_stack_complete: 1083 ↛ 1084line 1083 didn't jump to line 1084 because the condition on line 1083 was never true

1084 on_stack_complete(stack_name, success)

1085

1086 return successful, failed

1087

1088 def destroy_orchestrated(

1089 self,

1090 force: bool = False,

1091 on_stack_start: Callable[[str], None] | None = None,

1092 on_stack_complete: Callable[[str, bool], None] | None = None,

1093 parallel: bool = False,

1094 max_workers: int = 4,

1095 ) -> tuple[bool, list[str], list[str]]:

1096 """

1097 Destroy all stacks in the correct order.

1098

1099 Destroys monitoring stack first, then regional stacks (optionally in parallel),

1100 then global stacks.

1101

1102 Args:

1103 force: Skip confirmation prompts

1104 on_stack_start: Callback(stack_name) called when starting a stack

1105 on_stack_complete: Callback(stack_name, success) called when stack completes

1106 parallel: Destroy regional stacks in parallel

1107 max_workers: Maximum number of parallel destructions (default: 4)

1108

1109 Returns:

1110 Tuple of (overall_success, successful_stacks, failed_stacks)

1111 """

1112 stacks = self.list_stacks()

1113

1114 # Phase 0: Clean up backup vault recovery points so the global stack

1115 # can be deleted cleanly by CloudFormation.

1116 self._cleanup_backup_vault()

1117

1118 # Separate stacks into three groups (reverse of deploy order):

1119 pre_regional = {"gco-global", "gco-api-gateway"}

1120 post_regional = {"gco-monitoring"}

1121

1122 post_regional_stacks = [s for s in stacks if s in post_regional]

1123 regional_stacks = [s for s in stacks if s not in pre_regional and s not in post_regional]

1124 pre_regional_stacks = [s for s in stacks if s in pre_regional]

1125

1126 # Sort regional stacks alphabetically (reversed for destroy)

1127 regional_stacks.sort(reverse=True)

1128 # Sort pre-regional stacks in reverse priority order

1129 pre_regional_order = {"gco-api-gateway": 1, "gco-global": 2}

1130 pre_regional_stacks.sort(key=lambda x: pre_regional_order.get(x, 0))

1131

1132 successful: list[str] = []

1133 failed: list[str] = []

1134

1135 # Phase 1: Destroy post-regional stacks (monitoring) first

1136 for stack_name in post_regional_stacks:

1137 if on_stack_start: 1137 ↛ 1138line 1137 didn't jump to line 1138 because the condition on line 1137 was never true

1138 on_stack_start(stack_name)

1139

1140 success = self.destroy(

1141 stack_name=stack_name,

1142 force=force,

1143 )

1144

1145 if success: 1145 ↛ 1148line 1145 didn't jump to line 1148 because the condition on line 1145 was always true

1146 successful.append(stack_name)

1147 else:

1148 failed.append(stack_name)

1149

1150 if on_stack_complete: 1150 ↛ 1151line 1150 didn't jump to line 1151 because the condition on line 1150 was never true

1151 on_stack_complete(stack_name, success)

1152

1153 # Phase 2: Destroy regional stacks (parallel or sequential)

1154 # Each regional destroy has a background watchdog that proactively

1155 # deletes EKS-managed security groups + orphaned ENIs as soon as

1156 # they appear. Without this, CloudFormation's VPC delete step sits

1157 # in DELETE_IN_PROGRESS for ~10 min waiting for EKS to GC the

1158 # ``eks-cluster-sg-<cluster>-*`` security group and its cluster

1159 # ENIs. The SG is owned by the EKS service (not CloudFormation),

1160 # so CFN can't delete it directly — it just polls the VPC's

1161 # dependencies until they drain. See ``_start_eks_sg_watchdog``.

1162 watchdog_stops: dict[str, Event] = {}

1163 watchdog_threads: dict[str, Thread] = {}

1164 for stack_name in regional_stacks:

1165 stop_event = Event()

1166 thread = self._start_eks_sg_watchdog(stack_name, stop_event)

1167 watchdog_stops[stack_name] = stop_event

1168 watchdog_threads[stack_name] = thread

1169

1170 if regional_stacks: 1170 ↛ 1203line 1170 didn't jump to line 1203 because the condition on line 1170 was always true

1171 if parallel and len(regional_stacks) > 1:

1172 # Parallel destruction of regional stacks

1173 successful_regional, failed_regional = self._destroy_stacks_parallel(

1174 stacks=regional_stacks,

1175 force=force,

1176 on_stack_start=on_stack_start,

1177 on_stack_complete=on_stack_complete,

1178 max_workers=max_workers,

1179 )

1180 successful.extend(successful_regional)

1181 failed.extend(failed_regional)

1182 else:

1183 # Sequential destruction

1184 for stack_name in regional_stacks:

1185 if on_stack_start:

1186 on_stack_start(stack_name)

1187

1188 success = self.destroy(

1189 stack_name=stack_name,

1190 force=force,

1191 )

1192

1193 if success:

1194 successful.append(stack_name)

1195 else:

1196 failed.append(stack_name)

1197

1198 if on_stack_complete:

1199 on_stack_complete(stack_name, success)

1200

1201 # Stop all watchdogs and do one final cleanup pass per stack to

1202 # catch anything EKS re-created during the destroy flow.

1203 for stack_name in regional_stacks:

1204 watchdog_stops[stack_name].set()

1205 watchdog_threads[stack_name].join(timeout=5)

1206 self._cleanup_eks_security_groups(stack_name)

1207

1208 # Phase 3: Destroy pre-regional global stacks last

1209 for stack_name in pre_regional_stacks:

1210 if on_stack_start:

1211 on_stack_start(stack_name)

1212

1213 success = self.destroy(

1214 stack_name=stack_name,

1215 force=force,

1216 )

1217

1218 if success: 1218 ↛ 1221line 1218 didn't jump to line 1221 because the condition on line 1218 was always true

1219 successful.append(stack_name)

1220 else:

1221 failed.append(stack_name)

1222

1223 if on_stack_complete:

1224 on_stack_complete(stack_name, success)

1225

1226 return len(failed) == 0, successful, failed

1227

1228 def _destroy_stacks_parallel(

1229 self,

1230 stacks: list[str],

1231 force: bool,

1232 on_stack_start: Callable[[str], None] | None,

1233 on_stack_complete: Callable[[str, bool], None] | None,

1234 max_workers: int,

1235 ) -> tuple[list[str], list[str]]:

1236 """Destroy multiple stacks in parallel using separate CDK output directories."""

1237 import tempfile

1238

1239 successful: list[str] = []

1240 failed: list[str] = []

1241 lock = Lock()

1242

1243 def destroy_single(stack_name: str) -> tuple[str, bool]:

1244 # Use a unique output directory in /tmp for each parallel destruction

1245 output_dir = tempfile.mkdtemp(prefix=f"cdk-{stack_name}-")

1246

1247 if on_stack_start: 1247 ↛ 1248line 1247 didn't jump to line 1248 because the condition on line 1247 was never true

1248 with lock:

1249 on_stack_start(stack_name)

1250

1251 success = self.destroy(

1252 stack_name=stack_name,

1253 force=force,

1254 output_dir=output_dir,

1255 )

1256

1257 # Clean up the temporary output directory

1258 try:

1259 import shutil

1260

1261 if os.path.exists(output_dir): 1261 ↛ 1266line 1261 didn't jump to line 1266 because the condition on line 1261 was always true

1262 shutil.rmtree(output_dir)

1263 except Exception as e:

1264 logger.debug("Cleanup of %s failed: %s", output_dir, e)

1265

1266 return stack_name, success

1267

1268 with ThreadPoolExecutor(max_workers=max_workers) as executor:

1269 futures = {executor.submit(destroy_single, stack): stack for stack in stacks}

1270

1271 for future in as_completed(futures):

1272 stack_name, success = future.result()

1273

1274 with lock:

1275 if success:

1276 successful.append(stack_name)

1277 else:

1278 failed.append(stack_name)

1279

1280 if on_stack_complete: 1280 ↛ 1281line 1280 didn't jump to line 1281 because the condition on line 1280 was never true

1281 on_stack_complete(stack_name, success)

1282

1283 return successful, failed

1284

1285 def _cleanup_backup_vault(self) -> None:

1286 """Delete all recovery points from the GCO backup vault.

1287

1288 This is called before destroy-all so CloudFormation can delete the

1289 backup vault cleanly. Without this, the vault deletion fails because

1290 it contains recovery points.

1291 """

1292 import boto3

1293

1294 global_region = self.config.global_region

1295 project_name = self.config.project_name

1296

1297 try:

1298 backup_client = boto3.client("backup", region_name=global_region)

1299

1300 # Find the backup vault by listing vaults with the project prefix

1301 paginator = backup_client.get_paginator("list_backup_vaults")

1302 vault_name = None

1303 for page in paginator.paginate():

1304 for vault in page.get("BackupVaultList", []):

1305 if project_name in vault["BackupVaultName"].lower():

1306 vault_name = vault["BackupVaultName"]

1307 break

1308 if vault_name:

1309 break

1310

1311 if not vault_name:

1312 return

1313

1314 # Delete all recovery points in the vault

1315 rp_paginator = backup_client.get_paginator("list_recovery_points_by_backup_vault")

1316 deleted = 0

1317 for page in rp_paginator.paginate(BackupVaultName=vault_name):

1318 for rp in page.get("RecoveryPoints", []):

1319 try:

1320 backup_client.delete_recovery_point(

1321 BackupVaultName=vault_name,

1322 RecoveryPointArn=rp["RecoveryPointArn"],

1323 )

1324 deleted += 1

1325 except Exception as e:

1326 logger.debug("Failed to delete recovery point: %s", e)

1327

1328 if deleted > 0:

1329 print(f" Cleaned up {deleted} backup recovery points from {vault_name}")

1330

1331 except Exception as e:

1332 print(f" Warning: Backup vault cleanup failed (non-fatal): {e}")

1333

1334 def cleanup_eks_security_groups(self) -> None:

1335 """Clean up EKS-managed security groups across all regional stacks.

1336

1337 Called between destroy retries to remove orphaned security groups

1338 that block VPC deletion.

1339 """

1340 stacks = self.list_stacks()

1341 pre_regional = {"gco-global", "gco-api-gateway", "gco-monitoring"}

1342 regional_stacks = [s for s in stacks if s not in pre_regional]

1343 for stack_name in regional_stacks:

1344 self._cleanup_eks_security_groups(stack_name)

1345

1346 def _cleanup_eks_security_groups(self, stack_name: str) -> None:

1347 """Clean up EKS-managed security groups that block VPC deletion.

1348

1349 EKS creates a cluster security group (eks-cluster-sg-<cluster-name>-*)

1350 that is owned by EKS, not CloudFormation. When the stack is destroyed,

1351 this SG and its attached ENIs can linger, causing VPC deletion to fail.

1352

1353 This method finds and deletes these orphaned security groups and their

1354 ENIs before the stack destroy runs.

1355 """

1356 import time as _time

1357

1358 import boto3

1359

1360 # Extract region from stack name (e.g., gco-us-east-1 -> us-east-1)

1361 project_name = self.config.project_name

1362 region = stack_name.replace(f"{project_name}-", "", 1)

1363 cluster_name = stack_name # cluster name matches stack name

1364

1365 try:

1366 ec2 = boto3.client("ec2", region_name=region)

1367

1368 # Find EKS-managed security groups by name pattern

1369 response = ec2.describe_security_groups(

1370 Filters=[

1371 {

1372 "Name": "group-name",

1373 "Values": [f"eks-cluster-sg-{cluster_name}-*"],

1374 }

1375 ]

1376 )

1377

1378 sgs = response.get("SecurityGroups", [])

1379 if not sgs:

1380 return

1381

1382 for sg in sgs:

1383 sg_id = sg["GroupId"]

1384 sg_name = sg.get("GroupName", "")

1385

1386 # First, detach and delete any ENIs using this SG

1387 eni_response = ec2.describe_network_interfaces(

1388 Filters=[{"Name": "group-id", "Values": [sg_id]}]

1389 )

1390

1391 for eni in eni_response.get("NetworkInterfaces", []):

1392 eni_id = eni["NetworkInterfaceId"]

1393 try:

1394 if eni.get("Attachment"): 1394 ↛ 1400line 1394 didn't jump to line 1400 because the condition on line 1394 was always true

1395 ec2.detach_network_interface(

1396 AttachmentId=eni["Attachment"]["AttachmentId"],

1397 Force=True,

1398 )

1399 _time.sleep(5)

1400 ec2.delete_network_interface(NetworkInterfaceId=eni_id)

1401 logger.debug("Deleted ENI %s from SG %s", eni_id, sg_name)

1402 except Exception as e:

1403 logger.debug("Failed to delete ENI %s: %s", eni_id, e)

1404

1405 # Now delete the security group

1406 try:

1407 ec2.delete_security_group(GroupId=sg_id)

1408 print(f" Cleaned up EKS security group: {sg_name} ({sg_id})")

1409 except Exception as e:

1410 logger.debug(

1411 "Failed to delete SG %s: %s (will retry on next attempt)", sg_id, e

1412 )

1413

1414 except Exception as e:

1415 # Non-fatal — the retry loop will handle it

1416 logger.debug("EKS security group cleanup for %s failed: %s", stack_name, e)

1417

1418 def _start_eks_sg_watchdog(self, stack_name: str, stop_event: Event) -> Thread:

1419 """Start a background thread that polls for orphaned EKS security groups.

1420

1421 EKS creates an ``eks-cluster-sg-<cluster-name>-*`` security group that

1422 is owned by the EKS service (not CloudFormation). When the stack's

1423 EKS cluster resource deletes, the SG is supposed to GC along with its

1424 cluster ENIs — but on EKS Auto Mode there's a window where the SG

1425 lingers after the cluster is gone, blocking the subsequent VPC

1426 delete with ``DependencyViolation``. CloudFormation then sits in

1427 ``DELETE_IN_PROGRESS`` on the VPC for ~10 minutes retrying.

1428

1429 This watchdog runs for the full duration of the regional-stack

1430 destroy, polling every 30 seconds. As soon as an orphaned SG appears

1431 (which only happens after the cluster delete has progressed past

1432 the cluster resource itself), it deletes the SG and any ENIs still

1433 attached. That unblocks the VPC delete immediately instead of

1434 waiting for EKS's own GC timer.

1435

1436 The thread exits when ``stop_event`` is set by the orchestrator at

1437 the end of the regional phase.

1438 """

1439

1440 def _watchdog() -> None:

1441 while not stop_event.is_set():

1442 try:

1443 self._cleanup_eks_security_groups(stack_name)

1444 except Exception as e:

1445 logger.debug(

1446 "EKS SG watchdog tick for %s failed (non-fatal): %s",

1447 stack_name,

1448 e,

1449 )

1450 # ``wait`` returns immediately when the event is set, so this

1451 # doubles as the sleep-and-shutdown-check in one call.

1452 stop_event.wait(timeout=30)

1453

1454 thread = Thread(

1455 target=_watchdog,

1456 name=f"eks-sg-watchdog-{stack_name}",

1457 daemon=True,

1458 )

1459 thread.start()

1460 return thread

1461

1462

1463def get_stack_manager(config: GCOConfig) -> StackManager:

1464 """Factory function to get a StackManager instance."""

1465 return StackManager(config)

1466

1467

1468def get_stack_deployment_order(stacks: list[str]) -> list[str]:

1469 """

1470 Get the correct deployment order for stacks.

1471

1472 Order: global stacks first, then regional stacks.

1473 Global stacks: gco-global, gco-api-gateway, gco-monitoring

1474 Regional stacks: gco-{region} (e.g., gco-us-east-1)

1475 """

1476 global_stacks = []

1477 regional_stacks = []

1478

1479 # Define global stack priority (lower = deploy first)

1480 global_priority = {

1481 "gco-global": 1,

1482 "gco-api-gateway": 2,

1483 "gco-monitoring": 3,

1484 }

1485

1486 for stack in stacks:

1487 if stack in global_priority:

1488 global_stacks.append((global_priority[stack], stack))

1489 else:

1490 regional_stacks.append(stack)

1491

1492 # Sort global stacks by priority, regional stacks alphabetically

1493 global_stacks.sort(key=lambda x: x[0])

1494 regional_stacks.sort()

1495

1496 return [s[1] for s in global_stacks] + regional_stacks

1497

1498

1499def get_stack_destroy_order(stacks: list[str]) -> list[str]:

1500 """

1501 Get the correct destroy order for stacks.

1502

1503 Order: regional stacks first, then global stacks (reverse of deploy).

1504 """

1505 deployment_order = get_stack_deployment_order(stacks)

1506 return list(reversed(deployment_order))

1507

1508

1509# =============================================================================

1510# Feature toggle helpers

1511# =============================================================================

1512

1513_FSX_DEFAULTS: dict[str, Any] = {

1514 "enabled": False,

1515 "storage_capacity_gib": 1200,

1516 "deployment_type": "SCRATCH_2",

1517 "per_unit_storage_throughput": 200,

1518 "data_compression_type": "LZ4",

1519 "import_path": None,

1520 "export_path": None,

1521 "auto_import_policy": "NEW_CHANGED_DELETED",

1522}

1523

1524

1525def _find_cdk_json() -> Path | None:

1526 """Find cdk.json in current or parent directories."""

1527 current = Path.cwd()

1528 for parent in [current] + list(current.parents):

1529 cdk_path = parent / "cdk.json"

1530 if cdk_path.exists():

1531 return cdk_path

1532 return None

1533

1534

1535def get_fsx_config(region: str | None = None) -> dict[str, Any]:

1536 """Get current FSx for Lustre configuration from cdk.json.

1537

1538 Args:

1539 region: Optional region to get config for. If provided, checks for

1540 region-specific overrides first.

1541

1542 Returns:

1543 FSx configuration dictionary

1544 """

1545 return _get_feature_config("fsx_lustre", _FSX_DEFAULTS, region)

1546

1547

1548def update_fsx_config(settings: dict[str, Any], region: str | None = None) -> None:

1549 """Update FSx for Lustre configuration in cdk.json.

1550

1551 Args:

1552 settings: FSx settings to update

1553 region: Optional region for region-specific config. If None, updates global config.

1554 """

1555 _update_feature_config("fsx_lustre", settings, _FSX_DEFAULTS, region)

1556

1557

1558# =============================================================================

1559# Generic feature toggle helpers (used by FSx, Valkey, Aurora, and future features)

1560# =============================================================================

1561

1562

1563def _get_feature_config(

1564 feature_key: str,

1565 default_config: dict[str, Any],

1566 region: str | None = None,

1567) -> dict[str, Any]:

1568 """Get configuration for a toggleable feature from cdk.json.

1569

1570 Args:

1571 feature_key: The cdk.json context key (e.g. "valkey", "aurora_pgvector").

1572 default_config: Default configuration values when the key is missing.

1573 region: Optional region for region-specific overrides.

1574

1575 Returns:

1576 Merged configuration dictionary.

1577 """

1578 cdk_json_path = _find_cdk_json()

1579 if not cdk_json_path:

1580 raise RuntimeError("cdk.json not found")

1581

1582 import json

1583

1584 with open(cdk_json_path, encoding="utf-8") as f:

1585 cdk_config = json.load(f)

1586

1587 global_config = cdk_config.get("context", {}).get(feature_key, default_config)

1588

1589 if region:

1590 region_key = f"{feature_key}_regions"

1591 region_overrides = cdk_config.get("context", {}).get(region_key, {})

1592 if region in region_overrides:

1593 merged = {**global_config, **region_overrides[region]}

1594 merged["region"] = region

1595 merged["is_region_specific"] = True

1596 return merged

1597

1598 result = {**default_config, **global_config}

1599 result["is_region_specific"] = False

1600 return result

1601

1602

1603def _update_feature_config(

1604 feature_key: str,

1605 settings: dict[str, Any],

1606 default_config: dict[str, Any],

1607 region: str | None = None,

1608) -> None:

1609 """Update configuration for a toggleable feature in cdk.json.

1610

1611 Args:

1612 feature_key: The cdk.json context key (e.g. "valkey", "aurora_pgvector").

1613 settings: Settings to update.

1614 default_config: Default configuration values when the key is missing.

1615 region: Optional region for region-specific config.

1616 """

1617 cdk_json_path = _find_cdk_json()

1618 if not cdk_json_path:

1619 raise RuntimeError("cdk.json not found")

1620

1621 import json

1622

1623 with open(cdk_json_path, encoding="utf-8") as f:

1624 cdk_config = json.load(f)

1625

1626 if "context" not in cdk_config:

1627 cdk_config["context"] = {}

1628

1629 if region:

1630 region_key = f"{feature_key}_regions"

1631 if region_key not in cdk_config["context"]: 1631 ↛ 1633line 1631 didn't jump to line 1633 because the condition on line 1631 was always true

1632 cdk_config["context"][region_key] = {}

1633 if region not in cdk_config["context"][region_key]: 1633 ↛ 1635line 1633 didn't jump to line 1635 because the condition on line 1633 was always true

1634 cdk_config["context"][region_key][region] = {}

1635 for key, value in settings.items():

1636 if value is not None or key == "enabled": 1636 ↛ 1635line 1636 didn't jump to line 1635 because the condition on line 1636 was always true

1637 cdk_config["context"][region_key][region][key] = value

1638 else:

1639 if feature_key not in cdk_config["context"]:

1640 cdk_config["context"][feature_key] = {**default_config}

1641 for key, value in settings.items():

1642 if value is not None or key == "enabled":

1643 cdk_config["context"][feature_key][key] = value

1644

1645 with open(cdk_json_path, "w", encoding="utf-8") as f:

1646 json.dump(cdk_config, f, indent=2)

1647

1648

1649# =============================================================================

1650# Valkey configuration

1651# =============================================================================

1652

1653_VALKEY_DEFAULTS: dict[str, Any] = {

1654 "enabled": False,

1655 "max_data_storage_gb": 5,

1656 "max_ecpu_per_second": 5000,

1657 "snapshot_retention_limit": 1,

1658}

1659

1660

1661def get_valkey_config(region: str | None = None) -> dict[str, Any]:

1662 """Get current Valkey Serverless configuration from cdk.json."""

1663 return _get_feature_config("valkey", _VALKEY_DEFAULTS, region)

1664

1665

1666def update_valkey_config(settings: dict[str, Any], region: str | None = None) -> None:

1667 """Update Valkey Serverless configuration in cdk.json."""

1668 _update_feature_config("valkey", settings, _VALKEY_DEFAULTS, region)

1669

1670

1671# =============================================================================

1672# Aurora pgvector configuration

1673# =============================================================================

1674

1675_AURORA_DEFAULTS: dict[str, Any] = {

1676 "enabled": False,

1677 "min_acu": 0,

1678 "max_acu": 16,

1679 "backup_retention_days": 7,

1680 "deletion_protection": False,

1681}

1682

1683

1684def get_aurora_config(region: str | None = None) -> dict[str, Any]:

1685 """Get current Aurora pgvector configuration from cdk.json."""

1686 return _get_feature_config("aurora_pgvector", _AURORA_DEFAULTS, region)

1687

1688

1689def update_aurora_config(settings: dict[str, Any], region: str | None = None) -> None:

1690 """Update Aurora pgvector configuration in cdk.json."""

1691 _update_feature_config("aurora_pgvector", settings, _AURORA_DEFAULTS, region)

Coverage for cli / stacks.py: 88%

710 statements