Coverage for gco/stacks/global_stack.py: 97%

198 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-15 15:07 +0000

1""" 

2Global stack for GCO (Global Capacity Orchestrator on AWS) - AWS Global Accelerator configuration. 

3 

4This stack creates the global-level resources that span all regions: 

5- AWS Global Accelerator with TCP listeners on ports 80 and 443 

6- Endpoint groups for each configured region 

7- SSM parameters for cross-region endpoint group ARN sharing 

8- DynamoDB tables for templates and webhooks (global, replicated) 

9 

10The Global Accelerator provides: 

11- Single global endpoint for all regions 

12- Automatic health-based routing to nearest healthy region 

13- DDoS protection via AWS Shield Standard 

14- Reduced latency through AWS global network 

15 

16Architecture: 

17 Global Accelerator → Listener (80, 443) → Endpoint Groups (per region) 

18 

19 Regional ALBs (registered separately) 

20""" 

21 

22from typing import Any 

23 

24from aws_cdk import ( 

25 CfnOutput, 

26 Duration, 

27 Fn, 

28 RemovalPolicy, 

29 Stack, 

30) 

31from aws_cdk import aws_backup as backup 

32from aws_cdk import aws_dynamodb as dynamodb 

33from aws_cdk import aws_ecr as ecr 

34from aws_cdk import aws_events as events 

35from aws_cdk import aws_globalaccelerator as ga 

36from aws_cdk import aws_iam as iam 

37from aws_cdk import aws_kms as kms 

38from aws_cdk import aws_lambda as lambda_ 

39from aws_cdk import aws_s3 as s3 

40from aws_cdk import aws_ssm as ssm 

41from constructs import Construct 

42 

43from gco.config.config_loader import ConfigLoader 

44from gco.stacks.constants import ( 

45 CLUSTER_SHARED_BUCKET_NAME_PREFIX, 

46 CLUSTER_SHARED_SSM_PARAMETER_PREFIX, 

47 LAMBDA_PYTHON_RUNTIME, 

48) 

49 

50# <pyflowchart-code-diagram> BEGIN - auto-inserted, do not edit 

51# Flowchart(s) generated from this file: 

52# * ``GCOGlobalStack.__init__`` -> ``diagrams/code_diagrams/gco/stacks/global_stack.GCOGlobalStack___init__.html`` 

53# (PNG: ``diagrams/code_diagrams/gco/stacks/global_stack.GCOGlobalStack___init__.png``) 

54# Regenerate with ``python diagrams/code_diagrams/generate.py``. 

55# <pyflowchart-code-diagram> END 

56 

57 

58# Default values for the ``images`` cdk.json block. The defaults match the 

59# documented retention posture: repos survive a stack destroy by default 

60# (``retain``), non-empty repos block destroy unless the operator explicitly 

61# flips ``empty_on_delete`` to true, lifecycle keeps the latest 20 tagged 

62# images and expires untagged ones after 7 days, and replication is enabled 

63# by default to every deployed region. 

64_IMAGES_DEFAULT_REMOVAL_POLICY = "retain" 

65_IMAGES_DEFAULT_EMPTY_ON_DELETE = False 

66_IMAGES_DEFAULT_KEEP_TAGGED = 20 

67_IMAGES_DEFAULT_EXPIRE_UNTAGGED_DAYS = 7 

68_IMAGES_DEFAULT_REPLICATION_ENABLED = True 

69_IMAGES_DEFAULT_REPLICATION_DESTINATIONS = "all_deployed_regions" 

70 

71_IMAGES_VALID_REMOVAL_POLICIES = ("retain", "destroy") 

72 

73 

74def _parse_images_config(cdk_context: dict[str, Any] | None) -> dict[str, Any]: 

75 """Parse the ``images`` block from cdk.json with defaults applied. 

76 

77 Returns a normalized dict shape that the rest of the global stack 

78 can consume without re-parsing. Validates ``removal_policy`` against 

79 the set ``{"retain", "destroy"}`` and ``replication.destinations`` 

80 against either the literal string ``"all_deployed_regions"`` or a 

81 ``list[str]``. 

82 

83 Args: 

84 cdk_context: The dict returned by ``self.node.try_get_context("images")``. 

85 ``None`` (the key being absent) is equivalent to an empty dict. 

86 

87 Returns: 

88 A dict with keys ``removal_policy``, ``empty_on_delete``, 

89 ``lifecycle`` (with ``keep_tagged`` and ``expire_untagged_days``), 

90 and ``replication`` (with ``enabled`` and ``destinations``). 

91 """ 

92 raw = cdk_context or {} 

93 

94 removal_policy = raw.get("removal_policy", _IMAGES_DEFAULT_REMOVAL_POLICY) 

95 if not isinstance(removal_policy, str) or removal_policy not in _IMAGES_VALID_REMOVAL_POLICIES: 

96 raise ValueError( 

97 f"images.removal_policy must be 'retain' or 'destroy', got {removal_policy!r}" 

98 ) 

99 

100 empty_on_delete = bool(raw.get("empty_on_delete", _IMAGES_DEFAULT_EMPTY_ON_DELETE)) 

101 

102 lifecycle_raw = raw.get("lifecycle") or {} 

103 if not isinstance(lifecycle_raw, dict): 

104 raise ValueError(f"images.lifecycle must be a mapping, got {type(lifecycle_raw).__name__}") 

105 keep_tagged = int(lifecycle_raw.get("keep_tagged", _IMAGES_DEFAULT_KEEP_TAGGED)) 

106 expire_untagged_days = int( 

107 lifecycle_raw.get("expire_untagged_days", _IMAGES_DEFAULT_EXPIRE_UNTAGGED_DAYS) 

108 ) 

109 

110 replication_raw = raw.get("replication") or {} 

111 if not isinstance(replication_raw, dict): 

112 raise ValueError( 

113 f"images.replication must be a mapping, got {type(replication_raw).__name__}" 

114 ) 

115 replication_enabled = bool(replication_raw.get("enabled", _IMAGES_DEFAULT_REPLICATION_ENABLED)) 

116 destinations = replication_raw.get("destinations", _IMAGES_DEFAULT_REPLICATION_DESTINATIONS) 

117 if isinstance(destinations, str): 

118 if destinations != _IMAGES_DEFAULT_REPLICATION_DESTINATIONS: 

119 raise ValueError( 

120 "images.replication.destinations must be the string " 

121 f"{_IMAGES_DEFAULT_REPLICATION_DESTINATIONS!r} or a list of region names, " 

122 f"got {destinations!r}" 

123 ) 

124 elif isinstance(destinations, list): 

125 if not all(isinstance(item, str) for item in destinations): 

126 raise ValueError( 

127 "images.replication.destinations list must contain only region name strings" 

128 ) 

129 else: 

130 raise ValueError( 

131 "images.replication.destinations must be the string " 

132 f"{_IMAGES_DEFAULT_REPLICATION_DESTINATIONS!r} or a list of region names, " 

133 f"got {type(destinations).__name__}" 

134 ) 

135 

136 return { 

137 "removal_policy": removal_policy, 

138 "empty_on_delete": empty_on_delete, 

139 "lifecycle": { 

140 "keep_tagged": keep_tagged, 

141 "expire_untagged_days": expire_untagged_days, 

142 }, 

143 "replication": { 

144 "enabled": replication_enabled, 

145 "destinations": destinations, 

146 }, 

147 } 

148 

149 

150class GCOGlobalStack(Stack): 

151 """ 

152 Global resources stack including AWS Global Accelerator. 

153 

154 This stack must be deployed before regional stacks. Regional stacks 

155 will register their ALBs with the endpoint groups created here. 

156 

157 Attributes: 

158 accelerator: The Global Accelerator resource 

159 listener: TCP listener for HTTP/HTTPS traffic 

160 endpoint_groups: Dict mapping region names to endpoint groups 

161 templates_table: DynamoDB table for job templates 

162 webhooks_table: DynamoDB table for webhooks 

163 missions_table: DynamoDB table for mission session state 

164 """ 

165 

166 def __init__( 

167 self, scope: Construct, construct_id: str, config: ConfigLoader, **kwargs: Any 

168 ) -> None: 

169 super().__init__(scope, construct_id, **kwargs) 

170 

171 self.config = config 

172 self.regional_endpoints: dict[str, str] = {} 

173 self.endpoint_groups: dict[str, ga.EndpointGroup] = {} 

174 

175 ga_config = self.config.get_global_accelerator_config() 

176 

177 # Store the accelerator name for reference by other stacks 

178 self.accelerator_name = ga_config["name"] 

179 

180 # Create DynamoDB tables for templates and webhooks 

181 self._create_dynamodb_tables() 

182 

183 # Create S3 bucket for model weights 

184 self._create_model_bucket() 

185 

186 # Create always-on Cluster_Shared_Bucket + KMS key + SSM parameters. 

187 # These run unconditionally (no feature toggle) — they are consumed by 

188 # every Regional_Stack and, when analytics is enabled, by GCOAnalyticsStack. 

189 self._create_cluster_shared_kms_key() 

190 self._create_cluster_shared_bucket() 

191 self._publish_cluster_shared_bucket_ssm_params() 

192 

193 # Create AWS Backup plan for DynamoDB tables 

194 self._create_backup_plan() 

195 

196 # Container image registry — parses the cdk.json ``images`` block, 

197 # provisions the optional ECR replication rule for ``gco/*`` repos, 

198 # and creates the lookup-or-create custom resource Lambda that 

199 # ``cli images init`` will invoke per-repo on demand. The Lambda 

200 # construct is created here regardless of replication settings so 

201 # the function ARN is available for downstream invocations. 

202 self.images_config = _parse_images_config(self.node.try_get_context("images")) 

203 self._create_image_replication_rule() 

204 self._create_image_lookup_lambda() 

205 

206 # Create Global Accelerator with TCP protocol for HTTP/HTTPS traffic 

207 self.accelerator = ga.Accelerator( 

208 self, "GCOAccelerator", accelerator_name=self.accelerator_name, enabled=True 

209 ) 

210 

211 # Store the accelerator ID for CloudWatch metrics 

212 # CloudWatch uses the accelerator ID (UUID), not the name or ARN 

213 # ARN format: arn:aws:globalaccelerator::<account>:accelerator/<accelerator-id> 

214 # Use Fn.select and Fn.split to extract the ID at deploy time 

215 self.accelerator_id = Fn.select(1, Fn.split("/", self.accelerator.accelerator_arn)) 

216 

217 # Create listener for both HTTP (80) and HTTPS (443) traffic. 

218 # Client affinity controls whether GA pins a client to the same 

219 # endpoint across connections (see ``_resolve_client_affinity``). 

220 self.listener = self.accelerator.add_listener( 

221 "GCOListener", 

222 port_ranges=[ 

223 ga.PortRange(from_port=80, to_port=80), 

224 ga.PortRange(from_port=443, to_port=443), 

225 ], 

226 protocol=ga.ConnectionProtocol.TCP, 

227 client_affinity=self._resolve_client_affinity(ga_config), 

228 ) 

229 

230 # Create endpoint groups for each configured region 

231 for region in self.config.get_regions(): 

232 self._create_endpoint_group(region) 

233 

234 # Export Global Accelerator outputs for other stacks 

235 self._create_outputs() 

236 

237 # Apply cdk-nag suppressions 

238 self._apply_nag_suppressions() 

239 

240 @staticmethod 

241 def _resolve_client_affinity(ga_config: dict[str, Any]) -> ga.ClientAffinity: 

242 """Map the ``client_affinity`` cdk.json knob to a CDK enum. 

243 

244 Global Accelerator supports two client-affinity modes: 

245 

246 - ``NONE`` (default): each new connection may be routed to any 

247 healthy endpoint, maximising even load distribution. 

248 - ``SOURCE_IP``: connections from the same source IP are pinned to 

249 the same endpoint, which is useful for workloads that keep 

250 per-client state on a single region. 

251 

252 The value is validated up front by 

253 ``ConfigLoader._validate_global_accelerator_config`` so an unknown 

254 string never reaches this point; the fallback to ``NONE`` keeps the 

255 stack synthesizable even when the key is omitted entirely. 

256 """ 

257 affinity = str(ga_config.get("client_affinity", "NONE")).upper() 

258 mapping = { 

259 "NONE": ga.ClientAffinity.NONE, 

260 "SOURCE_IP": ga.ClientAffinity.SOURCE_IP, 

261 } 

262 return mapping.get(affinity, ga.ClientAffinity.NONE) 

263 

264 def _create_outputs(self) -> None: 

265 """Create CloudFormation outputs for cross-stack references.""" 

266 project_name = self.config.get_project_name() 

267 

268 CfnOutput( 

269 self, 

270 "GlobalAcceleratorDnsName", 

271 value=self.accelerator.dns_name, 

272 description="Global Accelerator DNS name for global endpoint", 

273 export_name=f"{project_name}-global-accelerator-dns", 

274 ) 

275 

276 CfnOutput( 

277 self, 

278 "GlobalAcceleratorArn", 

279 value=self.accelerator.accelerator_arn, 

280 description="Global Accelerator ARN", 

281 export_name=f"{project_name}-global-accelerator-arn", 

282 ) 

283 

284 CfnOutput( 

285 self, 

286 "GlobalAcceleratorListenerArn", 

287 value=self.listener.listener_arn, 

288 description="Global Accelerator Listener ARN", 

289 export_name=f"{project_name}-global-accelerator-listener-arn", 

290 ) 

291 

292 def _apply_nag_suppressions(self) -> None: 

293 """Apply cdk-nag suppressions for this stack.""" 

294 from gco.stacks.nag_suppressions import apply_all_suppressions 

295 

296 apply_all_suppressions(self, stack_type="global") 

297 

298 def _create_endpoint_group(self, region: str) -> None: 

299 """ 

300 Create an endpoint group for a specific region. 

301 

302 Configures HTTP health checks using the path from cdk.json so 

303 Global Accelerator can verify the ALB's backend services are 

304 actually healthy (not just that the port is open). 

305 

306 Also stores the endpoint group ARN in SSM Parameter Store for 

307 cross-region access by regional stacks. 

308 

309 Args: 

310 region: AWS region name (e.g., 'us-east-1') 

311 """ 

312 project_name = self.config.get_project_name() 

313 region_id = region.replace("-", "").title() 

314 ga_config = self.config.get_global_accelerator_config() 

315 

316 # Use HTTP health checks so GA validates the backend services are 

317 # actually responding, not just that the ALB port is open. 

318 # The health_check_path from cdk.json (default: /api/v1/health) 

319 # hits the health-monitor service behind the ALB. 

320 endpoint_group = self.listener.add_endpoint_group( 

321 f"EndpointGroup{region_id}", 

322 region=region, 

323 health_check_port=80, 

324 health_check_protocol=ga.HealthCheckProtocol.HTTP, 

325 health_check_path=ga_config.get("health_check_path", "/api/v1/health"), 

326 health_check_interval=Duration.seconds(ga_config.get("health_check_interval", 30)), 

327 health_check_threshold=3, 

328 ) 

329 

330 self.endpoint_groups[region] = endpoint_group 

331 

332 # Export endpoint group ARN for regional stacks 

333 CfnOutput( 

334 self, 

335 f"EndpointGroup{region_id}Arn", 

336 value=endpoint_group.endpoint_group_arn, 

337 description=f"Endpoint group ARN for {region}", 

338 export_name=f"{project_name}-endpoint-group-{region}-arn", 

339 ) 

340 

341 # Store endpoint group ARN in SSM Parameter Store for cross-region access 

342 # Regional stacks read this to register their ALBs with Global Accelerator 

343 ssm.StringParameter( 

344 self, 

345 f"EndpointGroup{region_id}ArnParam", 

346 parameter_name=f"/{project_name}/endpoint-group-{region}-arn", 

347 string_value=endpoint_group.endpoint_group_arn, 

348 description=f"Global Accelerator endpoint group ARN for {region}", 

349 ) 

350 

351 def add_regional_endpoint(self, region: str, alb_arn: str) -> None: 

352 """Add a regional ALB endpoint to the Global Accelerator. 

353 

354 Note: Due to cross-region reference limitations in CDK, the actual endpoint 

355 registration is handled by a custom resource in the regional stack. 

356 This method stores the ARN for reference but doesn't directly register it. 

357 

358 The regional stack should use the endpoint group ARN exported by this stack 

359 to register its ALB via an AwsCustomResource. 

360 """ 

361 self.regional_endpoints[region] = alb_arn 

362 # Actual registration happens in regional stack via custom resource 

363 

364 def get_accelerator_dns_name(self) -> str: 

365 """Get the Global Accelerator DNS name""" 

366 return str(self.accelerator.dns_name) 

367 

368 def get_accelerator_arn(self) -> str: 

369 """Get the Global Accelerator ARN""" 

370 return str(self.accelerator.accelerator_arn) 

371 

372 def get_listener_arn(self) -> str: 

373 """Get the Global Accelerator Listener ARN""" 

374 return str(self.listener.listener_arn) 

375 

376 def get_endpoint_group_arn(self, region: str) -> str: 

377 """Get the endpoint group ARN for a specific region""" 

378 if region in self.endpoint_groups: 

379 return str(self.endpoint_groups[region].endpoint_group_arn) 

380 raise ValueError(f"No endpoint group found for region: {region}") 

381 

382 def _create_dynamodb_tables(self) -> None: 

383 """Create DynamoDB tables for templates, webhooks, jobs, inference endpoints, and missions.""" 

384 project_name = self.config.get_project_name() 

385 

386 # Job Templates table - stores reusable job templates 

387 self.templates_table = dynamodb.Table( 

388 self, 

389 "JobTemplatesTable", 

390 table_name=f"{project_name}-job-templates", 

391 partition_key=dynamodb.Attribute( 

392 name="template_name", 

393 type=dynamodb.AttributeType.STRING, 

394 ), 

395 billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST, 

396 removal_policy=RemovalPolicy.DESTROY, 

397 point_in_time_recovery_specification=dynamodb.PointInTimeRecoverySpecification( 

398 point_in_time_recovery_enabled=True 

399 ), 

400 encryption=dynamodb.TableEncryption.AWS_MANAGED, 

401 ) 

402 

403 # Webhooks table - stores webhook registrations 

404 self.webhooks_table = dynamodb.Table( 

405 self, 

406 "WebhooksTable", 

407 table_name=f"{project_name}-webhooks", 

408 partition_key=dynamodb.Attribute( 

409 name="webhook_id", 

410 type=dynamodb.AttributeType.STRING, 

411 ), 

412 billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST, 

413 removal_policy=RemovalPolicy.DESTROY, 

414 point_in_time_recovery_specification=dynamodb.PointInTimeRecoverySpecification( 

415 point_in_time_recovery_enabled=True 

416 ), 

417 encryption=dynamodb.TableEncryption.AWS_MANAGED, 

418 ) 

419 

420 # Add GSI for querying webhooks by namespace 

421 self.webhooks_table.add_global_secondary_index( 

422 index_name="namespace-index", 

423 partition_key=dynamodb.Attribute( 

424 name="namespace", 

425 type=dynamodb.AttributeType.STRING, 

426 ), 

427 projection_type=dynamodb.ProjectionType.ALL, 

428 ) 

429 

430 # Jobs table - centralized job tracking and queue 

431 # This enables global job submission with regional pickup 

432 self.jobs_table = dynamodb.Table( 

433 self, 

434 "JobsTable", 

435 table_name=f"{project_name}-jobs", 

436 partition_key=dynamodb.Attribute( 

437 name="job_id", 

438 type=dynamodb.AttributeType.STRING, 

439 ), 

440 billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST, 

441 removal_policy=RemovalPolicy.DESTROY, 

442 point_in_time_recovery_specification=dynamodb.PointInTimeRecoverySpecification( 

443 point_in_time_recovery_enabled=True 

444 ), 

445 encryption=dynamodb.TableEncryption.AWS_MANAGED, 

446 time_to_live_attribute="ttl", # Auto-cleanup old completed jobs 

447 ) 

448 

449 # GSI for querying jobs by region and status (for regional polling) 

450 self.jobs_table.add_global_secondary_index( 

451 index_name="region-status-index", 

452 partition_key=dynamodb.Attribute( 

453 name="target_region", 

454 type=dynamodb.AttributeType.STRING, 

455 ), 

456 sort_key=dynamodb.Attribute( 

457 name="status", 

458 type=dynamodb.AttributeType.STRING, 

459 ), 

460 projection_type=dynamodb.ProjectionType.ALL, 

461 ) 

462 

463 # GSI for querying jobs by namespace 

464 self.jobs_table.add_global_secondary_index( 

465 index_name="namespace-index", 

466 partition_key=dynamodb.Attribute( 

467 name="namespace", 

468 type=dynamodb.AttributeType.STRING, 

469 ), 

470 sort_key=dynamodb.Attribute( 

471 name="submitted_at", 

472 type=dynamodb.AttributeType.STRING, 

473 ), 

474 projection_type=dynamodb.ProjectionType.ALL, 

475 ) 

476 

477 # GSI for querying jobs by status globally 

478 self.jobs_table.add_global_secondary_index( 

479 index_name="status-index", 

480 partition_key=dynamodb.Attribute( 

481 name="status", 

482 type=dynamodb.AttributeType.STRING, 

483 ), 

484 sort_key=dynamodb.Attribute( 

485 name="submitted_at", 

486 type=dynamodb.AttributeType.STRING, 

487 ), 

488 projection_type=dynamodb.ProjectionType.ALL, 

489 ) 

490 

491 # Export table names and ARNs for regional stacks 

492 CfnOutput( 

493 self, 

494 "TemplatesTableName", 

495 value=self.templates_table.table_name, 

496 description="DynamoDB table name for job templates", 

497 export_name=f"{project_name}-templates-table-name", 

498 ) 

499 

500 CfnOutput( 

501 self, 

502 "TemplatesTableArn", 

503 value=self.templates_table.table_arn, 

504 description="DynamoDB table ARN for job templates", 

505 export_name=f"{project_name}-templates-table-arn", 

506 ) 

507 

508 CfnOutput( 

509 self, 

510 "WebhooksTableName", 

511 value=self.webhooks_table.table_name, 

512 description="DynamoDB table name for webhooks", 

513 export_name=f"{project_name}-webhooks-table-name", 

514 ) 

515 

516 CfnOutput( 

517 self, 

518 "WebhooksTableArn", 

519 value=self.webhooks_table.table_arn, 

520 description="DynamoDB table ARN for webhooks", 

521 export_name=f"{project_name}-webhooks-table-arn", 

522 ) 

523 

524 CfnOutput( 

525 self, 

526 "JobsTableName", 

527 value=self.jobs_table.table_name, 

528 description="DynamoDB table name for centralized job tracking", 

529 export_name=f"{project_name}-jobs-table-name", 

530 ) 

531 

532 CfnOutput( 

533 self, 

534 "JobsTableArn", 

535 value=self.jobs_table.table_arn, 

536 description="DynamoDB table ARN for centralized job tracking", 

537 export_name=f"{project_name}-jobs-table-arn", 

538 ) 

539 

540 # Inference Endpoints table - stores desired state for inference deployments 

541 # The inference_monitor in each regional cluster polls this table 

542 self.inference_endpoints_table = dynamodb.Table( 

543 self, 

544 "InferenceEndpointsTable", 

545 table_name=f"{project_name}-inference-endpoints", 

546 partition_key=dynamodb.Attribute( 

547 name="endpoint_name", 

548 type=dynamodb.AttributeType.STRING, 

549 ), 

550 billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST, 

551 removal_policy=RemovalPolicy.DESTROY, 

552 point_in_time_recovery_specification=dynamodb.PointInTimeRecoverySpecification( 

553 point_in_time_recovery_enabled=True 

554 ), 

555 encryption=dynamodb.TableEncryption.AWS_MANAGED, 

556 ) 

557 

558 CfnOutput( 

559 self, 

560 "InferenceEndpointsTableName", 

561 value=self.inference_endpoints_table.table_name, 

562 description="DynamoDB table name for inference endpoint state", 

563 export_name=f"{project_name}-inference-endpoints-table-name", 

564 ) 

565 

566 CfnOutput( 

567 self, 

568 "InferenceEndpointsTableArn", 

569 value=self.inference_endpoints_table.table_arn, 

570 description="DynamoDB table ARN for inference endpoint state", 

571 export_name=f"{project_name}-inference-endpoints-table-arn", 

572 ) 

573 

574 # Missions table - persists goal-directed iteration session state 

575 # Partition by session_id; the status-index GSI supports paginated 

576 # listing by status (e.g. running, completed, terminated, failed). 

577 self.missions_table = dynamodb.Table( 

578 self, 

579 "MissionsTable", 

580 table_name=f"{project_name}-missions", 

581 partition_key=dynamodb.Attribute( 

582 name="session_id", 

583 type=dynamodb.AttributeType.STRING, 

584 ), 

585 billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST, 

586 removal_policy=RemovalPolicy.DESTROY, 

587 point_in_time_recovery_specification=dynamodb.PointInTimeRecoverySpecification( 

588 point_in_time_recovery_enabled=True 

589 ), 

590 encryption=dynamodb.TableEncryption.AWS_MANAGED, 

591 ) 

592 

593 # GSI for paginating sessions by status (sorted by creation time) 

594 self.missions_table.add_global_secondary_index( 

595 index_name="status-index", 

596 partition_key=dynamodb.Attribute( 

597 name="status", 

598 type=dynamodb.AttributeType.STRING, 

599 ), 

600 sort_key=dynamodb.Attribute( 

601 name="created_at", 

602 type=dynamodb.AttributeType.STRING, 

603 ), 

604 projection_type=dynamodb.ProjectionType.ALL, 

605 ) 

606 

607 CfnOutput( 

608 self, 

609 "MissionsTableName", 

610 value=self.missions_table.table_name, 

611 description="DynamoDB table name for mission session state", 

612 export_name=f"{project_name}-missions-table-name", 

613 ) 

614 

615 CfnOutput( 

616 self, 

617 "MissionsTableArn", 

618 value=self.missions_table.table_arn, 

619 description="DynamoDB table ARN for mission session state", 

620 export_name=f"{project_name}-missions-table-arn", 

621 ) 

622 

623 # Store table names in SSM for cross-region access 

624 ssm.StringParameter( 

625 self, 

626 "TemplatesTableNameParam", 

627 parameter_name=f"/{project_name}/templates-table-name", 

628 string_value=self.templates_table.table_name, 

629 description="DynamoDB table name for job templates", 

630 ) 

631 

632 ssm.StringParameter( 

633 self, 

634 "WebhooksTableNameParam", 

635 parameter_name=f"/{project_name}/webhooks-table-name", 

636 string_value=self.webhooks_table.table_name, 

637 description="DynamoDB table name for webhooks", 

638 ) 

639 

640 ssm.StringParameter( 

641 self, 

642 "JobsTableNameParam", 

643 parameter_name=f"/{project_name}/jobs-table-name", 

644 string_value=self.jobs_table.table_name, 

645 description="DynamoDB table name for centralized job tracking", 

646 ) 

647 

648 ssm.StringParameter( 

649 self, 

650 "InferenceEndpointsTableNameParam", 

651 parameter_name=f"/{project_name}/inference-endpoints-table-name", 

652 string_value=self.inference_endpoints_table.table_name, 

653 description="DynamoDB table name for inference endpoint state", 

654 ) 

655 

656 ssm.StringParameter( 

657 self, 

658 "MissionsTableNameParam", 

659 parameter_name=f"/{project_name}/missions-table-name", 

660 string_value=self.missions_table.table_name, 

661 description="DynamoDB table name for mission session state", 

662 ) 

663 

664 def _create_model_bucket(self) -> None: 

665 """Create S3 bucket for model weights. 

666 

667 This bucket serves as the central model registry. Users upload model 

668 weights here once, and the inference_monitor's init containers sync 

669 them to each region's local EFS at pod startup. 

670 

671 The bucket name is auto-generated by CDK to avoid naming collisions. 

672 It's exported via CfnOutput and SSM for CLI discovery. 

673 """ 

674 project_name = self.config.get_project_name() 

675 

676 # KMS key for model bucket encryption 

677 self.model_bucket_key = kms.Key( 

678 self, 

679 "ModelBucketKey", 

680 description="KMS key for GCO model weights bucket", 

681 enable_key_rotation=True, 

682 removal_policy=RemovalPolicy.DESTROY, 

683 ) 

684 

685 # Access logs bucket (required for compliance) 

686 # Retention is configurable via cdk.json context field `s3_access_logs.retention_days` 

687 # (default: 90 days). Logs older than the configured retention are expired. 

688 s3_access_logs_ctx = self.node.try_get_context("s3_access_logs") or {} 

689 access_logs_retention_days = int(s3_access_logs_ctx.get("retention_days", 90)) 

690 

691 self.model_bucket_access_logs = s3.Bucket( 

692 self, 

693 "ModelWeightsAccessLogsBucket", 

694 encryption=s3.BucketEncryption.S3_MANAGED, 

695 block_public_access=s3.BlockPublicAccess.BLOCK_ALL, 

696 enforce_ssl=True, 

697 versioned=True, 

698 removal_policy=RemovalPolicy.DESTROY, 

699 auto_delete_objects=True, 

700 lifecycle_rules=[ 

701 s3.LifecycleRule( 

702 id="ExpireAccessLogs", 

703 enabled=True, 

704 expiration=Duration.days(access_logs_retention_days), 

705 ) 

706 ], 

707 ) 

708 

709 # Model weights bucket 

710 self.model_bucket = s3.Bucket( 

711 self, 

712 "ModelWeightsBucket", 

713 encryption=s3.BucketEncryption.KMS, 

714 encryption_key=self.model_bucket_key, 

715 bucket_key_enabled=True, 

716 block_public_access=s3.BlockPublicAccess.BLOCK_ALL, 

717 enforce_ssl=True, 

718 versioned=True, 

719 removal_policy=RemovalPolicy.DESTROY, 

720 auto_delete_objects=True, 

721 server_access_logs_bucket=self.model_bucket_access_logs, 

722 server_access_logs_prefix="model-bucket-logs/", 

723 ) 

724 

725 # CDK-nag suppressions — only replication (not needed for model weights) 

726 from cdk_nag import NagSuppressions 

727 

728 replication_reason = ( 

729 "Model weights are user-uploaded artifacts that can be re-uploaded. " 

730 "Cross-region replication is not required; the inference_monitor " 

731 "syncs models from S3 to each region's EFS at pod startup." 

732 ) 

733 

734 NagSuppressions.add_resource_suppressions( 

735 self.model_bucket, 

736 [ 

737 { 

738 "id": "HIPAA.Security-S3BucketReplicationEnabled", 

739 "reason": replication_reason, 

740 }, 

741 { 

742 "id": "NIST.800.53.R5-S3BucketReplicationEnabled", 

743 "reason": replication_reason, 

744 }, 

745 { 

746 "id": "PCI.DSS.321-S3BucketReplicationEnabled", 

747 "reason": replication_reason, 

748 }, 

749 ], 

750 ) 

751 

752 logs_reason = "This is the server access logs destination bucket." 

753 NagSuppressions.add_resource_suppressions( 

754 self.model_bucket_access_logs, 

755 [ 

756 {"id": "AwsSolutions-S1", "reason": logs_reason}, 

757 {"id": "HIPAA.Security-S3BucketLoggingEnabled", "reason": logs_reason}, 

758 { 

759 "id": "HIPAA.Security-S3BucketReplicationEnabled", 

760 "reason": "Access logs do not require replication.", 

761 }, 

762 { 

763 "id": "HIPAA.Security-S3DefaultEncryptionKMS", 

764 "reason": "SSE-S3 is sufficient for access logs.", 

765 }, 

766 {"id": "NIST.800.53.R5-S3BucketLoggingEnabled", "reason": logs_reason}, 

767 { 

768 "id": "NIST.800.53.R5-S3BucketReplicationEnabled", 

769 "reason": "Access logs do not require replication.", 

770 }, 

771 { 

772 "id": "NIST.800.53.R5-S3DefaultEncryptionKMS", 

773 "reason": "SSE-S3 is sufficient for access logs.", 

774 }, 

775 {"id": "PCI.DSS.321-S3BucketLoggingEnabled", "reason": logs_reason}, 

776 { 

777 "id": "PCI.DSS.321-S3BucketReplicationEnabled", 

778 "reason": "Access logs do not require replication.", 

779 }, 

780 { 

781 "id": "PCI.DSS.321-S3DefaultEncryptionKMS", 

782 "reason": "SSE-S3 is sufficient for access logs.", 

783 }, 

784 ], 

785 ) 

786 

787 CfnOutput( 

788 self, 

789 "ModelBucketName", 

790 value=self.model_bucket.bucket_name, 

791 description="S3 bucket for model weights", 

792 export_name=f"{project_name}-model-bucket-name", 

793 ) 

794 

795 CfnOutput( 

796 self, 

797 "ModelBucketArn", 

798 value=self.model_bucket.bucket_arn, 

799 description="S3 bucket ARN for model weights", 

800 export_name=f"{project_name}-model-bucket-arn", 

801 ) 

802 

803 ssm.StringParameter( 

804 self, 

805 "ModelBucketNameParam", 

806 parameter_name=f"/{project_name}/model-bucket-name", 

807 string_value=self.model_bucket.bucket_name, 

808 description="S3 bucket name for model weights", 

809 ) 

810 

811 def _create_backup_plan(self) -> None: 

812 """Create AWS Backup plan for DynamoDB tables. 

813 

814 Creates a backup plan with: 

815 - Daily backups retained for 35 days 

816 - Weekly backups retained for 90 days 

817 - All DynamoDB tables added to the backup selection 

818 """ 

819 # Create backup vault for storing backups 

820 self.backup_vault = backup.BackupVault( 

821 self, 

822 "DynamoDBBackupVault", 

823 removal_policy=RemovalPolicy.DESTROY, 

824 ) 

825 

826 # Create backup plan with daily and weekly rules 

827 self.backup_plan = backup.BackupPlan( 

828 self, 

829 "DynamoDBBackupPlan", 

830 backup_plan_rules=[ 

831 # Daily backup - retained for 35 days 

832 backup.BackupPlanRule( 

833 rule_name="DailyBackup", 

834 backup_vault=self.backup_vault, 

835 schedule_expression=events.Schedule.cron( 

836 hour="3", 

837 minute="0", 

838 ), 

839 delete_after=Duration.days(35), 

840 enable_continuous_backup=True, # Enable PITR for DynamoDB 

841 ), 

842 # Weekly backup - retained for 90 days 

843 backup.BackupPlanRule( 

844 rule_name="WeeklyBackup", 

845 backup_vault=self.backup_vault, 

846 schedule_expression=events.Schedule.cron( 

847 hour="4", 

848 minute="0", 

849 week_day="SUN", 

850 ), 

851 delete_after=Duration.days(90), 

852 ), 

853 ], 

854 ) 

855 

856 # Add all DynamoDB tables to the backup selection 

857 self.backup_plan.add_selection( 

858 "DynamoDBTablesSelection", 

859 resources=[ 

860 backup.BackupResource.from_dynamo_db_table(self.templates_table), 

861 backup.BackupResource.from_dynamo_db_table(self.webhooks_table), 

862 backup.BackupResource.from_dynamo_db_table(self.jobs_table), 

863 backup.BackupResource.from_dynamo_db_table(self.inference_endpoints_table), 

864 backup.BackupResource.from_dynamo_db_table(self.missions_table), 

865 ], 

866 ) 

867 

868 # Export backup plan ARN 

869 project_name = self.config.get_project_name() 

870 CfnOutput( 

871 self, 

872 "BackupPlanArn", 

873 value=self.backup_plan.backup_plan_arn, 

874 description="AWS Backup plan ARN for DynamoDB tables", 

875 export_name=f"{project_name}-backup-plan-arn", 

876 ) 

877 

878 CfnOutput( 

879 self, 

880 "BackupVaultArn", 

881 value=self.backup_vault.backup_vault_arn, 

882 description="AWS Backup vault ARN for DynamoDB backups", 

883 export_name=f"{project_name}-backup-vault-arn", 

884 ) 

885 

886 def _create_cluster_shared_kms_key(self) -> None: 

887 """Create the always-on customer-managed KMS key for ``Cluster_Shared_Bucket``. 

888 

889 The key: 

890 - Enables automatic annual rotation. 

891 - Uses a 7-day pending window on destroy — the AWS minimum, matching the 

892 destroy-by-default iteration-loop posture of the analytics-environment 

893 feature while still providing a safety net against accidental deletion. 

894 - Uses ``RemovalPolicy.DESTROY`` so a ``cdk destroy gco-global`` cleans up 

895 the key without operator intervention (iteration-loop posture). 

896 - Grants encrypt/decrypt to the ``s3.amazonaws.com`` and 

897 ``logs.<region>.amazonaws.com`` service principals via the key policy 

898 so S3 server-side encryption and CloudWatch access-log delivery can use 

899 the key without role-side grants. 

900 

901 The key is exposed as ``self.cluster_shared_kms_key`` for tests and for 

902 ``_create_cluster_shared_bucket`` to reference. Role-side usage grants 

903 (``kms:Decrypt`` / ``kms:GenerateDataKey``) are attached by downstream 

904 consumers: ``GCORegionalStack`` on the job-pod role (always-on) 

905 and ``GCOAnalyticsStack`` on the SageMaker execution role (conditional on 

906 the analytics toggle). 

907 """ 

908 self.cluster_shared_kms_key = kms.Key( 

909 self, 

910 "ClusterSharedKmsKey", 

911 description=( 

912 "Customer-managed KMS key for the always-on Cluster_Shared_Bucket " 

913 "in GCOGlobalStack. Consumed by every regional EKS cluster and by " 

914 "GCOAnalyticsStack when analytics is enabled." 

915 ), 

916 enable_key_rotation=True, 

917 pending_window=Duration.days(7), 

918 removal_policy=RemovalPolicy.DESTROY, 

919 ) 

920 

921 # Key-policy grants for service principals that need to encrypt/decrypt 

922 # on behalf of the bucket (S3 server-side encryption) and the access-logs 

923 # bucket (CloudWatch Logs delivery). The actions match the standard 

924 # service-principal pattern used by cdk's default key policies. 

925 kms_actions = [ 

926 "kms:Encrypt", 

927 "kms:Decrypt", 

928 "kms:ReEncrypt*", 

929 "kms:GenerateDataKey*", 

930 "kms:DescribeKey", 

931 ] 

932 

933 self.cluster_shared_kms_key.add_to_resource_policy( 

934 iam.PolicyStatement( 

935 sid="AllowS3ServiceEncryptDecrypt", 

936 effect=iam.Effect.ALLOW, 

937 principals=[iam.ServicePrincipal("s3.amazonaws.com")], 

938 actions=kms_actions, 

939 resources=["*"], 

940 ) 

941 ) 

942 

943 self.cluster_shared_kms_key.add_to_resource_policy( 

944 iam.PolicyStatement( 

945 sid="AllowCloudWatchLogsEncryptDecrypt", 

946 effect=iam.Effect.ALLOW, 

947 principals=[iam.ServicePrincipal(f"logs.{self.region}.amazonaws.com")], 

948 actions=kms_actions, 

949 resources=["*"], 

950 ) 

951 ) 

952 

953 def _create_cluster_shared_bucket(self) -> None: 

954 """Create the always-on ``Cluster_Shared_Bucket`` and its access-logs bucket. 

955 

956 Two buckets are created: 

957 

958 1. ``cluster_shared_access_logs_bucket`` — dedicated S3 access-logs bucket 

959 used as ``server_access_logs_bucket`` for the primary bucket. Separate 

960 from ``model_bucket_access_logs`` so cluster-shared-bucket access logs 

961 are not commingled with model-bucket logs. 

962 2. ``cluster_shared_bucket`` — the primary bucket named 

963 ``gco-cluster-shared-<account>-<global-region>`` (the prefix 

964 ``CLUSTER_SHARED_BUCKET_NAME_PREFIX`` is the stable ARN prefix used by 

965 IAM policies and nag assertions). KMS-encrypted with 

966 ``cluster_shared_kms_key``, block-public-access on, SSL enforced, 

967 versioned, destroy-on-teardown. 

968 

969 An explicit ``Deny`` statement for ``aws:SecureTransport=false`` is added 

970 to the bucket policy independent of ``enforce_ssl=True`` so the deny is 

971 verifiable in the synthesized template (belt-and-suspenders). 

972 

973 Grants on ``Cluster_Shared_Bucket`` are intentionally not added here — 

974 they live on downstream role policies (``GCORegionalStack`` on the 

975 job-pod role, ``GCOAnalyticsStack`` on the SageMaker execution role) 

976 rather than in this bucket's policy. The bucket policy contains zero 

977 ``Principal: "*"`` Allow statements. 

978 """ 

979 # Retention for the access-logs bucket honors the same `s3_access_logs` 

980 # context field as the model-bucket access-logs bucket (default 90 days). 

981 s3_access_logs_ctx = self.node.try_get_context("s3_access_logs") or {} 

982 access_logs_retention_days = int(s3_access_logs_ctx.get("retention_days", 90)) 

983 

984 # Dedicated access-logs bucket for Cluster_Shared_Bucket. Encrypted with 

985 # the cluster-shared KMS key (the key policy grants the logs service 

986 # principal encrypt/decrypt). Kept separate from model_bucket_access_logs 

987 # so operators can reason about each bucket's logs independently. Matches 

988 # the LifecycleRule used on `model_bucket_access_logs` so retention is 

989 # consistent across the two log sinks. 

990 self.cluster_shared_access_logs_bucket = s3.Bucket( 

991 self, 

992 "ClusterSharedAccessLogsBucket", 

993 encryption=s3.BucketEncryption.KMS, 

994 encryption_key=self.cluster_shared_kms_key, 

995 block_public_access=s3.BlockPublicAccess.BLOCK_ALL, 

996 enforce_ssl=True, 

997 versioned=True, 

998 removal_policy=RemovalPolicy.DESTROY, 

999 auto_delete_objects=True, 

1000 lifecycle_rules=[ 

1001 s3.LifecycleRule( 

1002 id="ExpireAccessLogs", 

1003 enabled=True, 

1004 expiration=Duration.days(access_logs_retention_days), 

1005 ) 

1006 ], 

1007 ) 

1008 

1009 # Primary Cluster_Shared_Bucket. Name uses the constant prefix so 

1010 # the IAM allow-list assertion (arn:aws:s3:::gco-cluster-shared-*) 

1011 # stays stable across refactors. `bucket_key_enabled=True` mirrors the 

1012 # model_bucket pattern to reduce per-object KMS request costs. 

1013 self.cluster_shared_bucket = s3.Bucket( 

1014 self, 

1015 "ClusterSharedBucket", 

1016 bucket_name=f"{CLUSTER_SHARED_BUCKET_NAME_PREFIX}-{self.account}-{self.region}", 

1017 encryption=s3.BucketEncryption.KMS, 

1018 encryption_key=self.cluster_shared_kms_key, 

1019 bucket_key_enabled=True, 

1020 block_public_access=s3.BlockPublicAccess.BLOCK_ALL, 

1021 enforce_ssl=True, 

1022 versioned=True, 

1023 removal_policy=RemovalPolicy.DESTROY, 

1024 auto_delete_objects=True, 

1025 server_access_logs_bucket=self.cluster_shared_access_logs_bucket, 

1026 server_access_logs_prefix="cluster-shared/", 

1027 ) 

1028 

1029 # Explicit Deny for insecure transport. `enforce_ssl=True` already adds 

1030 # an equivalent statement, but duplicating it here makes the deny 

1031 # verifiable in the synthesized template under a known SID and satisfies 

1032 # a belt-and-suspenders posture. 

1033 self.cluster_shared_bucket.add_to_resource_policy( 

1034 iam.PolicyStatement( 

1035 sid="DenyInsecureTransport", 

1036 effect=iam.Effect.DENY, 

1037 principals=[iam.AnyPrincipal()], 

1038 actions=["s3:*"], 

1039 resources=[ 

1040 self.cluster_shared_bucket.bucket_arn, 

1041 f"{self.cluster_shared_bucket.bucket_arn}/*", 

1042 ], 

1043 conditions={"Bool": {"aws:SecureTransport": "false"}}, 

1044 ) 

1045 ) 

1046 

1047 # CDK-nag suppressions — scoped per-resource at the construct site to 

1048 # mirror the ``_create_model_bucket`` pattern (keeps the suppression 

1049 # co-located with the construct it applies to, so the reason survives 

1050 # refactors). Every suppression carries an explicit reason 

1051 # string; no blanket ``Resource::*`` bypasses. 

1052 from cdk_nag import NagSuppressions 

1053 

1054 shared_replication_reason = ( 

1055 "Cluster_Shared_Bucket is a regional scratch sink; cluster jobs " 

1056 "publish to it from a single region, and there is no durability " 

1057 "requirement that warrants cross-region replication. Access logs " 

1058 "do not require replication for the same reason." 

1059 ) 

1060 

1061 NagSuppressions.add_resource_suppressions( 

1062 self.cluster_shared_bucket, 

1063 [ 

1064 { 

1065 "id": "HIPAA.Security-S3BucketReplicationEnabled", 

1066 "reason": shared_replication_reason, 

1067 }, 

1068 { 

1069 "id": "NIST.800.53.R5-S3BucketReplicationEnabled", 

1070 "reason": shared_replication_reason, 

1071 }, 

1072 { 

1073 "id": "PCI.DSS.321-S3BucketReplicationEnabled", 

1074 "reason": shared_replication_reason, 

1075 }, 

1076 ], 

1077 ) 

1078 

1079 access_logs_is_self_target_reason = ( 

1080 "This is the server access logs destination bucket for Cluster_Shared_Bucket." 

1081 ) 

1082 NagSuppressions.add_resource_suppressions( 

1083 self.cluster_shared_access_logs_bucket, 

1084 [ 

1085 { 

1086 "id": "AwsSolutions-S1", 

1087 "reason": access_logs_is_self_target_reason, 

1088 }, 

1089 { 

1090 "id": "HIPAA.Security-S3BucketLoggingEnabled", 

1091 "reason": access_logs_is_self_target_reason, 

1092 }, 

1093 { 

1094 "id": "NIST.800.53.R5-S3BucketLoggingEnabled", 

1095 "reason": access_logs_is_self_target_reason, 

1096 }, 

1097 { 

1098 "id": "PCI.DSS.321-S3BucketLoggingEnabled", 

1099 "reason": access_logs_is_self_target_reason, 

1100 }, 

1101 { 

1102 "id": "HIPAA.Security-S3BucketReplicationEnabled", 

1103 "reason": shared_replication_reason, 

1104 }, 

1105 { 

1106 "id": "NIST.800.53.R5-S3BucketReplicationEnabled", 

1107 "reason": shared_replication_reason, 

1108 }, 

1109 { 

1110 "id": "PCI.DSS.321-S3BucketReplicationEnabled", 

1111 "reason": shared_replication_reason, 

1112 }, 

1113 ], 

1114 ) 

1115 

1116 def _publish_cluster_shared_bucket_ssm_params(self) -> None: 

1117 """Publish the three ``/gco/cluster-shared-bucket/*`` SSM parameters. 

1118 

1119 Writes: 

1120 

1121 - ``/gco/cluster-shared-bucket/name`` — bucket name 

1122 - ``/gco/cluster-shared-bucket/arn`` — bucket ARN 

1123 - ``/gco/cluster-shared-bucket/region`` — bucket home region (global region) 

1124 

1125 These parameters are the cross-region contract consumed by 

1126 ``GCORegionalStack._resolve_cluster_shared_bucket_from_ssm`` (always) and by 

1127 ``GCOAnalyticsStack._grant_sagemaker_role_on_cluster_shared_bucket`` 

1128 (conditional on the analytics toggle). The prefix 

1129 ``CLUSTER_SHARED_SSM_PARAMETER_PREFIX`` is the single source of truth so 

1130 the namespace can be renamed in exactly one place if needed. 

1131 

1132 Also emits four ``CfnOutput`` values for discoverability: the three SSM 

1133 values plus the KMS key ARN. Export names follow the existing 

1134 ``{project_name}-cluster-shared-{suffix}`` pattern used by the rest of 

1135 this stack's outputs so operators can cross-reference them from peer 

1136 stacks via ``Fn.import_value`` if needed (the primary cross-region 

1137 contract remains SSM). 

1138 """ 

1139 project_name = self.config.get_project_name() 

1140 

1141 ssm.StringParameter( 

1142 self, 

1143 "ClusterSharedBucketNameParam", 

1144 parameter_name=f"{CLUSTER_SHARED_SSM_PARAMETER_PREFIX}/name", 

1145 string_value=self.cluster_shared_bucket.bucket_name, 

1146 description="Name of the always-on Cluster_Shared_Bucket (owned by GCOGlobalStack).", 

1147 ) 

1148 

1149 ssm.StringParameter( 

1150 self, 

1151 "ClusterSharedBucketArnParam", 

1152 parameter_name=f"{CLUSTER_SHARED_SSM_PARAMETER_PREFIX}/arn", 

1153 string_value=self.cluster_shared_bucket.bucket_arn, 

1154 description="ARN of the always-on Cluster_Shared_Bucket (owned by GCOGlobalStack).", 

1155 ) 

1156 

1157 ssm.StringParameter( 

1158 self, 

1159 "ClusterSharedBucketRegionParam", 

1160 parameter_name=f"{CLUSTER_SHARED_SSM_PARAMETER_PREFIX}/region", 

1161 string_value=self.region, 

1162 description="Home region of the always-on Cluster_Shared_Bucket (the global region).", 

1163 ) 

1164 

1165 CfnOutput( 

1166 self, 

1167 "ClusterSharedBucketName", 

1168 value=self.cluster_shared_bucket.bucket_name, 

1169 description="Name of the always-on Cluster_Shared_Bucket.", 

1170 export_name=f"{project_name}-cluster-shared-bucket-name", 

1171 ) 

1172 

1173 CfnOutput( 

1174 self, 

1175 "ClusterSharedBucketArn", 

1176 value=self.cluster_shared_bucket.bucket_arn, 

1177 description="ARN of the always-on Cluster_Shared_Bucket.", 

1178 export_name=f"{project_name}-cluster-shared-bucket-arn", 

1179 ) 

1180 

1181 CfnOutput( 

1182 self, 

1183 "ClusterSharedBucketRegion", 

1184 value=self.region, 

1185 description="Home region of the always-on Cluster_Shared_Bucket.", 

1186 export_name=f"{project_name}-cluster-shared-bucket-region", 

1187 ) 

1188 

1189 CfnOutput( 

1190 self, 

1191 "ClusterSharedKmsKeyArn", 

1192 value=self.cluster_shared_kms_key.key_arn, 

1193 description="ARN of the always-on KMS key encrypting Cluster_Shared_Bucket.", 

1194 export_name=f"{project_name}-cluster-shared-kms-key-arn", 

1195 ) 

1196 

1197 def _resolve_replication_destinations(self, destinations: str | list[str]) -> list[str]: 

1198 """Resolve the configured replication destinations into a region list. 

1199 

1200 When ``destinations`` is the literal ``"all_deployed_regions"``, the 

1201 list comes from ``self.config.get_regions()`` (the same source the 

1202 rest of the stack uses for cross-region wiring). When it is an 

1203 explicit list, it is returned as-is. The source region (the global 

1204 stack's deploy region) is excluded — ECR replication is point-to-point 

1205 and a self-referential destination is rejected by the API. 

1206 """ 

1207 if isinstance(destinations, str): 1207 ↛ 1210line 1207 didn't jump to line 1210 because the condition on line 1207 was always true

1208 candidate_regions = list(self.config.get_regions()) 

1209 else: 

1210 candidate_regions = list(destinations) 

1211 return [region for region in candidate_regions if region != self.region] 

1212 

1213 def _create_image_replication_rule(self) -> None: 

1214 """Provision the ECR replication rule for ``gco/*`` repositories. 

1215 

1216 When ``images.replication.enabled`` is True and at least one 

1217 non-source destination resolves, creates one 

1218 ``aws_ecr.CfnReplicationConfiguration`` rule with a single 

1219 ``PREFIX_MATCH`` filter on ``gco/`` and one destination per resolved 

1220 region. When replication is disabled or the destination list is 

1221 empty (e.g. single-region deploy), no replication resource is 

1222 provisioned and the method becomes a no-op. 

1223 """ 

1224 if not self.images_config["replication"]["enabled"]: 1224 ↛ 1225line 1224 didn't jump to line 1225 because the condition on line 1224 was never true

1225 return 

1226 

1227 destinations = self._resolve_replication_destinations( 

1228 self.images_config["replication"]["destinations"] 

1229 ) 

1230 if not destinations: 1230 ↛ 1231line 1230 didn't jump to line 1231 because the condition on line 1230 was never true

1231 return 

1232 

1233 ecr.CfnReplicationConfiguration( 

1234 self, 

1235 "GcoImageReplicationConfig", 

1236 replication_configuration=ecr.CfnReplicationConfiguration.ReplicationConfigurationProperty( 

1237 rules=[ 

1238 ecr.CfnReplicationConfiguration.ReplicationRuleProperty( 

1239 destinations=[ 

1240 ecr.CfnReplicationConfiguration.ReplicationDestinationProperty( 

1241 region=region, 

1242 registry_id=self.account, 

1243 ) 

1244 for region in destinations 

1245 ], 

1246 repository_filters=[ 

1247 ecr.CfnReplicationConfiguration.RepositoryFilterProperty( 

1248 filter="gco/", 

1249 filter_type="PREFIX_MATCH", 

1250 ) 

1251 ], 

1252 ) 

1253 ] 

1254 ), 

1255 ) 

1256 

1257 def _create_image_lookup_lambda(self) -> None: 

1258 """Create the lookup-or-create custom resource Lambda for image repos. 

1259 

1260 The Lambda implements the adopt-or-create pattern for ECR repos 

1261 under the project's ``gco/*`` prefix. It is invoked at the time 

1262 ``cli images init`` registers a new repo with the global stack via 

1263 a ``CustomResource``; the function itself is provisioned here so 

1264 the ARN is stable across deploys. 

1265 

1266 The Lambda's IAM role grants read/write access to ECR repository 

1267 APIs scoped to the project's prefix, plus the standard basic 

1268 execution policy for CloudWatch Logs. 

1269 """ 

1270 project_name = self.config.get_project_name() 

1271 

1272 # IAM role for the Lambda — minimal ECR + CloudWatch Logs permissions. 

1273 # ECR repository APIs scope by repository name, not ARN, so the 

1274 # ``gco/*`` prefix scope is enforced via the ARN pattern in the 

1275 # policy resource list. 

1276 repo_arn = f"arn:aws:ecr:*:{self.account}:repository/gco/*" 

1277 

1278 self.image_lookup_lambda = lambda_.Function( 

1279 self, 

1280 "ImageLookupFunction", 

1281 runtime=getattr(lambda_.Runtime, LAMBDA_PYTHON_RUNTIME), 

1282 handler="handler.lambda_handler", 

1283 code=lambda_.Code.from_asset("lambda/image-lookup"), 

1284 timeout=Duration.minutes(5), 

1285 description=( 

1286 "Lookup-or-create custom resource handler for ECR " 

1287 "repositories under the project's gco/* prefix." 

1288 ), 

1289 ) 

1290 

1291 assert self.image_lookup_lambda.role is not None 

1292 self.image_lookup_lambda.role.add_to_principal_policy( 

1293 iam.PolicyStatement( 

1294 effect=iam.Effect.ALLOW, 

1295 actions=[ 

1296 "ecr:DescribeRepositories", 

1297 "ecr:CreateRepository", 

1298 "ecr:DeleteRepository", 

1299 "ecr:PutLifecyclePolicy", 

1300 "ecr:GetLifecyclePolicy", 

1301 "ecr:TagResource", 

1302 "ecr:ListTagsForResource", 

1303 "ecr:BatchDeleteImage", 

1304 "ecr:DescribeImages", 

1305 "ecr:ListImages", 

1306 ], 

1307 resources=[repo_arn], 

1308 ) 

1309 ) 

1310 

1311 CfnOutput( 

1312 self, 

1313 "ImageLookupFunctionArn", 

1314 value=self.image_lookup_lambda.function_arn, 

1315 description=( 

1316 "Lambda ARN for the lookup-or-create custom resource that " 

1317 "manages ECR repositories under the gco/* prefix." 

1318 ), 

1319 export_name=f"{project_name}-image-lookup-function-arn", 

1320 ) 

1321 

1322 # The ECR repository policy uses ``arn:aws:ecr:*:<account>:repository/gco/*`` 

1323 # which cdk-nag flags as ``AwsSolutions-IAM5`` because of the trailing 

1324 # ``*``. The wildcard here is the documented IAM way to express 

1325 # "every repository in this project's prefix", which is exactly the 

1326 # blast radius we want for a Lambda whose contract is to manage 

1327 # ECR repos under that prefix. Suppression is scoped to the specific 

1328 # ARN pattern (and to all ECR Describe/Read action wildcards in 

1329 # the policy below) rather than a blanket ``Resource::*`` bypass. 

1330 # 

1331 # ``self.account`` resolves to the unresolved CDK token 

1332 # ``<AWS::AccountId>`` at synth time, which is the literal form 

1333 # cdk-nag uses when it reports the finding's ``finding_id``. The 

1334 # ``appliesTo`` value below has to match that literal form exactly, 

1335 # so we hard-code the token rather than interpolating ``self.account``. 

1336 from cdk_nag import NagSuppressions 

1337 

1338 NagSuppressions.add_resource_suppressions( 

1339 self.image_lookup_lambda.role, 

1340 [ 

1341 { 

1342 "id": "AwsSolutions-IAM5", 

1343 "reason": ( 

1344 "The ImageLookupFunction's contract is to look up " 

1345 "or create any ECR repository under the project's " 

1346 "``gco/*`` prefix. The ARN pattern " 

1347 "``arn:aws:ecr:*:<account>:repository/gco/*`` is " 

1348 "the documented IAM way to express that scope: it " 

1349 "covers exactly the repositories the function is " 

1350 "allowed to touch and nothing else." 

1351 ), 

1352 "appliesTo": [ 

1353 "Resource::arn:aws:ecr:*:<AWS::AccountId>:repository/gco/*", 

1354 ], 

1355 }, 

1356 ], 

1357 apply_to_children=True, 

1358 )