Coverage for gco/stacks/global_stack.py: 97%
198 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-15 15:07 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-15 15:07 +0000
1"""
2Global stack for GCO (Global Capacity Orchestrator on AWS) - AWS Global Accelerator configuration.
4This stack creates the global-level resources that span all regions:
5- AWS Global Accelerator with TCP listeners on ports 80 and 443
6- Endpoint groups for each configured region
7- SSM parameters for cross-region endpoint group ARN sharing
8- DynamoDB tables for templates and webhooks (global, replicated)
10The Global Accelerator provides:
11- Single global endpoint for all regions
12- Automatic health-based routing to nearest healthy region
13- DDoS protection via AWS Shield Standard
14- Reduced latency through AWS global network
16Architecture:
17 Global Accelerator → Listener (80, 443) → Endpoint Groups (per region)
18 ↓
19 Regional ALBs (registered separately)
20"""
22from typing import Any
24from aws_cdk import (
25 CfnOutput,
26 Duration,
27 Fn,
28 RemovalPolicy,
29 Stack,
30)
31from aws_cdk import aws_backup as backup
32from aws_cdk import aws_dynamodb as dynamodb
33from aws_cdk import aws_ecr as ecr
34from aws_cdk import aws_events as events
35from aws_cdk import aws_globalaccelerator as ga
36from aws_cdk import aws_iam as iam
37from aws_cdk import aws_kms as kms
38from aws_cdk import aws_lambda as lambda_
39from aws_cdk import aws_s3 as s3
40from aws_cdk import aws_ssm as ssm
41from constructs import Construct
43from gco.config.config_loader import ConfigLoader
44from gco.stacks.constants import (
45 CLUSTER_SHARED_BUCKET_NAME_PREFIX,
46 CLUSTER_SHARED_SSM_PARAMETER_PREFIX,
47 LAMBDA_PYTHON_RUNTIME,
48)
50# <pyflowchart-code-diagram> BEGIN - auto-inserted, do not edit
51# Flowchart(s) generated from this file:
52# * ``GCOGlobalStack.__init__`` -> ``diagrams/code_diagrams/gco/stacks/global_stack.GCOGlobalStack___init__.html``
53# (PNG: ``diagrams/code_diagrams/gco/stacks/global_stack.GCOGlobalStack___init__.png``)
54# Regenerate with ``python diagrams/code_diagrams/generate.py``.
55# <pyflowchart-code-diagram> END
58# Default values for the ``images`` cdk.json block. The defaults match the
59# documented retention posture: repos survive a stack destroy by default
60# (``retain``), non-empty repos block destroy unless the operator explicitly
61# flips ``empty_on_delete`` to true, lifecycle keeps the latest 20 tagged
62# images and expires untagged ones after 7 days, and replication is enabled
63# by default to every deployed region.
64_IMAGES_DEFAULT_REMOVAL_POLICY = "retain"
65_IMAGES_DEFAULT_EMPTY_ON_DELETE = False
66_IMAGES_DEFAULT_KEEP_TAGGED = 20
67_IMAGES_DEFAULT_EXPIRE_UNTAGGED_DAYS = 7
68_IMAGES_DEFAULT_REPLICATION_ENABLED = True
69_IMAGES_DEFAULT_REPLICATION_DESTINATIONS = "all_deployed_regions"
71_IMAGES_VALID_REMOVAL_POLICIES = ("retain", "destroy")
74def _parse_images_config(cdk_context: dict[str, Any] | None) -> dict[str, Any]:
75 """Parse the ``images`` block from cdk.json with defaults applied.
77 Returns a normalized dict shape that the rest of the global stack
78 can consume without re-parsing. Validates ``removal_policy`` against
79 the set ``{"retain", "destroy"}`` and ``replication.destinations``
80 against either the literal string ``"all_deployed_regions"`` or a
81 ``list[str]``.
83 Args:
84 cdk_context: The dict returned by ``self.node.try_get_context("images")``.
85 ``None`` (the key being absent) is equivalent to an empty dict.
87 Returns:
88 A dict with keys ``removal_policy``, ``empty_on_delete``,
89 ``lifecycle`` (with ``keep_tagged`` and ``expire_untagged_days``),
90 and ``replication`` (with ``enabled`` and ``destinations``).
91 """
92 raw = cdk_context or {}
94 removal_policy = raw.get("removal_policy", _IMAGES_DEFAULT_REMOVAL_POLICY)
95 if not isinstance(removal_policy, str) or removal_policy not in _IMAGES_VALID_REMOVAL_POLICIES:
96 raise ValueError(
97 f"images.removal_policy must be 'retain' or 'destroy', got {removal_policy!r}"
98 )
100 empty_on_delete = bool(raw.get("empty_on_delete", _IMAGES_DEFAULT_EMPTY_ON_DELETE))
102 lifecycle_raw = raw.get("lifecycle") or {}
103 if not isinstance(lifecycle_raw, dict):
104 raise ValueError(f"images.lifecycle must be a mapping, got {type(lifecycle_raw).__name__}")
105 keep_tagged = int(lifecycle_raw.get("keep_tagged", _IMAGES_DEFAULT_KEEP_TAGGED))
106 expire_untagged_days = int(
107 lifecycle_raw.get("expire_untagged_days", _IMAGES_DEFAULT_EXPIRE_UNTAGGED_DAYS)
108 )
110 replication_raw = raw.get("replication") or {}
111 if not isinstance(replication_raw, dict):
112 raise ValueError(
113 f"images.replication must be a mapping, got {type(replication_raw).__name__}"
114 )
115 replication_enabled = bool(replication_raw.get("enabled", _IMAGES_DEFAULT_REPLICATION_ENABLED))
116 destinations = replication_raw.get("destinations", _IMAGES_DEFAULT_REPLICATION_DESTINATIONS)
117 if isinstance(destinations, str):
118 if destinations != _IMAGES_DEFAULT_REPLICATION_DESTINATIONS:
119 raise ValueError(
120 "images.replication.destinations must be the string "
121 f"{_IMAGES_DEFAULT_REPLICATION_DESTINATIONS!r} or a list of region names, "
122 f"got {destinations!r}"
123 )
124 elif isinstance(destinations, list):
125 if not all(isinstance(item, str) for item in destinations):
126 raise ValueError(
127 "images.replication.destinations list must contain only region name strings"
128 )
129 else:
130 raise ValueError(
131 "images.replication.destinations must be the string "
132 f"{_IMAGES_DEFAULT_REPLICATION_DESTINATIONS!r} or a list of region names, "
133 f"got {type(destinations).__name__}"
134 )
136 return {
137 "removal_policy": removal_policy,
138 "empty_on_delete": empty_on_delete,
139 "lifecycle": {
140 "keep_tagged": keep_tagged,
141 "expire_untagged_days": expire_untagged_days,
142 },
143 "replication": {
144 "enabled": replication_enabled,
145 "destinations": destinations,
146 },
147 }
150class GCOGlobalStack(Stack):
151 """
152 Global resources stack including AWS Global Accelerator.
154 This stack must be deployed before regional stacks. Regional stacks
155 will register their ALBs with the endpoint groups created here.
157 Attributes:
158 accelerator: The Global Accelerator resource
159 listener: TCP listener for HTTP/HTTPS traffic
160 endpoint_groups: Dict mapping region names to endpoint groups
161 templates_table: DynamoDB table for job templates
162 webhooks_table: DynamoDB table for webhooks
163 missions_table: DynamoDB table for mission session state
164 """
166 def __init__(
167 self, scope: Construct, construct_id: str, config: ConfigLoader, **kwargs: Any
168 ) -> None:
169 super().__init__(scope, construct_id, **kwargs)
171 self.config = config
172 self.regional_endpoints: dict[str, str] = {}
173 self.endpoint_groups: dict[str, ga.EndpointGroup] = {}
175 ga_config = self.config.get_global_accelerator_config()
177 # Store the accelerator name for reference by other stacks
178 self.accelerator_name = ga_config["name"]
180 # Create DynamoDB tables for templates and webhooks
181 self._create_dynamodb_tables()
183 # Create S3 bucket for model weights
184 self._create_model_bucket()
186 # Create always-on Cluster_Shared_Bucket + KMS key + SSM parameters.
187 # These run unconditionally (no feature toggle) — they are consumed by
188 # every Regional_Stack and, when analytics is enabled, by GCOAnalyticsStack.
189 self._create_cluster_shared_kms_key()
190 self._create_cluster_shared_bucket()
191 self._publish_cluster_shared_bucket_ssm_params()
193 # Create AWS Backup plan for DynamoDB tables
194 self._create_backup_plan()
196 # Container image registry — parses the cdk.json ``images`` block,
197 # provisions the optional ECR replication rule for ``gco/*`` repos,
198 # and creates the lookup-or-create custom resource Lambda that
199 # ``cli images init`` will invoke per-repo on demand. The Lambda
200 # construct is created here regardless of replication settings so
201 # the function ARN is available for downstream invocations.
202 self.images_config = _parse_images_config(self.node.try_get_context("images"))
203 self._create_image_replication_rule()
204 self._create_image_lookup_lambda()
206 # Create Global Accelerator with TCP protocol for HTTP/HTTPS traffic
207 self.accelerator = ga.Accelerator(
208 self, "GCOAccelerator", accelerator_name=self.accelerator_name, enabled=True
209 )
211 # Store the accelerator ID for CloudWatch metrics
212 # CloudWatch uses the accelerator ID (UUID), not the name or ARN
213 # ARN format: arn:aws:globalaccelerator::<account>:accelerator/<accelerator-id>
214 # Use Fn.select and Fn.split to extract the ID at deploy time
215 self.accelerator_id = Fn.select(1, Fn.split("/", self.accelerator.accelerator_arn))
217 # Create listener for both HTTP (80) and HTTPS (443) traffic.
218 # Client affinity controls whether GA pins a client to the same
219 # endpoint across connections (see ``_resolve_client_affinity``).
220 self.listener = self.accelerator.add_listener(
221 "GCOListener",
222 port_ranges=[
223 ga.PortRange(from_port=80, to_port=80),
224 ga.PortRange(from_port=443, to_port=443),
225 ],
226 protocol=ga.ConnectionProtocol.TCP,
227 client_affinity=self._resolve_client_affinity(ga_config),
228 )
230 # Create endpoint groups for each configured region
231 for region in self.config.get_regions():
232 self._create_endpoint_group(region)
234 # Export Global Accelerator outputs for other stacks
235 self._create_outputs()
237 # Apply cdk-nag suppressions
238 self._apply_nag_suppressions()
240 @staticmethod
241 def _resolve_client_affinity(ga_config: dict[str, Any]) -> ga.ClientAffinity:
242 """Map the ``client_affinity`` cdk.json knob to a CDK enum.
244 Global Accelerator supports two client-affinity modes:
246 - ``NONE`` (default): each new connection may be routed to any
247 healthy endpoint, maximising even load distribution.
248 - ``SOURCE_IP``: connections from the same source IP are pinned to
249 the same endpoint, which is useful for workloads that keep
250 per-client state on a single region.
252 The value is validated up front by
253 ``ConfigLoader._validate_global_accelerator_config`` so an unknown
254 string never reaches this point; the fallback to ``NONE`` keeps the
255 stack synthesizable even when the key is omitted entirely.
256 """
257 affinity = str(ga_config.get("client_affinity", "NONE")).upper()
258 mapping = {
259 "NONE": ga.ClientAffinity.NONE,
260 "SOURCE_IP": ga.ClientAffinity.SOURCE_IP,
261 }
262 return mapping.get(affinity, ga.ClientAffinity.NONE)
264 def _create_outputs(self) -> None:
265 """Create CloudFormation outputs for cross-stack references."""
266 project_name = self.config.get_project_name()
268 CfnOutput(
269 self,
270 "GlobalAcceleratorDnsName",
271 value=self.accelerator.dns_name,
272 description="Global Accelerator DNS name for global endpoint",
273 export_name=f"{project_name}-global-accelerator-dns",
274 )
276 CfnOutput(
277 self,
278 "GlobalAcceleratorArn",
279 value=self.accelerator.accelerator_arn,
280 description="Global Accelerator ARN",
281 export_name=f"{project_name}-global-accelerator-arn",
282 )
284 CfnOutput(
285 self,
286 "GlobalAcceleratorListenerArn",
287 value=self.listener.listener_arn,
288 description="Global Accelerator Listener ARN",
289 export_name=f"{project_name}-global-accelerator-listener-arn",
290 )
292 def _apply_nag_suppressions(self) -> None:
293 """Apply cdk-nag suppressions for this stack."""
294 from gco.stacks.nag_suppressions import apply_all_suppressions
296 apply_all_suppressions(self, stack_type="global")
298 def _create_endpoint_group(self, region: str) -> None:
299 """
300 Create an endpoint group for a specific region.
302 Configures HTTP health checks using the path from cdk.json so
303 Global Accelerator can verify the ALB's backend services are
304 actually healthy (not just that the port is open).
306 Also stores the endpoint group ARN in SSM Parameter Store for
307 cross-region access by regional stacks.
309 Args:
310 region: AWS region name (e.g., 'us-east-1')
311 """
312 project_name = self.config.get_project_name()
313 region_id = region.replace("-", "").title()
314 ga_config = self.config.get_global_accelerator_config()
316 # Use HTTP health checks so GA validates the backend services are
317 # actually responding, not just that the ALB port is open.
318 # The health_check_path from cdk.json (default: /api/v1/health)
319 # hits the health-monitor service behind the ALB.
320 endpoint_group = self.listener.add_endpoint_group(
321 f"EndpointGroup{region_id}",
322 region=region,
323 health_check_port=80,
324 health_check_protocol=ga.HealthCheckProtocol.HTTP,
325 health_check_path=ga_config.get("health_check_path", "/api/v1/health"),
326 health_check_interval=Duration.seconds(ga_config.get("health_check_interval", 30)),
327 health_check_threshold=3,
328 )
330 self.endpoint_groups[region] = endpoint_group
332 # Export endpoint group ARN for regional stacks
333 CfnOutput(
334 self,
335 f"EndpointGroup{region_id}Arn",
336 value=endpoint_group.endpoint_group_arn,
337 description=f"Endpoint group ARN for {region}",
338 export_name=f"{project_name}-endpoint-group-{region}-arn",
339 )
341 # Store endpoint group ARN in SSM Parameter Store for cross-region access
342 # Regional stacks read this to register their ALBs with Global Accelerator
343 ssm.StringParameter(
344 self,
345 f"EndpointGroup{region_id}ArnParam",
346 parameter_name=f"/{project_name}/endpoint-group-{region}-arn",
347 string_value=endpoint_group.endpoint_group_arn,
348 description=f"Global Accelerator endpoint group ARN for {region}",
349 )
351 def add_regional_endpoint(self, region: str, alb_arn: str) -> None:
352 """Add a regional ALB endpoint to the Global Accelerator.
354 Note: Due to cross-region reference limitations in CDK, the actual endpoint
355 registration is handled by a custom resource in the regional stack.
356 This method stores the ARN for reference but doesn't directly register it.
358 The regional stack should use the endpoint group ARN exported by this stack
359 to register its ALB via an AwsCustomResource.
360 """
361 self.regional_endpoints[region] = alb_arn
362 # Actual registration happens in regional stack via custom resource
364 def get_accelerator_dns_name(self) -> str:
365 """Get the Global Accelerator DNS name"""
366 return str(self.accelerator.dns_name)
368 def get_accelerator_arn(self) -> str:
369 """Get the Global Accelerator ARN"""
370 return str(self.accelerator.accelerator_arn)
372 def get_listener_arn(self) -> str:
373 """Get the Global Accelerator Listener ARN"""
374 return str(self.listener.listener_arn)
376 def get_endpoint_group_arn(self, region: str) -> str:
377 """Get the endpoint group ARN for a specific region"""
378 if region in self.endpoint_groups:
379 return str(self.endpoint_groups[region].endpoint_group_arn)
380 raise ValueError(f"No endpoint group found for region: {region}")
382 def _create_dynamodb_tables(self) -> None:
383 """Create DynamoDB tables for templates, webhooks, jobs, inference endpoints, and missions."""
384 project_name = self.config.get_project_name()
386 # Job Templates table - stores reusable job templates
387 self.templates_table = dynamodb.Table(
388 self,
389 "JobTemplatesTable",
390 table_name=f"{project_name}-job-templates",
391 partition_key=dynamodb.Attribute(
392 name="template_name",
393 type=dynamodb.AttributeType.STRING,
394 ),
395 billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST,
396 removal_policy=RemovalPolicy.DESTROY,
397 point_in_time_recovery_specification=dynamodb.PointInTimeRecoverySpecification(
398 point_in_time_recovery_enabled=True
399 ),
400 encryption=dynamodb.TableEncryption.AWS_MANAGED,
401 )
403 # Webhooks table - stores webhook registrations
404 self.webhooks_table = dynamodb.Table(
405 self,
406 "WebhooksTable",
407 table_name=f"{project_name}-webhooks",
408 partition_key=dynamodb.Attribute(
409 name="webhook_id",
410 type=dynamodb.AttributeType.STRING,
411 ),
412 billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST,
413 removal_policy=RemovalPolicy.DESTROY,
414 point_in_time_recovery_specification=dynamodb.PointInTimeRecoverySpecification(
415 point_in_time_recovery_enabled=True
416 ),
417 encryption=dynamodb.TableEncryption.AWS_MANAGED,
418 )
420 # Add GSI for querying webhooks by namespace
421 self.webhooks_table.add_global_secondary_index(
422 index_name="namespace-index",
423 partition_key=dynamodb.Attribute(
424 name="namespace",
425 type=dynamodb.AttributeType.STRING,
426 ),
427 projection_type=dynamodb.ProjectionType.ALL,
428 )
430 # Jobs table - centralized job tracking and queue
431 # This enables global job submission with regional pickup
432 self.jobs_table = dynamodb.Table(
433 self,
434 "JobsTable",
435 table_name=f"{project_name}-jobs",
436 partition_key=dynamodb.Attribute(
437 name="job_id",
438 type=dynamodb.AttributeType.STRING,
439 ),
440 billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST,
441 removal_policy=RemovalPolicy.DESTROY,
442 point_in_time_recovery_specification=dynamodb.PointInTimeRecoverySpecification(
443 point_in_time_recovery_enabled=True
444 ),
445 encryption=dynamodb.TableEncryption.AWS_MANAGED,
446 time_to_live_attribute="ttl", # Auto-cleanup old completed jobs
447 )
449 # GSI for querying jobs by region and status (for regional polling)
450 self.jobs_table.add_global_secondary_index(
451 index_name="region-status-index",
452 partition_key=dynamodb.Attribute(
453 name="target_region",
454 type=dynamodb.AttributeType.STRING,
455 ),
456 sort_key=dynamodb.Attribute(
457 name="status",
458 type=dynamodb.AttributeType.STRING,
459 ),
460 projection_type=dynamodb.ProjectionType.ALL,
461 )
463 # GSI for querying jobs by namespace
464 self.jobs_table.add_global_secondary_index(
465 index_name="namespace-index",
466 partition_key=dynamodb.Attribute(
467 name="namespace",
468 type=dynamodb.AttributeType.STRING,
469 ),
470 sort_key=dynamodb.Attribute(
471 name="submitted_at",
472 type=dynamodb.AttributeType.STRING,
473 ),
474 projection_type=dynamodb.ProjectionType.ALL,
475 )
477 # GSI for querying jobs by status globally
478 self.jobs_table.add_global_secondary_index(
479 index_name="status-index",
480 partition_key=dynamodb.Attribute(
481 name="status",
482 type=dynamodb.AttributeType.STRING,
483 ),
484 sort_key=dynamodb.Attribute(
485 name="submitted_at",
486 type=dynamodb.AttributeType.STRING,
487 ),
488 projection_type=dynamodb.ProjectionType.ALL,
489 )
491 # Export table names and ARNs for regional stacks
492 CfnOutput(
493 self,
494 "TemplatesTableName",
495 value=self.templates_table.table_name,
496 description="DynamoDB table name for job templates",
497 export_name=f"{project_name}-templates-table-name",
498 )
500 CfnOutput(
501 self,
502 "TemplatesTableArn",
503 value=self.templates_table.table_arn,
504 description="DynamoDB table ARN for job templates",
505 export_name=f"{project_name}-templates-table-arn",
506 )
508 CfnOutput(
509 self,
510 "WebhooksTableName",
511 value=self.webhooks_table.table_name,
512 description="DynamoDB table name for webhooks",
513 export_name=f"{project_name}-webhooks-table-name",
514 )
516 CfnOutput(
517 self,
518 "WebhooksTableArn",
519 value=self.webhooks_table.table_arn,
520 description="DynamoDB table ARN for webhooks",
521 export_name=f"{project_name}-webhooks-table-arn",
522 )
524 CfnOutput(
525 self,
526 "JobsTableName",
527 value=self.jobs_table.table_name,
528 description="DynamoDB table name for centralized job tracking",
529 export_name=f"{project_name}-jobs-table-name",
530 )
532 CfnOutput(
533 self,
534 "JobsTableArn",
535 value=self.jobs_table.table_arn,
536 description="DynamoDB table ARN for centralized job tracking",
537 export_name=f"{project_name}-jobs-table-arn",
538 )
540 # Inference Endpoints table - stores desired state for inference deployments
541 # The inference_monitor in each regional cluster polls this table
542 self.inference_endpoints_table = dynamodb.Table(
543 self,
544 "InferenceEndpointsTable",
545 table_name=f"{project_name}-inference-endpoints",
546 partition_key=dynamodb.Attribute(
547 name="endpoint_name",
548 type=dynamodb.AttributeType.STRING,
549 ),
550 billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST,
551 removal_policy=RemovalPolicy.DESTROY,
552 point_in_time_recovery_specification=dynamodb.PointInTimeRecoverySpecification(
553 point_in_time_recovery_enabled=True
554 ),
555 encryption=dynamodb.TableEncryption.AWS_MANAGED,
556 )
558 CfnOutput(
559 self,
560 "InferenceEndpointsTableName",
561 value=self.inference_endpoints_table.table_name,
562 description="DynamoDB table name for inference endpoint state",
563 export_name=f"{project_name}-inference-endpoints-table-name",
564 )
566 CfnOutput(
567 self,
568 "InferenceEndpointsTableArn",
569 value=self.inference_endpoints_table.table_arn,
570 description="DynamoDB table ARN for inference endpoint state",
571 export_name=f"{project_name}-inference-endpoints-table-arn",
572 )
574 # Missions table - persists goal-directed iteration session state
575 # Partition by session_id; the status-index GSI supports paginated
576 # listing by status (e.g. running, completed, terminated, failed).
577 self.missions_table = dynamodb.Table(
578 self,
579 "MissionsTable",
580 table_name=f"{project_name}-missions",
581 partition_key=dynamodb.Attribute(
582 name="session_id",
583 type=dynamodb.AttributeType.STRING,
584 ),
585 billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST,
586 removal_policy=RemovalPolicy.DESTROY,
587 point_in_time_recovery_specification=dynamodb.PointInTimeRecoverySpecification(
588 point_in_time_recovery_enabled=True
589 ),
590 encryption=dynamodb.TableEncryption.AWS_MANAGED,
591 )
593 # GSI for paginating sessions by status (sorted by creation time)
594 self.missions_table.add_global_secondary_index(
595 index_name="status-index",
596 partition_key=dynamodb.Attribute(
597 name="status",
598 type=dynamodb.AttributeType.STRING,
599 ),
600 sort_key=dynamodb.Attribute(
601 name="created_at",
602 type=dynamodb.AttributeType.STRING,
603 ),
604 projection_type=dynamodb.ProjectionType.ALL,
605 )
607 CfnOutput(
608 self,
609 "MissionsTableName",
610 value=self.missions_table.table_name,
611 description="DynamoDB table name for mission session state",
612 export_name=f"{project_name}-missions-table-name",
613 )
615 CfnOutput(
616 self,
617 "MissionsTableArn",
618 value=self.missions_table.table_arn,
619 description="DynamoDB table ARN for mission session state",
620 export_name=f"{project_name}-missions-table-arn",
621 )
623 # Store table names in SSM for cross-region access
624 ssm.StringParameter(
625 self,
626 "TemplatesTableNameParam",
627 parameter_name=f"/{project_name}/templates-table-name",
628 string_value=self.templates_table.table_name,
629 description="DynamoDB table name for job templates",
630 )
632 ssm.StringParameter(
633 self,
634 "WebhooksTableNameParam",
635 parameter_name=f"/{project_name}/webhooks-table-name",
636 string_value=self.webhooks_table.table_name,
637 description="DynamoDB table name for webhooks",
638 )
640 ssm.StringParameter(
641 self,
642 "JobsTableNameParam",
643 parameter_name=f"/{project_name}/jobs-table-name",
644 string_value=self.jobs_table.table_name,
645 description="DynamoDB table name for centralized job tracking",
646 )
648 ssm.StringParameter(
649 self,
650 "InferenceEndpointsTableNameParam",
651 parameter_name=f"/{project_name}/inference-endpoints-table-name",
652 string_value=self.inference_endpoints_table.table_name,
653 description="DynamoDB table name for inference endpoint state",
654 )
656 ssm.StringParameter(
657 self,
658 "MissionsTableNameParam",
659 parameter_name=f"/{project_name}/missions-table-name",
660 string_value=self.missions_table.table_name,
661 description="DynamoDB table name for mission session state",
662 )
664 def _create_model_bucket(self) -> None:
665 """Create S3 bucket for model weights.
667 This bucket serves as the central model registry. Users upload model
668 weights here once, and the inference_monitor's init containers sync
669 them to each region's local EFS at pod startup.
671 The bucket name is auto-generated by CDK to avoid naming collisions.
672 It's exported via CfnOutput and SSM for CLI discovery.
673 """
674 project_name = self.config.get_project_name()
676 # KMS key for model bucket encryption
677 self.model_bucket_key = kms.Key(
678 self,
679 "ModelBucketKey",
680 description="KMS key for GCO model weights bucket",
681 enable_key_rotation=True,
682 removal_policy=RemovalPolicy.DESTROY,
683 )
685 # Access logs bucket (required for compliance)
686 # Retention is configurable via cdk.json context field `s3_access_logs.retention_days`
687 # (default: 90 days). Logs older than the configured retention are expired.
688 s3_access_logs_ctx = self.node.try_get_context("s3_access_logs") or {}
689 access_logs_retention_days = int(s3_access_logs_ctx.get("retention_days", 90))
691 self.model_bucket_access_logs = s3.Bucket(
692 self,
693 "ModelWeightsAccessLogsBucket",
694 encryption=s3.BucketEncryption.S3_MANAGED,
695 block_public_access=s3.BlockPublicAccess.BLOCK_ALL,
696 enforce_ssl=True,
697 versioned=True,
698 removal_policy=RemovalPolicy.DESTROY,
699 auto_delete_objects=True,
700 lifecycle_rules=[
701 s3.LifecycleRule(
702 id="ExpireAccessLogs",
703 enabled=True,
704 expiration=Duration.days(access_logs_retention_days),
705 )
706 ],
707 )
709 # Model weights bucket
710 self.model_bucket = s3.Bucket(
711 self,
712 "ModelWeightsBucket",
713 encryption=s3.BucketEncryption.KMS,
714 encryption_key=self.model_bucket_key,
715 bucket_key_enabled=True,
716 block_public_access=s3.BlockPublicAccess.BLOCK_ALL,
717 enforce_ssl=True,
718 versioned=True,
719 removal_policy=RemovalPolicy.DESTROY,
720 auto_delete_objects=True,
721 server_access_logs_bucket=self.model_bucket_access_logs,
722 server_access_logs_prefix="model-bucket-logs/",
723 )
725 # CDK-nag suppressions — only replication (not needed for model weights)
726 from cdk_nag import NagSuppressions
728 replication_reason = (
729 "Model weights are user-uploaded artifacts that can be re-uploaded. "
730 "Cross-region replication is not required; the inference_monitor "
731 "syncs models from S3 to each region's EFS at pod startup."
732 )
734 NagSuppressions.add_resource_suppressions(
735 self.model_bucket,
736 [
737 {
738 "id": "HIPAA.Security-S3BucketReplicationEnabled",
739 "reason": replication_reason,
740 },
741 {
742 "id": "NIST.800.53.R5-S3BucketReplicationEnabled",
743 "reason": replication_reason,
744 },
745 {
746 "id": "PCI.DSS.321-S3BucketReplicationEnabled",
747 "reason": replication_reason,
748 },
749 ],
750 )
752 logs_reason = "This is the server access logs destination bucket."
753 NagSuppressions.add_resource_suppressions(
754 self.model_bucket_access_logs,
755 [
756 {"id": "AwsSolutions-S1", "reason": logs_reason},
757 {"id": "HIPAA.Security-S3BucketLoggingEnabled", "reason": logs_reason},
758 {
759 "id": "HIPAA.Security-S3BucketReplicationEnabled",
760 "reason": "Access logs do not require replication.",
761 },
762 {
763 "id": "HIPAA.Security-S3DefaultEncryptionKMS",
764 "reason": "SSE-S3 is sufficient for access logs.",
765 },
766 {"id": "NIST.800.53.R5-S3BucketLoggingEnabled", "reason": logs_reason},
767 {
768 "id": "NIST.800.53.R5-S3BucketReplicationEnabled",
769 "reason": "Access logs do not require replication.",
770 },
771 {
772 "id": "NIST.800.53.R5-S3DefaultEncryptionKMS",
773 "reason": "SSE-S3 is sufficient for access logs.",
774 },
775 {"id": "PCI.DSS.321-S3BucketLoggingEnabled", "reason": logs_reason},
776 {
777 "id": "PCI.DSS.321-S3BucketReplicationEnabled",
778 "reason": "Access logs do not require replication.",
779 },
780 {
781 "id": "PCI.DSS.321-S3DefaultEncryptionKMS",
782 "reason": "SSE-S3 is sufficient for access logs.",
783 },
784 ],
785 )
787 CfnOutput(
788 self,
789 "ModelBucketName",
790 value=self.model_bucket.bucket_name,
791 description="S3 bucket for model weights",
792 export_name=f"{project_name}-model-bucket-name",
793 )
795 CfnOutput(
796 self,
797 "ModelBucketArn",
798 value=self.model_bucket.bucket_arn,
799 description="S3 bucket ARN for model weights",
800 export_name=f"{project_name}-model-bucket-arn",
801 )
803 ssm.StringParameter(
804 self,
805 "ModelBucketNameParam",
806 parameter_name=f"/{project_name}/model-bucket-name",
807 string_value=self.model_bucket.bucket_name,
808 description="S3 bucket name for model weights",
809 )
811 def _create_backup_plan(self) -> None:
812 """Create AWS Backup plan for DynamoDB tables.
814 Creates a backup plan with:
815 - Daily backups retained for 35 days
816 - Weekly backups retained for 90 days
817 - All DynamoDB tables added to the backup selection
818 """
819 # Create backup vault for storing backups
820 self.backup_vault = backup.BackupVault(
821 self,
822 "DynamoDBBackupVault",
823 removal_policy=RemovalPolicy.DESTROY,
824 )
826 # Create backup plan with daily and weekly rules
827 self.backup_plan = backup.BackupPlan(
828 self,
829 "DynamoDBBackupPlan",
830 backup_plan_rules=[
831 # Daily backup - retained for 35 days
832 backup.BackupPlanRule(
833 rule_name="DailyBackup",
834 backup_vault=self.backup_vault,
835 schedule_expression=events.Schedule.cron(
836 hour="3",
837 minute="0",
838 ),
839 delete_after=Duration.days(35),
840 enable_continuous_backup=True, # Enable PITR for DynamoDB
841 ),
842 # Weekly backup - retained for 90 days
843 backup.BackupPlanRule(
844 rule_name="WeeklyBackup",
845 backup_vault=self.backup_vault,
846 schedule_expression=events.Schedule.cron(
847 hour="4",
848 minute="0",
849 week_day="SUN",
850 ),
851 delete_after=Duration.days(90),
852 ),
853 ],
854 )
856 # Add all DynamoDB tables to the backup selection
857 self.backup_plan.add_selection(
858 "DynamoDBTablesSelection",
859 resources=[
860 backup.BackupResource.from_dynamo_db_table(self.templates_table),
861 backup.BackupResource.from_dynamo_db_table(self.webhooks_table),
862 backup.BackupResource.from_dynamo_db_table(self.jobs_table),
863 backup.BackupResource.from_dynamo_db_table(self.inference_endpoints_table),
864 backup.BackupResource.from_dynamo_db_table(self.missions_table),
865 ],
866 )
868 # Export backup plan ARN
869 project_name = self.config.get_project_name()
870 CfnOutput(
871 self,
872 "BackupPlanArn",
873 value=self.backup_plan.backup_plan_arn,
874 description="AWS Backup plan ARN for DynamoDB tables",
875 export_name=f"{project_name}-backup-plan-arn",
876 )
878 CfnOutput(
879 self,
880 "BackupVaultArn",
881 value=self.backup_vault.backup_vault_arn,
882 description="AWS Backup vault ARN for DynamoDB backups",
883 export_name=f"{project_name}-backup-vault-arn",
884 )
886 def _create_cluster_shared_kms_key(self) -> None:
887 """Create the always-on customer-managed KMS key for ``Cluster_Shared_Bucket``.
889 The key:
890 - Enables automatic annual rotation.
891 - Uses a 7-day pending window on destroy — the AWS minimum, matching the
892 destroy-by-default iteration-loop posture of the analytics-environment
893 feature while still providing a safety net against accidental deletion.
894 - Uses ``RemovalPolicy.DESTROY`` so a ``cdk destroy gco-global`` cleans up
895 the key without operator intervention (iteration-loop posture).
896 - Grants encrypt/decrypt to the ``s3.amazonaws.com`` and
897 ``logs.<region>.amazonaws.com`` service principals via the key policy
898 so S3 server-side encryption and CloudWatch access-log delivery can use
899 the key without role-side grants.
901 The key is exposed as ``self.cluster_shared_kms_key`` for tests and for
902 ``_create_cluster_shared_bucket`` to reference. Role-side usage grants
903 (``kms:Decrypt`` / ``kms:GenerateDataKey``) are attached by downstream
904 consumers: ``GCORegionalStack`` on the job-pod role (always-on)
905 and ``GCOAnalyticsStack`` on the SageMaker execution role (conditional on
906 the analytics toggle).
907 """
908 self.cluster_shared_kms_key = kms.Key(
909 self,
910 "ClusterSharedKmsKey",
911 description=(
912 "Customer-managed KMS key for the always-on Cluster_Shared_Bucket "
913 "in GCOGlobalStack. Consumed by every regional EKS cluster and by "
914 "GCOAnalyticsStack when analytics is enabled."
915 ),
916 enable_key_rotation=True,
917 pending_window=Duration.days(7),
918 removal_policy=RemovalPolicy.DESTROY,
919 )
921 # Key-policy grants for service principals that need to encrypt/decrypt
922 # on behalf of the bucket (S3 server-side encryption) and the access-logs
923 # bucket (CloudWatch Logs delivery). The actions match the standard
924 # service-principal pattern used by cdk's default key policies.
925 kms_actions = [
926 "kms:Encrypt",
927 "kms:Decrypt",
928 "kms:ReEncrypt*",
929 "kms:GenerateDataKey*",
930 "kms:DescribeKey",
931 ]
933 self.cluster_shared_kms_key.add_to_resource_policy(
934 iam.PolicyStatement(
935 sid="AllowS3ServiceEncryptDecrypt",
936 effect=iam.Effect.ALLOW,
937 principals=[iam.ServicePrincipal("s3.amazonaws.com")],
938 actions=kms_actions,
939 resources=["*"],
940 )
941 )
943 self.cluster_shared_kms_key.add_to_resource_policy(
944 iam.PolicyStatement(
945 sid="AllowCloudWatchLogsEncryptDecrypt",
946 effect=iam.Effect.ALLOW,
947 principals=[iam.ServicePrincipal(f"logs.{self.region}.amazonaws.com")],
948 actions=kms_actions,
949 resources=["*"],
950 )
951 )
953 def _create_cluster_shared_bucket(self) -> None:
954 """Create the always-on ``Cluster_Shared_Bucket`` and its access-logs bucket.
956 Two buckets are created:
958 1. ``cluster_shared_access_logs_bucket`` — dedicated S3 access-logs bucket
959 used as ``server_access_logs_bucket`` for the primary bucket. Separate
960 from ``model_bucket_access_logs`` so cluster-shared-bucket access logs
961 are not commingled with model-bucket logs.
962 2. ``cluster_shared_bucket`` — the primary bucket named
963 ``gco-cluster-shared-<account>-<global-region>`` (the prefix
964 ``CLUSTER_SHARED_BUCKET_NAME_PREFIX`` is the stable ARN prefix used by
965 IAM policies and nag assertions). KMS-encrypted with
966 ``cluster_shared_kms_key``, block-public-access on, SSL enforced,
967 versioned, destroy-on-teardown.
969 An explicit ``Deny`` statement for ``aws:SecureTransport=false`` is added
970 to the bucket policy independent of ``enforce_ssl=True`` so the deny is
971 verifiable in the synthesized template (belt-and-suspenders).
973 Grants on ``Cluster_Shared_Bucket`` are intentionally not added here —
974 they live on downstream role policies (``GCORegionalStack`` on the
975 job-pod role, ``GCOAnalyticsStack`` on the SageMaker execution role)
976 rather than in this bucket's policy. The bucket policy contains zero
977 ``Principal: "*"`` Allow statements.
978 """
979 # Retention for the access-logs bucket honors the same `s3_access_logs`
980 # context field as the model-bucket access-logs bucket (default 90 days).
981 s3_access_logs_ctx = self.node.try_get_context("s3_access_logs") or {}
982 access_logs_retention_days = int(s3_access_logs_ctx.get("retention_days", 90))
984 # Dedicated access-logs bucket for Cluster_Shared_Bucket. Encrypted with
985 # the cluster-shared KMS key (the key policy grants the logs service
986 # principal encrypt/decrypt). Kept separate from model_bucket_access_logs
987 # so operators can reason about each bucket's logs independently. Matches
988 # the LifecycleRule used on `model_bucket_access_logs` so retention is
989 # consistent across the two log sinks.
990 self.cluster_shared_access_logs_bucket = s3.Bucket(
991 self,
992 "ClusterSharedAccessLogsBucket",
993 encryption=s3.BucketEncryption.KMS,
994 encryption_key=self.cluster_shared_kms_key,
995 block_public_access=s3.BlockPublicAccess.BLOCK_ALL,
996 enforce_ssl=True,
997 versioned=True,
998 removal_policy=RemovalPolicy.DESTROY,
999 auto_delete_objects=True,
1000 lifecycle_rules=[
1001 s3.LifecycleRule(
1002 id="ExpireAccessLogs",
1003 enabled=True,
1004 expiration=Duration.days(access_logs_retention_days),
1005 )
1006 ],
1007 )
1009 # Primary Cluster_Shared_Bucket. Name uses the constant prefix so
1010 # the IAM allow-list assertion (arn:aws:s3:::gco-cluster-shared-*)
1011 # stays stable across refactors. `bucket_key_enabled=True` mirrors the
1012 # model_bucket pattern to reduce per-object KMS request costs.
1013 self.cluster_shared_bucket = s3.Bucket(
1014 self,
1015 "ClusterSharedBucket",
1016 bucket_name=f"{CLUSTER_SHARED_BUCKET_NAME_PREFIX}-{self.account}-{self.region}",
1017 encryption=s3.BucketEncryption.KMS,
1018 encryption_key=self.cluster_shared_kms_key,
1019 bucket_key_enabled=True,
1020 block_public_access=s3.BlockPublicAccess.BLOCK_ALL,
1021 enforce_ssl=True,
1022 versioned=True,
1023 removal_policy=RemovalPolicy.DESTROY,
1024 auto_delete_objects=True,
1025 server_access_logs_bucket=self.cluster_shared_access_logs_bucket,
1026 server_access_logs_prefix="cluster-shared/",
1027 )
1029 # Explicit Deny for insecure transport. `enforce_ssl=True` already adds
1030 # an equivalent statement, but duplicating it here makes the deny
1031 # verifiable in the synthesized template under a known SID and satisfies
1032 # a belt-and-suspenders posture.
1033 self.cluster_shared_bucket.add_to_resource_policy(
1034 iam.PolicyStatement(
1035 sid="DenyInsecureTransport",
1036 effect=iam.Effect.DENY,
1037 principals=[iam.AnyPrincipal()],
1038 actions=["s3:*"],
1039 resources=[
1040 self.cluster_shared_bucket.bucket_arn,
1041 f"{self.cluster_shared_bucket.bucket_arn}/*",
1042 ],
1043 conditions={"Bool": {"aws:SecureTransport": "false"}},
1044 )
1045 )
1047 # CDK-nag suppressions — scoped per-resource at the construct site to
1048 # mirror the ``_create_model_bucket`` pattern (keeps the suppression
1049 # co-located with the construct it applies to, so the reason survives
1050 # refactors). Every suppression carries an explicit reason
1051 # string; no blanket ``Resource::*`` bypasses.
1052 from cdk_nag import NagSuppressions
1054 shared_replication_reason = (
1055 "Cluster_Shared_Bucket is a regional scratch sink; cluster jobs "
1056 "publish to it from a single region, and there is no durability "
1057 "requirement that warrants cross-region replication. Access logs "
1058 "do not require replication for the same reason."
1059 )
1061 NagSuppressions.add_resource_suppressions(
1062 self.cluster_shared_bucket,
1063 [
1064 {
1065 "id": "HIPAA.Security-S3BucketReplicationEnabled",
1066 "reason": shared_replication_reason,
1067 },
1068 {
1069 "id": "NIST.800.53.R5-S3BucketReplicationEnabled",
1070 "reason": shared_replication_reason,
1071 },
1072 {
1073 "id": "PCI.DSS.321-S3BucketReplicationEnabled",
1074 "reason": shared_replication_reason,
1075 },
1076 ],
1077 )
1079 access_logs_is_self_target_reason = (
1080 "This is the server access logs destination bucket for Cluster_Shared_Bucket."
1081 )
1082 NagSuppressions.add_resource_suppressions(
1083 self.cluster_shared_access_logs_bucket,
1084 [
1085 {
1086 "id": "AwsSolutions-S1",
1087 "reason": access_logs_is_self_target_reason,
1088 },
1089 {
1090 "id": "HIPAA.Security-S3BucketLoggingEnabled",
1091 "reason": access_logs_is_self_target_reason,
1092 },
1093 {
1094 "id": "NIST.800.53.R5-S3BucketLoggingEnabled",
1095 "reason": access_logs_is_self_target_reason,
1096 },
1097 {
1098 "id": "PCI.DSS.321-S3BucketLoggingEnabled",
1099 "reason": access_logs_is_self_target_reason,
1100 },
1101 {
1102 "id": "HIPAA.Security-S3BucketReplicationEnabled",
1103 "reason": shared_replication_reason,
1104 },
1105 {
1106 "id": "NIST.800.53.R5-S3BucketReplicationEnabled",
1107 "reason": shared_replication_reason,
1108 },
1109 {
1110 "id": "PCI.DSS.321-S3BucketReplicationEnabled",
1111 "reason": shared_replication_reason,
1112 },
1113 ],
1114 )
1116 def _publish_cluster_shared_bucket_ssm_params(self) -> None:
1117 """Publish the three ``/gco/cluster-shared-bucket/*`` SSM parameters.
1119 Writes:
1121 - ``/gco/cluster-shared-bucket/name`` — bucket name
1122 - ``/gco/cluster-shared-bucket/arn`` — bucket ARN
1123 - ``/gco/cluster-shared-bucket/region`` — bucket home region (global region)
1125 These parameters are the cross-region contract consumed by
1126 ``GCORegionalStack._resolve_cluster_shared_bucket_from_ssm`` (always) and by
1127 ``GCOAnalyticsStack._grant_sagemaker_role_on_cluster_shared_bucket``
1128 (conditional on the analytics toggle). The prefix
1129 ``CLUSTER_SHARED_SSM_PARAMETER_PREFIX`` is the single source of truth so
1130 the namespace can be renamed in exactly one place if needed.
1132 Also emits four ``CfnOutput`` values for discoverability: the three SSM
1133 values plus the KMS key ARN. Export names follow the existing
1134 ``{project_name}-cluster-shared-{suffix}`` pattern used by the rest of
1135 this stack's outputs so operators can cross-reference them from peer
1136 stacks via ``Fn.import_value`` if needed (the primary cross-region
1137 contract remains SSM).
1138 """
1139 project_name = self.config.get_project_name()
1141 ssm.StringParameter(
1142 self,
1143 "ClusterSharedBucketNameParam",
1144 parameter_name=f"{CLUSTER_SHARED_SSM_PARAMETER_PREFIX}/name",
1145 string_value=self.cluster_shared_bucket.bucket_name,
1146 description="Name of the always-on Cluster_Shared_Bucket (owned by GCOGlobalStack).",
1147 )
1149 ssm.StringParameter(
1150 self,
1151 "ClusterSharedBucketArnParam",
1152 parameter_name=f"{CLUSTER_SHARED_SSM_PARAMETER_PREFIX}/arn",
1153 string_value=self.cluster_shared_bucket.bucket_arn,
1154 description="ARN of the always-on Cluster_Shared_Bucket (owned by GCOGlobalStack).",
1155 )
1157 ssm.StringParameter(
1158 self,
1159 "ClusterSharedBucketRegionParam",
1160 parameter_name=f"{CLUSTER_SHARED_SSM_PARAMETER_PREFIX}/region",
1161 string_value=self.region,
1162 description="Home region of the always-on Cluster_Shared_Bucket (the global region).",
1163 )
1165 CfnOutput(
1166 self,
1167 "ClusterSharedBucketName",
1168 value=self.cluster_shared_bucket.bucket_name,
1169 description="Name of the always-on Cluster_Shared_Bucket.",
1170 export_name=f"{project_name}-cluster-shared-bucket-name",
1171 )
1173 CfnOutput(
1174 self,
1175 "ClusterSharedBucketArn",
1176 value=self.cluster_shared_bucket.bucket_arn,
1177 description="ARN of the always-on Cluster_Shared_Bucket.",
1178 export_name=f"{project_name}-cluster-shared-bucket-arn",
1179 )
1181 CfnOutput(
1182 self,
1183 "ClusterSharedBucketRegion",
1184 value=self.region,
1185 description="Home region of the always-on Cluster_Shared_Bucket.",
1186 export_name=f"{project_name}-cluster-shared-bucket-region",
1187 )
1189 CfnOutput(
1190 self,
1191 "ClusterSharedKmsKeyArn",
1192 value=self.cluster_shared_kms_key.key_arn,
1193 description="ARN of the always-on KMS key encrypting Cluster_Shared_Bucket.",
1194 export_name=f"{project_name}-cluster-shared-kms-key-arn",
1195 )
1197 def _resolve_replication_destinations(self, destinations: str | list[str]) -> list[str]:
1198 """Resolve the configured replication destinations into a region list.
1200 When ``destinations`` is the literal ``"all_deployed_regions"``, the
1201 list comes from ``self.config.get_regions()`` (the same source the
1202 rest of the stack uses for cross-region wiring). When it is an
1203 explicit list, it is returned as-is. The source region (the global
1204 stack's deploy region) is excluded — ECR replication is point-to-point
1205 and a self-referential destination is rejected by the API.
1206 """
1207 if isinstance(destinations, str): 1207 ↛ 1210line 1207 didn't jump to line 1210 because the condition on line 1207 was always true
1208 candidate_regions = list(self.config.get_regions())
1209 else:
1210 candidate_regions = list(destinations)
1211 return [region for region in candidate_regions if region != self.region]
1213 def _create_image_replication_rule(self) -> None:
1214 """Provision the ECR replication rule for ``gco/*`` repositories.
1216 When ``images.replication.enabled`` is True and at least one
1217 non-source destination resolves, creates one
1218 ``aws_ecr.CfnReplicationConfiguration`` rule with a single
1219 ``PREFIX_MATCH`` filter on ``gco/`` and one destination per resolved
1220 region. When replication is disabled or the destination list is
1221 empty (e.g. single-region deploy), no replication resource is
1222 provisioned and the method becomes a no-op.
1223 """
1224 if not self.images_config["replication"]["enabled"]: 1224 ↛ 1225line 1224 didn't jump to line 1225 because the condition on line 1224 was never true
1225 return
1227 destinations = self._resolve_replication_destinations(
1228 self.images_config["replication"]["destinations"]
1229 )
1230 if not destinations: 1230 ↛ 1231line 1230 didn't jump to line 1231 because the condition on line 1230 was never true
1231 return
1233 ecr.CfnReplicationConfiguration(
1234 self,
1235 "GcoImageReplicationConfig",
1236 replication_configuration=ecr.CfnReplicationConfiguration.ReplicationConfigurationProperty(
1237 rules=[
1238 ecr.CfnReplicationConfiguration.ReplicationRuleProperty(
1239 destinations=[
1240 ecr.CfnReplicationConfiguration.ReplicationDestinationProperty(
1241 region=region,
1242 registry_id=self.account,
1243 )
1244 for region in destinations
1245 ],
1246 repository_filters=[
1247 ecr.CfnReplicationConfiguration.RepositoryFilterProperty(
1248 filter="gco/",
1249 filter_type="PREFIX_MATCH",
1250 )
1251 ],
1252 )
1253 ]
1254 ),
1255 )
1257 def _create_image_lookup_lambda(self) -> None:
1258 """Create the lookup-or-create custom resource Lambda for image repos.
1260 The Lambda implements the adopt-or-create pattern for ECR repos
1261 under the project's ``gco/*`` prefix. It is invoked at the time
1262 ``cli images init`` registers a new repo with the global stack via
1263 a ``CustomResource``; the function itself is provisioned here so
1264 the ARN is stable across deploys.
1266 The Lambda's IAM role grants read/write access to ECR repository
1267 APIs scoped to the project's prefix, plus the standard basic
1268 execution policy for CloudWatch Logs.
1269 """
1270 project_name = self.config.get_project_name()
1272 # IAM role for the Lambda — minimal ECR + CloudWatch Logs permissions.
1273 # ECR repository APIs scope by repository name, not ARN, so the
1274 # ``gco/*`` prefix scope is enforced via the ARN pattern in the
1275 # policy resource list.
1276 repo_arn = f"arn:aws:ecr:*:{self.account}:repository/gco/*"
1278 self.image_lookup_lambda = lambda_.Function(
1279 self,
1280 "ImageLookupFunction",
1281 runtime=getattr(lambda_.Runtime, LAMBDA_PYTHON_RUNTIME),
1282 handler="handler.lambda_handler",
1283 code=lambda_.Code.from_asset("lambda/image-lookup"),
1284 timeout=Duration.minutes(5),
1285 description=(
1286 "Lookup-or-create custom resource handler for ECR "
1287 "repositories under the project's gco/* prefix."
1288 ),
1289 )
1291 assert self.image_lookup_lambda.role is not None
1292 self.image_lookup_lambda.role.add_to_principal_policy(
1293 iam.PolicyStatement(
1294 effect=iam.Effect.ALLOW,
1295 actions=[
1296 "ecr:DescribeRepositories",
1297 "ecr:CreateRepository",
1298 "ecr:DeleteRepository",
1299 "ecr:PutLifecyclePolicy",
1300 "ecr:GetLifecyclePolicy",
1301 "ecr:TagResource",
1302 "ecr:ListTagsForResource",
1303 "ecr:BatchDeleteImage",
1304 "ecr:DescribeImages",
1305 "ecr:ListImages",
1306 ],
1307 resources=[repo_arn],
1308 )
1309 )
1311 CfnOutput(
1312 self,
1313 "ImageLookupFunctionArn",
1314 value=self.image_lookup_lambda.function_arn,
1315 description=(
1316 "Lambda ARN for the lookup-or-create custom resource that "
1317 "manages ECR repositories under the gco/* prefix."
1318 ),
1319 export_name=f"{project_name}-image-lookup-function-arn",
1320 )
1322 # The ECR repository policy uses ``arn:aws:ecr:*:<account>:repository/gco/*``
1323 # which cdk-nag flags as ``AwsSolutions-IAM5`` because of the trailing
1324 # ``*``. The wildcard here is the documented IAM way to express
1325 # "every repository in this project's prefix", which is exactly the
1326 # blast radius we want for a Lambda whose contract is to manage
1327 # ECR repos under that prefix. Suppression is scoped to the specific
1328 # ARN pattern (and to all ECR Describe/Read action wildcards in
1329 # the policy below) rather than a blanket ``Resource::*`` bypass.
1330 #
1331 # ``self.account`` resolves to the unresolved CDK token
1332 # ``<AWS::AccountId>`` at synth time, which is the literal form
1333 # cdk-nag uses when it reports the finding's ``finding_id``. The
1334 # ``appliesTo`` value below has to match that literal form exactly,
1335 # so we hard-code the token rather than interpolating ``self.account``.
1336 from cdk_nag import NagSuppressions
1338 NagSuppressions.add_resource_suppressions(
1339 self.image_lookup_lambda.role,
1340 [
1341 {
1342 "id": "AwsSolutions-IAM5",
1343 "reason": (
1344 "The ImageLookupFunction's contract is to look up "
1345 "or create any ECR repository under the project's "
1346 "``gco/*`` prefix. The ARN pattern "
1347 "``arn:aws:ecr:*:<account>:repository/gco/*`` is "
1348 "the documented IAM way to express that scope: it "
1349 "covers exactly the repositories the function is "
1350 "allowed to touch and nothing else."
1351 ),
1352 "appliesTo": [
1353 "Resource::arn:aws:ecr:*:<AWS::AccountId>:repository/gco/*",
1354 ],
1355 },
1356 ],
1357 apply_to_children=True,
1358 )