Coverage for gco/stacks/regional_stack.py: 94%
482 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-15 15:07 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-15 15:07 +0000
1"""
2Regional stack for GCO (Global Capacity Orchestrator on AWS) - EKS cluster and ALB per region.
4This is the largest stack in the project (~3200 lines) and creates all regional
5resources for a single AWS region. One instance is deployed per region defined
6in cdk.json.
8Resources Created:
9 VPC & Networking:
10 - VPC with 3 AZs, public subnets (ALB), private subnets (EKS nodes)
11 - 2 NAT Gateways for high availability
12 - VPC endpoints for ECR, S3, STS, Secrets Manager, SSM, CloudWatch
13 - VPC Flow Logs (CloudWatch Logs, 30-day retention)
15 EKS Cluster (Auto Mode):
16 - Managed control plane with full logging (API, Audit, Authenticator, Controller Manager, Scheduler)
17 - NodePools: system, general-purpose, gpu-x86, gpu-arm, inference, gpu-efa, neuron, cpu-general
18 - IRSA roles for service accounts (Secrets Manager, SQS, DynamoDB, CloudWatch, S3, EFS)
20 Load Balancing:
21 - ALB (created by Ingress via AWS Load Balancer Controller)
22 - Internal NLB for regional API Gateway VPC Link
23 - Global Accelerator endpoint registration (via ga-registration Lambda)
25 Storage:
26 - EFS with dynamic provisioning (CSI driver, access points, encryption at rest + in transit)
27 - FSx for Lustre (optional, toggled via cdk.json)
28 - Valkey Serverless cache (optional)
29 - Aurora Serverless v2 with pgvector (optional)
31 Lambda Functions:
32 - kubectl-applier: applies K8s manifests during deployment
33 - helm-installer: installs Helm charts (KEDA, Volcano, KubeRay, GPU Operator, etc.)
34 - ga-registration: registers ALB with Global Accelerator
35 - regional-api-proxy: proxies regional API Gateway to internal ALB
37 Container Images:
38 - ECR repositories + Docker image builds for health-monitor, manifest-processor,
39 inference-monitor, queue-processor
41 SQS:
42 - Regional job queue + dead letter queue (for gco jobs submit-sqs)
44Key Design Decisions:
45 - EKS Auto Mode handles node provisioning — no managed node groups or Karpenter provisioners
46 - NodePools use WhenEmpty consolidation for inference to avoid disrupting long-running pods
47 - IRSA (IAM Roles for Service Accounts) for least-privilege pod-level AWS access
48 - All optional features (FSx, Valkey, Aurora) are toggled via cdk.json context variables
49 - Template variables in K8s manifests ({{PLACEHOLDER}}) are replaced at deploy time
51Dependencies:
52 - GCOGlobalStack (for Global Accelerator endpoint group ARN, DynamoDB table names, S3 bucket)
53 - GCOApiGatewayGlobalStack (for auth secret ARN)
55Modification Guide:
56 - To add a new NodePool: add a YAML manifest in lambda/kubectl-applier-simple/manifests/ (40-49 range)
57 - To add a new service: add ECR image build here, Dockerfile in dockerfiles/, manifest in manifests/
58 - To add a new optional feature: add a cdk.json context toggle, guard with if/else in this file
59 - To change EKS version: update KUBERNETES_VERSION in constants.py
60"""
62from __future__ import annotations
64import time
65from dataclasses import dataclass
66from typing import Any
68import aws_cdk.aws_eks_v2 as eks
69from aws_cdk import (
70 CfnJson,
71 CfnOutput,
72 CfnTag,
73 CustomResource,
74 Duration,
75 Fn,
76 RemovalPolicy,
77 Stack,
78)
79from aws_cdk import aws_ec2 as ec2
80from aws_cdk import aws_ecr as ecr
81from aws_cdk import aws_ecr_assets as ecr_assets
82from aws_cdk import aws_efs as efs
83from aws_cdk import aws_eks as eks_l1 # L1 constructs (CfnPodIdentityAssociation)
84from aws_cdk import aws_events as events
85from aws_cdk import aws_events_targets as events_targets
86from aws_cdk import aws_fsx as fsx
87from aws_cdk import aws_iam as iam
88from aws_cdk import aws_kms as kms
89from aws_cdk import aws_lambda as lambda_
90from aws_cdk import aws_logs as logs
91from aws_cdk import aws_sns as sns
92from aws_cdk import aws_sqs as sqs
93from aws_cdk import aws_ssm as ssm
94from aws_cdk import custom_resources as cr
95from constructs import Construct
97from gco.config.config_loader import ConfigLoader
98from gco.stacks.constants import (
99 AURORA_POSTGRES_VERSION,
100 CLUSTER_SHARED_SSM_PARAMETER_PREFIX,
101 EKS_ADDON_CLOUDWATCH_OBSERVABILITY,
102 EKS_ADDON_EFS_CSI_DRIVER,
103 EKS_ADDON_FSX_CSI_DRIVER,
104 EKS_ADDON_METRICS_SERVER,
105 EKS_ADDON_POD_IDENTITY_AGENT,
106 LAMBDA_PYTHON_RUNTIME,
107)
109# <pyflowchart-code-diagram> BEGIN - auto-inserted, do not edit
110# Flowchart(s) generated from this file:
111# * ``GCORegionalStack.__init__`` -> ``diagrams/code_diagrams/gco/stacks/regional_stack.GCORegionalStack___init__.html``
112# (PNG: ``diagrams/code_diagrams/gco/stacks/regional_stack.GCORegionalStack___init__.png``)
113# Regenerate with ``python diagrams/code_diagrams/generate.py``.
114# <pyflowchart-code-diagram> END
117@dataclass(frozen=True)
118class SharedBucketIdentity:
119 """Identity of the always-on ``Cluster_Shared_Bucket`` owned by ``GCOGlobalStack``.
121 Every regional stack resolves this identity from the three SSM parameters
122 ``/gco/cluster-shared-bucket/{name,arn,region}`` published by
123 ``GCOGlobalStack`` in the global region. The three values are used to
124 grant IAM permissions on the bucket to the regional job-pod role and to
125 populate the ``gco-cluster-shared-bucket`` ConfigMap applied to every
126 regional EKS cluster. Frozen so it can be safely shared across helper
127 methods without accidental mutation.
128 """
130 name: str
131 arn: str
132 region: str
135def _compute_kubectl_cluster_shared_replacements(
136 shared: SharedBucketIdentity,
137) -> dict[str, str]:
138 """Build the ``{{CLUSTER_SHARED_BUCKET*}}`` kubectl-applier replacements.
140 Pure helper kept at module scope so property and presence tests can
141 inspect the output without synthesizing a full regional stack. The
142 three keys are always populated — there is no feature toggle — because
143 the ``gco-cluster-shared-bucket`` ConfigMap is applied unconditionally
144 on every regional cluster.
145 """
146 return {
147 "{{CLUSTER_SHARED_BUCKET}}": shared.name,
148 "{{CLUSTER_SHARED_BUCKET_ARN}}": shared.arn,
149 "{{CLUSTER_SHARED_BUCKET_REGION}}": shared.region,
150 }
153def _augment_trusted_registries_with_project_ecr(
154 base: list[str],
155 *,
156 account: str,
157 regions: list[str],
158 global_region: str,
159) -> list[str]:
160 """Return the configured trusted registries plus the project's own ECR.
162 The new ``gco images build`` flow pushes images to a per-account ECR
163 registry under ``<account>.dkr.ecr.<region>.amazonaws.com/gco/<name>``.
164 Without this augmentation the queue/manifest validators would treat
165 those URIs as untrusted and reject every job that uses one — which
166 defeats the whole point of the image registry feature.
168 Returns the unique union of the operator-configured ``base`` list
169 plus the per-region project ECR hostnames (one per deployed region,
170 plus the global region where ``gco-global`` provisions the source
171 repo). Order is stable so the rendered ConfigMap doesn't churn
172 between deploys.
173 """
174 augmented: list[str] = list(base)
175 seen = set(augmented)
176 targets = list(dict.fromkeys([global_region, *regions]))
177 if account:
178 for region in targets:
179 host = f"{account}.dkr.ecr.{region}.amazonaws.com"
180 if host not in seen:
181 augmented.append(host)
182 seen.add(host)
183 return augmented
186class GCORegionalStack(Stack):
187 """
188 Regional resources stack for a single AWS region.
190 Creates EKS cluster, load balancers, and supporting infrastructure
191 for running GCO services in a specific region.
193 Attributes:
194 vpc: VPC with public/private subnets
195 cluster: EKS Auto Mode cluster
196 """
198 @staticmethod
199 def _create_irsa_role(
200 scope: GCORegionalStack,
201 id: str,
202 oidc_provider_arn: str,
203 oidc_issuer_url: str,
204 service_account_names: list[str],
205 namespaces: list[str],
206 ) -> iam.Role:
207 """Create an IAM role trusted by both IRSA (OIDC) and EKS Pod Identity.
209 IRSA is the primary credential mechanism — it works reliably on EKS Auto
210 Mode by projecting a service-account token that the AWS SDK exchanges for
211 temporary credentials via the OIDC provider.
213 Pod Identity trust is added as a secondary path so the role is ready if/when
214 Pod Identity injection starts working on Auto Mode nodes.
216 Uses CfnJson to defer OIDC condition key resolution to deploy time,
217 because the issuer URL is a CloudFormation token that can't be used
218 as a Python dict key at synth time.
219 """
220 # Strip https:// from issuer URL for the OIDC condition
221 issuer = Fn.select(1, Fn.split("//", oidc_issuer_url))
223 # Build OIDC conditions using CfnJson to defer token resolution
224 # The issuer URL is a CFN token — can't be used as a dict key at synth time
225 aud_key = Fn.join("", [issuer, ":aud"])
226 sub_key = Fn.join("", [issuer, ":sub"])
228 conditions_json = CfnJson(
229 scope,
230 f"{id}OidcConditions",
231 value={
232 aud_key: "sts.amazonaws.com",
233 sub_key: [
234 f"system:serviceaccount:{ns}:{sa}"
235 for ns in namespaces
236 for sa in service_account_names
237 ],
238 },
239 )
241 role = iam.Role(
242 scope,
243 id,
244 assumed_by=iam.FederatedPrincipal(
245 federated=oidc_provider_arn,
246 conditions={
247 "StringEquals": conditions_json,
248 },
249 assume_role_action="sts:AssumeRoleWithWebIdentity",
250 ),
251 )
253 # Also allow Pod Identity (secondary path for future use)
254 assert role.assume_role_policy is not None # guaranteed by assumed_by parameter above
255 role.assume_role_policy.add_statements(
256 iam.PolicyStatement(
257 effect=iam.Effect.ALLOW,
258 principals=[iam.ServicePrincipal("pods.eks.amazonaws.com")],
259 actions=["sts:AssumeRole", "sts:TagSession"],
260 )
261 )
262 return role
264 def __init__(
265 self,
266 scope: Construct,
267 construct_id: str,
268 config: ConfigLoader,
269 region: str,
270 auth_secret_arn: str,
271 **kwargs: Any,
272 ) -> None:
273 super().__init__(scope, construct_id, **kwargs)
275 self.config = config
276 self.deployment_region = region
277 self.auth_secret_arn = auth_secret_arn
278 self.alb_arn: str | None = None
280 # Get cluster configuration for this region
281 cluster_config = self.config.get_cluster_config(region)
282 self.cluster_config = cluster_config
284 # Create VPC for the EKS cluster
285 self.vpc = ec2.Vpc(
286 self,
287 "GCOVpc",
288 # vpc_name intentionally omitted - let CDK generate unique name
289 max_azs=3,
290 nat_gateways=2, # For high availability
291 subnet_configuration=[
292 ec2.SubnetConfiguration(
293 name="PublicSubnet", subnet_type=ec2.SubnetType.PUBLIC, cidr_mask=24
294 ),
295 ec2.SubnetConfiguration(
296 name="PrivateSubnet",
297 subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS,
298 cidr_mask=24,
299 ),
300 ],
301 )
303 # Enable VPC Flow Logs for network traffic analysis and security monitoring
304 self._create_vpc_flow_logs()
306 # Create SQS queue for job ingestion
307 self._create_sqs_queue()
309 # Create ECR repositories and build Docker images
310 self._create_container_images()
312 # Pre-create the execution role shared by every ``cr.AwsCustomResource``
313 # in this stack. See ``_create_aws_custom_resource_role`` for the full
314 # rationale — in short, CDK's default behavior of auto-generating a
315 # Lambda role per ``AwsCustomResource`` (and then merging all the
316 # ``policy=`` statements onto it during deploy) triggers an IAM
317 # propagation race on cold creates. We sidestep the race by creating
318 # a single long-lived role up front and attaching policies to it as
319 # each consumer is built; every ``AwsCustomResource`` then passes
320 # ``role=self.aws_custom_resource_role`` instead of ``policy=``, so
321 # the singleton Lambda runs against a role whose inline policy has
322 # already replicated globally.
323 self._create_aws_custom_resource_role()
325 # Create EKS cluster
326 self._create_eks_cluster(cluster_config)
328 # Resolve the always-on Cluster_Shared_Bucket identity from SSM
329 # (owned by GCOGlobalStack) and attach RW + KMS grants to the
330 # job-pod role. Runs unconditionally — the ConfigMap and IAM
331 # statements are always present on every regional cluster. Must
332 # run after
333 # _create_pod_identity_associations (which created service_account_role)
334 # and before _apply_kubernetes_manifests (which consumes the
335 # replacements in the KubectlApplyManifests CustomResource).
336 self.cluster_shared_identity = self._resolve_cluster_shared_bucket_from_ssm()
337 self._grant_cluster_shared_bucket_to_job_role(self.cluster_shared_identity)
339 # Create EFS for shared storage
340 self._create_efs()
342 # Create FSx for Lustre (if enabled) for high-performance storage
343 self._create_fsx_lustre()
345 # Create Valkey Serverless cache (if enabled) for K/V caching
346 self._create_valkey_cache()
348 # Create Aurora Serverless v2 + pgvector (if enabled) for vector DB
349 self._create_aurora_pgvector()
351 # Create GA registration Lambda for registering Ingress-created ALB
352 self._create_ga_registration_lambda()
354 # Create Helm installer Lambda for KEDA and other Helm-based installations
355 self._create_helm_installer_lambda()
357 # Apply Kubernetes manifests (after EFS so IDs are available)
358 self._apply_kubernetes_manifests()
360 # Create CloudFormation drift detection (daily schedule + SNS alerts)
361 self._create_drift_detection()
363 # Create dedicated IAM role for MCP server
364 self._create_mcp_role()
366 # Export cluster information
367 self._create_outputs()
369 # Apply cdk-nag suppressions for this stack
370 self._apply_nag_suppressions()
372 def _create_vpc_flow_logs(self) -> None:
373 """Create VPC Flow Logs for network traffic monitoring.
375 Flow logs capture information about IP traffic going to and from
376 network interfaces in the VPC. This is required for security
377 monitoring and compliance (HIPAA, SOC2, etc.).
378 """
379 # Create CloudWatch Log Group for flow logs
380 flow_log_group = logs.LogGroup(
381 self,
382 "VpcFlowLogGroup",
383 # log_group_name intentionally omitted - let CDK generate unique name
384 retention=logs.RetentionDays.ONE_MONTH,
385 removal_policy=RemovalPolicy.DESTROY,
386 )
388 # Create IAM role for VPC Flow Logs
389 flow_log_role = iam.Role(
390 self,
391 "VpcFlowLogRole",
392 assumed_by=iam.ServicePrincipal("vpc-flow-logs.amazonaws.com"),
393 )
395 flow_log_role.add_to_policy(
396 iam.PolicyStatement(
397 actions=[
398 "logs:CreateLogStream",
399 "logs:PutLogEvents",
400 "logs:DescribeLogGroups",
401 "logs:DescribeLogStreams",
402 ],
403 resources=[flow_log_group.log_group_arn, f"{flow_log_group.log_group_arn}:*"],
404 )
405 )
407 # Create VPC Flow Log
408 ec2.FlowLog(
409 self,
410 "VpcFlowLog",
411 resource_type=ec2.FlowLogResourceType.from_vpc(self.vpc),
412 destination=ec2.FlowLogDestination.to_cloud_watch_logs(flow_log_group, flow_log_role),
413 traffic_type=ec2.FlowLogTrafficType.ALL,
414 )
416 def _apply_nag_suppressions(self) -> None:
417 """Apply cdk-nag suppressions for this stack."""
418 from gco.stacks.nag_suppressions import apply_all_suppressions
420 apply_all_suppressions(
421 self,
422 stack_type="regional",
423 regions=self.config.get_regions(),
424 global_region=self.config.get_global_region(),
425 )
427 def _create_sqs_queue(self) -> None:
428 """Create SQS queue for job ingestion.
430 Creates an SQS queue that serves as the default job ingestion point
431 for this region. Jobs submitted to this queue are processed by the
432 manifest processor and KEDA scales based on queue depth.
434 Also creates a dead-letter queue for failed messages.
435 Both queues use server-side encryption with AWS managed keys.
436 """
437 project_name = self.config.get_project_name()
439 # Create dead-letter queue for failed messages
440 self.job_dlq = sqs.Queue(
441 self,
442 "JobDeadLetterQueue",
443 queue_name=f"{project_name}-jobs-dlq-{self.deployment_region}",
444 retention_period=Duration.days(14),
445 removal_policy=RemovalPolicy.DESTROY,
446 enforce_ssl=True, # Require SSL for all requests
447 encryption=sqs.QueueEncryption.SQS_MANAGED, # Server-side encryption
448 )
450 # Create main job queue
451 self.job_queue = sqs.Queue(
452 self,
453 "JobQueue",
454 queue_name=f"{project_name}-jobs-{self.deployment_region}",
455 visibility_timeout=Duration.minutes(5), # Match Lambda timeout
456 retention_period=Duration.days(7),
457 dead_letter_queue=sqs.DeadLetterQueue(
458 max_receive_count=3, # Move to DLQ after 3 failed attempts
459 queue=self.job_dlq,
460 ),
461 removal_policy=RemovalPolicy.DESTROY,
462 enforce_ssl=True, # Require SSL for all requests
463 encryption=sqs.QueueEncryption.SQS_MANAGED, # Server-side encryption
464 )
466 # Output queue information
467 CfnOutput(
468 self,
469 "JobQueueUrl",
470 value=self.job_queue.queue_url,
471 description=f"SQS Job Queue URL for {self.deployment_region}",
472 export_name=f"{project_name}-job-queue-url-{self.deployment_region}",
473 )
475 CfnOutput(
476 self,
477 "JobQueueArn",
478 value=self.job_queue.queue_arn,
479 description=f"SQS Job Queue ARN for {self.deployment_region}",
480 export_name=f"{project_name}-job-queue-arn-{self.deployment_region}",
481 )
483 CfnOutput(
484 self,
485 "JobDlqUrl",
486 value=self.job_dlq.queue_url,
487 description=f"SQS Dead Letter Queue URL for {self.deployment_region}",
488 export_name=f"{project_name}-job-dlq-url-{self.deployment_region}",
489 )
491 def _create_aws_custom_resource_role(self) -> None:
492 """Pre-create the execution role shared by every ``AwsCustomResource``.
494 CDK's ``cr.AwsCustomResource`` defaults to auto-generating a per-
495 construct Lambda execution role from the ``policy=`` parameter.
496 Internally, CDK deduplicates those auto-generated roles onto a
497 single *singleton* provider Lambda (logical id prefix
498 ``AWS679f53fac002430cb0da5b7982bd22872``), and merges each custom
499 resource's policy statements onto that Lambda's role at stack
500 create time. On cold deploys, CloudFormation invokes the Lambda
501 within 2-3 seconds of attaching a new policy statement, which is
502 faster than IAM's global propagation window. The symptom is a
503 ``iam:PassRole NOT authorized`` failure on whichever addon role
504 update happens to run right after its ``iam:PassRole`` policy
505 statement was attached but before it had replicated.
507 The fix is to create the role up front, attach every policy
508 statement the stack will need during stack creation, and pass
509 ``role=self.aws_custom_resource_role`` to every
510 ``AwsCustomResource`` instead of ``policy=``. Because the role
511 already exists — and its inline policy has had minutes to
512 replicate by the time any ``AwsCustomResource`` actually fires —
513 the race disappears entirely.
515 This method creates the role with the statements we can compute
516 without a cluster reference (EKS ``UpdateAddon`` / ``DescribeAddon``
517 scoped to this cluster, and SSM ``GetParameter`` for the endpoint
518 group ARN). ``iam:PassRole`` statements for individual addon
519 roles (EFS CSI, FSx CSI, CloudWatch Observability) are appended
520 by each ``_create_*_addon`` method after the corresponding IRSA
521 role has been created, so every PassRole ``resources=`` list
522 stays precise (no wildcards) and cdk-nag stays happy.
523 """
524 project_name = self.config.get_project_name()
525 global_region = self.config.get_global_region()
527 self.aws_custom_resource_role = iam.Role(
528 self,
529 "AwsCustomResourceRole",
530 assumed_by=iam.ServicePrincipal("lambda.amazonaws.com"),
531 description=(
532 "Shared execution role for every cr.AwsCustomResource in this "
533 "stack. Pre-created to avoid the IAM policy propagation race "
534 "that occurs when CDK auto-generates per-CR roles and the "
535 "singleton provider Lambda fires before the freshly-attached "
536 "policy has replicated globally."
537 ),
538 managed_policies=[
539 iam.ManagedPolicy.from_aws_managed_policy_name(
540 "service-role/AWSLambdaBasicExecutionRole"
541 ),
542 ],
543 )
545 # EKS UpdateAddon / DescribeAddon — used by the three updateAddon
546 # custom resources (EFS CSI, FSx CSI, CloudWatch Observability).
547 # Scoped to this cluster's addons by ARN.
548 self.aws_custom_resource_role.add_to_policy(
549 iam.PolicyStatement(
550 effect=iam.Effect.ALLOW,
551 actions=["eks:UpdateAddon", "eks:DescribeAddon"],
552 resources=[
553 f"arn:aws:eks:{self.deployment_region}:{self.account}"
554 f":addon/{self.cluster_config.cluster_name}/*"
555 ],
556 )
557 )
559 # SSM GetParameter — used by the GetEndpointGroupArn custom
560 # resource in _create_ga_registration_lambda to read the ARN of
561 # the Global Accelerator endpoint group published by the global
562 # stack during its deploy.
563 self.aws_custom_resource_role.add_to_policy(
564 iam.PolicyStatement(
565 effect=iam.Effect.ALLOW,
566 actions=["ssm:GetParameter"],
567 resources=[
568 f"arn:aws:ssm:{global_region}:{self.account}:parameter/{project_name}/*"
569 ],
570 )
571 )
573 # cdk-nag suppressions: the two wildcard-bearing ARNs above are
574 # intentional and both scoped as tightly as AWS IAM permits.
575 #
576 # - The ``eks:UpdateAddon`` / ``eks:DescribeAddon`` statement uses
577 # ``addon/<cluster>/*`` as its resource because the same shared
578 # role is consumed by three different updateAddon custom
579 # resources (EFS CSI, FSx CSI, CloudWatch Observability). Each
580 # addon has its own ARN and we'd otherwise need three separate
581 # statements that each grant access to a known addon name. The
582 # wildcard is scoped to a single cluster in a single region in
583 # a single account — it cannot be used against any addon
584 # belonging to a different cluster or a different service.
585 #
586 # - The ``ssm:GetParameter`` statement uses
587 # ``parameter/<project>/*`` because the exact parameter name
588 # (``endpoint-group-<region>-arn``) is only known at Global
589 # Accelerator registration time and the endpoint path
590 # structure is ``<project>/<parameter>``. Scoping to the
591 # project prefix restricts access to parameters owned by this
592 # project only.
593 from cdk_nag import NagSuppressions
595 NagSuppressions.add_resource_suppressions(
596 self.aws_custom_resource_role,
597 [
598 {
599 "id": "AwsSolutions-IAM5",
600 "reason": (
601 "Scoped to a single EKS cluster's addons "
602 "(addon/<cluster>/*) and this project's SSM "
603 "parameters (parameter/<project>/*). Both wildcards "
604 "are as tight as AWS IAM permits: addon names and "
605 "parameter names are not known at stack synthesis "
606 "time because the addons are created later in the "
607 "same stack and the GA endpoint group ARN is "
608 "published by a separate stack during deploy. The "
609 "shared role pattern itself is deliberate — see "
610 "_create_aws_custom_resource_role docstring for why "
611 "we pre-create instead of letting CDK auto-generate "
612 "per-CR roles."
613 ),
614 "appliesTo": [
615 f"Resource::arn:aws:eks:{self.deployment_region}"
616 f":<AWS::AccountId>:addon/{self.cluster_config.cluster_name}/*",
617 f"Resource::arn:aws:ssm:{global_region}"
618 f":<AWS::AccountId>:parameter/{project_name}/*",
619 ],
620 },
621 ],
622 apply_to_children=True,
623 )
625 def _create_container_images(self) -> None:
626 """Create ECR repositories and build Docker images for services"""
628 # Create ECR repository for health monitor
629 self.health_monitor_repo = ecr.Repository(
630 self,
631 "HealthMonitorRepo",
632 # repository_name intentionally omitted - let CDK generate unique name
633 removal_policy=RemovalPolicy.DESTROY, # For dev/test; use RETAIN for production
634 empty_on_delete=True, # Clean up images on stack deletion
635 image_scan_on_push=True, # Enable vulnerability scanning on push
636 )
638 # All Docker images target AMD64 (x86_64) to match EKS Auto Mode's
639 # default system nodepool.
641 # Build and push health monitor Docker image
642 self.health_monitor_image = ecr_assets.DockerImageAsset(
643 self,
644 "HealthMonitorImage",
645 directory=".", # Root directory
646 file="dockerfiles/health-monitor-dockerfile",
647 platform=ecr_assets.Platform.LINUX_AMD64,
648 )
650 # Create ECR repository for manifest processor
651 self.manifest_processor_repo = ecr.Repository(
652 self,
653 "ManifestProcessorRepo",
654 # repository_name intentionally omitted - let CDK generate unique name
655 removal_policy=RemovalPolicy.DESTROY,
656 empty_on_delete=True,
657 image_scan_on_push=True, # Enable vulnerability scanning on push
658 )
660 # Build and push manifest processor Docker image
661 self.manifest_processor_image = ecr_assets.DockerImageAsset(
662 self,
663 "ManifestProcessorImage",
664 directory=".",
665 file="dockerfiles/manifest-processor-dockerfile",
666 platform=ecr_assets.Platform.LINUX_AMD64,
667 )
669 # Output image URIs for reference
670 CfnOutput(
671 self,
672 "HealthMonitorImageUri",
673 value=self.health_monitor_image.image_uri,
674 description="Health Monitor Docker image URI",
675 )
677 CfnOutput(
678 self,
679 "ManifestProcessorImageUri",
680 value=self.manifest_processor_image.image_uri,
681 description="Manifest Processor Docker image URI",
682 )
684 # Build and push inference monitor Docker image
685 self.inference_monitor_image = ecr_assets.DockerImageAsset(
686 self,
687 "InferenceMonitorImage",
688 directory=".",
689 file="dockerfiles/inference-monitor-dockerfile",
690 platform=ecr_assets.Platform.LINUX_AMD64,
691 )
693 CfnOutput(
694 self,
695 "InferenceMonitorImageUri",
696 value=self.inference_monitor_image.image_uri,
697 description="Inference Monitor Docker image URI",
698 )
700 # Build and push queue processor Docker image (if enabled).
701 # The queue processor is a KEDA ScaledJob that consumes manifests from
702 # the regional SQS queue. It can be disabled in cdk.json if users want
703 # to implement their own consumer. When disabled, the post-helm-sqs-consumer.yaml
704 # manifest is skipped (unreplaced template variables cause it to be skipped).
705 queue_processor_config = self.node.try_get_context("queue_processor") or {}
706 self.queue_processor_enabled = queue_processor_config.get("enabled", True)
708 if self.queue_processor_enabled: 708 ↛ exitline 708 didn't return from function '_create_container_images' because the condition on line 708 was always true
709 self.queue_processor_image = ecr_assets.DockerImageAsset(
710 self,
711 "QueueProcessorImage",
712 directory=".",
713 file="dockerfiles/queue-processor-dockerfile",
714 platform=ecr_assets.Platform.LINUX_AMD64,
715 )
717 CfnOutput(
718 self,
719 "QueueProcessorImageUri",
720 value=self.queue_processor_image.image_uri,
721 description="Queue Processor Docker image URI",
722 )
724 def _create_eks_cluster(self, cluster_config: Any) -> None:
725 """Create the EKS cluster with auto mode and GPU node groups"""
727 # Create cluster admin role
728 # role_name intentionally omitted - let CDK generate unique name
729 cluster_admin_role = iam.Role(
730 self,
731 "ClusterAdminRole",
732 assumed_by=iam.ServicePrincipal("eks.amazonaws.com"),
733 managed_policies=[
734 iam.ManagedPolicy.from_aws_managed_policy_name("AmazonEKSClusterPolicy")
735 ],
736 )
738 # Create node group role
739 # role_name intentionally omitted - let CDK generate unique name
740 iam.Role(
741 self,
742 "NodeGroupRole",
743 assumed_by=iam.ServicePrincipal("ec2.amazonaws.com"),
744 managed_policies=[
745 iam.ManagedPolicy.from_aws_managed_policy_name("AmazonEKSWorkerNodePolicy"),
746 iam.ManagedPolicy.from_aws_managed_policy_name("AmazonEKS_CNI_Policy"),
747 iam.ManagedPolicy.from_aws_managed_policy_name(
748 "AmazonEC2ContainerRegistryReadOnly"
749 ),
750 ],
751 )
753 # Create EKS Auto Mode cluster with built-in system and general-purpose nodepools
754 # Auto Mode automatically manages compute resources and comes with essential addons
755 # Get endpoint access configuration
756 eks_config = self.config.get_eks_cluster_config()
757 endpoint_access_mode = eks_config.get("endpoint_access", "PRIVATE")
759 # Map config string to EKS EndpointAccess enum
760 endpoint_access = (
761 eks.EndpointAccess.PRIVATE
762 if endpoint_access_mode == "PRIVATE"
763 else eks.EndpointAccess.PUBLIC_AND_PRIVATE
764 )
766 # Create KMS key for EKS secrets encryption
767 self.eks_encryption_key = kms.Key(
768 self,
769 "EksSecretsEncryptionKey",
770 description="KMS key for EKS Kubernetes secrets encryption",
771 enable_key_rotation=True,
772 removal_policy=RemovalPolicy.RETAIN,
773 )
775 # Get Kubernetes version - use custom version if not available in CDK enum
776 k8s_version_str = cluster_config.kubernetes_version
777 try:
778 k8s_version = getattr(eks.KubernetesVersion, f"V{k8s_version_str.replace('.', '_')}")
779 except AttributeError:
780 # Version not in CDK enum yet, use custom version
781 k8s_version = eks.KubernetesVersion.of(k8s_version_str)
783 self.cluster = eks.Cluster(
784 self,
785 "GCOEksCluster",
786 cluster_name=cluster_config.cluster_name,
787 version=k8s_version, # Use configured version for Auto Mode with DRA support
788 vpc=self.vpc,
789 compute=eks.ComputeConfig(
790 # Enable both built-in node pools - Auto Mode manages these automatically
791 node_pools=["system", "general-purpose"]
792 ),
793 # SECURITY: Endpoint access controlled via cdk.json eks_cluster.endpoint_access
794 # PRIVATE (default): EKS API accessible only from within VPC - most secure
795 # Job submission works via API Gateway → Lambda (in VPC) or SQS
796 # For kubectl access, use a bastion host, VPN, or AWS SSM Session Manager
797 # PUBLIC_AND_PRIVATE: EKS API accessible from internet and VPC
798 # Allows direct kubectl access but less secure
799 endpoint_access=endpoint_access,
800 role=cluster_admin_role,
801 vpc_subnets=[ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS)],
802 # Enable all control plane logging for security and compliance
803 cluster_logging=[
804 eks.ClusterLoggingTypes.API,
805 eks.ClusterLoggingTypes.AUDIT,
806 eks.ClusterLoggingTypes.AUTHENTICATOR,
807 eks.ClusterLoggingTypes.CONTROLLER_MANAGER,
808 eks.ClusterLoggingTypes.SCHEDULER,
809 ],
810 # SECURITY: Enable envelope encryption for Kubernetes secrets using KMS
811 secrets_encryption_key=self.eks_encryption_key,
812 )
814 # Auto Mode comes with essential addons pre-configured:
815 # - AWS Load Balancer Controller (for ALB/NLB integration)
816 # - CoreDNS, kube-proxy, VPC CNI (standard Kubernetes components)
818 # OIDC provider for IRSA — the primary credential injection mechanism.
819 # IRSA uses projected service-account tokens exchanged via the OIDC provider
820 # for temporary AWS credentials. This works reliably on EKS Auto Mode.
821 self.oidc_provider = eks.OidcProviderNative(
822 self,
823 "OidcProvider",
824 url=self.cluster.cluster_open_id_connect_issuer_url,
825 )
827 # Pod Identity Agent add-on — registers the admission webhook that injects
828 # Pod Identity credentials. On Auto Mode the DaemonSet schedules 0 pods
829 # (the agent is built into the node), but the add-on registration is still
830 # needed for the control-plane webhook. Kept as a secondary credential path.
831 self._create_pod_identity_agent_addon()
833 # Add Metrics Server add-on for HPA and resource monitoring
834 self._create_metrics_server_addon()
836 # Add EFS CSI Driver add-on for shared storage
837 self._create_efs_csi_driver_addon()
839 # Add CloudWatch Observability add-on for Container Insights metrics
840 self._create_cloudwatch_observability_addon()
842 # NOTE: GPU compute is configured via Karpenter NodePools (not managed node groups)
843 # NodePool manifests are located in lambda/kubectl-applier-simple/manifests/:
844 # - 40-nodepool-gpu-x86.yaml: x86_64 GPU instances (g4dn, g5, g6, g6e, p3)
845 # - 41-nodepool-gpu-arm.yaml: ARM64 GPU instances (g5g)
846 # - 42-nodepool-inference.yaml: inference-optimized GPU instances
847 # - 43-nodepool-efa.yaml: EFA-enabled instances (p4d, p5/p5e/p5en, p6-b200/p6-b300/p6e-gb200)
848 # - 44-nodepool-neuron.yaml: Trainium/Inferentia instances
849 # These will be applied by the kubectl Lambda custom resource (created below)
851 # Create IRSA role for service account to access secrets
852 self._create_service_account_role()
854 # Create kubectl Lambda for applying Kubernetes manifests
855 self._create_kubectl_lambda()
857 # ── Shared toleration config for EKS add-ons ──────────────────────────
858 # All GCO nodepools apply taints (nvidia.com/gpu, aws.amazon.com/neuron,
859 # vpc.amazonaws.com/efa) that prevent DaemonSet pods from scheduling.
860 # Every add-on that runs a DaemonSet (or may schedule on tainted nodes)
861 # must tolerate these taints so that storage drivers, metrics agents, and
862 # other infrastructure components work on every node type.
863 _ADDON_NODE_TOLERATIONS = [
864 {"key": "nvidia.com/gpu", "operator": "Exists", "effect": "NoSchedule"},
865 {"key": "aws.amazon.com/neuron", "operator": "Exists", "effect": "NoSchedule"},
866 {"key": "vpc.amazonaws.com/efa", "operator": "Exists", "effect": "NoSchedule"},
867 ]
869 def _create_pod_identity_agent_addon(self) -> None:
870 """Create EKS Pod Identity Agent add-on.
872 On Auto Mode the DaemonSet schedules 0 pods (the agent is built into
873 the node runtime), but the add-on registration is still required for
874 the control-plane admission webhook that injects Pod Identity tokens.
875 """
876 eks.Addon(
877 self,
878 "PodIdentityAgentAddon",
879 cluster=self.cluster, # type: ignore[arg-type]
880 addon_name="eks-pod-identity-agent",
881 addon_version=EKS_ADDON_POD_IDENTITY_AGENT,
882 preserve_on_delete=False,
883 configuration_values={
884 "tolerations": self._ADDON_NODE_TOLERATIONS,
885 },
886 )
888 def _create_metrics_server_addon(self) -> None:
889 """Create Metrics Server add-on for resource metrics.
891 The Metrics Server collects resource metrics from kubelets and exposes
892 them via the Kubernetes API server. This is required for:
893 - Horizontal Pod Autoscaler (HPA)
894 - Vertical Pod Autoscaler (VPA)
895 - kubectl top commands
896 - Resource monitoring dashboards
898 Note: Metrics Server doesn't require an IRSA role as it only needs
899 in-cluster permissions which are handled by its service account.
900 """
901 eks.Addon(
902 self,
903 "MetricsServerAddon",
904 cluster=self.cluster, # type: ignore[arg-type]
905 addon_name="metrics-server",
906 addon_version=EKS_ADDON_METRICS_SERVER,
907 preserve_on_delete=False,
908 configuration_values={
909 "tolerations": self._ADDON_NODE_TOLERATIONS,
910 },
911 )
913 def _create_efs_csi_driver_addon(self) -> None:
914 """Create EFS CSI Driver add-on for shared storage support.
916 The EFS CSI driver enables Kubernetes pods to mount EFS file systems
917 as persistent volumes. This is required for the shared storage feature.
919 We create a Pod Identity role for the EFS CSI driver and update the add-on
920 to use it via a custom resource after the add-on is created.
921 """
922 # Create IAM role for EFS CSI Driver using IRSA + Pod Identity
923 self.efs_csi_role = GCORegionalStack._create_irsa_role(
924 self,
925 "EfsCsiDriverRole",
926 oidc_provider_arn=self.oidc_provider.open_id_connect_provider_arn,
927 oidc_issuer_url=self.cluster.cluster_open_id_connect_issuer_url,
928 service_account_names=["efs-csi-controller-sa"],
929 namespaces=["kube-system"],
930 )
932 # Add EFS CSI driver permissions
933 self.efs_csi_role.add_managed_policy(
934 iam.ManagedPolicy.from_aws_managed_policy_name("service-role/AmazonEFSCSIDriverPolicy")
935 )
937 # Create EFS CSI Driver add-on
938 efs_addon = eks.Addon(
939 self,
940 "EfsCsiDriverAddon",
941 cluster=self.cluster, # type: ignore[arg-type]
942 addon_name="aws-efs-csi-driver",
943 addon_version=EKS_ADDON_EFS_CSI_DRIVER,
944 preserve_on_delete=False,
945 configuration_values={
946 "node": {
947 "tolerations": self._ADDON_NODE_TOLERATIONS,
948 },
949 "controller": {
950 "tolerations": self._ADDON_NODE_TOLERATIONS,
951 },
952 },
953 )
955 # Append the PassRole statement for the EFS CSI role to the shared
956 # AwsCustomResource execution role. See the role's creation in
957 # _create_aws_custom_resource_role for the full rationale on why
958 # we pre-create + attach up-front instead of letting CDK
959 # auto-generate per-CR roles.
960 self.aws_custom_resource_role.add_to_policy(
961 iam.PolicyStatement(
962 effect=iam.Effect.ALLOW,
963 actions=["iam:PassRole"],
964 resources=[self.efs_csi_role.role_arn],
965 )
966 )
968 # Update the add-on to use the IRSA role via custom resource
969 # This is needed because the eks v2 alpha Addon doesn't support service_account_role directly
970 update_addon = cr.AwsCustomResource(
971 self,
972 "UpdateEfsCsiAddonRole",
973 on_create=cr.AwsSdkCall(
974 service="EKS",
975 action="updateAddon",
976 parameters={
977 "clusterName": self.cluster.cluster_name,
978 "addonName": "aws-efs-csi-driver",
979 "serviceAccountRoleArn": self.efs_csi_role.role_arn,
980 },
981 physical_resource_id=cr.PhysicalResourceId.of(
982 f"{self.cluster.cluster_name}-efs-csi-role-update"
983 ),
984 ),
985 on_update=cr.AwsSdkCall(
986 service="EKS",
987 action="updateAddon",
988 parameters={
989 "clusterName": self.cluster.cluster_name,
990 "addonName": "aws-efs-csi-driver",
991 "serviceAccountRoleArn": self.efs_csi_role.role_arn,
992 },
993 ),
994 role=self.aws_custom_resource_role,
995 )
997 # Ensure the update happens after the add-on is created. We also
998 # depend on the shared execution role so CloudFormation has fully
999 # attached + replicated its inline policy before the Lambda fires.
1000 update_addon.node.add_dependency(efs_addon)
1001 update_addon.node.add_dependency(self.efs_csi_role)
1002 update_addon.node.add_dependency(self.aws_custom_resource_role)
1004 # Expose the update-addon resource so _apply_kubernetes_manifests can
1005 # make the kubectl Lambda wait for the IRSA annotation patch to land
1006 # before it tries to rollout-restart the efs-csi-controller. Without
1007 # this ordering, the restart could fire before EKS has re-attached
1008 # the role ARN, leaving the new pods just as credential-less as the
1009 # old ones and causing every EFS CreateAccessPoint to fail with a
1010 # 401 from IMDS.
1011 self._efs_csi_addon_role_update = update_addon
1013 def _create_cloudwatch_observability_addon(self) -> None:
1014 """Create CloudWatch Observability add-on for Container Insights.
1016 The CloudWatch Observability add-on enables Container Insights metrics
1017 for the EKS cluster, providing visibility into:
1018 - Cluster CPU and memory utilization
1019 - Node-level metrics
1020 - Pod and container metrics
1021 - Application logs (optional)
1023 These metrics are used by the monitoring dashboard to display
1024 cluster health and resource utilization.
1025 """
1027 # Create IAM role for CloudWatch agent using IRSA + Pod Identity
1028 self.cloudwatch_role = GCORegionalStack._create_irsa_role(
1029 self,
1030 "CloudWatchObservabilityRole",
1031 oidc_provider_arn=self.oidc_provider.open_id_connect_provider_arn,
1032 oidc_issuer_url=self.cluster.cluster_open_id_connect_issuer_url,
1033 service_account_names=["cloudwatch-agent"],
1034 namespaces=["amazon-cloudwatch"],
1035 )
1037 # Add CloudWatch agent permissions
1038 self.cloudwatch_role.add_managed_policy(
1039 iam.ManagedPolicy.from_aws_managed_policy_name("CloudWatchAgentServerPolicy")
1040 )
1041 self.cloudwatch_role.add_managed_policy(
1042 iam.ManagedPolicy.from_aws_managed_policy_name("AWSXrayWriteOnlyAccess")
1043 )
1045 # Create CloudWatch Observability add-on
1046 cw_addon = eks.Addon(
1047 self,
1048 "CloudWatchObservabilityAddon",
1049 cluster=self.cluster, # type: ignore[arg-type]
1050 addon_name="amazon-cloudwatch-observability",
1051 addon_version=EKS_ADDON_CLOUDWATCH_OBSERVABILITY,
1052 preserve_on_delete=False,
1053 configuration_values={
1054 "tolerations": self._ADDON_NODE_TOLERATIONS,
1055 # Enable Container Insights with application log collection
1056 # Logs are sent to /aws/containerinsights/{cluster}/application
1057 "containerLogs": {
1058 "enabled": True,
1059 },
1060 },
1061 )
1063 # Append the PassRole statement for the CloudWatch Observability
1064 # role to the shared AwsCustomResource execution role. See
1065 # _create_aws_custom_resource_role for the full rationale.
1066 self.aws_custom_resource_role.add_to_policy(
1067 iam.PolicyStatement(
1068 effect=iam.Effect.ALLOW,
1069 actions=["iam:PassRole"],
1070 resources=[self.cloudwatch_role.role_arn],
1071 )
1072 )
1074 # Update the add-on to use the IRSA role via custom resource
1075 update_cw_addon = cr.AwsCustomResource(
1076 self,
1077 "UpdateCloudWatchAddonRole",
1078 on_create=cr.AwsSdkCall(
1079 service="EKS",
1080 action="updateAddon",
1081 parameters={
1082 "clusterName": self.cluster.cluster_name,
1083 "addonName": "amazon-cloudwatch-observability",
1084 "serviceAccountRoleArn": self.cloudwatch_role.role_arn,
1085 },
1086 physical_resource_id=cr.PhysicalResourceId.of(
1087 f"{self.cluster.cluster_name}-cw-obs-role-update"
1088 ),
1089 ),
1090 on_update=cr.AwsSdkCall(
1091 service="EKS",
1092 action="updateAddon",
1093 parameters={
1094 "clusterName": self.cluster.cluster_name,
1095 "addonName": "amazon-cloudwatch-observability",
1096 "serviceAccountRoleArn": self.cloudwatch_role.role_arn,
1097 },
1098 ),
1099 role=self.aws_custom_resource_role,
1100 )
1102 # Ensure the update happens after the add-on is created. Depend on
1103 # the shared execution role so CFN has fully attached + replicated
1104 # its inline policy before the Lambda fires. No CR→CR dependency
1105 # chain needed anymore — the race it was serializing against is
1106 # eliminated by pre-creating the role.
1107 update_cw_addon.node.add_dependency(cw_addon)
1108 update_cw_addon.node.add_dependency(self.cloudwatch_role)
1109 update_cw_addon.node.add_dependency(self.aws_custom_resource_role)
1111 # Expose the update-addon resource so _apply_kubernetes_manifests can
1112 # make the kubectl Lambda wait for the IRSA annotation patch to land
1113 # before it rollout-restarts the cloudwatch-agent DaemonSet. See the
1114 # EFS CSI equivalent for the full rationale — same race, same fix.
1115 self._cloudwatch_addon_role_update = update_cw_addon
1117 def _create_service_account_role(self) -> None:
1118 """Create IAM role for Kubernetes service account using EKS Pod Identity.
1120 Pod Identity is the recommended mechanism for EKS Auto Mode. It's simpler
1121 and more reliable than IRSA — no OIDC provider, no webhook injection, no
1122 projected tokens. EKS manages the credential injection automatically.
1124 This role can be assumed by the gco-service-account in:
1125 - gco-system namespace (for system services like health-monitor, manifest-processor)
1126 - gco-jobs namespace (for user jobs that need SQS access for KEDA scaling)
1127 - gco-inference namespace (for inference endpoints)
1128 """
1129 # Create IAM role with IRSA (OIDC) trust + Pod Identity trust
1130 #
1131 # The trust policy's `sub` condition must list every ServiceAccount
1132 # that needs to assume this role. Keep in sync with:
1133 # - lambda/kubectl-applier-simple/manifests/01-serviceaccounts.yaml
1134 # (gco-service-account)
1135 # - lambda/kubectl-applier-simple/manifests/02-rbac.yaml
1136 # (gco-health-monitor-sa, gco-manifest-processor-sa,
1137 # gco-inference-monitor-sa)
1138 # - lambda/kubectl-applier-simple/manifests/04a-jobs-serviceaccount.yaml
1139 # (gco-service-account in gco-jobs)
1140 self.service_account_role = GCORegionalStack._create_irsa_role(
1141 self,
1142 "ServiceAccountRole",
1143 oidc_provider_arn=self.oidc_provider.open_id_connect_provider_arn,
1144 oidc_issuer_url=self.cluster.cluster_open_id_connect_issuer_url,
1145 service_account_names=[
1146 "gco-service-account",
1147 "gco-health-monitor-sa",
1148 "gco-manifest-processor-sa",
1149 "gco-inference-monitor-sa",
1150 ],
1151 namespaces=["gco-system", "gco-jobs", "gco-inference"],
1152 )
1154 # Grant permission to read the auth secret
1155 # Note: We use an explicit IAM policy statement with a wildcard (*) because:
1156 # 1. The secret is in a different region (API Gateway region)
1157 # 2. CDK's grant_read() generates a policy with ?????? suffix which requires
1158 # exactly 6 characters, but the SDK can call GetSecretValue with either
1159 # the full ARN (with suffix) or partial ARN (without suffix)
1160 # 3. Using * ensures both forms work correctly
1161 self.service_account_role.add_to_policy(
1162 iam.PolicyStatement(
1163 effect=iam.Effect.ALLOW,
1164 actions=[
1165 "secretsmanager:GetSecretValue",
1166 "secretsmanager:DescribeSecret",
1167 ],
1168 resources=[f"{self.auth_secret_arn}*"], # Wildcard to match with or without suffix
1169 )
1170 )
1172 # cdk-nag suppression: the trailing ``*`` on the auth secret
1173 # ARN above is intentional and is NOT a broad wildcard. Secrets
1174 # Manager appends a random 6-character suffix to every secret
1175 # ARN at creation time (``arn:...:secret:my-secret-AbC123``).
1176 # The secret lives in a separate stack (api_gateway_global_stack)
1177 # and is referenced here via a cross-stack token, so the actual
1178 # suffix is unknown at synth time. The wildcard matches the
1179 # suffix only — every finding under this rule is still scoped
1180 # to this single secret.
1181 from cdk_nag import NagSuppressions
1183 NagSuppressions.add_resource_suppressions(
1184 self.service_account_role,
1185 [
1186 {
1187 "id": "AwsSolutions-IAM5",
1188 "reason": (
1189 "The trailing ``*`` matches the 6-character "
1190 "random suffix Secrets Manager appends to secret "
1191 "ARNs. The secret is created in a different stack "
1192 "(api_gateway_global_stack) and referenced here "
1193 "via a cross-stack token, so the actual suffix "
1194 "isn't known at synth time. The wildcard is "
1195 "bounded to a single secret — it does not grant "
1196 "access to any other secret in the account."
1197 ),
1198 "appliesTo": [
1199 {"regex": "/^Resource::<GCOAuthSecret.*>\\*$/"},
1200 ],
1201 },
1202 ],
1203 apply_to_children=True,
1204 )
1206 # cdk-nag suppression: the ServiceAccountRole grants ec2:Describe*
1207 # and elasticloadbalancing:Describe* for the AWS Load Balancer
1208 # Controller. These AWS APIs do not support resource-level IAM
1209 # scoping — Resource: * is the only valid form.
1210 NagSuppressions.add_resource_suppressions(
1211 self.service_account_role,
1212 [
1213 {
1214 "id": "AwsSolutions-IAM5",
1215 "reason": (
1216 "The ServiceAccountRole grants ec2:Describe* and "
1217 "elasticloadbalancing:Describe* for the AWS Load Balancer "
1218 "Controller. These AWS APIs do not support resource-level "
1219 "IAM scoping — Resource: * is the only valid form. See "
1220 "https://docs.aws.amazon.com/service-authorization/latest/"
1221 "reference/list_amazonec2.html"
1222 ),
1223 "appliesTo": ["Resource::*"],
1224 },
1225 ],
1226 apply_to_children=True,
1227 )
1229 # Add permissions for AWS Load Balancer Controller
1230 self.service_account_role.add_to_policy(
1231 iam.PolicyStatement(
1232 effect=iam.Effect.ALLOW,
1233 actions=[
1234 "ec2:DescribeAccountAttributes",
1235 "ec2:DescribeAddresses",
1236 "ec2:DescribeAvailabilityZones",
1237 "ec2:DescribeInternetGateways",
1238 "ec2:DescribeVpcs",
1239 "ec2:DescribeVpcPeeringConnections",
1240 "ec2:DescribeSubnets",
1241 "ec2:DescribeSecurityGroups",
1242 "ec2:DescribeInstances",
1243 "ec2:DescribeNetworkInterfaces",
1244 "ec2:DescribeTags",
1245 "ec2:GetCoipPoolUsage",
1246 "ec2:DescribeCoipPools",
1247 "elasticloadbalancing:DescribeLoadBalancers",
1248 "elasticloadbalancing:DescribeLoadBalancerAttributes",
1249 "elasticloadbalancing:DescribeListeners",
1250 "elasticloadbalancing:DescribeListenerCertificates",
1251 "elasticloadbalancing:DescribeSSLPolicies",
1252 "elasticloadbalancing:DescribeRules",
1253 "elasticloadbalancing:DescribeTargetGroups",
1254 "elasticloadbalancing:DescribeTargetGroupAttributes",
1255 "elasticloadbalancing:DescribeTargetHealth",
1256 "elasticloadbalancing:DescribeTags",
1257 ],
1258 resources=["*"],
1259 )
1260 )
1262 self.service_account_role.add_to_policy(
1263 iam.PolicyStatement(
1264 effect=iam.Effect.ALLOW,
1265 actions=[
1266 "elasticloadbalancing:CreateLoadBalancer",
1267 "elasticloadbalancing:CreateTargetGroup",
1268 "elasticloadbalancing:CreateListener",
1269 "elasticloadbalancing:DeleteLoadBalancer",
1270 "elasticloadbalancing:DeleteTargetGroup",
1271 "elasticloadbalancing:DeleteListener",
1272 "elasticloadbalancing:ModifyLoadBalancerAttributes",
1273 "elasticloadbalancing:ModifyTargetGroup",
1274 "elasticloadbalancing:ModifyTargetGroupAttributes",
1275 "elasticloadbalancing:ModifyListener",
1276 "elasticloadbalancing:RegisterTargets",
1277 "elasticloadbalancing:DeregisterTargets",
1278 "elasticloadbalancing:SetWebAcl",
1279 "elasticloadbalancing:SetSecurityGroups",
1280 "elasticloadbalancing:SetSubnets",
1281 "elasticloadbalancing:AddTags",
1282 "elasticloadbalancing:RemoveTags",
1283 ],
1284 resources=["*"],
1285 )
1286 )
1288 self.service_account_role.add_to_policy(
1289 iam.PolicyStatement(
1290 effect=iam.Effect.ALLOW,
1291 actions=[
1292 "ec2:CreateSecurityGroup",
1293 "ec2:CreateTags",
1294 "ec2:DeleteTags",
1295 "ec2:AuthorizeSecurityGroupIngress",
1296 "ec2:RevokeSecurityGroupIngress",
1297 "ec2:DeleteSecurityGroup",
1298 ],
1299 resources=["*"],
1300 )
1301 )
1303 self.service_account_role.add_to_policy(
1304 iam.PolicyStatement(
1305 effect=iam.Effect.ALLOW,
1306 actions=["iam:CreateServiceLinkedRole"],
1307 resources=["*"],
1308 conditions={
1309 "StringEquals": {"iam:AWSServiceName": "elasticloadbalancing.amazonaws.com"}
1310 },
1311 )
1312 )
1314 self.service_account_role.add_to_policy(
1315 iam.PolicyStatement(
1316 effect=iam.Effect.ALLOW,
1317 actions=[
1318 "wafv2:GetWebACL",
1319 "wafv2:GetWebACLForResource",
1320 "wafv2:AssociateWebACL",
1321 "wafv2:DisassociateWebACL",
1322 ],
1323 resources=["*"],
1324 )
1325 )
1327 self.service_account_role.add_to_policy(
1328 iam.PolicyStatement(
1329 effect=iam.Effect.ALLOW,
1330 actions=[
1331 "shield:GetSubscriptionState",
1332 "shield:DescribeProtection",
1333 "shield:CreateProtection",
1334 "shield:DeleteProtection",
1335 ],
1336 resources=["*"],
1337 )
1338 )
1340 self.service_account_role.add_to_policy(
1341 iam.PolicyStatement(
1342 effect=iam.Effect.ALLOW,
1343 actions=["acm:ListCertificates", "acm:DescribeCertificate"],
1344 resources=["*"],
1345 )
1346 )
1348 self.service_account_role.add_to_policy(
1349 iam.PolicyStatement(
1350 effect=iam.Effect.ALLOW,
1351 actions=["cognito-idp:DescribeUserPoolClient"],
1352 resources=["*"],
1353 )
1354 )
1356 # Add SQS permissions for KEDA to scale based on queue depth
1357 self.service_account_role.add_to_policy(
1358 iam.PolicyStatement(
1359 effect=iam.Effect.ALLOW,
1360 actions=[
1361 "sqs:GetQueueAttributes",
1362 "sqs:GetQueueUrl",
1363 "sqs:ReceiveMessage",
1364 "sqs:DeleteMessage",
1365 "sqs:SendMessage",
1366 ],
1367 resources=[
1368 self.job_queue.queue_arn,
1369 self.job_dlq.queue_arn,
1370 ],
1371 )
1372 )
1374 # Add CloudWatch permissions for publishing custom metrics
1375 # Used by health-monitor and manifest-processor to publish metrics
1376 self.service_account_role.add_to_policy(
1377 iam.PolicyStatement(
1378 effect=iam.Effect.ALLOW,
1379 actions=["cloudwatch:PutMetricData"],
1380 resources=["*"],
1381 conditions={
1382 "StringEquals": {
1383 "cloudwatch:namespace": [
1384 "GCO/HealthMonitor",
1385 "GCO/ManifestProcessor",
1386 ]
1387 }
1388 },
1389 )
1390 )
1392 # Add DynamoDB permissions for templates, webhooks, and job queue
1393 # Tables are created in the global stack and accessed from all regions
1394 project_name = self.config.get_project_name()
1395 global_region = self.config.get_global_region()
1397 self.service_account_role.add_to_policy(
1398 iam.PolicyStatement(
1399 effect=iam.Effect.ALLOW,
1400 actions=[
1401 "dynamodb:GetItem",
1402 "dynamodb:PutItem",
1403 "dynamodb:UpdateItem",
1404 "dynamodb:DeleteItem",
1405 "dynamodb:Query",
1406 "dynamodb:Scan",
1407 ],
1408 resources=[
1409 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-job-templates",
1410 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-job-templates/index/*",
1411 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-webhooks",
1412 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-webhooks/index/*",
1413 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-jobs",
1414 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-jobs/index/*",
1415 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-inference-endpoints",
1416 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-inference-endpoints/index/*",
1417 ],
1418 )
1419 )
1421 # Add S3 permissions for model weights bucket (used by inference init containers)
1422 self.service_account_role.add_to_policy(
1423 iam.PolicyStatement(
1424 effect=iam.Effect.ALLOW,
1425 actions=[
1426 "s3:GetObject",
1427 "s3:ListBucket",
1428 ],
1429 resources=[
1430 f"arn:aws:s3:::{project_name}-*",
1431 f"arn:aws:s3:::{project_name}-*/*",
1432 ],
1433 )
1434 )
1436 # KMS decrypt for model weights bucket (S3-scoped)
1437 self.service_account_role.add_to_policy(
1438 iam.PolicyStatement(
1439 effect=iam.Effect.ALLOW,
1440 actions=["kms:Decrypt", "kms:GenerateDataKey"],
1441 resources=[f"arn:aws:kms:*:{self.account}:key/*"],
1442 conditions={
1443 "StringLike": {
1444 "kms:ViaService": "s3.*.amazonaws.com",
1445 }
1446 },
1447 )
1448 )
1450 # Create KEDA operator IAM role for SQS access
1451 self._create_keda_operator_role()
1453 # Create Pod Identity Associations for all service accounts
1454 self._create_pod_identity_associations()
1456 def _create_keda_operator_role(self) -> None:
1457 """Create IAM role for KEDA operator service account using EKS Pod Identity.
1459 This role allows the KEDA operator to access SQS queues for scaling
1460 based on queue depth. The role is assumed by the keda-operator service
1461 account in the keda namespace.
1462 """
1463 # Create IAM role with IRSA (OIDC) trust + Pod Identity trust
1464 self.keda_operator_role = GCORegionalStack._create_irsa_role(
1465 self,
1466 "KedaOperatorRole",
1467 oidc_provider_arn=self.oidc_provider.open_id_connect_provider_arn,
1468 oidc_issuer_url=self.cluster.cluster_open_id_connect_issuer_url,
1469 service_account_names=["keda-operator"],
1470 namespaces=["keda"],
1471 )
1473 # Add SQS permissions for KEDA to read queue metrics
1474 self.keda_operator_role.add_to_policy(
1475 iam.PolicyStatement(
1476 effect=iam.Effect.ALLOW,
1477 actions=[
1478 "sqs:GetQueueAttributes",
1479 "sqs:GetQueueUrl",
1480 ],
1481 resources=[
1482 self.job_queue.queue_arn,
1483 self.job_dlq.queue_arn,
1484 ],
1485 )
1486 )
1488 def _create_pod_identity_associations(self) -> None:
1489 """Create EKS Pod Identity Associations for all service accounts.
1491 Pod Identity is the recommended mechanism for EKS Auto Mode. Each
1492 association links an IAM role to a Kubernetes service account in a
1493 specific namespace. EKS manages credential injection automatically.
1495 Stores associations in self._pod_identity_associations so the
1496 kubectl-applier custom resource can declare an explicit dependency,
1497 ensuring credentials are available before workloads start.
1498 """
1499 self._pod_identity_associations: list[Any] = []
1501 # GCO service account — used by health-monitor, manifest-processor, inference-monitor
1502 for namespace in ["gco-system", "gco-jobs", "gco-inference"]:
1503 assoc = eks_l1.CfnPodIdentityAssociation(
1504 self,
1505 f"PodIdentity-gco-sa-{namespace}",
1506 cluster_name=self.cluster.cluster_name,
1507 namespace=namespace,
1508 service_account="gco-service-account",
1509 role_arn=self.service_account_role.role_arn,
1510 )
1511 self._pod_identity_associations.append(assoc)
1513 # KEDA operator — needs SQS access for queue-based scaling
1514 keda_assoc = eks_l1.CfnPodIdentityAssociation(
1515 self,
1516 "PodIdentity-keda-operator",
1517 cluster_name=self.cluster.cluster_name,
1518 namespace="keda",
1519 service_account="keda-operator",
1520 role_arn=self.keda_operator_role.role_arn,
1521 )
1522 self._pod_identity_associations.append(keda_assoc)
1524 # EFS CSI driver — needs EFS access for shared storage
1525 efs_assoc = eks_l1.CfnPodIdentityAssociation(
1526 self,
1527 "PodIdentity-efs-csi",
1528 cluster_name=self.cluster.cluster_name,
1529 namespace="kube-system",
1530 service_account="efs-csi-controller-sa",
1531 role_arn=self.efs_csi_role.role_arn,
1532 )
1533 self._pod_identity_associations.append(efs_assoc)
1535 # CloudWatch agent — needs CloudWatch access for observability
1536 cw_assoc = eks_l1.CfnPodIdentityAssociation(
1537 self,
1538 "PodIdentity-cloudwatch",
1539 cluster_name=self.cluster.cluster_name,
1540 namespace="amazon-cloudwatch",
1541 service_account="cloudwatch-agent",
1542 role_arn=self.cloudwatch_role.role_arn,
1543 )
1544 self._pod_identity_associations.append(cw_assoc)
1546 # FSx CSI driver — only when FSx is enabled (created later in _create_fsx_lustre)
1547 # The FSx Pod Identity association is added in _create_fsx_lustre instead
1549 def _resolve_cluster_shared_bucket_from_ssm(self) -> SharedBucketIdentity:
1550 """Resolve the ``Cluster_Shared_Bucket`` identity from cross-region SSM.
1552 ``GCOGlobalStack`` publishes three ``ssm.StringParameter``s in the
1553 global region at ``/gco/cluster-shared-bucket/{name,arn,region}``.
1554 This method reads them back from the regional stack via
1555 ``cr.AwsCustomResource`` with ``service="SSM"``,
1556 ``action="getParameter"``, and ``region=<global-region>`` — matching
1557 the cross-region read pattern already used in
1558 ``_create_ga_registration_lambda`` for the Global Accelerator
1559 endpoint group ARN.
1561 Runs unconditionally in ``__init__`` — no feature toggle, no
1562 conditional guard. The returned :class:`SharedBucketIdentity` feeds
1563 ``_grant_cluster_shared_bucket_to_job_role`` (IAM) and the
1564 ``image_replacements`` dict (ConfigMap) downstream.
1566 Returns:
1567 :class:`SharedBucketIdentity` with ``name``, ``arn``, and
1568 ``region`` populated as CDK tokens that resolve at deploy time.
1569 """
1570 from cdk_nag import NagSuppressions
1572 global_region = self.config.get_global_region()
1573 resolved: dict[str, str] = {}
1575 for suffix in ("name", "arn", "region"):
1576 parameter_name = f"{CLUSTER_SHARED_SSM_PARAMETER_PREFIX}/{suffix}"
1577 read_cr = cr.AwsCustomResource(
1578 self,
1579 f"ReadClusterSharedBucket{suffix.capitalize()}",
1580 on_create=cr.AwsSdkCall(
1581 service="SSM",
1582 action="getParameter",
1583 parameters={"Name": parameter_name},
1584 region=global_region,
1585 physical_resource_id=cr.PhysicalResourceId.of(f"cluster-shared-{suffix}"),
1586 ),
1587 on_update=cr.AwsSdkCall(
1588 service="SSM",
1589 action="getParameter",
1590 parameters={"Name": parameter_name},
1591 region=global_region,
1592 physical_resource_id=cr.PhysicalResourceId.of(f"cluster-shared-{suffix}"),
1593 ),
1594 # Cross-region SSM GetParameter doesn't support resource-level
1595 # scoping cleanly — the principal evaluating the call lives in
1596 # this stack's region but the parameter lives in the global
1597 # region. ANY_RESOURCE is the AWS-documented escape hatch; the
1598 # resulting AwsSolutions-IAM5 nag finding is suppressed in a
1599 # scoped add_resource_suppressions call below.
1600 policy=cr.AwsCustomResourcePolicy.from_sdk_calls(
1601 resources=cr.AwsCustomResourcePolicy.ANY_RESOURCE
1602 ),
1603 )
1605 # Scoped suppression: the CR policy is Resource::* because the
1606 # SSM parameter lives in the global region (cross-region calls
1607 # don't support resource-level scoping cleanly). The action is
1608 # fixed to ssm:GetParameter and the parameter Name is fixed to
1609 # a literal string, so the wildcard can only ever read one
1610 # parameter.
1611 NagSuppressions.add_resource_suppressions(
1612 read_cr,
1613 [
1614 {
1615 "id": "AwsSolutions-IAM5",
1616 "reason": (
1617 "Cross-region ssm:GetParameter for "
1618 f"{parameter_name} in the global region. The "
1619 "AwsCustomResource SDK-call policy is scoped to "
1620 "a single fixed action (ssm:GetParameter) with "
1621 "a fixed parameter Name — the Resource: * is "
1622 "the CDK-documented escape hatch because the "
1623 "parameter ARN is not known to the calling "
1624 "principal's region. Effective blast radius: "
1625 "one parameter."
1626 ),
1627 "appliesTo": ["Resource::*"],
1628 },
1629 ],
1630 apply_to_children=True,
1631 )
1633 resolved[suffix] = read_cr.get_response_field("Parameter.Value")
1635 return SharedBucketIdentity(
1636 name=resolved["name"],
1637 arn=resolved["arn"],
1638 region=resolved["region"],
1639 )
1641 def _grant_cluster_shared_bucket_to_job_role(self, shared: SharedBucketIdentity) -> None:
1642 """Attach RW + KMS permissions on ``Cluster_Shared_Bucket`` to the job-pod role.
1644 Two ``iam.PolicyStatement``s are added to ``self.service_account_role``
1645 (the EKS Pod Identity role used by every pod in ``gco-jobs``,
1646 ``gco-system``, and ``gco-inference``):
1648 1. S3 object + bucket-level actions (``GetObject``, ``PutObject``,
1649 ``DeleteObject``, ``ListBucket``, ``GetBucketLocation``) scoped
1650 to ``<shared.arn>`` and ``<shared.arn>/*`` — the bucket-ARN
1651 shape uses the ``gco-cluster-shared-*`` prefix that IAM
1652 policies scope against.
1653 2. KMS ``Decrypt`` / ``GenerateDataKey`` scoped by the
1654 ``kms:ViaService=s3.<shared.region>.amazonaws.com`` condition —
1655 ``resources=["*"]`` because the KMS key ARN is not known to this
1656 stack (it lives in the global region and is referenced indirectly
1657 through the S3 service). The condition is what actually restricts
1658 the grant to the cluster-shared bucket's key.
1660 Runs unconditionally — the grant is
1661 present on every regional cluster whether or not analytics is
1662 enabled.
1663 """
1664 from cdk_nag import NagSuppressions
1666 self.service_account_role.add_to_policy(
1667 iam.PolicyStatement(
1668 effect=iam.Effect.ALLOW,
1669 actions=[
1670 "s3:GetObject",
1671 "s3:PutObject",
1672 "s3:DeleteObject",
1673 "s3:ListBucket",
1674 "s3:GetBucketLocation",
1675 ],
1676 resources=[shared.arn, f"{shared.arn}/*"],
1677 )
1678 )
1680 self.service_account_role.add_to_policy(
1681 iam.PolicyStatement(
1682 effect=iam.Effect.ALLOW,
1683 actions=["kms:Decrypt", "kms:GenerateDataKey"],
1684 resources=["*"],
1685 conditions={
1686 "StringEquals": {
1687 "kms:ViaService": f"s3.{shared.region}.amazonaws.com",
1688 }
1689 },
1690 )
1691 )
1693 # The S3 bucket-ARN resource uses a ``<arn>/*`` object-key wildcard
1694 # which cdk-nag flags as a wildcard resource. The ARN itself is the
1695 # literal Cluster_Shared_Bucket ARN resolved from SSM — the ``/*``
1696 # covers all object keys inside that single bucket, which is the
1697 # intended semantic for the RW grant.
1698 NagSuppressions.add_resource_suppressions(
1699 self.service_account_role,
1700 [
1701 {
1702 "id": "AwsSolutions-IAM5",
1703 "reason": (
1704 "The Cluster_Shared_Bucket RW grant uses an <arn>/* "
1705 "object-key wildcard on the literal cluster-shared "
1706 "bucket ARN resolved from SSM. The wildcard covers "
1707 "object keys within a single bucket (the always-on "
1708 "gco-cluster-shared-<account>-<region> bucket) — "
1709 "this is the standard shape for a bucket-scoped "
1710 "RW grant and is what the allow-list assertion "
1711 "is written against."
1712 ),
1713 "appliesTo": [
1714 {"regex": r"/^Resource::<ReadClusterSharedBucketArn.*>\/\*$/"},
1715 ],
1716 },
1717 ],
1718 apply_to_children=True,
1719 )
1721 def _create_kubectl_lambda(self) -> None:
1722 """Create Lambda function to apply Kubernetes manifests using Python client.
1724 Note: This creates the Lambda and provider but does NOT create the custom resource.
1725 The custom resource is created in _apply_kubernetes_manifests() after ALB is created,
1726 so that target group ARNs can be passed to the manifests.
1727 """
1728 project_name = self.config.get_project_name()
1730 # Create IAM role for kubectl Lambda
1731 kubectl_lambda_role = iam.Role(
1732 self,
1733 "KubectlLambdaRole",
1734 assumed_by=iam.ServicePrincipal("lambda.amazonaws.com"),
1735 managed_policies=[
1736 iam.ManagedPolicy.from_aws_managed_policy_name(
1737 "service-role/AWSLambdaVPCAccessExecutionRole"
1738 ),
1739 iam.ManagedPolicy.from_aws_managed_policy_name(
1740 "service-role/AWSLambdaBasicExecutionRole"
1741 ),
1742 ],
1743 )
1745 # Add EKS permissions
1746 kubectl_lambda_role.add_to_policy(
1747 iam.PolicyStatement(
1748 actions=[
1749 "eks:DescribeCluster",
1750 "eks:ListClusters",
1751 ],
1752 resources=[self.cluster.cluster_arn],
1753 )
1754 )
1756 # Add permissions to assume cluster admin role
1757 kubectl_lambda_role.add_to_policy(
1758 iam.PolicyStatement(actions=["sts:AssumeRole"], resources=["*"])
1759 )
1761 # Create security group for kubectl Lambda
1762 kubectl_lambda_sg = ec2.SecurityGroup(
1763 self,
1764 "KubectlLambdaSG",
1765 vpc=self.vpc,
1766 description="Security group for kubectl Lambda to access EKS cluster",
1767 security_group_name=f"{self.config.get_project_name()}-kubectl-lambda-sg-{self.deployment_region}",
1768 allow_all_outbound=True, # Lambda needs outbound access to EKS API
1769 )
1771 # Allow Lambda security group to access EKS cluster security group on port 443
1772 # The EKS cluster security group is automatically created by EKS
1773 self.cluster.cluster_security_group.add_ingress_rule(
1774 peer=kubectl_lambda_sg,
1775 connection=ec2.Port.tcp(443),
1776 description="Allow kubectl Lambda to access EKS API",
1777 )
1779 # Create Lambda function (Python-only, no Docker!)
1780 # Store function name as string attribute for cross-stack references
1781 # This avoids CDK cross-environment resolution issues when account is unresolved
1782 self.kubectl_lambda_function_name = f"{project_name}-kubectl-{self.deployment_region}"
1783 self.kubectl_lambda = lambda_.Function(
1784 self,
1785 "KubectlApplierFunction",
1786 function_name=self.kubectl_lambda_function_name,
1787 runtime=getattr(lambda_.Runtime, LAMBDA_PYTHON_RUNTIME),
1788 handler="handler.lambda_handler",
1789 code=lambda_.Code.from_asset("lambda/kubectl-applier-simple-build"),
1790 timeout=Duration.minutes(15), # Max Lambda timeout
1791 memory_size=512,
1792 role=kubectl_lambda_role,
1793 vpc=self.vpc,
1794 vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS),
1795 security_groups=[kubectl_lambda_sg], # Use the security group we created
1796 environment={
1797 "CLUSTER_NAME": self.cluster.cluster_name,
1798 "REGION": self.deployment_region,
1799 },
1800 tracing=lambda_.Tracing.ACTIVE,
1801 )
1803 # Add EKS access entry for the Lambda role to authenticate with the cluster
1804 # This grants the Lambda role cluster admin permissions
1805 eks.AccessEntry(
1806 self,
1807 "KubectlLambdaAccessEntry",
1808 cluster=self.cluster, # type: ignore[arg-type]
1809 principal=kubectl_lambda_role.role_arn,
1810 access_policies=[
1811 eks.AccessPolicy.from_access_policy_name(
1812 "AmazonEKSClusterAdminPolicy", access_scope_type=eks.AccessScopeType.CLUSTER
1813 )
1814 ],
1815 )
1817 # Create log group for kubectl provider
1818 kubectl_provider_log_group = logs.LogGroup(
1819 self,
1820 "KubectlProviderLogGroup",
1821 retention=logs.RetentionDays.ONE_WEEK,
1822 removal_policy=RemovalPolicy.DESTROY,
1823 )
1825 # Create custom resource provider (stored for use in _apply_kubernetes_manifests)
1826 self.kubectl_provider = cr.Provider(
1827 self,
1828 "KubectlProvider",
1829 on_event_handler=self.kubectl_lambda,
1830 log_group=kubectl_provider_log_group,
1831 )
1833 # cdk-nag suppression: the kubectl-applier Lambda requires broad
1834 # EKS and Kubernetes API access to apply arbitrary manifests.
1835 from cdk_nag import NagSuppressions
1837 NagSuppressions.add_resource_suppressions(
1838 kubectl_lambda_role,
1839 [
1840 {
1841 "id": "AwsSolutions-IAM5",
1842 "reason": (
1843 "The kubectl-applier Lambda requires broad EKS and Kubernetes API "
1844 "access to apply arbitrary manifests (RBAC, ServiceAccounts, "
1845 "Deployments, Jobs, NetworkPolicies) across multiple namespaces. "
1846 "Resource: * is required because the set of Kubernetes resources "
1847 "is dynamic and not known at synth time."
1848 ),
1849 "appliesTo": ["Resource::*"],
1850 },
1851 ],
1852 apply_to_children=True,
1853 )
1855 def _apply_kubernetes_manifests(self) -> None:
1856 """Apply Kubernetes manifests using the kubectl Lambda custom resource.
1858 This is called after ALB security group and EFS are created.
1859 The Ingress will use the security group ID to create the ALB.
1860 """
1862 # Get public subnet IDs for Ingress annotation (currently unused but kept for future use)
1863 # public_subnet_ids = ",".join([subnet.subnet_id for subnet in self.vpc.public_subnets])
1865 # Apply manifests using custom resource
1866 # Build image replacements dict
1867 # Include a deployment timestamp to force pod rollouts when code changes
1868 from datetime import UTC, datetime
1870 deployment_timestamp = datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ")
1872 # Get resource thresholds from config
1873 thresholds = self.config.get_resource_thresholds()
1875 # Get manifest processor resource quotas.
1876 # Resource quotas and the security/image policy now live under the
1877 # shared job_validation_policy section because both the REST
1878 # manifest_processor and the SQS queue_processor read them. Service-
1879 # specific knobs (replicas, validation_enabled, max_request_body_bytes,
1880 # etc.) stay under manifest_processor.
1881 mp_config = self.node.try_get_context("manifest_processor") or {}
1882 job_policy = self.node.try_get_context("job_validation_policy") or {}
1883 job_quotas = job_policy.get("resource_quotas", {})
1885 image_replacements = {
1886 "{{HEALTH_MONITOR_IMAGE}}": self.health_monitor_image.image_uri,
1887 "{{MANIFEST_PROCESSOR_IMAGE}}": self.manifest_processor_image.image_uri,
1888 "{{INFERENCE_MONITOR_IMAGE}}": self.inference_monitor_image.image_uri,
1889 "{{CLUSTER_NAME}}": self.cluster.cluster_name,
1890 "{{REGION}}": self.deployment_region,
1891 "{{AUTH_SECRET_ARN}}": self.auth_secret_arn,
1892 "{{SERVICE_ACCOUNT_ROLE_ARN}}": self.service_account_role.role_arn,
1893 "{{EFS_FILE_SYSTEM_ID}}": self.efs_file_system.file_system_id,
1894 "{{EFS_ACCESS_POINT_ID}}": self.efs_access_point.access_point_id,
1895 "{{JOB_QUEUE_URL}}": self.job_queue.queue_url,
1896 "{{JOB_QUEUE_ARN}}": self.job_queue.queue_arn,
1897 "{{DEPLOYMENT_TIMESTAMP}}": deployment_timestamp,
1898 # Resource thresholds
1899 "{{CPU_THRESHOLD}}": str(thresholds.cpu_threshold),
1900 "{{MEMORY_THRESHOLD}}": str(thresholds.memory_threshold),
1901 "{{GPU_THRESHOLD}}": str(thresholds.gpu_threshold),
1902 "{{PENDING_PODS_THRESHOLD}}": str(thresholds.pending_pods_threshold),
1903 "{{PENDING_REQUESTED_CPU_VCPUS}}": str(thresholds.pending_requested_cpu_vcpus),
1904 "{{PENDING_REQUESTED_MEMORY_GB}}": str(thresholds.pending_requested_memory_gb),
1905 "{{PENDING_REQUESTED_GPUS}}": str(thresholds.pending_requested_gpus),
1906 # DynamoDB table names (from global stack)
1907 "{{TEMPLATES_TABLE_NAME}}": f"{self.config.get_project_name()}-job-templates",
1908 "{{WEBHOOKS_TABLE_NAME}}": f"{self.config.get_project_name()}-webhooks",
1909 "{{JOBS_TABLE_NAME}}": f"{self.config.get_project_name()}-jobs",
1910 # DynamoDB region (global stack region, may differ from cluster region)
1911 "{{DYNAMODB_REGION}}": self.config.get_global_region(),
1912 # Manifest processor resource quotas (sourced from shared policy).
1913 "{{MP_MAX_CPU_PER_MANIFEST}}": str(job_quotas.get("max_cpu_per_manifest", "10")),
1914 "{{MP_MAX_MEMORY_PER_MANIFEST}}": str(
1915 job_quotas.get("max_memory_per_manifest", "32Gi")
1916 ),
1917 "{{MP_MAX_GPU_PER_MANIFEST}}": str(job_quotas.get("max_gpu_per_manifest", 4)),
1918 # Manifest processor namespace allowlist (sourced from shared policy).
1919 # Both the REST manifest processor and the SQS queue processor
1920 # read from job_validation_policy.allowed_namespaces so a single
1921 # edit takes effect on both submission paths at the next deploy.
1922 "{{MP_ALLOWED_NAMESPACES}}": ",".join(
1923 job_policy.get("allowed_namespaces", ["default", "gco-jobs"])
1924 ),
1925 # Manifest processor image registry allowlist (sourced from shared
1926 # policy). Augmented with the project's own ECR registry hostnames
1927 # so jobs built via ``gco images build`` aren't rejected by the
1928 # REST submission path. Identical augmentation runs on the SQS
1929 # path below — see ``{{QP_TRUSTED_REGISTRIES}}``.
1930 "{{MP_TRUSTED_REGISTRIES}}": ",".join(
1931 _augment_trusted_registries_with_project_ecr(
1932 job_policy.get("trusted_registries", []),
1933 account=self.account,
1934 regions=self.config.get_regions(),
1935 global_region=self.config.get_global_region(),
1936 )
1937 ),
1938 "{{MP_TRUSTED_DOCKERHUB_ORGS}}": ",".join(job_policy.get("trusted_dockerhub_orgs", [])),
1939 # Manifest processor request body size cap (HTTP 413 middleware).
1940 # Lives at cdk.json::manifest_processor.max_request_body_bytes.
1941 "{{MP_MAX_REQUEST_BODY_BYTES}}": str(
1942 mp_config.get("max_request_body_bytes", 1_048_576)
1943 ),
1944 }
1946 # Always-on Cluster_Shared_Bucket replacements. Populated from the
1947 # SharedBucketIdentity resolved in __init__ via cross-region SSM
1948 # read from GCOGlobalStack. Never gated on the analytics toggle —
1949 # the gco-cluster-shared-bucket ConfigMap is applied to every
1950 # regional cluster.
1951 image_replacements.update(
1952 _compute_kubectl_cluster_shared_replacements(self.cluster_shared_identity)
1953 )
1955 # Add queue processor replacements if enabled
1956 qp_config = self.node.try_get_context("queue_processor") or {}
1958 # Add VPC endpoint CIDR replacements for network policy restrictions
1959 # Generates a YAML block of ipBlock entries from the vpc_endpoint_cidrs array.
1960 # The placeholder {{VPC_ENDPOINT_CIDR_BLOCKS}} sits at 8-space indentation in
1961 # the manifest, so the first entry needs no leading indent (the manifest provides
1962 # it) and subsequent entries are indented to align.
1963 vpc_endpoint_cidrs = self.node.try_get_context("vpc_endpoint_cidrs") or ["10.0.0.0/16"]
1964 cidr_lines = []
1965 for i, cidr in enumerate(vpc_endpoint_cidrs):
1966 prefix = "" if i == 0 else " "
1967 cidr_lines.append(f'{prefix}- ipBlock:\n cidr: "{cidr}"')
1968 image_replacements["{{VPC_ENDPOINT_CIDR_BLOCKS}}"] = "\n".join(cidr_lines)
1970 # Resource governance for gco-jobs namespace: ResourceQuota caps aggregate
1971 # resource consumption across the namespace, LimitRange caps per-container
1972 # maxima. Values come from cdk.json `resource_quota` context with defaults
1973 # sized for a modest multi-tenant dev cluster.
1974 resource_quota = self.node.try_get_context("resource_quota") or {}
1975 image_replacements["{{QUOTA_MAX_CPU}}"] = str(resource_quota.get("max_cpu", "100"))
1976 image_replacements["{{QUOTA_MAX_MEMORY}}"] = str(resource_quota.get("max_memory", "512Gi"))
1977 image_replacements["{{QUOTA_MAX_GPU}}"] = str(resource_quota.get("max_gpu", "32"))
1978 image_replacements["{{QUOTA_MAX_PODS}}"] = str(resource_quota.get("max_pods", "50"))
1979 image_replacements["{{LIMIT_MAX_CPU}}"] = str(resource_quota.get("container_max_cpu", "10"))
1980 image_replacements["{{LIMIT_MAX_MEMORY}}"] = str(
1981 resource_quota.get("container_max_memory", "64Gi")
1982 )
1983 image_replacements["{{LIMIT_MAX_GPU}}"] = str(resource_quota.get("container_max_gpu", "4"))
1985 if self.queue_processor_enabled: 1985 ↛ 2064line 1985 didn't jump to line 2064 because the condition on line 1985 was always true
1986 image_replacements["{{QUEUE_PROCESSOR_IMAGE}}"] = self.queue_processor_image.image_uri
1987 image_replacements["{{QP_POLLING_INTERVAL}}"] = str(
1988 qp_config.get("polling_interval", 10)
1989 )
1990 image_replacements["{{QP_MAX_CONCURRENT_JOBS}}"] = str(
1991 qp_config.get("max_concurrent_jobs", 10)
1992 )
1993 image_replacements["{{QP_MESSAGES_PER_JOB}}"] = str(
1994 qp_config.get("messages_per_job", 1)
1995 )
1996 image_replacements["{{QP_SUCCESSFUL_JOBS_HISTORY}}"] = str(
1997 qp_config.get("successful_jobs_history", 20)
1998 )
1999 image_replacements["{{QP_FAILED_JOBS_HISTORY}}"] = str(
2000 qp_config.get("failed_jobs_history", 10)
2001 )
2002 image_replacements["{{QP_ALLOWED_NAMESPACES}}"] = ",".join(
2003 job_policy.get("allowed_namespaces", ["default", "gco-jobs"])
2004 )
2005 # Resource caps, image allowlist, and security policy are shared
2006 # with the REST manifest processor. Source them from the
2007 # job_validation_policy section so a single change in cdk.json
2008 # takes effect on both submission paths at the next deploy.
2009 image_replacements["{{QP_MAX_GPU_PER_MANIFEST}}"] = str(
2010 job_quotas.get("max_gpu_per_manifest", 4)
2011 )
2012 image_replacements["{{QP_MAX_CPU_PER_MANIFEST}}"] = str(
2013 job_quotas.get("max_cpu_per_manifest", "10")
2014 )
2015 image_replacements["{{QP_MAX_MEMORY_PER_MANIFEST}}"] = str(
2016 job_quotas.get("max_memory_per_manifest", "32Gi")
2017 )
2018 image_replacements["{{QP_TRUSTED_REGISTRIES}}"] = ",".join(
2019 _augment_trusted_registries_with_project_ecr(
2020 job_policy.get("trusted_registries", []),
2021 account=self.account,
2022 regions=self.config.get_regions(),
2023 global_region=self.config.get_global_region(),
2024 )
2025 )
2026 image_replacements["{{QP_TRUSTED_DOCKERHUB_ORGS}}"] = ",".join(
2027 job_policy.get("trusted_dockerhub_orgs", [])
2028 )
2030 # Security policy toggles — shared with the REST manifest_processor.
2031 # Both services read the same cdk.json section so a single policy
2032 # flip (e.g. block_run_as_root: true) takes effect on both paths.
2033 security_policy = job_policy.get("manifest_security_policy", {})
2035 def _policy_str(v: object) -> str:
2036 return "true" if v else "false"
2038 image_replacements["{{QP_BLOCK_PRIVILEGED}}"] = _policy_str(
2039 security_policy.get("block_privileged", True)
2040 )
2041 image_replacements["{{QP_BLOCK_PRIVILEGE_ESCALATION}}"] = _policy_str(
2042 security_policy.get("block_privilege_escalation", True)
2043 )
2044 image_replacements["{{QP_BLOCK_HOST_NETWORK}}"] = _policy_str(
2045 security_policy.get("block_host_network", True)
2046 )
2047 image_replacements["{{QP_BLOCK_HOST_PID}}"] = _policy_str(
2048 security_policy.get("block_host_pid", True)
2049 )
2050 image_replacements["{{QP_BLOCK_HOST_IPC}}"] = _policy_str(
2051 security_policy.get("block_host_ipc", True)
2052 )
2053 image_replacements["{{QP_BLOCK_HOST_PATH}}"] = _policy_str(
2054 security_policy.get("block_host_path", True)
2055 )
2056 image_replacements["{{QP_BLOCK_ADDED_CAPABILITIES}}"] = _policy_str(
2057 security_policy.get("block_added_capabilities", True)
2058 )
2059 image_replacements["{{QP_BLOCK_RUN_AS_ROOT}}"] = _policy_str(
2060 security_policy.get("block_run_as_root", False)
2061 )
2063 # Add Valkey endpoint if enabled
2064 if hasattr(self, "valkey_cache") and self.valkey_cache: 2064 ↛ 2065line 2064 didn't jump to line 2065 because the condition on line 2064 was never true
2065 image_replacements["{{VALKEY_ENDPOINT}}"] = self.valkey_cache.attr_endpoint_address
2066 image_replacements["{{VALKEY_PORT}}"] = self.valkey_cache.attr_endpoint_port
2068 # Add Aurora pgvector endpoint if enabled
2069 if hasattr(self, "aurora_cluster") and self.aurora_cluster:
2070 image_replacements["{{AURORA_PGVECTOR_ENDPOINT}}"] = (
2071 self.aurora_cluster.cluster_endpoint.hostname
2072 )
2073 image_replacements["{{AURORA_PGVECTOR_READER_ENDPOINT}}"] = (
2074 self.aurora_cluster.cluster_read_endpoint.hostname
2075 )
2076 image_replacements["{{AURORA_PGVECTOR_PORT}}"] = str(
2077 self.aurora_cluster.cluster_endpoint.port
2078 )
2079 if self.aurora_cluster.secret: 2079 ↛ 2085line 2079 didn't jump to line 2085 because the condition on line 2079 was always true
2080 image_replacements["{{AURORA_PGVECTOR_SECRET_ARN}}"] = (
2081 self.aurora_cluster.secret.secret_arn
2082 )
2084 # Add FSx replacements if enabled
2085 if self.fsx_file_system:
2086 image_replacements["{{FSX_FILE_SYSTEM_ID}}"] = self.fsx_file_system.ref
2087 image_replacements["{{FSX_DNS_NAME}}"] = self.fsx_file_system.attr_dns_name
2088 image_replacements["{{FSX_MOUNT_NAME}}"] = self.fsx_file_system.attr_lustre_mount_name
2089 image_replacements["{{PRIVATE_SUBNET_ID}}"] = self.vpc.private_subnets[0].subnet_id
2090 image_replacements["{{FSX_SECURITY_GROUP_ID}}"] = (
2091 self.fsx_security_group.security_group_id
2092 )
2094 kubectl_apply = CustomResource(
2095 self,
2096 "KubectlApplyManifests",
2097 service_token=self.kubectl_provider.service_token,
2098 properties={
2099 "ClusterName": self.cluster.cluster_name,
2100 "Region": self.deployment_region,
2101 "SkipDeletionOnStackDelete": "true", # Don't delete resources on stack deletion
2102 "ImageReplacements": image_replacements,
2103 # Include FSx file system ID directly to force update when FSx changes
2104 "FsxFileSystemId": self.fsx_file_system.ref if self.fsx_file_system else "none",
2105 # Force update on each deployment to trigger pod rollouts
2106 "DeploymentTimestamp": deployment_timestamp,
2107 },
2108 )
2110 # Ensure manifests are applied after cluster, EFS, and FSx are ready
2111 # Note: ALB is created by EKS Auto Mode when Ingress is applied
2112 kubectl_apply.node.add_dependency(self.cluster)
2113 kubectl_apply.node.add_dependency(self.efs_file_system)
2114 if self.fsx_file_system:
2115 kubectl_apply.node.add_dependency(self.fsx_file_system)
2117 # Wait for EKS to have patched the IRSA role ARN onto each managed
2118 # addon's service account before the kubectl Lambda rollout-restarts
2119 # the controllers at the end of this invocation. Otherwise the
2120 # restart sees the old (annotation-less) SA, the mutating webhook
2121 # can't inject AWS_ROLE_ARN, and the new pods are just as
2122 # credential-less as the ones they replaced. The symptom is
2123 # controller pods silently failing with "no EC2 IMDS role found" —
2124 # for EFS/FSx that manifests as PVCs stuck Pending forever, for
2125 # CloudWatch as missing Container Insights metrics. See the
2126 # UpdateEfsCsiAddonRole custom resource in _create_efs_csi_driver_addon
2127 # for the full rationale.
2128 for attr in (
2129 "_efs_csi_addon_role_update",
2130 "_fsx_csi_addon_role_update",
2131 "_cloudwatch_addon_role_update",
2132 ):
2133 update_cr = getattr(self, attr, None)
2134 if update_cr is not None:
2135 kubectl_apply.node.add_dependency(update_cr)
2137 # Ensure Pod Identity associations exist before workloads start,
2138 # so pods get IAM credentials on first launch
2139 for assoc in self._pod_identity_associations:
2140 kubectl_apply.node.add_dependency(assoc)
2142 # Install Helm charts (KEDA, etc.) after base manifests are applied
2143 # This ensures namespaces and RBAC are in place before Helm installations
2144 helm_install = CustomResource(
2145 self,
2146 "HelmInstallCharts",
2147 service_token=self.helm_installer_provider.service_token,
2148 properties={
2149 "ClusterName": self.cluster.cluster_name,
2150 "Region": self.deployment_region,
2151 # Enable core AI/ML infrastructure charts by default
2152 # NVIDIA Network Operator toggled via cdk.json nvidia_network_operator.enabled
2153 "EnabledCharts": self._get_enabled_helm_charts(),
2154 # Override chart values if needed
2155 "Charts": {},
2156 # Pass IAM role ARNs for service account annotations
2157 "KedaOperatorRoleArn": self.keda_operator_role.role_arn,
2158 # Force re-invocation on every deployment to pick up charts.yaml changes
2159 "DeploymentTimestamp": deployment_timestamp,
2160 },
2161 )
2163 # Helm charts depend on kubectl manifests being applied first
2164 helm_install.node.add_dependency(kubectl_apply)
2166 # Apply CRD-dependent manifests after Helm installs the CRDs.
2167 # KEDA ScaledJob/ScaledObject require the KEDA CRDs to exist first.
2168 # This second kubectl pass runs after Helm and applies only those resources.
2169 kubectl_apply_post_helm = CustomResource(
2170 self,
2171 "KubectlApplyPostHelmManifests",
2172 service_token=self.kubectl_provider.service_token,
2173 properties={
2174 "ClusterName": self.cluster.cluster_name,
2175 "Region": self.deployment_region,
2176 "SkipDeletionOnStackDelete": "true",
2177 "ImageReplacements": image_replacements,
2178 "FsxFileSystemId": self.fsx_file_system.ref if self.fsx_file_system else "none",
2179 "DeploymentTimestamp": deployment_timestamp,
2180 # PostHelm: "true" tells the handler to apply only post-helm-* manifests
2181 "PostHelm": "true",
2182 },
2183 )
2185 # Must run after Helm has installed the CRDs
2186 kubectl_apply_post_helm.node.add_dependency(helm_install)
2188 # Create GA registration custom resource AFTER manifests are applied
2189 # This waits for the Ingress to create the ALB and registers it with GA
2190 #
2191 # IMPORTANT: We include a deployment timestamp to force CloudFormation to
2192 # re-invoke the Lambda on every deployment. This ensures the ALB is always
2193 # registered with the Global Accelerator, even if other properties haven't changed.
2194 # Without this, CloudFormation may skip the custom resource if it thinks
2195 # nothing has changed, leaving the ALB unregistered after GA recreation.
2196 deployment_timestamp = str(int(time.time()))
2198 ga_registration = CustomResource(
2199 self,
2200 "GaRegistration",
2201 service_token=self.ga_registration_provider.service_token,
2202 properties={
2203 "ClusterName": self.cluster.cluster_name,
2204 "Region": self.deployment_region,
2205 "EndpointGroupArn": self.endpoint_group_arn,
2206 "IngressName": "gco-ingress",
2207 "Namespace": "gco-system",
2208 # Pass global region and project name for SSM storage
2209 "GlobalRegion": self.config.get_global_region(),
2210 "ProjectName": self.config.get_project_name(),
2211 # Force re-invocation on every deployment
2212 "DeploymentTimestamp": deployment_timestamp,
2213 },
2214 )
2216 # GA registration must happen after manifests are applied
2217 ga_registration.node.add_dependency(kubectl_apply)
2219 def _create_ga_registration_lambda(self) -> None:
2220 """Create Lambda function to register Ingress-created ALB with Global Accelerator.
2222 This Lambda:
2223 1. Waits for the Ingress to get an ALB address
2224 2. Gets the ALB ARN from the address
2225 3. Registers that ALB with Global Accelerator
2227 This is necessary because the ALB is created by the AWS Load Balancer Controller
2228 (not CDK), so we can't directly reference its ARN.
2229 """
2230 project_name = self.config.get_project_name()
2232 # Create Lambda function for GA registration using external handler
2233 ga_registration_lambda = lambda_.Function(
2234 self,
2235 "GaRegistrationFunction",
2236 runtime=getattr(lambda_.Runtime, LAMBDA_PYTHON_RUNTIME),
2237 handler="handler.lambda_handler",
2238 code=lambda_.Code.from_asset("lambda/ga-registration"),
2239 timeout=Duration.minutes(15), # Max Lambda timeout; handler uses 14 min budget
2240 memory_size=256,
2241 vpc=self.vpc,
2242 vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS),
2243 environment={
2244 "CLUSTER_NAME": self.cluster.cluster_name,
2245 "REGION": self.deployment_region,
2246 },
2247 tracing=lambda_.Tracing.ACTIVE,
2248 )
2250 # Grant permissions
2251 ga_registration_lambda.add_to_role_policy(
2252 iam.PolicyStatement(
2253 effect=iam.Effect.ALLOW,
2254 actions=["eks:DescribeCluster"],
2255 resources=[self.cluster.cluster_arn],
2256 )
2257 )
2258 ga_registration_lambda.add_to_role_policy(
2259 iam.PolicyStatement(
2260 effect=iam.Effect.ALLOW,
2261 actions=[
2262 "elasticloadbalancing:DescribeLoadBalancers",
2263 "elasticloadbalancing:DescribeTags", # Required for tag-based ALB detection
2264 ],
2265 resources=["*"],
2266 )
2267 )
2268 ga_registration_lambda.add_to_role_policy(
2269 iam.PolicyStatement(
2270 effect=iam.Effect.ALLOW,
2271 actions=[
2272 "globalaccelerator:AddEndpoints",
2273 "globalaccelerator:RemoveEndpoints",
2274 "globalaccelerator:UpdateEndpointGroup",
2275 "globalaccelerator:DescribeEndpointGroup",
2276 ],
2277 resources=["*"],
2278 )
2279 )
2280 ga_registration_lambda.add_to_role_policy(
2281 iam.PolicyStatement(
2282 effect=iam.Effect.ALLOW,
2283 actions=["ssm:GetParameter", "ssm:PutParameter", "ssm:DeleteParameter"],
2284 resources=[
2285 f"arn:aws:ssm:{self.config.get_global_region()}:{self.account}:parameter/{project_name}/*"
2286 ],
2287 )
2288 )
2290 # Add EKS access entry for the Lambda role
2291 if ga_registration_lambda.role is not None: 2291 ↛ 2305line 2291 didn't jump to line 2305 because the condition on line 2291 was always true
2292 eks.AccessEntry(
2293 self,
2294 "GaRegistrationLambdaAccessEntry",
2295 cluster=self.cluster, # type: ignore[arg-type]
2296 principal=ga_registration_lambda.role.role_arn,
2297 access_policies=[
2298 eks.AccessPolicy.from_access_policy_name(
2299 "AmazonEKSClusterAdminPolicy", access_scope_type=eks.AccessScopeType.CLUSTER
2300 )
2301 ],
2302 )
2304 # Allow Lambda to access EKS API
2305 self.cluster.cluster_security_group.add_ingress_rule(
2306 peer=ec2.Peer.ipv4(self.vpc.vpc_cidr_block),
2307 connection=ec2.Port.tcp(443),
2308 description="Allow GA registration Lambda to access EKS API",
2309 )
2311 # Get endpoint group ARN from SSM (stored in global region).
2312 # Uses the shared AwsCustomResource execution role (pre-created in
2313 # _create_aws_custom_resource_role) — the SSM GetParameter
2314 # statement was attached there up-front so the Lambda never hits
2315 # an IAM propagation race on cold deploys.
2316 global_region = self.config.get_global_region()
2317 get_endpoint_group_arn = cr.AwsCustomResource(
2318 self,
2319 "GetEndpointGroupArn",
2320 on_create=cr.AwsSdkCall(
2321 service="SSM",
2322 action="getParameter",
2323 parameters={"Name": f"/{project_name}/endpoint-group-{self.deployment_region}-arn"},
2324 region=global_region,
2325 physical_resource_id=cr.PhysicalResourceId.of(
2326 f"{project_name}-get-endpoint-group-arn-{self.deployment_region}"
2327 ),
2328 ),
2329 on_update=cr.AwsSdkCall(
2330 service="SSM",
2331 action="getParameter",
2332 parameters={"Name": f"/{project_name}/endpoint-group-{self.deployment_region}-arn"},
2333 region=global_region,
2334 ),
2335 role=self.aws_custom_resource_role,
2336 )
2337 get_endpoint_group_arn.node.add_dependency(self.aws_custom_resource_role)
2339 endpoint_group_arn = get_endpoint_group_arn.get_response_field("Parameter.Value")
2341 # Create log group for GA registration provider
2342 ga_provider_log_group = logs.LogGroup(
2343 self,
2344 "GaRegistrationProviderLogGroup",
2345 retention=logs.RetentionDays.ONE_WEEK,
2346 removal_policy=RemovalPolicy.DESTROY,
2347 )
2349 # Create provider and custom resource
2350 ga_provider = cr.Provider(
2351 self,
2352 "GaRegistrationProvider",
2353 on_event_handler=ga_registration_lambda,
2354 log_group=ga_provider_log_group,
2355 )
2357 # Store for use after kubectl apply
2358 self.ga_registration_provider = ga_provider
2359 self.endpoint_group_arn = endpoint_group_arn
2361 # cdk-nag suppression: the GA registration Lambda needs broad
2362 # Global Accelerator and ELB Describe access with Resource: *.
2363 from cdk_nag import NagSuppressions
2365 NagSuppressions.add_resource_suppressions(
2366 ga_registration_lambda,
2367 [
2368 {
2369 "id": "AwsSolutions-IAM5",
2370 "reason": (
2371 "The GA registration Lambda needs elasticloadbalancing:Describe* "
2372 "and globalaccelerator:* to discover the Ingress-created ALB and "
2373 "register it with Global Accelerator. These APIs do not support "
2374 "resource-level IAM scoping — Resource: * is the only valid form."
2375 ),
2376 "appliesTo": ["Resource::*"],
2377 },
2378 ],
2379 apply_to_children=True,
2380 )
2382 def _get_enabled_helm_charts(self) -> list[str]:
2383 """Return the list of Helm charts to install based on cdk.json helm config.
2385 Reads the 'helm' section from cdk.json context. Each key maps to one or
2386 more Helm chart names. Charts are returned in dependency order with Kueue
2387 last (its webhook intercepts all Job/Deployment mutations).
2388 """
2389 helm_config = self.node.try_get_context("helm") or {}
2391 # Mapping from cdk.json helm key → Helm chart name(s) in charts.yaml
2392 # Order matters: dependencies first, Kueue last
2393 chart_map: list[tuple[str, list[str]]] = [
2394 ("keda", ["keda"]),
2395 ("nvidia_gpu_operator", ["nvidia-gpu-operator"]),
2396 ("nvidia_dra_driver", ["nvidia-dra-driver"]),
2397 ("nvidia_network_operator", ["nvidia-network-operator"]),
2398 ("aws_efa_device_plugin", ["aws-efa-device-plugin"]),
2399 ("aws_neuron_device_plugin", ["aws-neuron-device-plugin"]),
2400 ("volcano", ["volcano"]),
2401 ("kuberay", ["kuberay-operator"]),
2402 ("cert_manager", ["cert-manager"]),
2403 ("slurm", ["slinky-slurm-operator", "slinky-slurm"]),
2404 ("yunikorn", ["yunikorn"]),
2405 ("kueue", ["kueue"]), # Must be last
2406 ]
2408 enabled_charts = []
2409 for config_key, chart_names in chart_map:
2410 chart_config = helm_config.get(config_key, {})
2411 if chart_config.get("enabled", True):
2412 enabled_charts.extend(chart_names)
2414 return enabled_charts
2416 def _create_helm_installer_lambda(self) -> None:
2417 """Create Lambda function to install Helm charts (KEDA, NVIDIA DRA, etc.).
2419 This Lambda uses Helm to install charts that require complex setup
2420 (TLS certificates, CRDs, etc.) that are difficult to manage via raw manifests.
2422 Charts installed:
2423 - KEDA: Kubernetes Event-Driven Autoscaling (enabled by default)
2424 - NVIDIA DRA Driver: Dynamic Resource Allocation for GPUs (disabled by default)
2425 """
2426 project_name = self.config.get_project_name()
2428 # Create IAM role for Helm installer Lambda
2429 helm_lambda_role = iam.Role(
2430 self,
2431 "HelmInstallerLambdaRole",
2432 assumed_by=iam.ServicePrincipal("lambda.amazonaws.com"),
2433 managed_policies=[
2434 iam.ManagedPolicy.from_aws_managed_policy_name(
2435 "service-role/AWSLambdaVPCAccessExecutionRole"
2436 ),
2437 iam.ManagedPolicy.from_aws_managed_policy_name(
2438 "service-role/AWSLambdaBasicExecutionRole"
2439 ),
2440 ],
2441 )
2443 # Add EKS permissions
2444 helm_lambda_role.add_to_policy(
2445 iam.PolicyStatement(
2446 actions=["eks:DescribeCluster", "eks:ListClusters"],
2447 resources=[self.cluster.cluster_arn],
2448 )
2449 )
2451 # Create security group for Helm installer Lambda
2452 helm_lambda_sg = ec2.SecurityGroup(
2453 self,
2454 "HelmInstallerLambdaSG",
2455 vpc=self.vpc,
2456 description="Security group for Helm installer Lambda to access EKS cluster",
2457 security_group_name=f"{project_name}-helm-lambda-sg-{self.deployment_region}",
2458 allow_all_outbound=True,
2459 )
2461 # Allow Lambda to access EKS cluster API
2462 self.cluster.cluster_security_group.add_ingress_rule(
2463 peer=helm_lambda_sg,
2464 connection=ec2.Port.tcp(443),
2465 description="Allow Helm installer Lambda to access EKS API",
2466 )
2468 # Build Docker image for Helm installer Lambda
2469 # Points at helm-installer-build/ which is rebuilt fresh every deploy
2470 # by _build_helm_installer_lambda() in cli/stacks.py
2471 ecr_assets.DockerImageAsset(
2472 self,
2473 "HelmInstallerImage",
2474 directory="lambda/helm-installer-build",
2475 platform=ecr_assets.Platform.LINUX_AMD64,
2476 )
2478 # Create Lambda function using Docker image
2479 # Store function name as string attribute for cross-stack references
2480 # This avoids CDK cross-environment resolution issues when account is unresolved
2481 self.helm_installer_lambda_function_name = f"{project_name}-helm-{self.deployment_region}"
2482 self.helm_installer_lambda = lambda_.DockerImageFunction(
2483 self,
2484 "HelmInstallerFunction",
2485 function_name=self.helm_installer_lambda_function_name,
2486 code=lambda_.DockerImageCode.from_image_asset(
2487 directory="lambda/helm-installer-build",
2488 platform=ecr_assets.Platform.LINUX_AMD64,
2489 ),
2490 timeout=Duration.minutes(15),
2491 memory_size=1024,
2492 architecture=lambda_.Architecture.X86_64,
2493 role=helm_lambda_role,
2494 vpc=self.vpc,
2495 vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS),
2496 security_groups=[helm_lambda_sg],
2497 environment={
2498 "CLUSTER_NAME": self.cluster.cluster_name,
2499 "REGION": self.deployment_region,
2500 },
2501 tracing=lambda_.Tracing.ACTIVE,
2502 )
2504 # Add EKS access entry for the Lambda role
2505 eks.AccessEntry(
2506 self,
2507 "HelmInstallerLambdaAccessEntry",
2508 cluster=self.cluster, # type: ignore[arg-type]
2509 principal=helm_lambda_role.role_arn,
2510 access_policies=[
2511 eks.AccessPolicy.from_access_policy_name(
2512 "AmazonEKSClusterAdminPolicy", access_scope_type=eks.AccessScopeType.CLUSTER
2513 )
2514 ],
2515 )
2517 # Create log group for Helm installer provider
2518 helm_provider_log_group = logs.LogGroup(
2519 self,
2520 "HelmInstallerProviderLogGroup",
2521 retention=logs.RetentionDays.ONE_WEEK,
2522 removal_policy=RemovalPolicy.DESTROY,
2523 )
2525 # Create custom resource provider
2526 self.helm_installer_provider = cr.Provider(
2527 self,
2528 "HelmInstallerProvider",
2529 on_event_handler=self.helm_installer_lambda,
2530 log_group=helm_provider_log_group,
2531 )
2533 # cdk-nag suppression: the Helm installer Lambda requires broad
2534 # EKS and Kubernetes API access to install Helm charts.
2535 from cdk_nag import NagSuppressions
2537 NagSuppressions.add_resource_suppressions(
2538 helm_lambda_role,
2539 [
2540 {
2541 "id": "AwsSolutions-IAM5",
2542 "reason": (
2543 "The Helm installer Lambda requires broad EKS and Kubernetes API "
2544 "access to install Helm charts (KEDA, NVIDIA DRA, etc.) that create "
2545 "CRDs, RBAC rules, and workloads across multiple namespaces. "
2546 "Resource: * is required because the set of Kubernetes resources "
2547 "is dynamic and not known at synth time."
2548 ),
2549 "appliesTo": ["Resource::*"],
2550 },
2551 ],
2552 apply_to_children=True,
2553 )
2555 def _create_efs(self) -> None:
2556 """Create EFS file system for shared storage across jobs.
2558 Creates an EFS file system with mount targets in each private subnet,
2559 allowing pods to share data and persist outputs. The EFS is configured
2560 with:
2561 - Encryption at rest
2562 - Automatic backups
2563 - General Purpose performance mode (suitable for most workloads)
2564 - Bursting throughput mode
2566 Kubernetes resources (StorageClass, PV, PVC) are created via manifests.
2567 """
2568 project_name = self.config.get_project_name()
2570 # Create security group for EFS
2571 self.efs_security_group = ec2.SecurityGroup(
2572 self,
2573 "EfsSecurityGroup",
2574 vpc=self.vpc,
2575 description=f"Security group for {project_name} EFS in {self.deployment_region}",
2576 security_group_name=f"{project_name}-efs-sg-{self.deployment_region}",
2577 allow_all_outbound=False, # EFS doesn't need outbound
2578 )
2580 # Allow NFS traffic from EKS cluster security group
2581 self.efs_security_group.add_ingress_rule(
2582 peer=self.cluster.cluster_security_group,
2583 connection=ec2.Port.tcp(2049),
2584 description="Allow NFS from EKS cluster",
2585 )
2587 # Create EFS file system
2588 self.efs_file_system = efs.FileSystem(
2589 self,
2590 "GCOEfs",
2591 vpc=self.vpc,
2592 file_system_name=f"{project_name}-efs-{self.deployment_region}",
2593 security_group=self.efs_security_group,
2594 vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS),
2595 encrypted=True,
2596 performance_mode=efs.PerformanceMode.GENERAL_PURPOSE,
2597 throughput_mode=efs.ThroughputMode.BURSTING,
2598 removal_policy=RemovalPolicy.DESTROY, # For dev/test; use RETAIN for production
2599 enable_automatic_backups=True,
2600 )
2602 # Add file system policy to allow mounting without IAM authorization
2603 # This allows any client that can reach the mount target to mount the file system
2604 self.efs_file_system.add_to_resource_policy(
2605 iam.PolicyStatement(
2606 effect=iam.Effect.ALLOW,
2607 principals=[iam.AnyPrincipal()],
2608 actions=[
2609 "elasticfilesystem:ClientMount",
2610 "elasticfilesystem:ClientWrite",
2611 "elasticfilesystem:ClientRootAccess",
2612 ],
2613 conditions={"Bool": {"elasticfilesystem:AccessedViaMountTarget": "true"}},
2614 )
2615 )
2617 # Create access point for the gco-jobs directory
2618 self.efs_access_point = self.efs_file_system.add_access_point(
2619 "JobsAccessPoint",
2620 path="/gco-jobs",
2621 create_acl=efs.Acl(owner_uid="1000", owner_gid="1000", permissions="755"),
2622 posix_user=efs.PosixUser(uid="1000", gid="1000"),
2623 )
2625 # Output EFS information
2626 CfnOutput(
2627 self,
2628 "EfsFileSystemId",
2629 value=self.efs_file_system.file_system_id,
2630 description="EFS File System ID for shared job storage",
2631 )
2633 CfnOutput(
2634 self,
2635 "EfsAccessPointId",
2636 value=self.efs_access_point.access_point_id,
2637 description="EFS Access Point ID for job outputs",
2638 )
2640 def _create_fsx_lustre(self) -> None:
2641 """Create FSx for Lustre file system for high-performance storage.
2643 FSx for Lustre provides high-performance parallel file system storage
2644 ideal for ML training workloads that require high throughput and low latency.
2646 This is optional and controlled by the fsx_lustre.enabled config setting.
2648 Supported deployment types:
2649 - SCRATCH_1: Temporary storage, no data replication
2650 - SCRATCH_2: Temporary storage with better burst performance
2651 - PERSISTENT_1: Persistent storage with data replication
2652 - PERSISTENT_2: Latest persistent storage with higher throughput
2653 """
2654 fsx_config = self.config.get_fsx_lustre_config(self.deployment_region)
2656 if not fsx_config.get("enabled", False):
2657 self.fsx_file_system = None
2658 return
2660 project_name = self.config.get_project_name()
2662 # Create security group for FSx
2663 self.fsx_security_group = ec2.SecurityGroup(
2664 self,
2665 "FsxSecurityGroup",
2666 vpc=self.vpc,
2667 description=f"Security group for {project_name} FSx Lustre in {self.deployment_region}",
2668 security_group_name=f"{project_name}-fsx-sg-{self.deployment_region}",
2669 allow_all_outbound=False,
2670 )
2672 # Allow Lustre traffic from EKS cluster security group
2673 # Lustre uses ports 988 (control) and 1021-1023 (data)
2674 self.fsx_security_group.add_ingress_rule(
2675 peer=self.cluster.cluster_security_group,
2676 connection=ec2.Port.tcp(988),
2677 description="Allow Lustre control traffic from EKS cluster",
2678 )
2679 self.fsx_security_group.add_ingress_rule(
2680 peer=self.cluster.cluster_security_group,
2681 connection=ec2.Port.tcp_range(1021, 1023),
2682 description="Allow Lustre data traffic from EKS cluster",
2683 )
2685 # Allow self-referencing traffic for FSx Lustre internal communication
2686 # FSx Lustre nodes need to communicate with each other on port 988
2687 self.fsx_security_group.add_ingress_rule(
2688 peer=self.fsx_security_group,
2689 connection=ec2.Port.tcp(988),
2690 description="Allow Lustre internal traffic on port 988",
2691 )
2692 self.fsx_security_group.add_ingress_rule(
2693 peer=self.fsx_security_group,
2694 connection=ec2.Port.tcp_range(1021, 1023),
2695 description="Allow Lustre internal traffic on ports 1021-1023",
2696 )
2698 # Get deployment type
2699 deployment_type = fsx_config.get("deployment_type", "SCRATCH_2")
2700 storage_capacity = fsx_config.get("storage_capacity_gib", 1200)
2702 # Build Lustre configuration based on deployment type
2703 lustre_config = {
2704 "deploymentType": deployment_type,
2705 "dataCompressionType": fsx_config.get("data_compression_type", "LZ4"),
2706 }
2708 # Add throughput for PERSISTENT types
2709 if deployment_type.startswith("PERSISTENT"):
2710 lustre_config["perUnitStorageThroughput"] = fsx_config.get(
2711 "per_unit_storage_throughput", 200
2712 )
2714 # Add S3 import/export if configured
2715 import_path = fsx_config.get("import_path")
2716 export_path = fsx_config.get("export_path")
2718 if import_path:
2719 lustre_config["importPath"] = import_path
2720 lustre_config["autoImportPolicy"] = fsx_config.get(
2721 "auto_import_policy", "NEW_CHANGED_DELETED"
2722 )
2724 if export_path:
2725 lustre_config["exportPath"] = export_path
2727 # Get file system type version (default to 2.15 for kernel 6.x compatibility)
2728 # IMPORTANT: Lustre 2.10 is NOT compatible with kernel 6.x (AL2023, Bottlerocket 1.19+)
2729 # See: https://docs.aws.amazon.com/fsx/latest/LustreGuide/lustre-client-matrix.html
2730 file_system_type_version = fsx_config.get("file_system_type_version", "2.15")
2732 # Create FSx for Lustre file system
2733 self.fsx_file_system = fsx.CfnFileSystem(
2734 self,
2735 "GCOFsxLustre",
2736 file_system_type="LUSTRE",
2737 file_system_type_version=file_system_type_version,
2738 storage_capacity=storage_capacity,
2739 subnet_ids=[self.vpc.private_subnets[0].subnet_id],
2740 security_group_ids=[self.fsx_security_group.security_group_id],
2741 lustre_configuration=lustre_config,
2742 tags=[
2743 {"key": "Name", "value": f"{project_name}-fsx-{self.deployment_region}"},
2744 {"key": "Project", "value": project_name},
2745 ],
2746 )
2748 # Ensure FSx file system waits for security group ingress rules to be created
2749 # This prevents "security group does not permit Lustre LNET traffic" errors
2750 self.fsx_file_system.node.add_dependency(self.fsx_security_group)
2752 # Create FSx CSI Driver add-on for Kubernetes integration
2753 self._create_fsx_csi_driver_addon()
2755 # Output FSx information
2756 CfnOutput(
2757 self,
2758 "FsxFileSystemId",
2759 value=self.fsx_file_system.ref,
2760 description="FSx for Lustre File System ID",
2761 )
2763 CfnOutput(
2764 self,
2765 "FsxDnsName",
2766 value=self.fsx_file_system.attr_dns_name,
2767 description="FSx for Lustre DNS Name",
2768 )
2770 CfnOutput(
2771 self,
2772 "FsxMountName",
2773 value=self.fsx_file_system.attr_lustre_mount_name,
2774 description="FSx for Lustre Mount Name",
2775 )
2777 def _create_valkey_cache(self) -> None:
2778 """Create an ElastiCache Serverless Valkey cache for K/V caching.
2780 Provides a low-latency key-value store that inference endpoints and
2781 jobs can use for prompt caching, session state, feature stores, or
2782 any shared state across pods. Valkey Serverless auto-scales and
2783 requires no node management.
2785 The cache is placed in the VPC private subnets and accessible from
2786 any pod via the cluster security group.
2787 """
2788 valkey_config = self.config.get_valkey_config()
2789 if not valkey_config.get("enabled", False): 2789 ↛ 2792line 2789 didn't jump to line 2792 because the condition on line 2789 was always true
2790 return
2792 from aws_cdk import aws_elasticache as elasticache
2794 # Security group for Valkey (allow access from EKS cluster)
2795 valkey_sg = ec2.SecurityGroup(
2796 self,
2797 "ValkeySG",
2798 vpc=self.vpc,
2799 description="Security group for Valkey Serverless cache",
2800 allow_all_outbound=False,
2801 )
2802 valkey_sg.add_ingress_rule(
2803 ec2.Peer.ipv4(self.vpc.vpc_cidr_block),
2804 ec2.Port.tcp(6379),
2805 "Allow Valkey access from VPC",
2806 )
2808 private_subnet_ids = [s.subnet_id for s in self.vpc.private_subnets]
2810 self.valkey_cache = elasticache.CfnServerlessCache(
2811 self,
2812 "ValkeyCache",
2813 engine="valkey",
2814 serverless_cache_name=f"gco-{self.deployment_region}",
2815 description=f"GCO K/V cache for {self.deployment_region}",
2816 major_engine_version="8",
2817 security_group_ids=[valkey_sg.security_group_id],
2818 subnet_ids=private_subnet_ids,
2819 cache_usage_limits=elasticache.CfnServerlessCache.CacheUsageLimitsProperty(
2820 data_storage=elasticache.CfnServerlessCache.DataStorageProperty(
2821 maximum=valkey_config.get("max_data_storage_gb", 5),
2822 minimum=1,
2823 unit="GB",
2824 ),
2825 ecpu_per_second=elasticache.CfnServerlessCache.ECPUPerSecondProperty(
2826 maximum=valkey_config.get("max_ecpu_per_second", 5000),
2827 minimum=1000,
2828 ),
2829 ),
2830 snapshot_retention_limit=valkey_config.get("snapshot_retention_limit", 1),
2831 tags=[
2832 CfnTag(key="Project", value="gco"),
2833 CfnTag(key="Region", value=self.deployment_region),
2834 ],
2835 )
2837 CfnOutput(
2838 self,
2839 "ValkeyEndpoint",
2840 value=self.valkey_cache.attr_endpoint_address,
2841 description="Valkey Serverless cache endpoint",
2842 )
2843 CfnOutput(
2844 self,
2845 "ValkeyPort",
2846 value=self.valkey_cache.attr_endpoint_port,
2847 description="Valkey Serverless cache port",
2848 )
2850 # Store endpoint in SSM for discovery by pods
2851 ssm.StringParameter(
2852 self,
2853 "ValkeyEndpointParam",
2854 parameter_name=f"/{self.config.get_project_name()}/valkey-endpoint-{self.deployment_region}",
2855 string_value=self.valkey_cache.attr_endpoint_address,
2856 description=f"Valkey endpoint for {self.deployment_region}",
2857 )
2859 def _create_aurora_pgvector(self) -> None:
2860 """Create an Aurora Serverless v2 PostgreSQL cluster with pgvector.
2862 Provides a fully managed vector database that inference endpoints and
2863 jobs can use for RAG (retrieval-augmented generation), semantic search,
2864 embedding storage, and similarity queries. Aurora Serverless v2
2865 auto-scales capacity and requires no instance management.
2867 The cluster is placed in the VPC private subnets and accessible from
2868 any pod via the cluster security group. Credentials are stored in
2869 Secrets Manager and the endpoint is published to SSM + a K8s ConfigMap
2870 for automatic discovery.
2872 See: https://aws.amazon.com/blogs/database/accelerate-generative-ai-workloads-on-amazon-aurora-with-optimized-reads-and-pgvector/
2873 """
2874 aurora_config = self.config.get_aurora_pgvector_config()
2875 if not aurora_config.get("enabled", False):
2876 return
2878 from aws_cdk import aws_rds as rds
2880 project_name = self.config.get_project_name()
2882 # Security group for Aurora (allow PostgreSQL access from EKS cluster only)
2883 aurora_sg = ec2.SecurityGroup(
2884 self,
2885 "AuroraPgvectorSG",
2886 vpc=self.vpc,
2887 description="Security group for Aurora Serverless v2 pgvector",
2888 allow_all_outbound=False,
2889 )
2890 aurora_sg.add_ingress_rule(
2891 self.cluster.cluster_security_group,
2892 ec2.Port.tcp(5432),
2893 "Allow PostgreSQL access from EKS cluster",
2894 )
2896 # Subnet group for Aurora (private subnets only)
2897 subnet_group = rds.SubnetGroup(
2898 self,
2899 "AuroraPgvectorSubnetGroup",
2900 description=f"Subnet group for GCO Aurora pgvector in {self.deployment_region}",
2901 vpc=self.vpc,
2902 vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS),
2903 )
2905 # Aurora Serverless v2 cluster with PostgreSQL 16 + pgvector
2906 self.aurora_cluster = rds.DatabaseCluster(
2907 self,
2908 "AuroraPgvectorCluster",
2909 engine=rds.DatabaseClusterEngine.aurora_postgres(
2910 version=getattr(rds.AuroraPostgresEngineVersion, AURORA_POSTGRES_VERSION),
2911 ),
2912 serverless_v2_min_capacity=aurora_config.get("min_acu", 0),
2913 serverless_v2_max_capacity=aurora_config.get("max_acu", 16),
2914 writer=rds.ClusterInstance.serverless_v2(
2915 "Writer",
2916 auto_minor_version_upgrade=True,
2917 ),
2918 readers=[
2919 rds.ClusterInstance.serverless_v2(
2920 "Reader",
2921 auto_minor_version_upgrade=True,
2922 scale_with_writer=True,
2923 ),
2924 ],
2925 vpc=self.vpc,
2926 vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS),
2927 subnet_group=subnet_group,
2928 security_groups=[aurora_sg],
2929 default_database_name="gco_vectors",
2930 backup=rds.BackupProps(
2931 retention=Duration.days(aurora_config.get("backup_retention_days", 7)),
2932 ),
2933 deletion_protection=aurora_config.get("deletion_protection", False),
2934 removal_policy=RemovalPolicy.DESTROY,
2935 storage_encrypted=True,
2936 iam_authentication=True,
2937 cloudwatch_logs_exports=["postgresql"],
2938 monitoring_interval=Duration.seconds(60),
2939 cluster_identifier=f"{project_name}-pgvector-{self.deployment_region}",
2940 )
2942 # Construct-level cdk-nag suppressions for Aurora pgvector
2943 from cdk_nag import NagPackSuppression, NagSuppressions
2945 NagSuppressions.add_resource_suppressions(
2946 self.aurora_cluster,
2947 [
2948 NagPackSuppression(
2949 id="AwsSolutions-RDS10",
2950 reason=(
2951 "Deletion protection is intentionally disabled for dev/test deployments. "
2952 "Production deployments should set aurora_pgvector.deletion_protection=true "
2953 "in cdk.json."
2954 ),
2955 ),
2956 NagPackSuppression(
2957 id="AwsSolutions-SMG4",
2958 reason=(
2959 "Aurora manages credential rotation via the RDS integration with Secrets "
2960 "Manager. Manual Secrets Manager rotation is not required. "
2961 "See: https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/rds-secrets-manager.html"
2962 ),
2963 ),
2964 NagPackSuppression(
2965 id="HIPAA.Security-RDSInstanceDeletionProtectionEnabled",
2966 reason=(
2967 "Deletion protection is intentionally disabled for dev/test deployments. "
2968 "Production deployments should set aurora_pgvector.deletion_protection=true "
2969 "in cdk.json."
2970 ),
2971 ),
2972 NagPackSuppression(
2973 id="NIST.800.53.R5-RDSInstanceDeletionProtectionEnabled",
2974 reason=(
2975 "Deletion protection is intentionally disabled for dev/test deployments. "
2976 "Production deployments should set aurora_pgvector.deletion_protection=true "
2977 "in cdk.json."
2978 ),
2979 ),
2980 NagPackSuppression(
2981 id="PCI.DSS.321-SecretsManagerUsingKMSKey",
2982 reason=(
2983 "Aurora Serverless v2 credentials in Secrets Manager are encrypted with "
2984 "AWS-managed keys by default. Customer-managed KMS can be enabled if "
2985 "required for PCI compliance."
2986 ),
2987 ),
2988 ],
2989 apply_to_children=True,
2990 )
2992 # Outputs
2993 CfnOutput(
2994 self,
2995 "AuroraPgvectorEndpoint",
2996 value=self.aurora_cluster.cluster_endpoint.hostname,
2997 description="Aurora pgvector cluster writer endpoint",
2998 )
2999 CfnOutput(
3000 self,
3001 "AuroraPgvectorReaderEndpoint",
3002 value=self.aurora_cluster.cluster_read_endpoint.hostname,
3003 description="Aurora pgvector cluster reader endpoint",
3004 )
3005 CfnOutput(
3006 self,
3007 "AuroraPgvectorPort",
3008 value=str(self.aurora_cluster.cluster_endpoint.port),
3009 description="Aurora pgvector cluster port",
3010 )
3011 CfnOutput(
3012 self,
3013 "AuroraPgvectorSecretArn",
3014 value=self.aurora_cluster.secret.secret_arn if self.aurora_cluster.secret else "",
3015 description="Aurora pgvector credentials secret ARN",
3016 )
3018 # Store endpoint in SSM for discovery by pods and external tools
3019 ssm.StringParameter(
3020 self,
3021 "AuroraPgvectorEndpointParam",
3022 parameter_name=f"/{project_name}/aurora-pgvector-endpoint-{self.deployment_region}",
3023 string_value=self.aurora_cluster.cluster_endpoint.hostname,
3024 description=f"Aurora pgvector endpoint for {self.deployment_region}",
3025 )
3027 # Grant the ServiceAccountRole read access to the Aurora secret
3028 # so pods can retrieve credentials via the ConfigMap + Secrets Manager.
3029 if self.aurora_cluster.secret: 3029 ↛ exitline 3029 didn't return from function '_create_aurora_pgvector' because the condition on line 3029 was always true
3030 self.aurora_cluster.secret.grant_read(self.service_account_role)
3032 def _create_fsx_csi_driver_addon(self) -> None:
3033 """Create FSx CSI Driver add-on for Kubernetes integration.
3035 The FSx CSI driver enables Kubernetes pods to mount FSx for Lustre
3036 file systems as persistent volumes.
3037 """
3038 # Create IAM role for FSx CSI Driver using IRSA + Pod Identity
3039 self.fsx_csi_role = GCORegionalStack._create_irsa_role(
3040 self,
3041 "FsxCsiDriverRole",
3042 oidc_provider_arn=self.oidc_provider.open_id_connect_provider_arn,
3043 oidc_issuer_url=self.cluster.cluster_open_id_connect_issuer_url,
3044 service_account_names=["fsx-csi-controller-sa"],
3045 namespaces=["kube-system"],
3046 )
3048 # Add FSx CSI driver permissions
3049 self.fsx_csi_role.add_to_policy(
3050 iam.PolicyStatement(
3051 effect=iam.Effect.ALLOW,
3052 actions=[
3053 "fsx:DescribeFileSystems",
3054 "fsx:DescribeVolumes",
3055 "fsx:CreateVolume",
3056 "fsx:DeleteVolume",
3057 "fsx:TagResource",
3058 ],
3059 resources=["*"],
3060 )
3061 )
3063 self.fsx_csi_role.add_to_policy(
3064 iam.PolicyStatement(
3065 effect=iam.Effect.ALLOW,
3066 actions=[
3067 "ec2:DescribeInstances",
3068 "ec2:DescribeVolumes",
3069 "ec2:DescribeVpcs",
3070 "ec2:DescribeSubnets",
3071 "ec2:DescribeSecurityGroups",
3072 ],
3073 resources=["*"],
3074 )
3075 )
3077 # cdk-nag suppression: the FSx CSI driver role grants
3078 # ec2:Describe* APIs that don't support resource-level scoping.
3079 from cdk_nag import NagSuppressions
3081 NagSuppressions.add_resource_suppressions(
3082 self.fsx_csi_role,
3083 [
3084 {
3085 "id": "AwsSolutions-IAM5",
3086 "reason": (
3087 "The FSx CSI driver role grants ec2:Describe* for volume "
3088 "and network discovery. These AWS APIs do not support "
3089 "resource-level IAM scoping — Resource: * is the only "
3090 "valid form."
3091 ),
3092 "appliesTo": ["Resource::*"],
3093 },
3094 ],
3095 apply_to_children=True,
3096 )
3098 # Create FSx CSI Driver add-on
3099 fsx_addon = eks.Addon(
3100 self,
3101 "FsxCsiDriverAddon",
3102 cluster=self.cluster, # type: ignore[arg-type]
3103 addon_name="aws-fsx-csi-driver",
3104 addon_version=EKS_ADDON_FSX_CSI_DRIVER,
3105 preserve_on_delete=False,
3106 configuration_values={
3107 "node": {
3108 "tolerations": self._ADDON_NODE_TOLERATIONS,
3109 },
3110 "controller": {
3111 "tolerations": self._ADDON_NODE_TOLERATIONS,
3112 },
3113 },
3114 )
3116 # Append the PassRole statement for the FSx CSI role to the shared
3117 # AwsCustomResource execution role. See
3118 # _create_aws_custom_resource_role for the full rationale.
3119 self.aws_custom_resource_role.add_to_policy(
3120 iam.PolicyStatement(
3121 effect=iam.Effect.ALLOW,
3122 actions=["iam:PassRole"],
3123 resources=[self.fsx_csi_role.role_arn],
3124 )
3125 )
3127 # Update the add-on to use the IRSA role
3128 update_fsx_addon = cr.AwsCustomResource(
3129 self,
3130 "UpdateFsxCsiAddonRole",
3131 on_create=cr.AwsSdkCall(
3132 service="EKS",
3133 action="updateAddon",
3134 parameters={
3135 "clusterName": self.cluster.cluster_name,
3136 "addonName": "aws-fsx-csi-driver",
3137 "serviceAccountRoleArn": self.fsx_csi_role.role_arn,
3138 },
3139 physical_resource_id=cr.PhysicalResourceId.of(
3140 f"{self.cluster.cluster_name}-fsx-csi-role-update"
3141 ),
3142 ),
3143 on_update=cr.AwsSdkCall(
3144 service="EKS",
3145 action="updateAddon",
3146 parameters={
3147 "clusterName": self.cluster.cluster_name,
3148 "addonName": "aws-fsx-csi-driver",
3149 "serviceAccountRoleArn": self.fsx_csi_role.role_arn,
3150 },
3151 ),
3152 role=self.aws_custom_resource_role,
3153 )
3155 update_fsx_addon.node.add_dependency(fsx_addon)
3156 update_fsx_addon.node.add_dependency(self.fsx_csi_role)
3157 update_fsx_addon.node.add_dependency(self.aws_custom_resource_role)
3159 # Expose the update-addon resource so _apply_kubernetes_manifests can
3160 # make the kubectl Lambda wait for the IRSA annotation patch to land
3161 # before it rollout-restarts the fsx-csi-controller. See the EFS CSI
3162 # equivalent for the full rationale — same race, same fix, same
3163 # symptom (PVCs stuck Pending with "no EC2 IMDS role found").
3164 self._fsx_csi_addon_role_update = update_fsx_addon
3166 # Create Pod Identity Association for FSx CSI driver
3167 eks_l1.CfnPodIdentityAssociation(
3168 self,
3169 "PodIdentity-fsx-csi",
3170 cluster_name=self.cluster.cluster_name,
3171 namespace="kube-system",
3172 service_account="fsx-csi-controller-sa",
3173 role_arn=self.fsx_csi_role.role_arn,
3174 )
3176 def _create_drift_detection(self) -> None:
3177 """Create CloudFormation drift detection on a daily schedule.
3179 Creates:
3180 - SNS topic (KMS-encrypted) for drift alerts
3181 - Lambda function that initiates drift detection on this stack, polls
3182 until detection completes, and publishes to SNS if drift is found
3183 - EventBridge rule on a daily schedule (configurable via cdk.json
3184 ``drift_detection.schedule_hours``) that invokes the Lambda
3186 Operators can disable drift detection entirely by setting
3187 ``drift_detection.enabled`` to ``false`` in cdk.json. When disabled,
3188 no resources are created.
3189 """
3190 drift_config = self.node.try_get_context("drift_detection") or {}
3191 if not drift_config.get("enabled", True):
3192 return
3194 schedule_hours = int(drift_config.get("schedule_hours", 24))
3196 # KMS key for SNS topic encryption. SNS with AWS-managed keys doesn't
3197 # allow CloudFormation/Lambda to publish, so we use a customer-managed
3198 # key we can grant publish access on.
3199 drift_topic_key = kms.Key(
3200 self,
3201 "DriftDetectionTopicKey",
3202 description="KMS key for GCO drift detection SNS topic",
3203 enable_key_rotation=True,
3204 removal_policy=RemovalPolicy.DESTROY,
3205 )
3207 self.drift_detection_topic = sns.Topic(
3208 self,
3209 "DriftDetectionTopic",
3210 display_name="GCO CloudFormation Drift Alerts",
3211 master_key=drift_topic_key,
3212 )
3214 # IAM role for the drift detection Lambda
3215 drift_lambda_role = iam.Role(
3216 self,
3217 "DriftDetectionLambdaRole",
3218 assumed_by=iam.ServicePrincipal("lambda.amazonaws.com"),
3219 managed_policies=[
3220 iam.ManagedPolicy.from_aws_managed_policy_name(
3221 "service-role/AWSLambdaBasicExecutionRole"
3222 ),
3223 ],
3224 )
3226 # CloudFormation drift APIs operate at the stack level; the API does
3227 # not support resource-level ARN scoping for these actions, so we scope
3228 # to this stack's ARN where supported and accept "*" where not.
3229 drift_lambda_role.add_to_policy(
3230 iam.PolicyStatement(
3231 effect=iam.Effect.ALLOW,
3232 actions=[
3233 "cloudformation:DetectStackDrift",
3234 "cloudformation:DescribeStackDriftDetectionStatus",
3235 "cloudformation:DescribeStackResourceDrifts",
3236 "cloudformation:DescribeStackResource",
3237 "cloudformation:DescribeStackResources",
3238 ],
3239 resources=["*"],
3240 )
3241 )
3243 self.drift_detection_topic.grant_publish(drift_lambda_role)
3245 # Lambda function — one per stack; stack name is baked into env vars
3246 drift_lambda = lambda_.Function(
3247 self,
3248 "DriftDetectionFunction",
3249 runtime=getattr(lambda_.Runtime, LAMBDA_PYTHON_RUNTIME),
3250 handler="handler.lambda_handler",
3251 code=lambda_.Code.from_asset("lambda/drift-detection"),
3252 timeout=Duration.minutes(14), # Leave headroom under Lambda 15-min cap
3253 memory_size=256,
3254 role=drift_lambda_role,
3255 environment={
3256 "STACK_NAME": self.stack_name,
3257 "SNS_TOPIC_ARN": self.drift_detection_topic.topic_arn,
3258 "REGION": self.deployment_region,
3259 },
3260 tracing=lambda_.Tracing.ACTIVE,
3261 )
3263 # Dead-letter queue for EventBridge → Lambda target failures.
3264 # Captures events that fail to reach the Lambda (e.g. due to
3265 # throttling or permission issues) so operators can retry or
3266 # investigate. Required by Serverless-EventBusDLQ cdk-nag rule.
3267 drift_rule_dlq = sqs.Queue(
3268 self,
3269 "DriftDetectionRuleDlq",
3270 retention_period=Duration.days(14),
3271 enforce_ssl=True,
3272 encryption=sqs.QueueEncryption.SQS_MANAGED,
3273 removal_policy=RemovalPolicy.DESTROY,
3274 )
3276 # DLQs themselves are terminal — they don't need their own DLQ.
3277 # Suppress the circular AwsSolutions-SQS3 nag finding.
3278 from cdk_nag import NagSuppressions as _DlqNagSuppressions
3280 _DlqNagSuppressions.add_resource_suppressions(
3281 drift_rule_dlq,
3282 [
3283 {
3284 "id": "AwsSolutions-SQS3",
3285 "reason": (
3286 "This queue IS the dead-letter queue for the "
3287 "DriftDetectionSchedule EventBridge rule. A DLQ for a "
3288 "DLQ is circular; if events fail to reach this queue "
3289 "they are captured by EventBridge's own retry metrics "
3290 "(CloudWatch FailedInvocations)."
3291 ),
3292 },
3293 ],
3294 )
3296 # EventBridge rule — daily schedule by default
3297 events.Rule(
3298 self,
3299 "DriftDetectionSchedule",
3300 description=(f"Daily CloudFormation drift detection for {self.stack_name}"),
3301 schedule=events.Schedule.rate(Duration.hours(schedule_hours)),
3302 targets=[
3303 events_targets.LambdaFunction(
3304 drift_lambda,
3305 dead_letter_queue=drift_rule_dlq,
3306 retry_attempts=2,
3307 )
3308 ],
3309 )
3311 # Outputs for operators to subscribe to the topic
3312 CfnOutput(
3313 self,
3314 "DriftDetectionTopicArn",
3315 value=self.drift_detection_topic.topic_arn,
3316 description=(
3317 f"SNS topic ARN for CloudFormation drift alerts in "
3318 f"{self.deployment_region}. Subscribe an endpoint (email, "
3319 f"Slack, PagerDuty) to receive drift notifications."
3320 ),
3321 )
3323 # cdk-nag suppressions for this component
3324 from cdk_nag import NagSuppressions
3326 NagSuppressions.add_resource_suppressions(
3327 drift_lambda_role,
3328 [
3329 {
3330 "id": "AwsSolutions-IAM4",
3331 "reason": (
3332 "AWSLambdaBasicExecutionRole provides standard "
3333 "CloudWatch Logs permissions required for Lambda "
3334 "logging. This is the AWS-recommended managed policy."
3335 ),
3336 },
3337 {
3338 "id": "AwsSolutions-IAM5",
3339 "reason": (
3340 "CloudFormation drift detection APIs (DetectStackDrift, "
3341 "DescribeStackDriftDetectionStatus, "
3342 "DescribeStackResourceDrifts) cannot be scoped to a "
3343 "specific stack resource via IAM; the action-level "
3344 "scoping requires wildcard resources. The Lambda's "
3345 "environment pins it to a single stack name, so the "
3346 "effective blast radius is limited."
3347 ),
3348 },
3349 ],
3350 apply_to_children=True,
3351 )
3353 def _create_mcp_role(self) -> None:
3354 """Create dedicated IAM role for the MCP server.
3356 The MCP server exposes GCO CLI tools to LLM agents. Without a dedicated
3357 role, the server would inherit the full ambient credentials of the user
3358 who launches it (often an administrator). This method creates a
3359 least-privilege role that the MCP server can assume at startup via
3360 ``GCO_MCP_ROLE_ARN``.
3362 Permissions are scoped to the minimum needed by the tools exposed:
3364 - ``eks:DescribeCluster`` on this regional EKS cluster ARN only.
3365 - ``s3:GetObject`` on model weights buckets. The model bucket lives in
3366 the global stack, so we scope to the same name pattern used by the
3367 service account role (``{project_name}-*``). This is a deliberate
3368 compromise: a precise cross-stack ARN export would force a tight
3369 dependency on the global stack, and cdk-nag will flag it anyway
3370 because the bucket name is auto-generated.
3371 - ``cloudwatch:GetMetricData`` / ``cloudwatch:ListMetrics``. These APIs
3372 do not support resource-level IAM, so wildcard is required. Read-only.
3373 - ``sqs:SendMessage`` scoped to this region's job queue ARN only.
3375 The trust policy uses ``AccountRootPrincipal`` so any IAM user/role in
3376 the account can assume it (gated by an explicit sts:AssumeRole
3377 permission on the caller — standard AWS behavior). Operators who want
3378 to restrict assumption further should add an external-id or principal
3379 condition to the trust policy after deployment.
3381 Operators can disable this component entirely by setting
3382 ``mcp_server.enabled`` to ``false`` in cdk.json.
3383 """
3384 mcp_config = self.node.try_get_context("mcp_server") or {}
3385 if not mcp_config.get("enabled", True):
3386 return
3388 project_name = self.config.get_project_name()
3390 self.mcp_server_role = iam.Role(
3391 self,
3392 "McpServerRole",
3393 assumed_by=iam.AccountRootPrincipal(),
3394 description=(
3395 "Least-privilege role assumed by the GCO MCP server at startup. "
3396 "Grants only the permissions needed by MCP tools: eks:DescribeCluster, "
3397 "s3:GetObject on model buckets, cloudwatch read-only metrics, and "
3398 "sqs:SendMessage to the regional job queue."
3399 ),
3400 max_session_duration=Duration.hours(12),
3401 )
3403 # eks:DescribeCluster on this region's cluster only
3404 self.mcp_server_role.add_to_policy(
3405 iam.PolicyStatement(
3406 effect=iam.Effect.ALLOW,
3407 actions=["eks:DescribeCluster"],
3408 resources=[self.cluster.cluster_arn],
3409 )
3410 )
3412 # s3:GetObject on model weights buckets. Bucket name is auto-generated
3413 # in the global stack, so we match the same prefix pattern used by the
3414 # service account role.
3415 self.mcp_server_role.add_to_policy(
3416 iam.PolicyStatement(
3417 effect=iam.Effect.ALLOW,
3418 actions=["s3:GetObject", "s3:ListBucket"],
3419 resources=[
3420 f"arn:aws:s3:::{project_name}-*",
3421 f"arn:aws:s3:::{project_name}-*/*",
3422 ],
3423 )
3424 )
3426 # CloudWatch read-only metrics APIs. These APIs do not support
3427 # resource-level IAM so wildcard is required.
3428 self.mcp_server_role.add_to_policy(
3429 iam.PolicyStatement(
3430 effect=iam.Effect.ALLOW,
3431 actions=[
3432 "cloudwatch:GetMetricData",
3433 "cloudwatch:GetMetricStatistics",
3434 "cloudwatch:ListMetrics",
3435 ],
3436 resources=["*"],
3437 )
3438 )
3440 # sqs:SendMessage scoped to the regional job queue only
3441 self.mcp_server_role.add_to_policy(
3442 iam.PolicyStatement(
3443 effect=iam.Effect.ALLOW,
3444 actions=["sqs:SendMessage", "sqs:GetQueueUrl", "sqs:GetQueueAttributes"],
3445 resources=[self.job_queue.queue_arn],
3446 )
3447 )
3449 # Export the role ARN so operators can set GCO_MCP_ROLE_ARN in their
3450 # MCP server environment.
3451 CfnOutput(
3452 self,
3453 "McpServerRoleArn",
3454 value=self.mcp_server_role.role_arn,
3455 description=(
3456 "IAM role ARN for the GCO MCP server. Set GCO_MCP_ROLE_ARN to "
3457 "this value when launching the MCP server so it assumes a "
3458 "least-privilege role instead of ambient credentials."
3459 ),
3460 export_name=f"{project_name}-mcp-server-role-arn-{self.deployment_region}",
3461 )
3463 # cdk-nag suppressions: CloudWatch metrics APIs cannot be scoped.
3464 from cdk_nag import NagSuppressions
3466 NagSuppressions.add_resource_suppressions(
3467 self.mcp_server_role,
3468 [
3469 {
3470 "id": "AwsSolutions-IAM5",
3471 "reason": (
3472 "The CloudWatch metrics APIs (GetMetricData, "
3473 "GetMetricStatistics, ListMetrics) do not support "
3474 "resource-level IAM; wildcard resource is required. "
3475 "The S3 permissions use the {project_name}-* prefix "
3476 "pattern because the model weights bucket name is "
3477 "auto-generated by CDK in the global stack and a "
3478 "cross-stack ARN export would create tight stack "
3479 "coupling. All actions are read-only or scoped "
3480 "send-only (SQS)."
3481 ),
3482 },
3483 ],
3484 apply_to_children=True,
3485 )
3487 def _create_outputs(self) -> None:
3488 """Create CloudFormation outputs for cluster information"""
3489 project_name = self.config.get_project_name()
3491 # Export cluster information
3492 CfnOutput(
3493 self,
3494 "ClusterName",
3495 value=self.cluster.cluster_name,
3496 description=f"EKS cluster name for {self.deployment_region}",
3497 export_name=f"{project_name}-cluster-name-{self.deployment_region}",
3498 )
3500 CfnOutput(
3501 self,
3502 "ClusterArn",
3503 value=self.cluster.cluster_arn,
3504 description=f"EKS cluster ARN for {self.deployment_region}",
3505 export_name=f"{project_name}-cluster-arn-{self.deployment_region}",
3506 )
3508 CfnOutput(
3509 self,
3510 "ClusterEndpoint",
3511 value=self.cluster.cluster_endpoint,
3512 description=f"EKS cluster endpoint for {self.deployment_region}",
3513 export_name=f"{project_name}-cluster-endpoint-{self.deployment_region}",
3514 )
3516 CfnOutput(
3517 self,
3518 "ClusterSecurityGroupId",
3519 value=self.cluster.cluster_security_group_id,
3520 description=f"EKS cluster security group ID for {self.deployment_region}",
3521 export_name=f"{project_name}-cluster-sg-{self.deployment_region}",
3522 )
3524 CfnOutput(
3525 self,
3526 "VpcId",
3527 value=self.vpc.vpc_id,
3528 description=f"VPC ID for {self.deployment_region}",
3529 export_name=f"{project_name}-vpc-id-{self.deployment_region}",
3530 )
3532 # Export public subnet IDs for ALB
3533 public_subnet_ids = [subnet.subnet_id for subnet in self.vpc.public_subnets]
3534 CfnOutput(
3535 self,
3536 "PublicSubnetIds",
3537 value=Fn.join(",", public_subnet_ids),
3538 description=f"Public subnet IDs for ALB in {self.deployment_region}",
3539 export_name=f"{project_name}-public-subnets-{self.deployment_region}",
3540 )
3542 # Note: ALB is created by AWS Load Balancer Controller via Ingress
3543 # The ALB ARN is registered with Global Accelerator by the GA registration Lambda
3545 def get_cluster(self) -> eks.Cluster:
3546 """Get the EKS cluster"""
3547 return self.cluster
3549 def get_vpc(self) -> ec2.Vpc:
3550 """Get the VPC"""
3551 return self.vpc