Coverage for gco / stacks / global_stack.py: 100%
101 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-30 21:47 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-30 21:47 +0000
1"""
2Global stack for GCO (Global Capacity Orchestrator on AWS) - AWS Global Accelerator configuration.
4This stack creates the global-level resources that span all regions:
5- AWS Global Accelerator with TCP listeners on ports 80 and 443
6- Endpoint groups for each configured region
7- SSM parameters for cross-region endpoint group ARN sharing
8- DynamoDB tables for templates and webhooks (global, replicated)
10The Global Accelerator provides:
11- Single global endpoint for all regions
12- Automatic health-based routing to nearest healthy region
13- DDoS protection via AWS Shield Standard
14- Reduced latency through AWS global network
16Architecture:
17 Global Accelerator → Listener (80, 443) → Endpoint Groups (per region)
18 ↓
19 Regional ALBs (registered separately)
20"""
22from typing import Any
24from aws_cdk import (
25 CfnOutput,
26 Duration,
27 Fn,
28 RemovalPolicy,
29 Stack,
30)
31from aws_cdk import aws_backup as backup
32from aws_cdk import aws_dynamodb as dynamodb
33from aws_cdk import aws_events as events
34from aws_cdk import aws_globalaccelerator as ga
35from aws_cdk import aws_s3 as s3
36from aws_cdk import aws_ssm as ssm
37from constructs import Construct
39from gco.config.config_loader import ConfigLoader
42class GCOGlobalStack(Stack):
43 """
44 Global resources stack including AWS Global Accelerator.
46 This stack must be deployed before regional stacks. Regional stacks
47 will register their ALBs with the endpoint groups created here.
49 Attributes:
50 accelerator: The Global Accelerator resource
51 listener: TCP listener for HTTP/HTTPS traffic
52 endpoint_groups: Dict mapping region names to endpoint groups
53 templates_table: DynamoDB table for job templates
54 webhooks_table: DynamoDB table for webhooks
55 """
57 def __init__(
58 self, scope: Construct, construct_id: str, config: ConfigLoader, **kwargs: Any
59 ) -> None:
60 super().__init__(scope, construct_id, **kwargs)
62 self.config = config
63 self.regional_endpoints: dict[str, str] = {}
64 self.endpoint_groups: dict[str, ga.EndpointGroup] = {}
66 ga_config = self.config.get_global_accelerator_config()
68 # Store the accelerator name for reference by other stacks
69 self.accelerator_name = ga_config["name"]
71 # Create DynamoDB tables for templates and webhooks
72 self._create_dynamodb_tables()
74 # Create S3 bucket for model weights
75 self._create_model_bucket()
77 # Create AWS Backup plan for DynamoDB tables
78 self._create_backup_plan()
80 # Create Global Accelerator with TCP protocol for HTTP/HTTPS traffic
81 self.accelerator = ga.Accelerator(
82 self, "GCOAccelerator", accelerator_name=self.accelerator_name, enabled=True
83 )
85 # Store the accelerator ID for CloudWatch metrics
86 # CloudWatch uses the accelerator ID (UUID), not the name or ARN
87 # ARN format: arn:aws:globalaccelerator::<account>:accelerator/<accelerator-id>
88 # Use Fn.select and Fn.split to extract the ID at deploy time
89 self.accelerator_id = Fn.select(1, Fn.split("/", self.accelerator.accelerator_arn))
91 # Create listener for both HTTP (80) and HTTPS (443) traffic
92 self.listener = self.accelerator.add_listener(
93 "GCOListener",
94 port_ranges=[
95 ga.PortRange(from_port=80, to_port=80),
96 ga.PortRange(from_port=443, to_port=443),
97 ],
98 protocol=ga.ConnectionProtocol.TCP,
99 client_affinity=ga.ClientAffinity.NONE,
100 )
102 # Create endpoint groups for each configured region
103 for region in self.config.get_regions():
104 self._create_endpoint_group(region)
106 # Export Global Accelerator outputs for other stacks
107 self._create_outputs()
109 # Apply cdk-nag suppressions
110 self._apply_nag_suppressions()
112 def _create_outputs(self) -> None:
113 """Create CloudFormation outputs for cross-stack references."""
114 project_name = self.config.get_project_name()
116 CfnOutput(
117 self,
118 "GlobalAcceleratorDnsName",
119 value=self.accelerator.dns_name,
120 description="Global Accelerator DNS name for global endpoint",
121 export_name=f"{project_name}-global-accelerator-dns",
122 )
124 CfnOutput(
125 self,
126 "GlobalAcceleratorArn",
127 value=self.accelerator.accelerator_arn,
128 description="Global Accelerator ARN",
129 export_name=f"{project_name}-global-accelerator-arn",
130 )
132 CfnOutput(
133 self,
134 "GlobalAcceleratorListenerArn",
135 value=self.listener.listener_arn,
136 description="Global Accelerator Listener ARN",
137 export_name=f"{project_name}-global-accelerator-listener-arn",
138 )
140 def _apply_nag_suppressions(self) -> None:
141 """Apply cdk-nag suppressions for this stack."""
142 from gco.stacks.nag_suppressions import apply_all_suppressions
144 apply_all_suppressions(self, stack_type="global")
146 def _create_endpoint_group(self, region: str) -> None:
147 """
148 Create an endpoint group for a specific region.
150 Configures HTTP health checks using the path from cdk.json so
151 Global Accelerator can verify the ALB's backend services are
152 actually healthy (not just that the port is open).
154 Also stores the endpoint group ARN in SSM Parameter Store for
155 cross-region access by regional stacks.
157 Args:
158 region: AWS region name (e.g., 'us-east-1')
159 """
160 project_name = self.config.get_project_name()
161 region_id = region.replace("-", "").title()
162 ga_config = self.config.get_global_accelerator_config()
164 # Use HTTP health checks so GA validates the backend services are
165 # actually responding, not just that the ALB port is open.
166 # The health_check_path from cdk.json (default: /api/v1/health)
167 # hits the health-monitor service behind the ALB.
168 endpoint_group = self.listener.add_endpoint_group(
169 f"EndpointGroup{region_id}",
170 region=region,
171 health_check_port=80,
172 health_check_protocol=ga.HealthCheckProtocol.HTTP,
173 health_check_path=ga_config.get("health_check_path", "/api/v1/health"),
174 health_check_interval=Duration.seconds(ga_config.get("health_check_interval", 30)),
175 health_check_threshold=3,
176 )
178 self.endpoint_groups[region] = endpoint_group
180 # Export endpoint group ARN for regional stacks
181 CfnOutput(
182 self,
183 f"EndpointGroup{region_id}Arn",
184 value=endpoint_group.endpoint_group_arn,
185 description=f"Endpoint group ARN for {region}",
186 export_name=f"{project_name}-endpoint-group-{region}-arn",
187 )
189 # Store endpoint group ARN in SSM Parameter Store for cross-region access
190 # Regional stacks read this to register their ALBs with Global Accelerator
191 ssm.StringParameter(
192 self,
193 f"EndpointGroup{region_id}ArnParam",
194 parameter_name=f"/{project_name}/endpoint-group-{region}-arn",
195 string_value=endpoint_group.endpoint_group_arn,
196 description=f"Global Accelerator endpoint group ARN for {region}",
197 )
199 def add_regional_endpoint(self, region: str, alb_arn: str) -> None:
200 """Add a regional ALB endpoint to the Global Accelerator.
202 Note: Due to cross-region reference limitations in CDK, the actual endpoint
203 registration is handled by a custom resource in the regional stack.
204 This method stores the ARN for reference but doesn't directly register it.
206 The regional stack should use the endpoint group ARN exported by this stack
207 to register its ALB via an AwsCustomResource.
208 """
209 self.regional_endpoints[region] = alb_arn
210 # Actual registration happens in regional stack via custom resource
212 def get_accelerator_dns_name(self) -> str:
213 """Get the Global Accelerator DNS name"""
214 return str(self.accelerator.dns_name)
216 def get_accelerator_arn(self) -> str:
217 """Get the Global Accelerator ARN"""
218 return str(self.accelerator.accelerator_arn)
220 def get_listener_arn(self) -> str:
221 """Get the Global Accelerator Listener ARN"""
222 return str(self.listener.listener_arn)
224 def get_endpoint_group_arn(self, region: str) -> str:
225 """Get the endpoint group ARN for a specific region"""
226 if region in self.endpoint_groups:
227 return str(self.endpoint_groups[region].endpoint_group_arn)
228 raise ValueError(f"No endpoint group found for region: {region}")
230 def _create_dynamodb_tables(self) -> None:
231 """Create DynamoDB tables for templates, webhooks, and jobs."""
232 project_name = self.config.get_project_name()
234 # Job Templates table - stores reusable job templates
235 self.templates_table = dynamodb.Table(
236 self,
237 "JobTemplatesTable",
238 table_name=f"{project_name}-job-templates",
239 partition_key=dynamodb.Attribute(
240 name="template_name",
241 type=dynamodb.AttributeType.STRING,
242 ),
243 billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST,
244 removal_policy=RemovalPolicy.DESTROY,
245 point_in_time_recovery_specification=dynamodb.PointInTimeRecoverySpecification(
246 point_in_time_recovery_enabled=True
247 ),
248 encryption=dynamodb.TableEncryption.AWS_MANAGED,
249 )
251 # Webhooks table - stores webhook registrations
252 self.webhooks_table = dynamodb.Table(
253 self,
254 "WebhooksTable",
255 table_name=f"{project_name}-webhooks",
256 partition_key=dynamodb.Attribute(
257 name="webhook_id",
258 type=dynamodb.AttributeType.STRING,
259 ),
260 billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST,
261 removal_policy=RemovalPolicy.DESTROY,
262 point_in_time_recovery_specification=dynamodb.PointInTimeRecoverySpecification(
263 point_in_time_recovery_enabled=True
264 ),
265 encryption=dynamodb.TableEncryption.AWS_MANAGED,
266 )
268 # Add GSI for querying webhooks by namespace
269 self.webhooks_table.add_global_secondary_index(
270 index_name="namespace-index",
271 partition_key=dynamodb.Attribute(
272 name="namespace",
273 type=dynamodb.AttributeType.STRING,
274 ),
275 projection_type=dynamodb.ProjectionType.ALL,
276 )
278 # Jobs table - centralized job tracking and queue
279 # This enables global job submission with regional pickup
280 self.jobs_table = dynamodb.Table(
281 self,
282 "JobsTable",
283 table_name=f"{project_name}-jobs",
284 partition_key=dynamodb.Attribute(
285 name="job_id",
286 type=dynamodb.AttributeType.STRING,
287 ),
288 billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST,
289 removal_policy=RemovalPolicy.DESTROY,
290 point_in_time_recovery_specification=dynamodb.PointInTimeRecoverySpecification(
291 point_in_time_recovery_enabled=True
292 ),
293 encryption=dynamodb.TableEncryption.AWS_MANAGED,
294 time_to_live_attribute="ttl", # Auto-cleanup old completed jobs
295 )
297 # GSI for querying jobs by region and status (for regional polling)
298 self.jobs_table.add_global_secondary_index(
299 index_name="region-status-index",
300 partition_key=dynamodb.Attribute(
301 name="target_region",
302 type=dynamodb.AttributeType.STRING,
303 ),
304 sort_key=dynamodb.Attribute(
305 name="status",
306 type=dynamodb.AttributeType.STRING,
307 ),
308 projection_type=dynamodb.ProjectionType.ALL,
309 )
311 # GSI for querying jobs by namespace
312 self.jobs_table.add_global_secondary_index(
313 index_name="namespace-index",
314 partition_key=dynamodb.Attribute(
315 name="namespace",
316 type=dynamodb.AttributeType.STRING,
317 ),
318 sort_key=dynamodb.Attribute(
319 name="submitted_at",
320 type=dynamodb.AttributeType.STRING,
321 ),
322 projection_type=dynamodb.ProjectionType.ALL,
323 )
325 # GSI for querying jobs by status globally
326 self.jobs_table.add_global_secondary_index(
327 index_name="status-index",
328 partition_key=dynamodb.Attribute(
329 name="status",
330 type=dynamodb.AttributeType.STRING,
331 ),
332 sort_key=dynamodb.Attribute(
333 name="submitted_at",
334 type=dynamodb.AttributeType.STRING,
335 ),
336 projection_type=dynamodb.ProjectionType.ALL,
337 )
339 # Export table names and ARNs for regional stacks
340 CfnOutput(
341 self,
342 "TemplatesTableName",
343 value=self.templates_table.table_name,
344 description="DynamoDB table name for job templates",
345 export_name=f"{project_name}-templates-table-name",
346 )
348 CfnOutput(
349 self,
350 "TemplatesTableArn",
351 value=self.templates_table.table_arn,
352 description="DynamoDB table ARN for job templates",
353 export_name=f"{project_name}-templates-table-arn",
354 )
356 CfnOutput(
357 self,
358 "WebhooksTableName",
359 value=self.webhooks_table.table_name,
360 description="DynamoDB table name for webhooks",
361 export_name=f"{project_name}-webhooks-table-name",
362 )
364 CfnOutput(
365 self,
366 "WebhooksTableArn",
367 value=self.webhooks_table.table_arn,
368 description="DynamoDB table ARN for webhooks",
369 export_name=f"{project_name}-webhooks-table-arn",
370 )
372 CfnOutput(
373 self,
374 "JobsTableName",
375 value=self.jobs_table.table_name,
376 description="DynamoDB table name for centralized job tracking",
377 export_name=f"{project_name}-jobs-table-name",
378 )
380 CfnOutput(
381 self,
382 "JobsTableArn",
383 value=self.jobs_table.table_arn,
384 description="DynamoDB table ARN for centralized job tracking",
385 export_name=f"{project_name}-jobs-table-arn",
386 )
388 # Inference Endpoints table - stores desired state for inference deployments
389 # The inference_monitor in each regional cluster polls this table
390 self.inference_endpoints_table = dynamodb.Table(
391 self,
392 "InferenceEndpointsTable",
393 table_name=f"{project_name}-inference-endpoints",
394 partition_key=dynamodb.Attribute(
395 name="endpoint_name",
396 type=dynamodb.AttributeType.STRING,
397 ),
398 billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST,
399 removal_policy=RemovalPolicy.DESTROY,
400 point_in_time_recovery_specification=dynamodb.PointInTimeRecoverySpecification(
401 point_in_time_recovery_enabled=True
402 ),
403 encryption=dynamodb.TableEncryption.AWS_MANAGED,
404 )
406 CfnOutput(
407 self,
408 "InferenceEndpointsTableName",
409 value=self.inference_endpoints_table.table_name,
410 description="DynamoDB table name for inference endpoint state",
411 export_name=f"{project_name}-inference-endpoints-table-name",
412 )
414 CfnOutput(
415 self,
416 "InferenceEndpointsTableArn",
417 value=self.inference_endpoints_table.table_arn,
418 description="DynamoDB table ARN for inference endpoint state",
419 export_name=f"{project_name}-inference-endpoints-table-arn",
420 )
422 # Store table names in SSM for cross-region access
423 ssm.StringParameter(
424 self,
425 "TemplatesTableNameParam",
426 parameter_name=f"/{project_name}/templates-table-name",
427 string_value=self.templates_table.table_name,
428 description="DynamoDB table name for job templates",
429 )
431 ssm.StringParameter(
432 self,
433 "WebhooksTableNameParam",
434 parameter_name=f"/{project_name}/webhooks-table-name",
435 string_value=self.webhooks_table.table_name,
436 description="DynamoDB table name for webhooks",
437 )
439 ssm.StringParameter(
440 self,
441 "JobsTableNameParam",
442 parameter_name=f"/{project_name}/jobs-table-name",
443 string_value=self.jobs_table.table_name,
444 description="DynamoDB table name for centralized job tracking",
445 )
447 ssm.StringParameter(
448 self,
449 "InferenceEndpointsTableNameParam",
450 parameter_name=f"/{project_name}/inference-endpoints-table-name",
451 string_value=self.inference_endpoints_table.table_name,
452 description="DynamoDB table name for inference endpoint state",
453 )
455 def _create_model_bucket(self) -> None:
456 """Create S3 bucket for model weights.
458 This bucket serves as the central model registry. Users upload model
459 weights here once, and the inference_monitor's init containers sync
460 them to each region's local EFS at pod startup.
462 The bucket name is auto-generated by CDK to avoid naming collisions.
463 It's exported via CfnOutput and SSM for CLI discovery.
464 """
465 project_name = self.config.get_project_name()
466 from aws_cdk import aws_kms as kms
468 # KMS key for model bucket encryption
469 self.model_bucket_key = kms.Key(
470 self,
471 "ModelBucketKey",
472 description="KMS key for GCO model weights bucket",
473 enable_key_rotation=True,
474 removal_policy=RemovalPolicy.DESTROY,
475 )
477 # Access logs bucket (required for compliance)
478 # Retention is configurable via cdk.json context field `s3_access_logs.retention_days`
479 # (default: 90 days). Logs older than the configured retention are expired.
480 s3_access_logs_ctx = self.node.try_get_context("s3_access_logs") or {}
481 access_logs_retention_days = int(s3_access_logs_ctx.get("retention_days", 90))
483 self.model_bucket_access_logs = s3.Bucket(
484 self,
485 "ModelWeightsAccessLogsBucket",
486 encryption=s3.BucketEncryption.S3_MANAGED,
487 block_public_access=s3.BlockPublicAccess.BLOCK_ALL,
488 enforce_ssl=True,
489 versioned=True,
490 removal_policy=RemovalPolicy.DESTROY,
491 auto_delete_objects=True,
492 lifecycle_rules=[
493 s3.LifecycleRule(
494 id="ExpireAccessLogs",
495 enabled=True,
496 expiration=Duration.days(access_logs_retention_days),
497 )
498 ],
499 )
501 # Model weights bucket
502 self.model_bucket = s3.Bucket(
503 self,
504 "ModelWeightsBucket",
505 encryption=s3.BucketEncryption.KMS,
506 encryption_key=self.model_bucket_key,
507 bucket_key_enabled=True,
508 block_public_access=s3.BlockPublicAccess.BLOCK_ALL,
509 enforce_ssl=True,
510 versioned=True,
511 removal_policy=RemovalPolicy.DESTROY,
512 auto_delete_objects=True,
513 server_access_logs_bucket=self.model_bucket_access_logs,
514 server_access_logs_prefix="model-bucket-logs/",
515 )
517 # CDK-nag suppressions — only replication (not needed for model weights)
518 from cdk_nag import NagSuppressions
520 replication_reason = (
521 "Model weights are user-uploaded artifacts that can be re-uploaded. "
522 "Cross-region replication is not required; the inference_monitor "
523 "syncs models from S3 to each region's EFS at pod startup."
524 )
526 NagSuppressions.add_resource_suppressions(
527 self.model_bucket,
528 [
529 {
530 "id": "HIPAA.Security-S3BucketReplicationEnabled",
531 "reason": replication_reason,
532 },
533 {
534 "id": "NIST.800.53.R5-S3BucketReplicationEnabled",
535 "reason": replication_reason,
536 },
537 {
538 "id": "PCI.DSS.321-S3BucketReplicationEnabled",
539 "reason": replication_reason,
540 },
541 ],
542 )
544 logs_reason = "This is the server access logs destination bucket."
545 NagSuppressions.add_resource_suppressions(
546 self.model_bucket_access_logs,
547 [
548 {"id": "AwsSolutions-S1", "reason": logs_reason},
549 {"id": "HIPAA.Security-S3BucketLoggingEnabled", "reason": logs_reason},
550 {
551 "id": "HIPAA.Security-S3BucketReplicationEnabled",
552 "reason": "Access logs do not require replication.",
553 },
554 {
555 "id": "HIPAA.Security-S3DefaultEncryptionKMS",
556 "reason": "SSE-S3 is sufficient for access logs.",
557 },
558 {"id": "NIST.800.53.R5-S3BucketLoggingEnabled", "reason": logs_reason},
559 {
560 "id": "NIST.800.53.R5-S3BucketReplicationEnabled",
561 "reason": "Access logs do not require replication.",
562 },
563 {
564 "id": "NIST.800.53.R5-S3DefaultEncryptionKMS",
565 "reason": "SSE-S3 is sufficient for access logs.",
566 },
567 {"id": "PCI.DSS.321-S3BucketLoggingEnabled", "reason": logs_reason},
568 {
569 "id": "PCI.DSS.321-S3BucketReplicationEnabled",
570 "reason": "Access logs do not require replication.",
571 },
572 {
573 "id": "PCI.DSS.321-S3DefaultEncryptionKMS",
574 "reason": "SSE-S3 is sufficient for access logs.",
575 },
576 ],
577 )
579 CfnOutput(
580 self,
581 "ModelBucketName",
582 value=self.model_bucket.bucket_name,
583 description="S3 bucket for model weights",
584 export_name=f"{project_name}-model-bucket-name",
585 )
587 CfnOutput(
588 self,
589 "ModelBucketArn",
590 value=self.model_bucket.bucket_arn,
591 description="S3 bucket ARN for model weights",
592 export_name=f"{project_name}-model-bucket-arn",
593 )
595 ssm.StringParameter(
596 self,
597 "ModelBucketNameParam",
598 parameter_name=f"/{project_name}/model-bucket-name",
599 string_value=self.model_bucket.bucket_name,
600 description="S3 bucket name for model weights",
601 )
603 def _create_backup_plan(self) -> None:
604 """Create AWS Backup plan for DynamoDB tables.
606 Creates a backup plan with:
607 - Daily backups retained for 35 days
608 - Weekly backups retained for 90 days
609 - All DynamoDB tables added to the backup selection
610 """
611 # Create backup vault for storing backups
612 self.backup_vault = backup.BackupVault(
613 self,
614 "DynamoDBBackupVault",
615 removal_policy=RemovalPolicy.DESTROY,
616 )
618 # Create backup plan with daily and weekly rules
619 self.backup_plan = backup.BackupPlan(
620 self,
621 "DynamoDBBackupPlan",
622 backup_plan_rules=[
623 # Daily backup - retained for 35 days
624 backup.BackupPlanRule(
625 rule_name="DailyBackup",
626 backup_vault=self.backup_vault,
627 schedule_expression=events.Schedule.cron(
628 hour="3",
629 minute="0",
630 ),
631 delete_after=Duration.days(35),
632 enable_continuous_backup=True, # Enable PITR for DynamoDB
633 ),
634 # Weekly backup - retained for 90 days
635 backup.BackupPlanRule(
636 rule_name="WeeklyBackup",
637 backup_vault=self.backup_vault,
638 schedule_expression=events.Schedule.cron(
639 hour="4",
640 minute="0",
641 week_day="SUN",
642 ),
643 delete_after=Duration.days(90),
644 ),
645 ],
646 )
648 # Add all DynamoDB tables to the backup selection
649 self.backup_plan.add_selection(
650 "DynamoDBTablesSelection",
651 resources=[
652 backup.BackupResource.from_dynamo_db_table(self.templates_table),
653 backup.BackupResource.from_dynamo_db_table(self.webhooks_table),
654 backup.BackupResource.from_dynamo_db_table(self.jobs_table),
655 backup.BackupResource.from_dynamo_db_table(self.inference_endpoints_table),
656 ],
657 )
659 # Export backup plan ARN
660 project_name = self.config.get_project_name()
661 CfnOutput(
662 self,
663 "BackupPlanArn",
664 value=self.backup_plan.backup_plan_arn,
665 description="AWS Backup plan ARN for DynamoDB tables",
666 export_name=f"{project_name}-backup-plan-arn",
667 )
669 CfnOutput(
670 self,
671 "BackupVaultArn",
672 value=self.backup_vault.backup_vault_arn,
673 description="AWS Backup vault ARN for DynamoDB backups",
674 export_name=f"{project_name}-backup-vault-arn",
675 )