Coverage for gco/stacks/global

1"""

2Global stack for GCO (Global Capacity Orchestrator on AWS) - AWS Global Accelerator configuration.

4This stack creates the global-level resources that span all regions:

5- AWS Global Accelerator with TCP listeners on ports 80 and 443

6- Endpoint groups for each configured region

7- SSM parameters for cross-region endpoint group ARN sharing

8- DynamoDB tables for templates and webhooks (global, replicated)

10The Global Accelerator provides:

11- Single global endpoint for all regions

12- Automatic health-based routing to nearest healthy region

13- DDoS protection via AWS Shield Standard

14- Reduced latency through AWS global network

16Architecture:

17 Global Accelerator → Listener (80, 443) → Endpoint Groups (per region)

18 ↓

19 Regional ALBs (registered separately)

20"""

22from typing import Any

24from aws_cdk import (

25 CfnOutput,

26 Duration,

27 Fn,

28 RemovalPolicy,

29 Stack,

30)

31from aws_cdk import aws_backup as backup

32from aws_cdk import aws_dynamodb as dynamodb

33from aws_cdk import aws_events as events

34from aws_cdk import aws_globalaccelerator as ga

35from aws_cdk import aws_s3 as s3

36from aws_cdk import aws_ssm as ssm

37from constructs import Construct

39from gco.config.config_loader import ConfigLoader

42class GCOGlobalStack(Stack):

43 """

44 Global resources stack including AWS Global Accelerator.

46 This stack must be deployed before regional stacks. Regional stacks

47 will register their ALBs with the endpoint groups created here.

49 Attributes:

50 accelerator: The Global Accelerator resource

51 listener: TCP listener for HTTP/HTTPS traffic

52 endpoint_groups: Dict mapping region names to endpoint groups

53 templates_table: DynamoDB table for job templates

54 webhooks_table: DynamoDB table for webhooks

55 """

57 def __init__(

58 self, scope: Construct, construct_id: str, config: ConfigLoader, **kwargs: Any

59 ) -> None:

60 super().__init__(scope, construct_id, **kwargs)

62 self.config = config

63 self.regional_endpoints: dict[str, str] = {}

64 self.endpoint_groups: dict[str, ga.EndpointGroup] = {}

66 ga_config = self.config.get_global_accelerator_config()

68 # Store the accelerator name for reference by other stacks

69 self.accelerator_name = ga_config["name"]

71 # Create DynamoDB tables for templates and webhooks

72 self._create_dynamodb_tables()

74 # Create S3 bucket for model weights

75 self._create_model_bucket()

77 # Create AWS Backup plan for DynamoDB tables

78 self._create_backup_plan()

80 # Create Global Accelerator with TCP protocol for HTTP/HTTPS traffic

81 self.accelerator = ga.Accelerator(

82 self, "GCOAccelerator", accelerator_name=self.accelerator_name, enabled=True

83 )

85 # Store the accelerator ID for CloudWatch metrics

86 # CloudWatch uses the accelerator ID (UUID), not the name or ARN

87 # ARN format: arn:aws:globalaccelerator::<account>:accelerator/<accelerator-id>

88 # Use Fn.select and Fn.split to extract the ID at deploy time

89 self.accelerator_id = Fn.select(1, Fn.split("/", self.accelerator.accelerator_arn))

91 # Create listener for both HTTP (80) and HTTPS (443) traffic

92 self.listener = self.accelerator.add_listener(

93 "GCOListener",

94 port_ranges=[

95 ga.PortRange(from_port=80, to_port=80),

96 ga.PortRange(from_port=443, to_port=443),

97 ],

98 protocol=ga.ConnectionProtocol.TCP,

99 client_affinity=ga.ClientAffinity.NONE,

100 )

101

102 # Create endpoint groups for each configured region

103 for region in self.config.get_regions():

104 self._create_endpoint_group(region)

105

106 # Export Global Accelerator outputs for other stacks

107 self._create_outputs()

108

109 # Apply cdk-nag suppressions

110 self._apply_nag_suppressions()

111

112 def _create_outputs(self) -> None:

113 """Create CloudFormation outputs for cross-stack references."""

114 project_name = self.config.get_project_name()

115

116 CfnOutput(

117 self,

118 "GlobalAcceleratorDnsName",

119 value=self.accelerator.dns_name,

120 description="Global Accelerator DNS name for global endpoint",

121 export_name=f"{project_name}-global-accelerator-dns",

122 )

123

124 CfnOutput(

125 self,

126 "GlobalAcceleratorArn",

127 value=self.accelerator.accelerator_arn,

128 description="Global Accelerator ARN",

129 export_name=f"{project_name}-global-accelerator-arn",

130 )

131

132 CfnOutput(

133 self,

134 "GlobalAcceleratorListenerArn",

135 value=self.listener.listener_arn,

136 description="Global Accelerator Listener ARN",

137 export_name=f"{project_name}-global-accelerator-listener-arn",

138 )

139

140 def _apply_nag_suppressions(self) -> None:

141 """Apply cdk-nag suppressions for this stack."""

142 from gco.stacks.nag_suppressions import apply_all_suppressions

143

144 apply_all_suppressions(self, stack_type="global")

145

146 def _create_endpoint_group(self, region: str) -> None:

147 """

148 Create an endpoint group for a specific region.

149

150 Configures HTTP health checks using the path from cdk.json so

151 Global Accelerator can verify the ALB's backend services are

152 actually healthy (not just that the port is open).

153

154 Also stores the endpoint group ARN in SSM Parameter Store for

155 cross-region access by regional stacks.

156

157 Args:

158 region: AWS region name (e.g., 'us-east-1')

159 """

160 project_name = self.config.get_project_name()

161 region_id = region.replace("-", "").title()

162 ga_config = self.config.get_global_accelerator_config()

163

164 # Use HTTP health checks so GA validates the backend services are

165 # actually responding, not just that the ALB port is open.

166 # The health_check_path from cdk.json (default: /api/v1/health)

167 # hits the health-monitor service behind the ALB.

168 endpoint_group = self.listener.add_endpoint_group(

169 f"EndpointGroup{region_id}",

170 region=region,

171 health_check_port=80,

172 health_check_protocol=ga.HealthCheckProtocol.HTTP,

173 health_check_path=ga_config.get("health_check_path", "/api/v1/health"),

174 health_check_interval=Duration.seconds(ga_config.get("health_check_interval", 30)),

175 health_check_threshold=3,

176 )

177

178 self.endpoint_groups[region] = endpoint_group

179

180 # Export endpoint group ARN for regional stacks

181 CfnOutput(

182 self,

183 f"EndpointGroup{region_id}Arn",

184 value=endpoint_group.endpoint_group_arn,

185 description=f"Endpoint group ARN for {region}",

186 export_name=f"{project_name}-endpoint-group-{region}-arn",

187 )

188

189 # Store endpoint group ARN in SSM Parameter Store for cross-region access

190 # Regional stacks read this to register their ALBs with Global Accelerator

191 ssm.StringParameter(

192 self,

193 f"EndpointGroup{region_id}ArnParam",

194 parameter_name=f"/{project_name}/endpoint-group-{region}-arn",

195 string_value=endpoint_group.endpoint_group_arn,

196 description=f"Global Accelerator endpoint group ARN for {region}",

197 )

198

199 def add_regional_endpoint(self, region: str, alb_arn: str) -> None:

200 """Add a regional ALB endpoint to the Global Accelerator.

201

202 Note: Due to cross-region reference limitations in CDK, the actual endpoint

203 registration is handled by a custom resource in the regional stack.

204 This method stores the ARN for reference but doesn't directly register it.

205

206 The regional stack should use the endpoint group ARN exported by this stack

207 to register its ALB via an AwsCustomResource.

208 """

209 self.regional_endpoints[region] = alb_arn

210 # Actual registration happens in regional stack via custom resource

211

212 def get_accelerator_dns_name(self) -> str:

213 """Get the Global Accelerator DNS name"""

214 return str(self.accelerator.dns_name)

215

216 def get_accelerator_arn(self) -> str:

217 """Get the Global Accelerator ARN"""

218 return str(self.accelerator.accelerator_arn)

219

220 def get_listener_arn(self) -> str:

221 """Get the Global Accelerator Listener ARN"""

222 return str(self.listener.listener_arn)

223

224 def get_endpoint_group_arn(self, region: str) -> str:

225 """Get the endpoint group ARN for a specific region"""

226 if region in self.endpoint_groups:

227 return str(self.endpoint_groups[region].endpoint_group_arn)

228 raise ValueError(f"No endpoint group found for region: {region}")

229

230 def _create_dynamodb_tables(self) -> None:

231 """Create DynamoDB tables for templates, webhooks, and jobs."""

232 project_name = self.config.get_project_name()

233

234 # Job Templates table - stores reusable job templates

235 self.templates_table = dynamodb.Table(

236 self,

237 "JobTemplatesTable",

238 table_name=f"{project_name}-job-templates",

239 partition_key=dynamodb.Attribute(

240 name="template_name",

241 type=dynamodb.AttributeType.STRING,

242 ),

243 billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST,

244 removal_policy=RemovalPolicy.DESTROY,

245 point_in_time_recovery_specification=dynamodb.PointInTimeRecoverySpecification(

246 point_in_time_recovery_enabled=True

247 ),

248 encryption=dynamodb.TableEncryption.AWS_MANAGED,

249 )

250

251 # Webhooks table - stores webhook registrations

252 self.webhooks_table = dynamodb.Table(

253 self,

254 "WebhooksTable",

255 table_name=f"{project_name}-webhooks",

256 partition_key=dynamodb.Attribute(

257 name="webhook_id",

258 type=dynamodb.AttributeType.STRING,

259 ),

260 billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST,

261 removal_policy=RemovalPolicy.DESTROY,

262 point_in_time_recovery_specification=dynamodb.PointInTimeRecoverySpecification(

263 point_in_time_recovery_enabled=True

264 ),

265 encryption=dynamodb.TableEncryption.AWS_MANAGED,

266 )

267

268 # Add GSI for querying webhooks by namespace

269 self.webhooks_table.add_global_secondary_index(

270 index_name="namespace-index",

271 partition_key=dynamodb.Attribute(

272 name="namespace",

273 type=dynamodb.AttributeType.STRING,

274 ),

275 projection_type=dynamodb.ProjectionType.ALL,

276 )

277

278 # Jobs table - centralized job tracking and queue

279 # This enables global job submission with regional pickup

280 self.jobs_table = dynamodb.Table(

281 self,

282 "JobsTable",

283 table_name=f"{project_name}-jobs",

284 partition_key=dynamodb.Attribute(

285 name="job_id",

286 type=dynamodb.AttributeType.STRING,

287 ),

288 billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST,

289 removal_policy=RemovalPolicy.DESTROY,

290 point_in_time_recovery_specification=dynamodb.PointInTimeRecoverySpecification(

291 point_in_time_recovery_enabled=True

292 ),

293 encryption=dynamodb.TableEncryption.AWS_MANAGED,

294 time_to_live_attribute="ttl", # Auto-cleanup old completed jobs

295 )

296

297 # GSI for querying jobs by region and status (for regional polling)

298 self.jobs_table.add_global_secondary_index(

299 index_name="region-status-index",

300 partition_key=dynamodb.Attribute(

301 name="target_region",

302 type=dynamodb.AttributeType.STRING,

303 ),

304 sort_key=dynamodb.Attribute(

305 name="status",

306 type=dynamodb.AttributeType.STRING,

307 ),

308 projection_type=dynamodb.ProjectionType.ALL,

309 )

310

311 # GSI for querying jobs by namespace

312 self.jobs_table.add_global_secondary_index(

313 index_name="namespace-index",

314 partition_key=dynamodb.Attribute(

315 name="namespace",

316 type=dynamodb.AttributeType.STRING,

317 ),

318 sort_key=dynamodb.Attribute(

319 name="submitted_at",

320 type=dynamodb.AttributeType.STRING,

321 ),

322 projection_type=dynamodb.ProjectionType.ALL,

323 )

324

325 # GSI for querying jobs by status globally

326 self.jobs_table.add_global_secondary_index(

327 index_name="status-index",

328 partition_key=dynamodb.Attribute(

329 name="status",

330 type=dynamodb.AttributeType.STRING,

331 ),

332 sort_key=dynamodb.Attribute(

333 name="submitted_at",

334 type=dynamodb.AttributeType.STRING,

335 ),

336 projection_type=dynamodb.ProjectionType.ALL,

337 )

338

339 # Export table names and ARNs for regional stacks

340 CfnOutput(

341 self,

342 "TemplatesTableName",

343 value=self.templates_table.table_name,

344 description="DynamoDB table name for job templates",

345 export_name=f"{project_name}-templates-table-name",

346 )

347

348 CfnOutput(

349 self,

350 "TemplatesTableArn",

351 value=self.templates_table.table_arn,

352 description="DynamoDB table ARN for job templates",

353 export_name=f"{project_name}-templates-table-arn",

354 )

355

356 CfnOutput(

357 self,

358 "WebhooksTableName",

359 value=self.webhooks_table.table_name,

360 description="DynamoDB table name for webhooks",

361 export_name=f"{project_name}-webhooks-table-name",

362 )

363

364 CfnOutput(

365 self,

366 "WebhooksTableArn",

367 value=self.webhooks_table.table_arn,

368 description="DynamoDB table ARN for webhooks",

369 export_name=f"{project_name}-webhooks-table-arn",

370 )

371

372 CfnOutput(

373 self,

374 "JobsTableName",

375 value=self.jobs_table.table_name,

376 description="DynamoDB table name for centralized job tracking",

377 export_name=f"{project_name}-jobs-table-name",

378 )

379

380 CfnOutput(

381 self,

382 "JobsTableArn",

383 value=self.jobs_table.table_arn,

384 description="DynamoDB table ARN for centralized job tracking",

385 export_name=f"{project_name}-jobs-table-arn",

386 )

387

388 # Inference Endpoints table - stores desired state for inference deployments

389 # The inference_monitor in each regional cluster polls this table

390 self.inference_endpoints_table = dynamodb.Table(

391 self,

392 "InferenceEndpointsTable",

393 table_name=f"{project_name}-inference-endpoints",

394 partition_key=dynamodb.Attribute(

395 name="endpoint_name",

396 type=dynamodb.AttributeType.STRING,

397 ),

398 billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST,

399 removal_policy=RemovalPolicy.DESTROY,

400 point_in_time_recovery_specification=dynamodb.PointInTimeRecoverySpecification(

401 point_in_time_recovery_enabled=True

402 ),

403 encryption=dynamodb.TableEncryption.AWS_MANAGED,

404 )

405

406 CfnOutput(

407 self,

408 "InferenceEndpointsTableName",

409 value=self.inference_endpoints_table.table_name,

410 description="DynamoDB table name for inference endpoint state",

411 export_name=f"{project_name}-inference-endpoints-table-name",

412 )

413

414 CfnOutput(

415 self,

416 "InferenceEndpointsTableArn",

417 value=self.inference_endpoints_table.table_arn,

418 description="DynamoDB table ARN for inference endpoint state",

419 export_name=f"{project_name}-inference-endpoints-table-arn",

420 )

421

422 # Store table names in SSM for cross-region access

423 ssm.StringParameter(

424 self,

425 "TemplatesTableNameParam",

426 parameter_name=f"/{project_name}/templates-table-name",

427 string_value=self.templates_table.table_name,

428 description="DynamoDB table name for job templates",

429 )

430

431 ssm.StringParameter(

432 self,

433 "WebhooksTableNameParam",

434 parameter_name=f"/{project_name}/webhooks-table-name",

435 string_value=self.webhooks_table.table_name,

436 description="DynamoDB table name for webhooks",

437 )

438

439 ssm.StringParameter(

440 self,

441 "JobsTableNameParam",

442 parameter_name=f"/{project_name}/jobs-table-name",

443 string_value=self.jobs_table.table_name,

444 description="DynamoDB table name for centralized job tracking",

445 )

446

447 ssm.StringParameter(

448 self,

449 "InferenceEndpointsTableNameParam",

450 parameter_name=f"/{project_name}/inference-endpoints-table-name",

451 string_value=self.inference_endpoints_table.table_name,

452 description="DynamoDB table name for inference endpoint state",

453 )

454

455 def _create_model_bucket(self) -> None:

456 """Create S3 bucket for model weights.

457

458 This bucket serves as the central model registry. Users upload model

459 weights here once, and the inference_monitor's init containers sync

460 them to each region's local EFS at pod startup.

461

462 The bucket name is auto-generated by CDK to avoid naming collisions.

463 It's exported via CfnOutput and SSM for CLI discovery.

464 """

465 project_name = self.config.get_project_name()

466 from aws_cdk import aws_kms as kms

467

468 # KMS key for model bucket encryption

469 self.model_bucket_key = kms.Key(

470 self,

471 "ModelBucketKey",

472 description="KMS key for GCO model weights bucket",

473 enable_key_rotation=True,

474 removal_policy=RemovalPolicy.DESTROY,

475 )

476

477 # Access logs bucket (required for compliance)

478 # Retention is configurable via cdk.json context field `s3_access_logs.retention_days`

479 # (default: 90 days). Logs older than the configured retention are expired.

480 s3_access_logs_ctx = self.node.try_get_context("s3_access_logs") or {}

481 access_logs_retention_days = int(s3_access_logs_ctx.get("retention_days", 90))

482

483 self.model_bucket_access_logs = s3.Bucket(

484 self,

485 "ModelWeightsAccessLogsBucket",

486 encryption=s3.BucketEncryption.S3_MANAGED,

487 block_public_access=s3.BlockPublicAccess.BLOCK_ALL,

488 enforce_ssl=True,

489 versioned=True,

490 removal_policy=RemovalPolicy.DESTROY,

491 auto_delete_objects=True,

492 lifecycle_rules=[

493 s3.LifecycleRule(

494 id="ExpireAccessLogs",

495 enabled=True,

496 expiration=Duration.days(access_logs_retention_days),

497 )

498 ],

499 )

500

501 # Model weights bucket

502 self.model_bucket = s3.Bucket(

503 self,

504 "ModelWeightsBucket",

505 encryption=s3.BucketEncryption.KMS,

506 encryption_key=self.model_bucket_key,

507 bucket_key_enabled=True,

508 block_public_access=s3.BlockPublicAccess.BLOCK_ALL,

509 enforce_ssl=True,

510 versioned=True,

511 removal_policy=RemovalPolicy.DESTROY,

512 auto_delete_objects=True,

513 server_access_logs_bucket=self.model_bucket_access_logs,

514 server_access_logs_prefix="model-bucket-logs/",

515 )

516

517 # CDK-nag suppressions — only replication (not needed for model weights)

518 from cdk_nag import NagSuppressions

519

520 replication_reason = (

521 "Model weights are user-uploaded artifacts that can be re-uploaded. "

522 "Cross-region replication is not required; the inference_monitor "

523 "syncs models from S3 to each region's EFS at pod startup."

524 )

525

526 NagSuppressions.add_resource_suppressions(

527 self.model_bucket,

528 [

529 {

530 "id": "HIPAA.Security-S3BucketReplicationEnabled",

531 "reason": replication_reason,

532 },

533 {

534 "id": "NIST.800.53.R5-S3BucketReplicationEnabled",

535 "reason": replication_reason,

536 },

537 {

538 "id": "PCI.DSS.321-S3BucketReplicationEnabled",

539 "reason": replication_reason,

540 },

541 ],

542 )

543

544 logs_reason = "This is the server access logs destination bucket."

545 NagSuppressions.add_resource_suppressions(

546 self.model_bucket_access_logs,

547 [

548 {"id": "AwsSolutions-S1", "reason": logs_reason},

549 {"id": "HIPAA.Security-S3BucketLoggingEnabled", "reason": logs_reason},

550 {

551 "id": "HIPAA.Security-S3BucketReplicationEnabled",

552 "reason": "Access logs do not require replication.",

553 },

554 {

555 "id": "HIPAA.Security-S3DefaultEncryptionKMS",

556 "reason": "SSE-S3 is sufficient for access logs.",

557 },

558 {"id": "NIST.800.53.R5-S3BucketLoggingEnabled", "reason": logs_reason},

559 {

560 "id": "NIST.800.53.R5-S3BucketReplicationEnabled",

561 "reason": "Access logs do not require replication.",

562 },

563 {

564 "id": "NIST.800.53.R5-S3DefaultEncryptionKMS",

565 "reason": "SSE-S3 is sufficient for access logs.",

566 },

567 {"id": "PCI.DSS.321-S3BucketLoggingEnabled", "reason": logs_reason},

568 {

569 "id": "PCI.DSS.321-S3BucketReplicationEnabled",

570 "reason": "Access logs do not require replication.",

571 },

572 {

573 "id": "PCI.DSS.321-S3DefaultEncryptionKMS",

574 "reason": "SSE-S3 is sufficient for access logs.",

575 },

576 ],

577 )

578

579 CfnOutput(

580 self,

581 "ModelBucketName",

582 value=self.model_bucket.bucket_name,

583 description="S3 bucket for model weights",

584 export_name=f"{project_name}-model-bucket-name",

585 )

586

587 CfnOutput(

588 self,

589 "ModelBucketArn",

590 value=self.model_bucket.bucket_arn,

591 description="S3 bucket ARN for model weights",

592 export_name=f"{project_name}-model-bucket-arn",

593 )

594

595 ssm.StringParameter(

596 self,

597 "ModelBucketNameParam",

598 parameter_name=f"/{project_name}/model-bucket-name",

599 string_value=self.model_bucket.bucket_name,

600 description="S3 bucket name for model weights",

601 )

602

603 def _create_backup_plan(self) -> None:

604 """Create AWS Backup plan for DynamoDB tables.

605

606 Creates a backup plan with:

607 - Daily backups retained for 35 days

608 - Weekly backups retained for 90 days

609 - All DynamoDB tables added to the backup selection

610 """

611 # Create backup vault for storing backups

612 self.backup_vault = backup.BackupVault(

613 self,

614 "DynamoDBBackupVault",

615 removal_policy=RemovalPolicy.DESTROY,

616 )

617

618 # Create backup plan with daily and weekly rules

619 self.backup_plan = backup.BackupPlan(

620 self,

621 "DynamoDBBackupPlan",

622 backup_plan_rules=[

623 # Daily backup - retained for 35 days

624 backup.BackupPlanRule(

625 rule_name="DailyBackup",

626 backup_vault=self.backup_vault,

627 schedule_expression=events.Schedule.cron(

628 hour="3",

629 minute="0",

630 ),

631 delete_after=Duration.days(35),

632 enable_continuous_backup=True, # Enable PITR for DynamoDB

633 ),

634 # Weekly backup - retained for 90 days

635 backup.BackupPlanRule(

636 rule_name="WeeklyBackup",

637 backup_vault=self.backup_vault,

638 schedule_expression=events.Schedule.cron(

639 hour="4",

640 minute="0",

641 week_day="SUN",

642 ),

643 delete_after=Duration.days(90),

644 ),

645 ],

646 )

647

648 # Add all DynamoDB tables to the backup selection

649 self.backup_plan.add_selection(

650 "DynamoDBTablesSelection",

651 resources=[

652 backup.BackupResource.from_dynamo_db_table(self.templates_table),

653 backup.BackupResource.from_dynamo_db_table(self.webhooks_table),

654 backup.BackupResource.from_dynamo_db_table(self.jobs_table),

655 backup.BackupResource.from_dynamo_db_table(self.inference_endpoints_table),

656 ],

657 )

658

659 # Export backup plan ARN

660 project_name = self.config.get_project_name()

661 CfnOutput(

662 self,

663 "BackupPlanArn",

664 value=self.backup_plan.backup_plan_arn,

665 description="AWS Backup plan ARN for DynamoDB tables",

666 export_name=f"{project_name}-backup-plan-arn",

667 )

668

669 CfnOutput(

670 self,

671 "BackupVaultArn",

672 value=self.backup_vault.backup_vault_arn,

673 description="AWS Backup vault ARN for DynamoDB backups",

674 export_name=f"{project_name}-backup-vault-arn",

675 )

Coverage for gco / stacks / global_stack.py: 100%

101 statements