Coverage for gco/stacks/monitoring_stack.py: 98%

314 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-15 15:07 +0000

1""" 

2Monitoring stack for GCO (Global Capacity Orchestrator on AWS) - Cross-region monitoring and observability. 

3 

4This stack creates centralized monitoring resources for all GCO deployments: 

5- CloudWatch Dashboard with comprehensive widgets for all regions 

6- SNS topic for alerting 

7- CloudWatch Alarms for critical metrics 

8- Log groups for application logs 

9- Anomaly detection for traffic patterns 

10- Composite alarms for better signal-to-noise 

11 

12Dashboard Sections: 

13- Global Accelerator: Flow counts, processed bytes 

14- API Gateway: Request counts, latency, error rates 

15- Lambda Functions: Invocations, errors, duration, throttles 

16- SQS Queues: Message counts, age, dead letter queue depth 

17- DynamoDB Tables: Capacity, latency, throttles, errors 

18- EKS Clusters: CPU/memory utilization per region 

19- FSx for Lustre (when enabled): Throughput, IOPS, free storage 

20- Valkey Serverless (when enabled): ECPU, hit rate, latency, bytes used 

21- Aurora pgvector (when enabled): ACU utilization, connections, latency, CPU 

22- ALBs: Request counts, response times, healthy hosts 

23- Applications: Custom metrics from health monitor and manifest processor 

24 

25Cross-Region Metrics: 

26 CloudWatch metrics are region-specific. This stack handles cross-region 

27 monitoring by specifying the `region` parameter on metrics: 

28 - Global Accelerator metrics: Always in us-west-2 

29 - DynamoDB metrics: In the global region (where tables are deployed) 

30 - Regional metrics: In each cluster's region 

31 

32Alarms: 

33- High CPU/memory utilization on EKS clusters 

34- Unhealthy hosts in ALB target groups 

35- High response times 

36- Manifest processing failures 

37- Lambda errors and throttles 

38- SQS message age (stuck jobs) 

39- DynamoDB throttling and system errors 

40- API Gateway 5XX errors 

41- Secret rotation failures 

42""" 

43 

44from typing import TYPE_CHECKING, Any 

45 

46from aws_cdk import ( 

47 CfnOutput, 

48 Duration, 

49 RemovalPolicy, 

50 Stack, 

51) 

52from aws_cdk import aws_cloudwatch as cloudwatch 

53from aws_cdk import aws_cloudwatch_actions as cw_actions 

54from aws_cdk import aws_logs as logs 

55from aws_cdk import aws_sns as sns 

56from constructs import Construct 

57 

58from gco.config.config_loader import ConfigLoader 

59 

60# <pyflowchart-code-diagram> BEGIN - auto-inserted, do not edit 

61# Flowchart(s) generated from this file: 

62# * ``GCOMonitoringStack.__init__`` -> ``diagrams/code_diagrams/gco/stacks/monitoring_stack.GCOMonitoringStack___init__.html`` 

63# (PNG: ``diagrams/code_diagrams/gco/stacks/monitoring_stack.GCOMonitoringStack___init__.png``) 

64# Regenerate with ``python diagrams/code_diagrams/generate.py``. 

65# <pyflowchart-code-diagram> END 

66 

67 

68if TYPE_CHECKING: 

69 from gco.stacks.api_gateway_global_stack import GCOApiGatewayGlobalStack 

70 from gco.stacks.global_stack import GCOGlobalStack 

71 from gco.stacks.regional_stack import GCORegionalStack 

72 

73 

74class GCOMonitoringStack(Stack): 

75 """ 

76 Cross-region monitoring and observability stack. 

77 

78 Creates a centralized CloudWatch dashboard and alarms that aggregate 

79 metrics from all regional deployments. 

80 

81 Attributes: 

82 alert_topic: SNS topic for alarm notifications 

83 dashboard: CloudWatch dashboard with all monitoring widgets 

84 """ 

85 

86 def __init__( 

87 self, 

88 scope: Construct, 

89 construct_id: str, 

90 config: ConfigLoader, 

91 global_stack: GCOGlobalStack, 

92 regional_stacks: list[GCORegionalStack], 

93 api_gateway_stack: GCOApiGatewayGlobalStack | None = None, 

94 **kwargs: Any, 

95 ) -> None: 

96 # Enable CDK's native cross-region references. The monitoring stack 

97 # lives in the monitoring region (by default us-east-2) and needs 

98 # resource identifiers from the regional stacks for dashboard 

99 # dimensions — specifically the auto-generated FSx file system IDs, 

100 # whose values aren't known until deploy time. 

101 # 

102 # CDK implements this by provisioning a small Lambda-backed custom 

103 # resource in each source stack that writes the referenced value to 

104 # an SSM parameter in the target region, plus a reader custom 

105 # resource in the target stack. Cost is negligible (the Lambdas run 

106 # once per deploy) and the pattern is the documented canonical 

107 # answer for ``CrossRegionReferencesNotEnabled`` errors. 

108 kwargs.setdefault("cross_region_references", True) 

109 super().__init__(scope, construct_id, **kwargs) 

110 

111 self.config = config 

112 self.global_stack = global_stack 

113 self.regional_stacks = regional_stacks 

114 self.api_gateway_stack = api_gateway_stack 

115 self.project_name = config.get_project_name() 

116 self.regions = config.get_regions() 

117 

118 # Create SNS topic for alerts 

119 self.alert_topic = self._create_alert_topic() 

120 

121 # Create CloudWatch dashboard 

122 self.dashboard = self._create_dashboard() 

123 

124 # Create alarms 

125 self._create_alarms() 

126 

127 # Create composite alarms 

128 self._create_composite_alarms() 

129 

130 # Create custom metrics 

131 self._create_custom_metrics() 

132 

133 # Export monitoring resources 

134 self._create_outputs() 

135 

136 # Apply cdk-nag suppressions 

137 self._apply_nag_suppressions() 

138 

139 def _apply_nag_suppressions(self) -> None: 

140 """Apply cdk-nag suppressions for this stack.""" 

141 from gco.stacks.nag_suppressions import apply_all_suppressions 

142 

143 apply_all_suppressions( 

144 self, 

145 stack_type="monitoring", 

146 regions=self.config.get_regions(), 

147 global_region=self.config.get_global_region(), 

148 ) 

149 

150 def _create_alert_topic(self) -> sns.Topic: 

151 """Create SNS topic for monitoring alerts""" 

152 topic = sns.Topic( 

153 self, 

154 "GCOAlertTopic", 

155 display_name="GCO (Global Capacity Orchestrator on AWS) Monitoring Alerts", 

156 enforce_ssl=True, 

157 ) 

158 return topic 

159 

160 def _create_dashboard(self) -> cloudwatch.Dashboard: 

161 """Create comprehensive CloudWatch dashboard for monitoring""" 

162 dashboard = cloudwatch.Dashboard( 

163 self, 

164 "GCODashboard", 

165 period_override=cloudwatch.PeriodOverride.AUTO, 

166 ) 

167 

168 # Add widgets in logical order 

169 dashboard.add_widgets(*self._create_global_accelerator_widgets()) 

170 dashboard.add_widgets(*self._create_api_gateway_widgets()) 

171 dashboard.add_widgets(*self._create_lambda_widgets()) 

172 dashboard.add_widgets(*self._create_sqs_widgets()) 

173 dashboard.add_widgets(*self._create_dynamodb_widgets()) 

174 dashboard.add_widgets(*self._create_eks_widgets()) 

175 dashboard.add_widgets(*self._create_gpu_widgets()) 

176 dashboard.add_widgets(*self._create_fsx_widgets()) 

177 dashboard.add_widgets(*self._create_valkey_widgets()) 

178 dashboard.add_widgets(*self._create_aurora_pgvector_widgets()) 

179 dashboard.add_widgets(*self._create_alb_widgets()) 

180 dashboard.add_widgets(*self._create_application_widgets()) 

181 

182 return dashboard 

183 

184 def _create_global_accelerator_widgets(self) -> list[cloudwatch.IWidget]: 

185 """Create Global Accelerator monitoring widgets. 

186 

187 Note: Global Accelerator metrics are only available in us-west-2, 

188 regardless of where the accelerator endpoints are located. 

189 CloudWatch uses the Accelerator ID (UUID), not the name. 

190 """ 

191 widgets: list[cloudwatch.IWidget] = [] 

192 

193 # Get the accelerator ID from the global stack (CloudWatch uses ID, not name) 

194 accelerator_id = self.global_stack.accelerator_id 

195 

196 # Global Accelerator metrics are always in us-west-2 

197 ga_metrics_region = "us-west-2" 

198 

199 # Section header 

200 widgets.append( 

201 cloudwatch.TextWidget( 

202 markdown="# Global Accelerator\nTraffic distribution and connectivity metrics", 

203 width=24, 

204 height=1, 

205 ) 

206 ) 

207 

208 # Flow count with anomaly detection 

209 flow_count_widget = cloudwatch.GraphWidget( 

210 title="Global Accelerator - New Flows", 

211 left=[ 

212 cloudwatch.Metric( 

213 namespace="AWS/GlobalAccelerator", 

214 metric_name="NewFlowCount", 

215 dimensions_map={"Accelerator": accelerator_id}, 

216 statistic="Sum", 

217 period=Duration.minutes(5), 

218 region=ga_metrics_region, 

219 ) 

220 ], 

221 width=12, 

222 height=6, 

223 region=ga_metrics_region, 

224 ) 

225 widgets.append(flow_count_widget) 

226 

227 # Processed bytes 

228 bytes_widget = cloudwatch.GraphWidget( 

229 title="Global Accelerator - Processed Bytes", 

230 left=[ 

231 cloudwatch.Metric( 

232 namespace="AWS/GlobalAccelerator", 

233 metric_name="ProcessedBytesIn", 

234 dimensions_map={"Accelerator": accelerator_id}, 

235 statistic="Sum", 

236 period=Duration.minutes(5), 

237 region=ga_metrics_region, 

238 ), 

239 cloudwatch.Metric( 

240 namespace="AWS/GlobalAccelerator", 

241 metric_name="ProcessedBytesOut", 

242 dimensions_map={"Accelerator": accelerator_id}, 

243 statistic="Sum", 

244 period=Duration.minutes(5), 

245 region=ga_metrics_region, 

246 ), 

247 ], 

248 width=12, 

249 height=6, 

250 region=ga_metrics_region, 

251 ) 

252 widgets.append(bytes_widget) 

253 

254 return widgets 

255 

256 def _create_api_gateway_widgets(self) -> list[cloudwatch.IWidget]: 

257 """Create API Gateway monitoring widgets""" 

258 widgets: list[cloudwatch.IWidget] = [] 

259 

260 # Get the actual API name from the api_gateway_stack 

261 api_name = ( 

262 self.api_gateway_stack.api.rest_api_name if self.api_gateway_stack else "gco-global-api" 

263 ) 

264 

265 # API Gateway metrics are in the region where the API is deployed 

266 api_gw_region = self.config.get_api_gateway_region() 

267 

268 # Section header 

269 widgets.append( 

270 cloudwatch.TextWidget( 

271 markdown="# API Gateway\nRequest metrics, latency, and error rates", 

272 width=24, 

273 height=1, 

274 ) 

275 ) 

276 

277 # Request count and latency 

278 request_widget = cloudwatch.GraphWidget( 

279 title="API Gateway - Requests & Latency", 

280 left=[ 

281 cloudwatch.Metric( 

282 namespace="AWS/ApiGateway", 

283 metric_name="Count", 

284 dimensions_map={"ApiName": api_name}, 

285 statistic="Sum", 

286 period=Duration.minutes(5), 

287 region=api_gw_region, 

288 ) 

289 ], 

290 right=[ 

291 cloudwatch.Metric( 

292 namespace="AWS/ApiGateway", 

293 metric_name="Latency", 

294 dimensions_map={"ApiName": api_name}, 

295 statistic="Average", 

296 period=Duration.minutes(5), 

297 region=api_gw_region, 

298 ), 

299 cloudwatch.Metric( 

300 namespace="AWS/ApiGateway", 

301 metric_name="Latency", 

302 dimensions_map={"ApiName": api_name}, 

303 statistic="p99", 

304 period=Duration.minutes(5), 

305 region=api_gw_region, 

306 ), 

307 ], 

308 width=12, 

309 height=6, 

310 region=api_gw_region, 

311 ) 

312 widgets.append(request_widget) 

313 

314 # Error rates (4XX and 5XX) 

315 error_widget = cloudwatch.GraphWidget( 

316 title="API Gateway - Error Rates", 

317 left=[ 

318 cloudwatch.Metric( 

319 namespace="AWS/ApiGateway", 

320 metric_name="4XXError", 

321 dimensions_map={"ApiName": api_name}, 

322 statistic="Sum", 

323 period=Duration.minutes(5), 

324 color="#ff7f0e", 

325 region=api_gw_region, 

326 ), 

327 cloudwatch.Metric( 

328 namespace="AWS/ApiGateway", 

329 metric_name="5XXError", 

330 dimensions_map={"ApiName": api_name}, 

331 statistic="Sum", 

332 period=Duration.minutes(5), 

333 color="#d62728", 

334 region=api_gw_region, 

335 ), 

336 ], 

337 width=12, 

338 height=6, 

339 region=api_gw_region, 

340 ) 

341 widgets.append(error_widget) 

342 

343 return widgets 

344 

345 def _create_lambda_widgets(self) -> list[cloudwatch.IWidget]: 

346 """Create Lambda function monitoring widgets""" 

347 widgets: list[cloudwatch.IWidget] = [] 

348 

349 # Section header 

350 widgets.append( 

351 cloudwatch.TextWidget( 

352 markdown="# Lambda Functions\nProxy, rotation, and regional Lambda metrics", 

353 width=24, 

354 height=1, 

355 ) 

356 ) 

357 

358 # Get API Gateway region for global Lambda functions 

359 api_gw_region = self.config.get_api_gateway_region() 

360 

361 # Build Lambda function list: (function_name, label, region) 

362 lambda_functions: list[tuple[str, str, str]] = [] 

363 

364 # Add API Gateway Lambda functions if available 

365 if self.api_gateway_stack: 365 ↛ 382line 365 didn't jump to line 382 because the condition on line 365 was always true

366 lambda_functions.append( 

367 ( 

368 self.api_gateway_stack.proxy_lambda.function_name, 

369 "API Gateway Proxy", 

370 api_gw_region, 

371 ) 

372 ) 

373 lambda_functions.append( 

374 ( 

375 self.api_gateway_stack.rotation_lambda.function_name, 

376 "Secret Rotation", 

377 api_gw_region, 

378 ) 

379 ) 

380 

381 # Add regional Lambda functions from each regional stack 

382 for regional_stack in self.regional_stacks: 

383 region = regional_stack.deployment_region 

384 lambda_functions.extend( 

385 [ 

386 ( 

387 regional_stack.kubectl_lambda_function_name, 

388 f"Kubectl Applier ({region})", 

389 region, 

390 ), 

391 ( 

392 regional_stack.helm_installer_lambda_function_name, 

393 f"Helm Installer ({region})", 

394 region, 

395 ), 

396 ] 

397 ) 

398 

399 # Invocations widget 

400 invocations_widget = cloudwatch.GraphWidget( 

401 title="Lambda - Invocations", 

402 left=[ 

403 cloudwatch.Metric( 

404 namespace="AWS/Lambda", 

405 metric_name="Invocations", 

406 dimensions_map={"FunctionName": func_name}, 

407 statistic="Sum", 

408 period=Duration.minutes(5), 

409 label=label, 

410 region=region, 

411 ) 

412 for func_name, label, region in lambda_functions[:5] 

413 ], 

414 width=12, 

415 height=6, 

416 ) 

417 widgets.append(invocations_widget) 

418 

419 errors_widget = cloudwatch.GraphWidget( 

420 title="Lambda - Errors", 

421 left=[ 

422 cloudwatch.Metric( 

423 namespace="AWS/Lambda", 

424 metric_name="Errors", 

425 dimensions_map={"FunctionName": func_name}, 

426 statistic="Sum", 

427 period=Duration.minutes(5), 

428 label=label, 

429 color="#d62728", 

430 region=region, 

431 ) 

432 for func_name, label, region in lambda_functions[:5] 

433 ], 

434 width=12, 

435 height=6, 

436 ) 

437 widgets.append(errors_widget) 

438 

439 # Duration widget 

440 duration_widget = cloudwatch.GraphWidget( 

441 title="Lambda - Duration (ms)", 

442 left=[ 

443 cloudwatch.Metric( 

444 namespace="AWS/Lambda", 

445 metric_name="Duration", 

446 dimensions_map={"FunctionName": func_name}, 

447 statistic="Average", 

448 period=Duration.minutes(5), 

449 label=label, 

450 region=region, 

451 ) 

452 for func_name, label, region in lambda_functions[:5] 

453 ], 

454 width=12, 

455 height=6, 

456 ) 

457 widgets.append(duration_widget) 

458 

459 # Throttles widget 

460 throttles_widget = cloudwatch.GraphWidget( 

461 title="Lambda - Throttles & Concurrent Executions", 

462 left=[ 

463 cloudwatch.Metric( 

464 namespace="AWS/Lambda", 

465 metric_name="Throttles", 

466 dimensions_map={"FunctionName": func_name}, 

467 statistic="Sum", 

468 period=Duration.minutes(5), 

469 label=f"{label} Throttles", 

470 region=region, 

471 ) 

472 for func_name, label, region in lambda_functions[:3] 

473 ], 

474 right=[ 

475 cloudwatch.Metric( 

476 namespace="AWS/Lambda", 

477 metric_name="ConcurrentExecutions", 

478 dimensions_map={"FunctionName": func_name}, 

479 statistic="Maximum", 

480 period=Duration.minutes(5), 

481 label=f"{label} Concurrent", 

482 region=region, 

483 ) 

484 for func_name, label, region in lambda_functions[:3] 

485 ], 

486 width=12, 

487 height=6, 

488 ) 

489 widgets.append(throttles_widget) 

490 

491 return widgets 

492 

493 def _create_sqs_widgets(self) -> list[cloudwatch.IWidget]: 

494 """Create SQS queue monitoring widgets""" 

495 widgets: list[cloudwatch.IWidget] = [] 

496 

497 # Section header 

498 widgets.append( 

499 cloudwatch.TextWidget( 

500 markdown="# SQS Queues\nJob submission queue metrics and dead letter queue", 

501 width=24, 

502 height=1, 

503 ) 

504 ) 

505 

506 # Build queue info from regional stacks: (queue_name, dlq_name, region) 

507 queue_info = [ 

508 ( 

509 regional_stack.job_queue.queue_name, 

510 regional_stack.job_dlq.queue_name, 

511 regional_stack.deployment_region, 

512 ) 

513 for regional_stack in self.regional_stacks 

514 ] 

515 

516 # Messages visible and in-flight per region 

517 messages_widget = cloudwatch.GraphWidget( 

518 title="SQS - Messages (Visible & In-Flight)", 

519 left=[ 

520 cloudwatch.Metric( 

521 namespace="AWS/SQS", 

522 metric_name="ApproximateNumberOfMessagesVisible", 

523 dimensions_map={"QueueName": queue_name}, 

524 statistic="Average", 

525 period=Duration.minutes(1), 

526 label=f"{region} Visible", 

527 region=region, 

528 ) 

529 for queue_name, _, region in queue_info 

530 ], 

531 right=[ 

532 cloudwatch.Metric( 

533 namespace="AWS/SQS", 

534 metric_name="ApproximateNumberOfMessagesNotVisible", 

535 dimensions_map={"QueueName": queue_name}, 

536 statistic="Average", 

537 period=Duration.minutes(1), 

538 label=f"{region} In-Flight", 

539 region=region, 

540 ) 

541 for queue_name, _, region in queue_info 

542 ], 

543 width=12, 

544 height=6, 

545 ) 

546 widgets.append(messages_widget) 

547 

548 # Age of oldest message (critical for detecting stuck jobs) 

549 age_widget = cloudwatch.GraphWidget( 

550 title="SQS - Age of Oldest Message (seconds)", 

551 left=[ 

552 cloudwatch.Metric( 

553 namespace="AWS/SQS", 

554 metric_name="ApproximateAgeOfOldestMessage", 

555 dimensions_map={"QueueName": queue_name}, 

556 statistic="Maximum", 

557 period=Duration.minutes(1), 

558 label=region, 

559 region=region, 

560 ) 

561 for queue_name, _, region in queue_info 

562 ], 

563 width=12, 

564 height=6, 

565 ) 

566 widgets.append(age_widget) 

567 

568 # Dead letter queue depth 

569 dlq_widget = cloudwatch.GraphWidget( 

570 title="SQS - Dead Letter Queue Depth", 

571 left=[ 

572 cloudwatch.Metric( 

573 namespace="AWS/SQS", 

574 metric_name="ApproximateNumberOfMessagesVisible", 

575 dimensions_map={"QueueName": dlq_name}, 

576 statistic="Average", 

577 period=Duration.minutes(1), 

578 label=f"{region} DLQ", 

579 color="#d62728", 

580 region=region, 

581 ) 

582 for _, dlq_name, region in queue_info 

583 ], 

584 width=12, 

585 height=6, 

586 ) 

587 widgets.append(dlq_widget) 

588 

589 # Messages sent/received/deleted 

590 throughput_widget = cloudwatch.GraphWidget( 

591 title="SQS - Throughput", 

592 left=[ 

593 cloudwatch.Metric( 

594 namespace="AWS/SQS", 

595 metric_name="NumberOfMessagesSent", 

596 dimensions_map={"QueueName": queue_name}, 

597 statistic="Sum", 

598 period=Duration.minutes(5), 

599 label=f"{region} Sent", 

600 region=region, 

601 ) 

602 for queue_name, _, region in queue_info 

603 ], 

604 right=[ 

605 cloudwatch.Metric( 

606 namespace="AWS/SQS", 

607 metric_name="NumberOfMessagesDeleted", 

608 dimensions_map={"QueueName": queue_name}, 

609 statistic="Sum", 

610 period=Duration.minutes(5), 

611 label=f"{region} Processed", 

612 region=region, 

613 ) 

614 for queue_name, _, region in queue_info 

615 ], 

616 width=12, 

617 height=6, 

618 ) 

619 widgets.append(throughput_widget) 

620 

621 return widgets 

622 

623 def _create_dynamodb_widgets(self) -> list[cloudwatch.IWidget]: 

624 """Create DynamoDB monitoring widgets for job queue, templates, and webhooks tables.""" 

625 widgets: list[cloudwatch.IWidget] = [] 

626 

627 # Get table names from global stack 

628 templates_table = self.global_stack.templates_table.table_name 

629 webhooks_table = self.global_stack.webhooks_table.table_name 

630 jobs_table = self.global_stack.jobs_table.table_name 

631 

632 # DynamoDB tables are in the global region 

633 global_region = self.config.get_global_region() 

634 

635 # Section header 

636 widgets.append( 

637 cloudwatch.TextWidget( 

638 markdown="# DynamoDB Tables\nJob queue, templates, and webhooks storage metrics", 

639 width=24, 

640 height=1, 

641 ) 

642 ) 

643 

644 # Read/Write capacity consumed 

645 capacity_widget = cloudwatch.GraphWidget( 

646 title="DynamoDB - Consumed Capacity", 

647 left=[ 

648 cloudwatch.Metric( 

649 namespace="AWS/DynamoDB", 

650 metric_name="ConsumedReadCapacityUnits", 

651 dimensions_map={"TableName": jobs_table}, 

652 statistic="Sum", 

653 period=Duration.minutes(5), 

654 label="Jobs Read", 

655 region=global_region, 

656 ), 

657 cloudwatch.Metric( 

658 namespace="AWS/DynamoDB", 

659 metric_name="ConsumedReadCapacityUnits", 

660 dimensions_map={"TableName": templates_table}, 

661 statistic="Sum", 

662 period=Duration.minutes(5), 

663 label="Templates Read", 

664 region=global_region, 

665 ), 

666 cloudwatch.Metric( 

667 namespace="AWS/DynamoDB", 

668 metric_name="ConsumedReadCapacityUnits", 

669 dimensions_map={"TableName": webhooks_table}, 

670 statistic="Sum", 

671 period=Duration.minutes(5), 

672 label="Webhooks Read", 

673 region=global_region, 

674 ), 

675 ], 

676 right=[ 

677 cloudwatch.Metric( 

678 namespace="AWS/DynamoDB", 

679 metric_name="ConsumedWriteCapacityUnits", 

680 dimensions_map={"TableName": jobs_table}, 

681 statistic="Sum", 

682 period=Duration.minutes(5), 

683 label="Jobs Write", 

684 region=global_region, 

685 ), 

686 cloudwatch.Metric( 

687 namespace="AWS/DynamoDB", 

688 metric_name="ConsumedWriteCapacityUnits", 

689 dimensions_map={"TableName": templates_table}, 

690 statistic="Sum", 

691 period=Duration.minutes(5), 

692 label="Templates Write", 

693 region=global_region, 

694 ), 

695 ], 

696 width=12, 

697 height=6, 

698 region=global_region, 

699 ) 

700 widgets.append(capacity_widget) 

701 

702 # Latency metrics 

703 latency_widget = cloudwatch.GraphWidget( 

704 title="DynamoDB - Latency (ms)", 

705 left=[ 

706 cloudwatch.Metric( 

707 namespace="AWS/DynamoDB", 

708 metric_name="SuccessfulRequestLatency", 

709 dimensions_map={"TableName": jobs_table, "Operation": "GetItem"}, 

710 statistic="Average", 

711 period=Duration.minutes(5), 

712 label="Jobs GetItem", 

713 region=global_region, 

714 ), 

715 cloudwatch.Metric( 

716 namespace="AWS/DynamoDB", 

717 metric_name="SuccessfulRequestLatency", 

718 dimensions_map={"TableName": jobs_table, "Operation": "PutItem"}, 

719 statistic="Average", 

720 period=Duration.minutes(5), 

721 label="Jobs PutItem", 

722 region=global_region, 

723 ), 

724 cloudwatch.Metric( 

725 namespace="AWS/DynamoDB", 

726 metric_name="SuccessfulRequestLatency", 

727 dimensions_map={"TableName": jobs_table, "Operation": "Query"}, 

728 statistic="Average", 

729 period=Duration.minutes(5), 

730 label="Jobs Query", 

731 region=global_region, 

732 ), 

733 ], 

734 width=12, 

735 height=6, 

736 region=global_region, 

737 ) 

738 widgets.append(latency_widget) 

739 

740 # Throttled requests 

741 throttle_widget = cloudwatch.GraphWidget( 

742 title="DynamoDB - Throttled Requests", 

743 left=[ 

744 cloudwatch.Metric( 

745 namespace="AWS/DynamoDB", 

746 metric_name="ThrottledRequests", 

747 dimensions_map={"TableName": jobs_table}, 

748 statistic="Sum", 

749 period=Duration.minutes(5), 

750 label="Jobs", 

751 color="#d62728", 

752 region=global_region, 

753 ), 

754 cloudwatch.Metric( 

755 namespace="AWS/DynamoDB", 

756 metric_name="ThrottledRequests", 

757 dimensions_map={"TableName": templates_table}, 

758 statistic="Sum", 

759 period=Duration.minutes(5), 

760 label="Templates", 

761 color="#ff7f0e", 

762 region=global_region, 

763 ), 

764 cloudwatch.Metric( 

765 namespace="AWS/DynamoDB", 

766 metric_name="ThrottledRequests", 

767 dimensions_map={"TableName": webhooks_table}, 

768 statistic="Sum", 

769 period=Duration.minutes(5), 

770 label="Webhooks", 

771 color="#9467bd", 

772 region=global_region, 

773 ), 

774 ], 

775 width=12, 

776 height=6, 

777 region=global_region, 

778 ) 

779 widgets.append(throttle_widget) 

780 

781 # System errors 

782 errors_widget = cloudwatch.GraphWidget( 

783 title="DynamoDB - System Errors", 

784 left=[ 

785 cloudwatch.Metric( 

786 namespace="AWS/DynamoDB", 

787 metric_name="SystemErrors", 

788 dimensions_map={"TableName": jobs_table}, 

789 statistic="Sum", 

790 period=Duration.minutes(5), 

791 label="Jobs", 

792 color="#d62728", 

793 region=global_region, 

794 ), 

795 cloudwatch.Metric( 

796 namespace="AWS/DynamoDB", 

797 metric_name="SystemErrors", 

798 dimensions_map={"TableName": templates_table}, 

799 statistic="Sum", 

800 period=Duration.minutes(5), 

801 label="Templates", 

802 color="#ff7f0e", 

803 region=global_region, 

804 ), 

805 ], 

806 width=12, 

807 height=6, 

808 region=global_region, 

809 ) 

810 widgets.append(errors_widget) 

811 

812 return widgets 

813 

814 def _create_eks_widgets(self) -> list[cloudwatch.IWidget]: 

815 """Create EKS cluster monitoring widgets""" 

816 widgets: list[cloudwatch.IWidget] = [] 

817 

818 # Section header 

819 widgets.append( 

820 cloudwatch.TextWidget( 

821 markdown="# EKS Clusters\nCluster resource utilization and node metrics", 

822 width=24, 

823 height=1, 

824 ) 

825 ) 

826 

827 # Build cluster info from regional stacks: (cluster_name, region) 

828 cluster_info = [ 

829 (regional_stack.cluster.cluster_name, regional_stack.deployment_region) 

830 for regional_stack in self.regional_stacks 

831 ] 

832 

833 # EKS cluster status 

834 cluster_status_widget = cloudwatch.SingleValueWidget( 

835 title="EKS Clusters - Failed Requests", 

836 metrics=[ 

837 cloudwatch.Metric( 

838 namespace="AWS/EKS", 

839 metric_name="cluster_failed_request_count", 

840 dimensions_map={"cluster_name": cluster_name}, 

841 statistic="Sum", 

842 period=Duration.minutes(5), 

843 region=region, 

844 ) 

845 for cluster_name, region in cluster_info 

846 ], 

847 width=12, 

848 height=6, 

849 ) 

850 widgets.append(cluster_status_widget) 

851 

852 # Container Insights - Node CPU utilization (aggregated across all nodes) 

853 # Note: region parameter enables cross-region metrics in dashboard 

854 cpu_widget = cloudwatch.GraphWidget( 

855 title="EKS Clusters - Node CPU Utilization (%)", 

856 left=[ 

857 cloudwatch.Metric( 

858 namespace="ContainerInsights", 

859 metric_name="node_cpu_utilization", 

860 dimensions_map={"ClusterName": cluster_name}, 

861 statistic="Average", 

862 period=Duration.minutes(5), 

863 label=region, 

864 region=region, 

865 ) 

866 for cluster_name, region in cluster_info 

867 ], 

868 width=12, 

869 height=6, 

870 ) 

871 widgets.append(cpu_widget) 

872 

873 # Container Insights - Node Memory utilization (aggregated across all nodes) 

874 memory_widget = cloudwatch.GraphWidget( 

875 title="EKS Clusters - Node Memory Utilization (%)", 

876 left=[ 

877 cloudwatch.Metric( 

878 namespace="ContainerInsights", 

879 metric_name="node_memory_utilization", 

880 dimensions_map={"ClusterName": cluster_name}, 

881 statistic="Average", 

882 period=Duration.minutes(5), 

883 label=region, 

884 region=region, 

885 ) 

886 for cluster_name, region in cluster_info 

887 ], 

888 width=12, 

889 height=6, 

890 ) 

891 widgets.append(memory_widget) 

892 

893 # Node status - running pods capacity 

894 node_widget = cloudwatch.GraphWidget( 

895 title="EKS Clusters - Node Pod Capacity", 

896 left=[ 

897 cloudwatch.Metric( 

898 namespace="ContainerInsights", 

899 metric_name="node_status_capacity_pods", 

900 dimensions_map={"ClusterName": cluster_name}, 

901 statistic="Sum", 

902 period=Duration.minutes(5), 

903 label=f"{region} Capacity", 

904 region=region, 

905 ) 

906 for cluster_name, region in cluster_info 

907 ], 

908 right=[ 

909 cloudwatch.Metric( 

910 namespace="ContainerInsights", 

911 metric_name="node_number_of_running_pods", 

912 dimensions_map={"ClusterName": cluster_name}, 

913 statistic="Sum", 

914 period=Duration.minutes(5), 

915 label=f"{region} Running", 

916 region=region, 

917 ) 

918 for cluster_name, region in cluster_info 

919 ], 

920 width=12, 

921 height=6, 

922 ) 

923 widgets.append(node_widget) 

924 

925 return widgets 

926 

927 def _create_gpu_widgets(self) -> list[cloudwatch.IWidget]: 

928 """Create GPU monitoring widgets using DCGM Exporter metrics via ContainerInsights.""" 

929 widgets: list[cloudwatch.IWidget] = [] 

930 

931 widgets.append( 

932 cloudwatch.TextWidget( 

933 markdown="# GPU Metrics\nGPU utilization, memory, and temperature from DCGM Exporter", 

934 width=24, 

935 height=1, 

936 ) 

937 ) 

938 

939 cluster_info = [ 

940 (regional_stack.cluster.cluster_name, regional_stack.deployment_region) 

941 for regional_stack in self.regional_stacks 

942 ] 

943 

944 # GPU utilization percentage 

945 gpu_util_widget = cloudwatch.GraphWidget( 

946 title="GPU Utilization (%)", 

947 left=[ 

948 cloudwatch.Metric( 

949 namespace="ContainerInsights", 

950 metric_name="node_gpu_utilization", 

951 dimensions_map={"ClusterName": cluster_name}, 

952 statistic="Average", 

953 period=Duration.minutes(5), 

954 label=region, 

955 region=region, 

956 ) 

957 for cluster_name, region in cluster_info 

958 ], 

959 width=12, 

960 height=6, 

961 ) 

962 widgets.append(gpu_util_widget) 

963 

964 # GPU memory utilization 

965 gpu_mem_widget = cloudwatch.GraphWidget( 

966 title="GPU Memory Utilization (%)", 

967 left=[ 

968 cloudwatch.Metric( 

969 namespace="ContainerInsights", 

970 metric_name="node_gpu_memory_utilization", 

971 dimensions_map={"ClusterName": cluster_name}, 

972 statistic="Average", 

973 period=Duration.minutes(5), 

974 label=region, 

975 region=region, 

976 ) 

977 for cluster_name, region in cluster_info 

978 ], 

979 width=12, 

980 height=6, 

981 ) 

982 widgets.append(gpu_mem_widget) 

983 

984 # GPU temperature 

985 gpu_temp_widget = cloudwatch.GraphWidget( 

986 title="GPU Temperature (°C)", 

987 left=[ 

988 cloudwatch.Metric( 

989 namespace="ContainerInsights", 

990 metric_name="node_gpu_temperature", 

991 dimensions_map={"ClusterName": cluster_name}, 

992 statistic="Maximum", 

993 period=Duration.minutes(5), 

994 label=region, 

995 region=region, 

996 ) 

997 for cluster_name, region in cluster_info 

998 ], 

999 width=12, 

1000 height=6, 

1001 ) 

1002 widgets.append(gpu_temp_widget) 

1003 

1004 # GPU count (active GPUs) 

1005 gpu_count_widget = cloudwatch.GraphWidget( 

1006 title="Active GPU Count", 

1007 left=[ 

1008 cloudwatch.Metric( 

1009 namespace="ContainerInsights", 

1010 metric_name="node_gpu_limit", 

1011 dimensions_map={"ClusterName": cluster_name}, 

1012 statistic="Sum", 

1013 period=Duration.minutes(5), 

1014 label=region, 

1015 region=region, 

1016 ) 

1017 for cluster_name, region in cluster_info 

1018 ], 

1019 width=12, 

1020 height=6, 

1021 ) 

1022 widgets.append(gpu_count_widget) 

1023 

1024 return widgets 

1025 

1026 def _create_fsx_widgets(self) -> list[cloudwatch.IWidget]: 

1027 """Create FSx for Lustre monitoring widgets. 

1028 

1029 Only emits widgets for regions where the FSx file system is actually 

1030 provisioned (``regional_stack.fsx_file_system`` is non-None). The 

1031 dimension ``FileSystemId`` is the CDK-generated CloudFormation ref 

1032 from each regional stack — CDK's ``cross_region_references=True`` 

1033 (enabled on this stack's constructor) plumbs the value across 

1034 regions via SSM + custom resources. 

1035 

1036 Returns an empty list if no region has FSx enabled — the dashboard 

1037 skips the section entirely. 

1038 """ 

1039 # Collect (file_system_id, region) tuples for regions that have FSx on. 

1040 # fsx_file_system is either a CfnFileSystem or None; the local 

1041 # assignment + is-not-None check lets mypy narrow the type so 

1042 # ``.ref`` access typechecks cleanly (a list comprehension with 

1043 # the guard in the ``if`` clause does not narrow the value clause). 

1044 fsx_info: list[tuple[str, str]] = [] 

1045 for regional_stack in self.regional_stacks: 

1046 fsx = getattr(regional_stack, "fsx_file_system", None) 

1047 if fsx is None: 

1048 continue 

1049 fsx_info.append((fsx.ref, regional_stack.deployment_region)) 

1050 if not fsx_info: 

1051 return [] 

1052 

1053 widgets: list[cloudwatch.IWidget] = [] 

1054 

1055 # Section header 

1056 widgets.append( 

1057 cloudwatch.TextWidget( 

1058 markdown=( 

1059 "# FSx for Lustre\n" 

1060 "Parallel file system throughput, IOPS, and free storage " 

1061 "capacity. Each line below is scoped to the exact GCO " 

1062 "file system in its region — so unrelated FSx file " 

1063 "systems in the same account do not appear on the " 

1064 "dashboard." 

1065 ), 

1066 width=24, 

1067 height=1, 

1068 ) 

1069 ) 

1070 

1071 # Throughput: bytes read vs written 

1072 throughput_widget = cloudwatch.GraphWidget( 

1073 title="FSx - Throughput (Bytes/sec)", 

1074 left=[ 

1075 cloudwatch.Metric( 

1076 namespace="AWS/FSx", 

1077 metric_name="DataReadBytes", 

1078 dimensions_map={"FileSystemId": fs_id}, 

1079 statistic="Sum", 

1080 period=Duration.minutes(1), 

1081 label=f"{region} Read", 

1082 region=region, 

1083 ) 

1084 for fs_id, region in fsx_info 

1085 ], 

1086 right=[ 

1087 cloudwatch.Metric( 

1088 namespace="AWS/FSx", 

1089 metric_name="DataWriteBytes", 

1090 dimensions_map={"FileSystemId": fs_id}, 

1091 statistic="Sum", 

1092 period=Duration.minutes(1), 

1093 label=f"{region} Write", 

1094 region=region, 

1095 ) 

1096 for fs_id, region in fsx_info 

1097 ], 

1098 width=12, 

1099 height=6, 

1100 ) 

1101 widgets.append(throughput_widget) 

1102 

1103 # IOPS: read vs write operations 

1104 iops_widget = cloudwatch.GraphWidget( 

1105 title="FSx - IOPS", 

1106 left=[ 

1107 cloudwatch.Metric( 

1108 namespace="AWS/FSx", 

1109 metric_name="DataReadOperations", 

1110 dimensions_map={"FileSystemId": fs_id}, 

1111 statistic="Sum", 

1112 period=Duration.minutes(1), 

1113 label=f"{region} Read", 

1114 region=region, 

1115 ) 

1116 for fs_id, region in fsx_info 

1117 ], 

1118 right=[ 

1119 cloudwatch.Metric( 

1120 namespace="AWS/FSx", 

1121 metric_name="DataWriteOperations", 

1122 dimensions_map={"FileSystemId": fs_id}, 

1123 statistic="Sum", 

1124 period=Duration.minutes(1), 

1125 label=f"{region} Write", 

1126 region=region, 

1127 ) 

1128 for fs_id, region in fsx_info 

1129 ], 

1130 width=12, 

1131 height=6, 

1132 ) 

1133 widgets.append(iops_widget) 

1134 

1135 # Free storage capacity — the classic "running out of space" signal. 

1136 # FreeDataStorageCapacity is emitted in bytes. 

1137 free_storage_widget = cloudwatch.GraphWidget( 

1138 title="FSx - Free Storage Capacity (Bytes)", 

1139 left=[ 

1140 cloudwatch.Metric( 

1141 namespace="AWS/FSx", 

1142 metric_name="FreeDataStorageCapacity", 

1143 dimensions_map={"FileSystemId": fs_id}, 

1144 statistic="Minimum", 

1145 period=Duration.minutes(5), 

1146 label=region, 

1147 region=region, 

1148 ) 

1149 for fs_id, region in fsx_info 

1150 ], 

1151 width=24, 

1152 height=6, 

1153 ) 

1154 widgets.append(free_storage_widget) 

1155 

1156 return widgets 

1157 

1158 def _create_valkey_widgets(self) -> list[cloudwatch.IWidget]: 

1159 """Create Valkey (ElastiCache Serverless) monitoring widgets. 

1160 

1161 Uses explicit ``clusterId`` dimension values (camelCase — the 

1162 ElastiCache Serverless variant; distinct from the node-based 

1163 ``CacheClusterId``). The regional stack names its cache 

1164 deterministically as ``gco-{deployment_region}``, so we reproduce 

1165 that name here and pin each widget to the exact cache in its 

1166 region. No SEARCH expression, so the dashboard ignores every 

1167 unrelated ElastiCache cluster in the account. 

1168 """ 

1169 valkey_enabled = self.config.get_valkey_config().get("enabled", False) 

1170 if not valkey_enabled or not self.regions: 

1171 return [] 

1172 

1173 widgets: list[cloudwatch.IWidget] = [] 

1174 

1175 widgets.append( 

1176 cloudwatch.TextWidget( 

1177 markdown=( 

1178 "# Valkey Serverless Cache\n" 

1179 "ECPU consumption, storage, hit rate, and request " 

1180 "latency — scoped to each region's ``gco-{region}`` " 

1181 "cache exactly (no SEARCH)." 

1182 ), 

1183 width=24, 

1184 height=1, 

1185 ) 

1186 ) 

1187 

1188 # Build (cache_name, region) pairs. cache_name is the literal 

1189 # ``serverless_cache_name`` the regional stack passes to the 

1190 # CfnServerlessCache. 

1191 cache_info = [(f"gco-{region}", region) for region in self.regions] 

1192 

1193 # ECPU consumption and cache size per region 

1194 for cache_name, region in cache_info: 

1195 widgets.append( 

1196 cloudwatch.GraphWidget( 

1197 title=f"Valkey - ECPU & Cache Size ({region})", 

1198 left=[ 

1199 cloudwatch.Metric( 

1200 namespace="AWS/ElastiCache", 

1201 metric_name="ElastiCacheProcessingUnits", 

1202 dimensions_map={"clusterId": cache_name}, 

1203 statistic="Sum", 

1204 period=Duration.minutes(1), 

1205 label="ECPUs", 

1206 region=region, 

1207 ), 

1208 ], 

1209 right=[ 

1210 cloudwatch.Metric( 

1211 namespace="AWS/ElastiCache", 

1212 metric_name="BytesUsedForCache", 

1213 dimensions_map={"clusterId": cache_name}, 

1214 statistic="Average", 

1215 period=Duration.minutes(5), 

1216 label="Bytes", 

1217 region=region, 

1218 ), 

1219 ], 

1220 width=12, 

1221 height=6, 

1222 region=region, 

1223 ) 

1224 ) 

1225 

1226 # Hit rate and p99 read/write latency per region 

1227 for cache_name, region in cache_info: 

1228 widgets.append( 

1229 cloudwatch.GraphWidget( 

1230 title=f"Valkey - Hit Rate & Latency ({region})", 

1231 left=[ 

1232 cloudwatch.Metric( 

1233 namespace="AWS/ElastiCache", 

1234 metric_name="CacheHitRate", 

1235 dimensions_map={"clusterId": cache_name}, 

1236 statistic="Average", 

1237 period=Duration.minutes(5), 

1238 label="Hit Rate %", 

1239 region=region, 

1240 ), 

1241 ], 

1242 right=[ 

1243 cloudwatch.Metric( 

1244 namespace="AWS/ElastiCache", 

1245 metric_name="SuccessfulReadRequestLatency", 

1246 dimensions_map={"clusterId": cache_name}, 

1247 statistic="p99", 

1248 period=Duration.minutes(1), 

1249 label="Read p99 µs", 

1250 region=region, 

1251 ), 

1252 cloudwatch.Metric( 

1253 namespace="AWS/ElastiCache", 

1254 metric_name="SuccessfulWriteRequestLatency", 

1255 dimensions_map={"clusterId": cache_name}, 

1256 statistic="p99", 

1257 period=Duration.minutes(1), 

1258 label="Write p99 µs", 

1259 region=region, 

1260 ), 

1261 ], 

1262 width=12, 

1263 height=6, 

1264 region=region, 

1265 ) 

1266 ) 

1267 

1268 return widgets 

1269 

1270 def _create_aurora_pgvector_widgets(self) -> list[cloudwatch.IWidget]: 

1271 """Create Aurora Serverless v2 (pgvector) monitoring widgets. 

1272 

1273 Pins each widget to the exact Aurora cluster provisioned by the 

1274 regional stack via ``regional_stack.aurora_cluster.cluster_identifier``. 

1275 CDK-generated cluster IDs are CloudFormation tokens; the 

1276 ``cross_region_references=True`` flag on this stack handles 

1277 plumbing them from each regional stack into the monitoring stack 

1278 (us-east-2 by default) through SSM + custom resources. 

1279 

1280 Returns an empty list when every region has Aurora pgvector 

1281 disabled so the dashboard skips the section entirely. 

1282 """ 

1283 # (cluster_identifier, region) pairs for regions with Aurora on. 

1284 # Use a guarded loop (not a comprehension) so mypy can narrow the 

1285 # Optional[DatabaseCluster] to a real cluster before dereferencing. 

1286 aurora_info: list[tuple[str, str]] = [] 

1287 for regional_stack in self.regional_stacks: 

1288 aurora = getattr(regional_stack, "aurora_cluster", None) 

1289 if aurora is None: 

1290 continue 

1291 aurora_info.append((aurora.cluster_identifier, regional_stack.deployment_region)) 

1292 if not aurora_info: 

1293 return [] 

1294 

1295 widgets: list[cloudwatch.IWidget] = [] 

1296 

1297 widgets.append( 

1298 cloudwatch.TextWidget( 

1299 markdown=( 

1300 "# Aurora pgvector (Serverless v2)\n" 

1301 "ACU utilization, database connections, query latency, " 

1302 "and CPU utilization — pinned to each regional GCO " 

1303 "Aurora cluster by ID. ACU utilization is the primary " 

1304 "scale/cost signal for Serverless v2." 

1305 ), 

1306 width=24, 

1307 height=1, 

1308 ) 

1309 ) 

1310 

1311 # ACU utilization and capacity 

1312 for cluster_id, region in aurora_info: 

1313 widgets.append( 

1314 cloudwatch.GraphWidget( 

1315 title=f"Aurora - ACU Utilization & Capacity ({region})", 

1316 left=[ 

1317 cloudwatch.Metric( 

1318 namespace="AWS/RDS", 

1319 metric_name="ACUUtilization", 

1320 dimensions_map={"DBClusterIdentifier": cluster_id}, 

1321 statistic="Average", 

1322 period=Duration.minutes(1), 

1323 label="ACU %", 

1324 region=region, 

1325 ), 

1326 ], 

1327 right=[ 

1328 cloudwatch.Metric( 

1329 namespace="AWS/RDS", 

1330 metric_name="ServerlessDatabaseCapacity", 

1331 dimensions_map={"DBClusterIdentifier": cluster_id}, 

1332 statistic="Average", 

1333 period=Duration.minutes(1), 

1334 label="ACUs", 

1335 region=region, 

1336 ), 

1337 ], 

1338 width=12, 

1339 height=6, 

1340 region=region, 

1341 ) 

1342 ) 

1343 

1344 # Database connections and CPU utilization 

1345 for cluster_id, region in aurora_info: 

1346 widgets.append( 

1347 cloudwatch.GraphWidget( 

1348 title=f"Aurora - Connections & CPU ({region})", 

1349 left=[ 

1350 cloudwatch.Metric( 

1351 namespace="AWS/RDS", 

1352 metric_name="DatabaseConnections", 

1353 dimensions_map={"DBClusterIdentifier": cluster_id}, 

1354 statistic="Average", 

1355 period=Duration.minutes(1), 

1356 label="Connections", 

1357 region=region, 

1358 ), 

1359 ], 

1360 right=[ 

1361 cloudwatch.Metric( 

1362 namespace="AWS/RDS", 

1363 metric_name="CPUUtilization", 

1364 dimensions_map={"DBClusterIdentifier": cluster_id}, 

1365 statistic="Average", 

1366 period=Duration.minutes(1), 

1367 label="CPU %", 

1368 region=region, 

1369 ), 

1370 ], 

1371 width=12, 

1372 height=6, 

1373 region=region, 

1374 ) 

1375 ) 

1376 

1377 # Read and write latency p99 

1378 for cluster_id, region in aurora_info: 

1379 widgets.append( 

1380 cloudwatch.GraphWidget( 

1381 title=f"Aurora - Query Latency p99 ({region})", 

1382 left=[ 

1383 cloudwatch.Metric( 

1384 namespace="AWS/RDS", 

1385 metric_name="ReadLatency", 

1386 dimensions_map={"DBClusterIdentifier": cluster_id}, 

1387 statistic="p99", 

1388 period=Duration.minutes(1), 

1389 label="Read p99", 

1390 region=region, 

1391 ), 

1392 ], 

1393 right=[ 

1394 cloudwatch.Metric( 

1395 namespace="AWS/RDS", 

1396 metric_name="WriteLatency", 

1397 dimensions_map={"DBClusterIdentifier": cluster_id}, 

1398 statistic="p99", 

1399 period=Duration.minutes(1), 

1400 label="Write p99", 

1401 region=region, 

1402 ), 

1403 ], 

1404 width=24, 

1405 height=6, 

1406 region=region, 

1407 ) 

1408 ) 

1409 

1410 return widgets 

1411 

1412 def _create_alb_widgets(self) -> list[cloudwatch.IWidget]: 

1413 """Create ALB monitoring widgets scoped to the GCO platform ALB. 

1414 

1415 ALBs are created by the AWS Load Balancer Controller at runtime 

1416 from an Ingress resource (not by CDK), so the exact ALB name 

1417 isn't known at synth time. We originally tried reading the ARN 

1418 off the regional stack's ``GaRegistration`` custom resource via 

1419 ``cross_region_references=True``, but that path races the 

1420 custom-resource response pipeline: CDK's cross-region 

1421 ``ExportsWriter`` executes ``Fn::GetAtt: [GaRegistration, AlbArn]`` 

1422 before CloudFormation has the updated response data stored, and 

1423 errors with "Vendor response doesn't contain AlbArn attribute". 

1424 

1425 Instead we use a SEARCH expression with a composite-token 

1426 filter. The ALB Controller names the platform ALB 

1427 ``k8s-gco-<hash>`` (the namespace is shortened because the 

1428 controller enforces a 32-char total name limit); CloudWatch's 

1429 ``LoadBalancer`` dimension is the ARN suffix ``app/<name>/<hash>``, 

1430 so an unquoted filter ``LoadBalancer=app/k8s-gco-`` performs a 

1431 composite-token match (the sequence ``app``, ``k``, ``8``, ``s``, 

1432 ``gco`` must appear consecutively in the dimension value). 

1433 Double-quoted filters would be exact matches and return nothing 

1434 because no ALB's dimension value is literally ``app/k8s-gco-``. 

1435 """ 

1436 widgets: list[cloudwatch.IWidget] = [] 

1437 

1438 # Section header 

1439 widgets.append( 

1440 cloudwatch.TextWidget( 

1441 markdown=( 

1442 "# Application Load Balancers\n" 

1443 "Request metrics, response time, HTTP errors, and " 

1444 "connection counts — scoped via SEARCH composite-token " 

1445 "match to ALBs named ``app/k8s-gco-*`` so only the GCO " 

1446 "platform ALB in each region appears. Inference ALBs " 

1447 "(named per endpoint) and unrelated ALBs in the " 

1448 "account are excluded." 

1449 ), 

1450 width=24, 

1451 height=1, 

1452 ) 

1453 ) 

1454 

1455 # Per-region request count 

1456 for region in self.regions: 

1457 widgets.append( 

1458 cloudwatch.GraphWidget( 

1459 title=f"ALB - Request Count ({region})", 

1460 left=[ 

1461 cloudwatch.MathExpression( 

1462 expression=( 

1463 "SEARCH('{AWS/ApplicationELB,LoadBalancer} " 

1464 'MetricName="RequestCount" ' 

1465 'LoadBalancer=app/k8s-gco-\', "Sum", 300)' 

1466 ), 

1467 label="Request Count", 

1468 period=Duration.minutes(5), 

1469 ), 

1470 ], 

1471 width=12, 

1472 height=6, 

1473 region=region, 

1474 ) 

1475 ) 

1476 

1477 # Per-region response time (average and p99) 

1478 for region in self.regions: 

1479 widgets.append( 

1480 cloudwatch.GraphWidget( 

1481 title=f"ALB - Response Time ({region})", 

1482 left=[ 

1483 cloudwatch.MathExpression( 

1484 expression=( 

1485 "SEARCH('{AWS/ApplicationELB,LoadBalancer} " 

1486 'MetricName="TargetResponseTime" ' 

1487 'LoadBalancer=app/k8s-gco-\', "Average", 300)' 

1488 ), 

1489 label="Avg Response Time", 

1490 period=Duration.minutes(5), 

1491 ), 

1492 cloudwatch.MathExpression( 

1493 expression=( 

1494 "SEARCH('{AWS/ApplicationELB,LoadBalancer} " 

1495 'MetricName="TargetResponseTime" ' 

1496 'LoadBalancer=app/k8s-gco-\', "p99", 300)' 

1497 ), 

1498 label="p99 Response Time", 

1499 period=Duration.minutes(5), 

1500 ), 

1501 ], 

1502 width=12, 

1503 height=6, 

1504 region=region, 

1505 ) 

1506 ) 

1507 

1508 # Per-region HTTP errors (4XX + 5XX from targets) 

1509 for region in self.regions: 

1510 widgets.append( 

1511 cloudwatch.GraphWidget( 

1512 title=f"ALB - HTTP Errors ({region})", 

1513 left=[ 

1514 cloudwatch.MathExpression( 

1515 expression=( 

1516 "SEARCH('{AWS/ApplicationELB,LoadBalancer} " 

1517 'MetricName="HTTPCode_Target_4XX_Count" ' 

1518 'LoadBalancer=app/k8s-gco-\', "Sum", 300)' 

1519 ), 

1520 label="4XX Errors", 

1521 period=Duration.minutes(5), 

1522 ), 

1523 ], 

1524 right=[ 

1525 cloudwatch.MathExpression( 

1526 expression=( 

1527 "SEARCH('{AWS/ApplicationELB,LoadBalancer} " 

1528 'MetricName="HTTPCode_Target_5XX_Count" ' 

1529 'LoadBalancer=app/k8s-gco-\', "Sum", 300)' 

1530 ), 

1531 label="5XX Errors", 

1532 period=Duration.minutes(5), 

1533 ), 

1534 ], 

1535 width=12, 

1536 height=6, 

1537 region=region, 

1538 ) 

1539 ) 

1540 

1541 # Per-region active connections 

1542 for region in self.regions: 

1543 widgets.append( 

1544 cloudwatch.GraphWidget( 

1545 title=f"ALB - Active Connections ({region})", 

1546 left=[ 

1547 cloudwatch.MathExpression( 

1548 expression=( 

1549 "SEARCH('{AWS/ApplicationELB,LoadBalancer} " 

1550 'MetricName="ActiveConnectionCount" ' 

1551 'LoadBalancer=app/k8s-gco-\', "Sum", 300)' 

1552 ), 

1553 label="Active Connections", 

1554 period=Duration.minutes(5), 

1555 ), 

1556 ], 

1557 width=12, 

1558 height=6, 

1559 region=region, 

1560 ) 

1561 ) 

1562 

1563 return widgets 

1564 

1565 def _create_application_widgets(self) -> list[cloudwatch.IWidget]: 

1566 """Create custom application monitoring widgets""" 

1567 widgets: list[cloudwatch.IWidget] = [] 

1568 

1569 # Section header 

1570 widgets.append( 

1571 cloudwatch.TextWidget( 

1572 markdown="# Application Metrics\n" 

1573 "Health monitor and manifest processor metrics. " 

1574 "Application logs are available in Container Insights at " 

1575 "`/aws/containerinsights/<cluster>/application`.", 

1576 width=24, 

1577 height=1, 

1578 ) 

1579 ) 

1580 

1581 # Build cluster info from regional stacks: (cluster_name, region) 

1582 cluster_info = [ 

1583 (regional_stack.cluster.cluster_name, regional_stack.deployment_region) 

1584 for regional_stack in self.regional_stacks 

1585 ] 

1586 

1587 # Health monitor metrics 

1588 health_monitor_widget = cloudwatch.GraphWidget( 

1589 title="Health Monitor - Resource Utilization", 

1590 left=[ 

1591 cloudwatch.Metric( 

1592 namespace="GCO/HealthMonitor", 

1593 metric_name="ClusterCpuUtilization", 

1594 dimensions_map={ 

1595 "ClusterName": cluster_name, 

1596 "Region": region, 

1597 }, 

1598 statistic="Average", 

1599 period=Duration.minutes(5), 

1600 label=f"{region} CPU", 

1601 region=region, 

1602 ) 

1603 for cluster_name, region in cluster_info 

1604 ], 

1605 right=[ 

1606 cloudwatch.Metric( 

1607 namespace="GCO/HealthMonitor", 

1608 metric_name="ClusterMemoryUtilization", 

1609 dimensions_map={ 

1610 "ClusterName": cluster_name, 

1611 "Region": region, 

1612 }, 

1613 statistic="Average", 

1614 period=Duration.minutes(5), 

1615 label=f"{region} Memory", 

1616 region=region, 

1617 ) 

1618 for cluster_name, region in cluster_info 

1619 ], 

1620 width=12, 

1621 height=6, 

1622 ) 

1623 widgets.append(health_monitor_widget) 

1624 

1625 # Manifest processor metrics 

1626 manifest_processor_widget = cloudwatch.GraphWidget( 

1627 title="Manifest Processor - Submissions", 

1628 left=[ 

1629 cloudwatch.Metric( 

1630 namespace="GCO/ManifestProcessor", 

1631 metric_name="ManifestSubmissions", 

1632 dimensions_map={ 

1633 "ClusterName": cluster_name, 

1634 "Region": region, 

1635 }, 

1636 statistic="Sum", 

1637 period=Duration.minutes(5), 

1638 label=f"{region} Submissions", 

1639 region=region, 

1640 ) 

1641 for cluster_name, region in cluster_info 

1642 ], 

1643 right=[ 

1644 cloudwatch.Metric( 

1645 namespace="GCO/ManifestProcessor", 

1646 metric_name="ManifestFailures", 

1647 dimensions_map={ 

1648 "ClusterName": cluster_name, 

1649 "Region": region, 

1650 }, 

1651 statistic="Sum", 

1652 period=Duration.minutes(5), 

1653 label=f"{region} Failures", 

1654 color="#d62728", 

1655 region=region, 

1656 ) 

1657 for cluster_name, region in cluster_info 

1658 ], 

1659 width=12, 

1660 height=6, 

1661 ) 

1662 widgets.append(manifest_processor_widget) 

1663 

1664 # Container Insights - Pod restarts (indicates application issues) 

1665 pod_restarts_widget = cloudwatch.GraphWidget( 

1666 title="Container Insights - Pod Restarts", 

1667 left=[ 

1668 cloudwatch.Metric( 

1669 namespace="ContainerInsights", 

1670 metric_name="pod_number_of_container_restarts", 

1671 dimensions_map={"ClusterName": cluster_name}, 

1672 statistic="Sum", 

1673 period=Duration.minutes(5), 

1674 label=f"{region}", 

1675 region=region, 

1676 ) 

1677 for cluster_name, region in cluster_info 

1678 ], 

1679 width=12, 

1680 height=6, 

1681 ) 

1682 widgets.append(pod_restarts_widget) 

1683 

1684 # Secret rotation Lambda metrics (Secrets Manager doesn't publish rotation metrics, 

1685 # so we monitor the rotation Lambda function instead) 

1686 if self.api_gateway_stack: 1686 ↛ 1722line 1686 didn't jump to line 1722 because the condition on line 1686 was always true

1687 rotation_function_name = self.api_gateway_stack.rotation_lambda.function_name 

1688 api_gw_region = self.config.get_api_gateway_region() 

1689 

1690 rotation_widget = cloudwatch.GraphWidget( 

1691 title="Secret Rotation Lambda - Invocations & Errors", 

1692 left=[ 

1693 cloudwatch.Metric( 

1694 namespace="AWS/Lambda", 

1695 metric_name="Invocations", 

1696 dimensions_map={"FunctionName": rotation_function_name}, 

1697 statistic="Sum", 

1698 period=Duration.hours(1), 

1699 label="Invocations", 

1700 color="#2ca02c", 

1701 region=api_gw_region, 

1702 ), 

1703 ], 

1704 right=[ 

1705 cloudwatch.Metric( 

1706 namespace="AWS/Lambda", 

1707 metric_name="Errors", 

1708 dimensions_map={"FunctionName": rotation_function_name}, 

1709 statistic="Sum", 

1710 period=Duration.hours(1), 

1711 label="Errors", 

1712 color="#d62728", 

1713 region=api_gw_region, 

1714 ), 

1715 ], 

1716 width=12, 

1717 height=6, 

1718 ) 

1719 widgets.append(rotation_widget) 

1720 else: 

1721 # Fallback text widget if api_gateway_stack not available 

1722 fallback_widget = cloudwatch.TextWidget( 

1723 markdown="**Secret Rotation:** API Gateway stack not configured. " 

1724 "Rotation Lambda metrics unavailable.", 

1725 width=12, 

1726 height=6, 

1727 ) 

1728 widgets.append(fallback_widget) 

1729 

1730 return widgets 

1731 

1732 def _create_alarms(self) -> None: 

1733 """Create CloudWatch alarms""" 

1734 self._create_global_accelerator_alarms() 

1735 self._create_api_gateway_alarms() 

1736 self._create_lambda_alarms() 

1737 self._create_sqs_alarms() 

1738 self._create_dynamodb_alarms() 

1739 self._create_eks_alarms() 

1740 self._create_alb_alarms() 

1741 self._create_application_alarms() 

1742 

1743 def _create_global_accelerator_alarms(self) -> None: 

1744 """Create Global Accelerator alarms. 

1745 

1746 Note: Global Accelerator metrics are only available in us-west-2. 

1747 CloudWatch Alarms must be in the same region as the metrics they monitor. 

1748 Since this monitoring stack may be deployed in a different region, 

1749 we skip GA alarms here. To monitor GA, either: 

1750 1. Create alarms manually in us-west-2 

1751 2. Use CloudWatch cross-region dashboard widgets (which we do) 

1752 3. Deploy a separate alarm stack in us-west-2 

1753 """ 

1754 # GA alarms skipped - metrics only available in us-west-2 

1755 # Dashboard widgets use region parameter to display GA metrics correctly 

1756 pass 

1757 

1758 def _create_api_gateway_alarms(self) -> None: 

1759 """Create API Gateway alarms""" 

1760 # Get the actual API name from the api_gateway_stack 

1761 api_name = ( 

1762 self.api_gateway_stack.api.rest_api_name if self.api_gateway_stack else "gco-global-api" 

1763 ) 

1764 

1765 # High 5XX error rate 

1766 api_5xx_alarm = cloudwatch.Alarm( 

1767 self, 

1768 "ApiGateway5xxAlarm", 

1769 alarm_description="API Gateway has high 5XX error rate", 

1770 metric=cloudwatch.Metric( 

1771 namespace="AWS/ApiGateway", 

1772 metric_name="5XXError", 

1773 dimensions_map={"ApiName": api_name}, 

1774 statistic="Sum", 

1775 period=Duration.minutes(5), 

1776 ), 

1777 threshold=10, 

1778 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD, 

1779 evaluation_periods=2, 

1780 datapoints_to_alarm=2, 

1781 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING, 

1782 ) 

1783 api_5xx_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic)) 

1784 

1785 # High latency 

1786 api_latency_alarm = cloudwatch.Alarm( 

1787 self, 

1788 "ApiGatewayHighLatencyAlarm", 

1789 alarm_description="API Gateway has high latency", 

1790 metric=cloudwatch.Metric( 

1791 namespace="AWS/ApiGateway", 

1792 metric_name="Latency", 

1793 dimensions_map={"ApiName": api_name}, 

1794 statistic="p99", 

1795 period=Duration.minutes(5), 

1796 ), 

1797 threshold=10000, # 10 seconds 

1798 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD, 

1799 evaluation_periods=3, 

1800 datapoints_to_alarm=2, 

1801 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING, 

1802 ) 

1803 api_latency_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic)) 

1804 

1805 def _create_lambda_alarms(self) -> None: 

1806 """Create Lambda function alarms""" 

1807 # Get Lambda function names from api_gateway_stack if available 

1808 if self.api_gateway_stack: 1808 ↛ exitline 1808 didn't return from function '_create_lambda_alarms' because the condition on line 1808 was always true

1809 proxy_function_name = self.api_gateway_stack.proxy_lambda.function_name 

1810 rotation_function_name = self.api_gateway_stack.rotation_lambda.function_name 

1811 

1812 # API Gateway Proxy Lambda errors 

1813 proxy_errors_alarm = cloudwatch.Alarm( 

1814 self, 

1815 "ProxyLambdaErrorsAlarm", 

1816 alarm_description="API Gateway proxy Lambda has errors", 

1817 metric=cloudwatch.Metric( 

1818 namespace="AWS/Lambda", 

1819 metric_name="Errors", 

1820 dimensions_map={"FunctionName": proxy_function_name}, 

1821 statistic="Sum", 

1822 period=Duration.minutes(5), 

1823 ), 

1824 threshold=5, 

1825 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD, 

1826 evaluation_periods=2, 

1827 datapoints_to_alarm=2, 

1828 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING, 

1829 ) 

1830 proxy_errors_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic)) 

1831 

1832 # Proxy Lambda throttles 

1833 proxy_throttles_alarm = cloudwatch.Alarm( 

1834 self, 

1835 "ProxyLambdaThrottlesAlarm", 

1836 alarm_description="API Gateway proxy Lambda is being throttled", 

1837 metric=cloudwatch.Metric( 

1838 namespace="AWS/Lambda", 

1839 metric_name="Throttles", 

1840 dimensions_map={"FunctionName": proxy_function_name}, 

1841 statistic="Sum", 

1842 period=Duration.minutes(5), 

1843 ), 

1844 threshold=1, 

1845 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD, 

1846 evaluation_periods=2, 

1847 datapoints_to_alarm=2, 

1848 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING, 

1849 ) 

1850 proxy_throttles_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic)) 

1851 

1852 # Secret rotation Lambda errors 

1853 rotation_errors_alarm = cloudwatch.Alarm( 

1854 self, 

1855 "RotationLambdaErrorsAlarm", 

1856 alarm_description="Secret rotation Lambda has errors", 

1857 metric=cloudwatch.Metric( 

1858 namespace="AWS/Lambda", 

1859 metric_name="Errors", 

1860 dimensions_map={"FunctionName": rotation_function_name}, 

1861 statistic="Sum", 

1862 period=Duration.hours(1), 

1863 ), 

1864 threshold=1, 

1865 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD, 

1866 evaluation_periods=1, 

1867 datapoints_to_alarm=1, 

1868 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING, 

1869 ) 

1870 rotation_errors_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic)) 

1871 

1872 def _create_sqs_alarms(self) -> None: 

1873 """Create SQS queue alarms""" 

1874 for regional_stack in self.regional_stacks: 

1875 region = regional_stack.deployment_region 

1876 queue_name = regional_stack.job_queue.queue_name 

1877 dlq_name = regional_stack.job_dlq.queue_name 

1878 region_id = region.replace("-", "").title() 

1879 

1880 # Old message alarm (stuck jobs) 

1881 old_message_alarm = cloudwatch.Alarm( 

1882 self, 

1883 f"SqsOldMessageAlarm{region_id}", 

1884 alarm_description=f"SQS queue in {region} has old messages (potential stuck jobs)", 

1885 metric=cloudwatch.Metric( 

1886 namespace="AWS/SQS", 

1887 metric_name="ApproximateAgeOfOldestMessage", 

1888 dimensions_map={"QueueName": queue_name}, 

1889 statistic="Maximum", 

1890 period=Duration.minutes(5), 

1891 ), 

1892 threshold=3600, # 1 hour 

1893 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD, 

1894 evaluation_periods=2, 

1895 datapoints_to_alarm=2, 

1896 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING, 

1897 ) 

1898 old_message_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic)) 

1899 

1900 # Dead letter queue alarm 

1901 dlq_alarm = cloudwatch.Alarm( 

1902 self, 

1903 f"SqsDlqAlarm{region_id}", 

1904 alarm_description=f"SQS dead letter queue in {region} has messages", 

1905 metric=cloudwatch.Metric( 

1906 namespace="AWS/SQS", 

1907 metric_name="ApproximateNumberOfMessagesVisible", 

1908 dimensions_map={"QueueName": dlq_name}, 

1909 statistic="Sum", 

1910 period=Duration.minutes(5), 

1911 ), 

1912 threshold=1, 

1913 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD, 

1914 evaluation_periods=1, 

1915 datapoints_to_alarm=1, 

1916 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING, 

1917 ) 

1918 dlq_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic)) 

1919 

1920 def _create_dynamodb_alarms(self) -> None: 

1921 """Create DynamoDB alarms for job queue, templates, and webhooks tables.""" 

1922 # Get table names from global stack 

1923 jobs_table = self.global_stack.jobs_table.table_name 

1924 

1925 # DynamoDB tables are in the global region 

1926 global_region = self.config.get_global_region() 

1927 

1928 # Jobs table throttling alarm 

1929 jobs_throttle_alarm = cloudwatch.Alarm( 

1930 self, 

1931 "DynamoDBJobsThrottleAlarm", 

1932 alarm_description="DynamoDB jobs table is being throttled", 

1933 metric=cloudwatch.Metric( 

1934 namespace="AWS/DynamoDB", 

1935 metric_name="ThrottledRequests", 

1936 dimensions_map={"TableName": jobs_table}, 

1937 statistic="Sum", 

1938 period=Duration.minutes(5), 

1939 region=global_region, 

1940 ), 

1941 threshold=1, 

1942 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD, 

1943 evaluation_periods=2, 

1944 datapoints_to_alarm=2, 

1945 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING, 

1946 ) 

1947 jobs_throttle_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic)) 

1948 

1949 # Jobs table system errors alarm 

1950 jobs_errors_alarm = cloudwatch.Alarm( 

1951 self, 

1952 "DynamoDBJobsErrorsAlarm", 

1953 alarm_description="DynamoDB jobs table has system errors", 

1954 metric=cloudwatch.Metric( 

1955 namespace="AWS/DynamoDB", 

1956 metric_name="SystemErrors", 

1957 dimensions_map={"TableName": jobs_table}, 

1958 statistic="Sum", 

1959 period=Duration.minutes(5), 

1960 region=global_region, 

1961 ), 

1962 threshold=1, 

1963 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD, 

1964 evaluation_periods=1, 

1965 datapoints_to_alarm=1, 

1966 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING, 

1967 ) 

1968 jobs_errors_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic)) 

1969 

1970 def _create_eks_alarms(self) -> None: 

1971 """Create EKS cluster alarms""" 

1972 for regional_stack in self.regional_stacks: 

1973 region = regional_stack.deployment_region 

1974 cluster_name = regional_stack.cluster.cluster_name 

1975 region_id = region.replace("-", "").title() 

1976 

1977 # High CPU utilization alarm (node-level metric) 

1978 high_cpu_alarm = cloudwatch.Alarm( 

1979 self, 

1980 f"EksHighCpuAlarm{region_id}", 

1981 alarm_description=f"EKS cluster {cluster_name} has high CPU utilization", 

1982 metric=cloudwatch.Metric( 

1983 namespace="ContainerInsights", 

1984 metric_name="node_cpu_utilization", 

1985 dimensions_map={"ClusterName": cluster_name}, 

1986 statistic="Average", 

1987 period=Duration.minutes(5), 

1988 ), 

1989 threshold=80, 

1990 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD, 

1991 evaluation_periods=3, 

1992 datapoints_to_alarm=2, 

1993 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING, 

1994 ) 

1995 high_cpu_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic)) 

1996 

1997 # High memory utilization alarm (node-level metric) 

1998 high_memory_alarm = cloudwatch.Alarm( 

1999 self, 

2000 f"EksHighMemoryAlarm{region_id}", 

2001 alarm_description=f"EKS cluster {cluster_name} has high memory utilization", 

2002 metric=cloudwatch.Metric( 

2003 namespace="ContainerInsights", 

2004 metric_name="node_memory_utilization", 

2005 dimensions_map={"ClusterName": cluster_name}, 

2006 statistic="Average", 

2007 period=Duration.minutes(5), 

2008 ), 

2009 threshold=85, 

2010 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD, 

2011 evaluation_periods=3, 

2012 datapoints_to_alarm=2, 

2013 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING, 

2014 ) 

2015 high_memory_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic)) 

2016 

2017 def _create_alb_alarms(self) -> None: 

2018 """Create ALB alarms. 

2019 

2020 Status: no alarms created yet, even though we now have the ALB 

2021 ARN at deploy time via the GA registration custom resource (which 

2022 also feeds the dashboard widgets). Adding per-ALB alarms here is 

2023 a straightforward enhancement — derive the ``LoadBalancer`` 

2024 dimension the same way ``_create_alb_widgets`` does 

2025 (``Fn.split(":loadbalancer/", alb_arn)[1]``) and wire it into 

2026 ``cloudwatch.Alarm`` constructs. 

2027 

2028 For now we rely on: 

2029 1. Dashboard widgets pinned to each platform ALB (see 

2030 ``_create_alb_widgets``) 

2031 2. EKS Container Insights alarms for pod/node health 

2032 3. API Gateway alarms for request-level monitoring 

2033 """ 

2034 # TODO: Add UnHealthyHostCount / 5XXCount alarms using the ARN 

2035 # returned by regional_stack.ga_registration.get_att_string("AlbArn"). 

2036 # The test suite explicitly documents that the ALB alarm count is 

2037 # currently zero (test_alb_unhealthy_hosts_alarm_skipped); update 

2038 # that test when adding real alarms. 

2039 pass 

2040 

2041 def _create_application_alarms(self) -> None: 

2042 """Create application-specific alarms""" 

2043 for regional_stack in self.regional_stacks: 

2044 region = regional_stack.deployment_region 

2045 cluster_name = regional_stack.cluster.cluster_name 

2046 region_id = region.replace("-", "").title() 

2047 

2048 # High manifest failure rate alarm 

2049 high_failure_rate_alarm = cloudwatch.Alarm( 

2050 self, 

2051 f"ManifestHighFailureRateAlarm{region_id}", 

2052 alarm_description=f"Manifest processor in {region} has high failure rate", 

2053 metric=cloudwatch.Metric( 

2054 namespace="GCO/ManifestProcessor", 

2055 metric_name="ManifestFailures", 

2056 dimensions_map={"ClusterName": cluster_name, "Region": region}, 

2057 statistic="Sum", 

2058 period=Duration.minutes(5), 

2059 ), 

2060 threshold=10, 

2061 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD, 

2062 evaluation_periods=2, 

2063 datapoints_to_alarm=2, 

2064 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING, 

2065 ) 

2066 high_failure_rate_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic)) 

2067 

2068 def _create_composite_alarms(self) -> None: 

2069 """Create composite alarms for better signal-to-noise ratio""" 

2070 

2071 # Store individual alarms for composite alarm references 

2072 regional_alarms: dict[str, list[cloudwatch.Alarm]] = {} 

2073 

2074 for regional_stack in self.regional_stacks: 

2075 region = regional_stack.deployment_region 

2076 cluster_name = regional_stack.cluster.cluster_name 

2077 region_id = region.replace("-", "").title() 

2078 regional_alarms[region] = [] 

2079 

2080 # Create regional health composite alarm 

2081 # Triggers when multiple issues occur in the same region 

2082 eks_cpu_alarm = cloudwatch.Alarm( 

2083 self, 

2084 f"CompositeEksCpu{region_id}", 

2085 metric=cloudwatch.Metric( 

2086 namespace="ContainerInsights", 

2087 metric_name="node_cpu_utilization", 

2088 dimensions_map={"ClusterName": cluster_name}, 

2089 statistic="Average", 

2090 period=Duration.minutes(5), 

2091 ), 

2092 threshold=90, 

2093 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD, 

2094 evaluation_periods=2, 

2095 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING, 

2096 ) 

2097 regional_alarms[region].append(eks_cpu_alarm) 

2098 

2099 eks_memory_alarm = cloudwatch.Alarm( 

2100 self, 

2101 f"CompositeEksMemory{region_id}", 

2102 metric=cloudwatch.Metric( 

2103 namespace="ContainerInsights", 

2104 metric_name="node_memory_utilization", 

2105 dimensions_map={"ClusterName": cluster_name}, 

2106 statistic="Average", 

2107 period=Duration.minutes(5), 

2108 ), 

2109 threshold=90, 

2110 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD, 

2111 evaluation_periods=2, 

2112 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING, 

2113 ) 

2114 regional_alarms[region].append(eks_memory_alarm) 

2115 

2116 # Create composite alarm for critical regional issues 

2117 for region, alarms in regional_alarms.items(): 

2118 region_id = region.replace("-", "").title() 

2119 if len(alarms) >= 2: 2119 ↛ 2117line 2119 didn't jump to line 2117 because the condition on line 2119 was always true

2120 composite_alarm = cloudwatch.CompositeAlarm( 

2121 self, 

2122 f"RegionalCriticalAlarm{region_id}", 

2123 alarm_description=f"Critical: Multiple issues detected in {region}", 

2124 alarm_rule=cloudwatch.AlarmRule.all_of(*alarms), 

2125 ) 

2126 composite_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic)) 

2127 

2128 # API Gateway + Lambda composite alarm (only if api_gateway_stack is available) 

2129 if self.api_gateway_stack: 2129 ↛ exitline 2129 didn't return from function '_create_composite_alarms' because the condition on line 2129 was always true

2130 api_name = self.api_gateway_stack.api.rest_api_name 

2131 proxy_function_name = self.api_gateway_stack.proxy_lambda.function_name 

2132 

2133 api_error_alarm = cloudwatch.Alarm( 

2134 self, 

2135 "CompositeApiErrors", 

2136 metric=cloudwatch.Metric( 

2137 namespace="AWS/ApiGateway", 

2138 metric_name="5XXError", 

2139 dimensions_map={"ApiName": api_name}, 

2140 statistic="Sum", 

2141 period=Duration.minutes(5), 

2142 ), 

2143 threshold=5, 

2144 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD, 

2145 evaluation_periods=2, 

2146 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING, 

2147 ) 

2148 

2149 lambda_error_alarm = cloudwatch.Alarm( 

2150 self, 

2151 "CompositeLambdaErrors", 

2152 metric=cloudwatch.Metric( 

2153 namespace="AWS/Lambda", 

2154 metric_name="Errors", 

2155 dimensions_map={"FunctionName": proxy_function_name}, 

2156 statistic="Sum", 

2157 period=Duration.minutes(5), 

2158 ), 

2159 threshold=3, 

2160 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD, 

2161 evaluation_periods=2, 

2162 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING, 

2163 ) 

2164 

2165 api_lambda_composite = cloudwatch.CompositeAlarm( 

2166 self, 

2167 "ApiLambdaCompositeAlarm", 

2168 alarm_description="Critical: Both API Gateway and Lambda proxy have errors", 

2169 alarm_rule=cloudwatch.AlarmRule.all_of(api_error_alarm, lambda_error_alarm), 

2170 ) 

2171 api_lambda_composite.add_alarm_action(cw_actions.SnsAction(self.alert_topic)) 

2172 

2173 def _create_custom_metrics(self) -> None: 

2174 """Create custom metric filters and log groups""" 

2175 for regional_stack in self.regional_stacks: 

2176 region = regional_stack.deployment_region 

2177 region_id = region.replace("-", "").title() 

2178 

2179 # Health monitor log group 

2180 # log_group_name intentionally omitted - let CDK generate unique name 

2181 logs.LogGroup( 

2182 self, 

2183 f"HealthMonitorLogGroup{region_id}", 

2184 retention=logs.RetentionDays.ONE_MONTH, 

2185 removal_policy=RemovalPolicy.DESTROY, 

2186 ) 

2187 

2188 # Manifest processor log group 

2189 # log_group_name intentionally omitted - let CDK generate unique name 

2190 logs.LogGroup( 

2191 self, 

2192 f"ManifestProcessorLogGroup{region_id}", 

2193 retention=logs.RetentionDays.ONE_MONTH, 

2194 removal_policy=RemovalPolicy.DESTROY, 

2195 ) 

2196 

2197 def _create_outputs(self) -> None: 

2198 """Create CloudFormation outputs""" 

2199 CfnOutput( 

2200 self, 

2201 "DashboardUrl", 

2202 value=f"https://console.aws.amazon.com/cloudwatch/home?region={self.region}#dashboards:name={self.dashboard.dashboard_name}", 

2203 description="CloudWatch Dashboard URL", 

2204 ) 

2205 

2206 CfnOutput( 

2207 self, 

2208 "AlertTopicArn", 

2209 value=self.alert_topic.topic_arn, 

2210 description="SNS Topic ARN for monitoring alerts", 

2211 ) 

2212 

2213 CfnOutput( 

2214 self, 

2215 "AlarmCount", 

2216 value="See CloudWatch Alarms console for full list", 

2217 description="Monitoring alarms created", 

2218 )