Coverage for gco/stacks/monitoring_stack.py: 98%
314 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-15 15:07 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-15 15:07 +0000
1"""
2Monitoring stack for GCO (Global Capacity Orchestrator on AWS) - Cross-region monitoring and observability.
4This stack creates centralized monitoring resources for all GCO deployments:
5- CloudWatch Dashboard with comprehensive widgets for all regions
6- SNS topic for alerting
7- CloudWatch Alarms for critical metrics
8- Log groups for application logs
9- Anomaly detection for traffic patterns
10- Composite alarms for better signal-to-noise
12Dashboard Sections:
13- Global Accelerator: Flow counts, processed bytes
14- API Gateway: Request counts, latency, error rates
15- Lambda Functions: Invocations, errors, duration, throttles
16- SQS Queues: Message counts, age, dead letter queue depth
17- DynamoDB Tables: Capacity, latency, throttles, errors
18- EKS Clusters: CPU/memory utilization per region
19- FSx for Lustre (when enabled): Throughput, IOPS, free storage
20- Valkey Serverless (when enabled): ECPU, hit rate, latency, bytes used
21- Aurora pgvector (when enabled): ACU utilization, connections, latency, CPU
22- ALBs: Request counts, response times, healthy hosts
23- Applications: Custom metrics from health monitor and manifest processor
25Cross-Region Metrics:
26 CloudWatch metrics are region-specific. This stack handles cross-region
27 monitoring by specifying the `region` parameter on metrics:
28 - Global Accelerator metrics: Always in us-west-2
29 - DynamoDB metrics: In the global region (where tables are deployed)
30 - Regional metrics: In each cluster's region
32Alarms:
33- High CPU/memory utilization on EKS clusters
34- Unhealthy hosts in ALB target groups
35- High response times
36- Manifest processing failures
37- Lambda errors and throttles
38- SQS message age (stuck jobs)
39- DynamoDB throttling and system errors
40- API Gateway 5XX errors
41- Secret rotation failures
42"""
44from typing import TYPE_CHECKING, Any
46from aws_cdk import (
47 CfnOutput,
48 Duration,
49 RemovalPolicy,
50 Stack,
51)
52from aws_cdk import aws_cloudwatch as cloudwatch
53from aws_cdk import aws_cloudwatch_actions as cw_actions
54from aws_cdk import aws_logs as logs
55from aws_cdk import aws_sns as sns
56from constructs import Construct
58from gco.config.config_loader import ConfigLoader
60# <pyflowchart-code-diagram> BEGIN - auto-inserted, do not edit
61# Flowchart(s) generated from this file:
62# * ``GCOMonitoringStack.__init__`` -> ``diagrams/code_diagrams/gco/stacks/monitoring_stack.GCOMonitoringStack___init__.html``
63# (PNG: ``diagrams/code_diagrams/gco/stacks/monitoring_stack.GCOMonitoringStack___init__.png``)
64# Regenerate with ``python diagrams/code_diagrams/generate.py``.
65# <pyflowchart-code-diagram> END
68if TYPE_CHECKING:
69 from gco.stacks.api_gateway_global_stack import GCOApiGatewayGlobalStack
70 from gco.stacks.global_stack import GCOGlobalStack
71 from gco.stacks.regional_stack import GCORegionalStack
74class GCOMonitoringStack(Stack):
75 """
76 Cross-region monitoring and observability stack.
78 Creates a centralized CloudWatch dashboard and alarms that aggregate
79 metrics from all regional deployments.
81 Attributes:
82 alert_topic: SNS topic for alarm notifications
83 dashboard: CloudWatch dashboard with all monitoring widgets
84 """
86 def __init__(
87 self,
88 scope: Construct,
89 construct_id: str,
90 config: ConfigLoader,
91 global_stack: GCOGlobalStack,
92 regional_stacks: list[GCORegionalStack],
93 api_gateway_stack: GCOApiGatewayGlobalStack | None = None,
94 **kwargs: Any,
95 ) -> None:
96 # Enable CDK's native cross-region references. The monitoring stack
97 # lives in the monitoring region (by default us-east-2) and needs
98 # resource identifiers from the regional stacks for dashboard
99 # dimensions — specifically the auto-generated FSx file system IDs,
100 # whose values aren't known until deploy time.
101 #
102 # CDK implements this by provisioning a small Lambda-backed custom
103 # resource in each source stack that writes the referenced value to
104 # an SSM parameter in the target region, plus a reader custom
105 # resource in the target stack. Cost is negligible (the Lambdas run
106 # once per deploy) and the pattern is the documented canonical
107 # answer for ``CrossRegionReferencesNotEnabled`` errors.
108 kwargs.setdefault("cross_region_references", True)
109 super().__init__(scope, construct_id, **kwargs)
111 self.config = config
112 self.global_stack = global_stack
113 self.regional_stacks = regional_stacks
114 self.api_gateway_stack = api_gateway_stack
115 self.project_name = config.get_project_name()
116 self.regions = config.get_regions()
118 # Create SNS topic for alerts
119 self.alert_topic = self._create_alert_topic()
121 # Create CloudWatch dashboard
122 self.dashboard = self._create_dashboard()
124 # Create alarms
125 self._create_alarms()
127 # Create composite alarms
128 self._create_composite_alarms()
130 # Create custom metrics
131 self._create_custom_metrics()
133 # Export monitoring resources
134 self._create_outputs()
136 # Apply cdk-nag suppressions
137 self._apply_nag_suppressions()
139 def _apply_nag_suppressions(self) -> None:
140 """Apply cdk-nag suppressions for this stack."""
141 from gco.stacks.nag_suppressions import apply_all_suppressions
143 apply_all_suppressions(
144 self,
145 stack_type="monitoring",
146 regions=self.config.get_regions(),
147 global_region=self.config.get_global_region(),
148 )
150 def _create_alert_topic(self) -> sns.Topic:
151 """Create SNS topic for monitoring alerts"""
152 topic = sns.Topic(
153 self,
154 "GCOAlertTopic",
155 display_name="GCO (Global Capacity Orchestrator on AWS) Monitoring Alerts",
156 enforce_ssl=True,
157 )
158 return topic
160 def _create_dashboard(self) -> cloudwatch.Dashboard:
161 """Create comprehensive CloudWatch dashboard for monitoring"""
162 dashboard = cloudwatch.Dashboard(
163 self,
164 "GCODashboard",
165 period_override=cloudwatch.PeriodOverride.AUTO,
166 )
168 # Add widgets in logical order
169 dashboard.add_widgets(*self._create_global_accelerator_widgets())
170 dashboard.add_widgets(*self._create_api_gateway_widgets())
171 dashboard.add_widgets(*self._create_lambda_widgets())
172 dashboard.add_widgets(*self._create_sqs_widgets())
173 dashboard.add_widgets(*self._create_dynamodb_widgets())
174 dashboard.add_widgets(*self._create_eks_widgets())
175 dashboard.add_widgets(*self._create_gpu_widgets())
176 dashboard.add_widgets(*self._create_fsx_widgets())
177 dashboard.add_widgets(*self._create_valkey_widgets())
178 dashboard.add_widgets(*self._create_aurora_pgvector_widgets())
179 dashboard.add_widgets(*self._create_alb_widgets())
180 dashboard.add_widgets(*self._create_application_widgets())
182 return dashboard
184 def _create_global_accelerator_widgets(self) -> list[cloudwatch.IWidget]:
185 """Create Global Accelerator monitoring widgets.
187 Note: Global Accelerator metrics are only available in us-west-2,
188 regardless of where the accelerator endpoints are located.
189 CloudWatch uses the Accelerator ID (UUID), not the name.
190 """
191 widgets: list[cloudwatch.IWidget] = []
193 # Get the accelerator ID from the global stack (CloudWatch uses ID, not name)
194 accelerator_id = self.global_stack.accelerator_id
196 # Global Accelerator metrics are always in us-west-2
197 ga_metrics_region = "us-west-2"
199 # Section header
200 widgets.append(
201 cloudwatch.TextWidget(
202 markdown="# Global Accelerator\nTraffic distribution and connectivity metrics",
203 width=24,
204 height=1,
205 )
206 )
208 # Flow count with anomaly detection
209 flow_count_widget = cloudwatch.GraphWidget(
210 title="Global Accelerator - New Flows",
211 left=[
212 cloudwatch.Metric(
213 namespace="AWS/GlobalAccelerator",
214 metric_name="NewFlowCount",
215 dimensions_map={"Accelerator": accelerator_id},
216 statistic="Sum",
217 period=Duration.minutes(5),
218 region=ga_metrics_region,
219 )
220 ],
221 width=12,
222 height=6,
223 region=ga_metrics_region,
224 )
225 widgets.append(flow_count_widget)
227 # Processed bytes
228 bytes_widget = cloudwatch.GraphWidget(
229 title="Global Accelerator - Processed Bytes",
230 left=[
231 cloudwatch.Metric(
232 namespace="AWS/GlobalAccelerator",
233 metric_name="ProcessedBytesIn",
234 dimensions_map={"Accelerator": accelerator_id},
235 statistic="Sum",
236 period=Duration.minutes(5),
237 region=ga_metrics_region,
238 ),
239 cloudwatch.Metric(
240 namespace="AWS/GlobalAccelerator",
241 metric_name="ProcessedBytesOut",
242 dimensions_map={"Accelerator": accelerator_id},
243 statistic="Sum",
244 period=Duration.minutes(5),
245 region=ga_metrics_region,
246 ),
247 ],
248 width=12,
249 height=6,
250 region=ga_metrics_region,
251 )
252 widgets.append(bytes_widget)
254 return widgets
256 def _create_api_gateway_widgets(self) -> list[cloudwatch.IWidget]:
257 """Create API Gateway monitoring widgets"""
258 widgets: list[cloudwatch.IWidget] = []
260 # Get the actual API name from the api_gateway_stack
261 api_name = (
262 self.api_gateway_stack.api.rest_api_name if self.api_gateway_stack else "gco-global-api"
263 )
265 # API Gateway metrics are in the region where the API is deployed
266 api_gw_region = self.config.get_api_gateway_region()
268 # Section header
269 widgets.append(
270 cloudwatch.TextWidget(
271 markdown="# API Gateway\nRequest metrics, latency, and error rates",
272 width=24,
273 height=1,
274 )
275 )
277 # Request count and latency
278 request_widget = cloudwatch.GraphWidget(
279 title="API Gateway - Requests & Latency",
280 left=[
281 cloudwatch.Metric(
282 namespace="AWS/ApiGateway",
283 metric_name="Count",
284 dimensions_map={"ApiName": api_name},
285 statistic="Sum",
286 period=Duration.minutes(5),
287 region=api_gw_region,
288 )
289 ],
290 right=[
291 cloudwatch.Metric(
292 namespace="AWS/ApiGateway",
293 metric_name="Latency",
294 dimensions_map={"ApiName": api_name},
295 statistic="Average",
296 period=Duration.minutes(5),
297 region=api_gw_region,
298 ),
299 cloudwatch.Metric(
300 namespace="AWS/ApiGateway",
301 metric_name="Latency",
302 dimensions_map={"ApiName": api_name},
303 statistic="p99",
304 period=Duration.minutes(5),
305 region=api_gw_region,
306 ),
307 ],
308 width=12,
309 height=6,
310 region=api_gw_region,
311 )
312 widgets.append(request_widget)
314 # Error rates (4XX and 5XX)
315 error_widget = cloudwatch.GraphWidget(
316 title="API Gateway - Error Rates",
317 left=[
318 cloudwatch.Metric(
319 namespace="AWS/ApiGateway",
320 metric_name="4XXError",
321 dimensions_map={"ApiName": api_name},
322 statistic="Sum",
323 period=Duration.minutes(5),
324 color="#ff7f0e",
325 region=api_gw_region,
326 ),
327 cloudwatch.Metric(
328 namespace="AWS/ApiGateway",
329 metric_name="5XXError",
330 dimensions_map={"ApiName": api_name},
331 statistic="Sum",
332 period=Duration.minutes(5),
333 color="#d62728",
334 region=api_gw_region,
335 ),
336 ],
337 width=12,
338 height=6,
339 region=api_gw_region,
340 )
341 widgets.append(error_widget)
343 return widgets
345 def _create_lambda_widgets(self) -> list[cloudwatch.IWidget]:
346 """Create Lambda function monitoring widgets"""
347 widgets: list[cloudwatch.IWidget] = []
349 # Section header
350 widgets.append(
351 cloudwatch.TextWidget(
352 markdown="# Lambda Functions\nProxy, rotation, and regional Lambda metrics",
353 width=24,
354 height=1,
355 )
356 )
358 # Get API Gateway region for global Lambda functions
359 api_gw_region = self.config.get_api_gateway_region()
361 # Build Lambda function list: (function_name, label, region)
362 lambda_functions: list[tuple[str, str, str]] = []
364 # Add API Gateway Lambda functions if available
365 if self.api_gateway_stack: 365 ↛ 382line 365 didn't jump to line 382 because the condition on line 365 was always true
366 lambda_functions.append(
367 (
368 self.api_gateway_stack.proxy_lambda.function_name,
369 "API Gateway Proxy",
370 api_gw_region,
371 )
372 )
373 lambda_functions.append(
374 (
375 self.api_gateway_stack.rotation_lambda.function_name,
376 "Secret Rotation",
377 api_gw_region,
378 )
379 )
381 # Add regional Lambda functions from each regional stack
382 for regional_stack in self.regional_stacks:
383 region = regional_stack.deployment_region
384 lambda_functions.extend(
385 [
386 (
387 regional_stack.kubectl_lambda_function_name,
388 f"Kubectl Applier ({region})",
389 region,
390 ),
391 (
392 regional_stack.helm_installer_lambda_function_name,
393 f"Helm Installer ({region})",
394 region,
395 ),
396 ]
397 )
399 # Invocations widget
400 invocations_widget = cloudwatch.GraphWidget(
401 title="Lambda - Invocations",
402 left=[
403 cloudwatch.Metric(
404 namespace="AWS/Lambda",
405 metric_name="Invocations",
406 dimensions_map={"FunctionName": func_name},
407 statistic="Sum",
408 period=Duration.minutes(5),
409 label=label,
410 region=region,
411 )
412 for func_name, label, region in lambda_functions[:5]
413 ],
414 width=12,
415 height=6,
416 )
417 widgets.append(invocations_widget)
419 errors_widget = cloudwatch.GraphWidget(
420 title="Lambda - Errors",
421 left=[
422 cloudwatch.Metric(
423 namespace="AWS/Lambda",
424 metric_name="Errors",
425 dimensions_map={"FunctionName": func_name},
426 statistic="Sum",
427 period=Duration.minutes(5),
428 label=label,
429 color="#d62728",
430 region=region,
431 )
432 for func_name, label, region in lambda_functions[:5]
433 ],
434 width=12,
435 height=6,
436 )
437 widgets.append(errors_widget)
439 # Duration widget
440 duration_widget = cloudwatch.GraphWidget(
441 title="Lambda - Duration (ms)",
442 left=[
443 cloudwatch.Metric(
444 namespace="AWS/Lambda",
445 metric_name="Duration",
446 dimensions_map={"FunctionName": func_name},
447 statistic="Average",
448 period=Duration.minutes(5),
449 label=label,
450 region=region,
451 )
452 for func_name, label, region in lambda_functions[:5]
453 ],
454 width=12,
455 height=6,
456 )
457 widgets.append(duration_widget)
459 # Throttles widget
460 throttles_widget = cloudwatch.GraphWidget(
461 title="Lambda - Throttles & Concurrent Executions",
462 left=[
463 cloudwatch.Metric(
464 namespace="AWS/Lambda",
465 metric_name="Throttles",
466 dimensions_map={"FunctionName": func_name},
467 statistic="Sum",
468 period=Duration.minutes(5),
469 label=f"{label} Throttles",
470 region=region,
471 )
472 for func_name, label, region in lambda_functions[:3]
473 ],
474 right=[
475 cloudwatch.Metric(
476 namespace="AWS/Lambda",
477 metric_name="ConcurrentExecutions",
478 dimensions_map={"FunctionName": func_name},
479 statistic="Maximum",
480 period=Duration.minutes(5),
481 label=f"{label} Concurrent",
482 region=region,
483 )
484 for func_name, label, region in lambda_functions[:3]
485 ],
486 width=12,
487 height=6,
488 )
489 widgets.append(throttles_widget)
491 return widgets
493 def _create_sqs_widgets(self) -> list[cloudwatch.IWidget]:
494 """Create SQS queue monitoring widgets"""
495 widgets: list[cloudwatch.IWidget] = []
497 # Section header
498 widgets.append(
499 cloudwatch.TextWidget(
500 markdown="# SQS Queues\nJob submission queue metrics and dead letter queue",
501 width=24,
502 height=1,
503 )
504 )
506 # Build queue info from regional stacks: (queue_name, dlq_name, region)
507 queue_info = [
508 (
509 regional_stack.job_queue.queue_name,
510 regional_stack.job_dlq.queue_name,
511 regional_stack.deployment_region,
512 )
513 for regional_stack in self.regional_stacks
514 ]
516 # Messages visible and in-flight per region
517 messages_widget = cloudwatch.GraphWidget(
518 title="SQS - Messages (Visible & In-Flight)",
519 left=[
520 cloudwatch.Metric(
521 namespace="AWS/SQS",
522 metric_name="ApproximateNumberOfMessagesVisible",
523 dimensions_map={"QueueName": queue_name},
524 statistic="Average",
525 period=Duration.minutes(1),
526 label=f"{region} Visible",
527 region=region,
528 )
529 for queue_name, _, region in queue_info
530 ],
531 right=[
532 cloudwatch.Metric(
533 namespace="AWS/SQS",
534 metric_name="ApproximateNumberOfMessagesNotVisible",
535 dimensions_map={"QueueName": queue_name},
536 statistic="Average",
537 period=Duration.minutes(1),
538 label=f"{region} In-Flight",
539 region=region,
540 )
541 for queue_name, _, region in queue_info
542 ],
543 width=12,
544 height=6,
545 )
546 widgets.append(messages_widget)
548 # Age of oldest message (critical for detecting stuck jobs)
549 age_widget = cloudwatch.GraphWidget(
550 title="SQS - Age of Oldest Message (seconds)",
551 left=[
552 cloudwatch.Metric(
553 namespace="AWS/SQS",
554 metric_name="ApproximateAgeOfOldestMessage",
555 dimensions_map={"QueueName": queue_name},
556 statistic="Maximum",
557 period=Duration.minutes(1),
558 label=region,
559 region=region,
560 )
561 for queue_name, _, region in queue_info
562 ],
563 width=12,
564 height=6,
565 )
566 widgets.append(age_widget)
568 # Dead letter queue depth
569 dlq_widget = cloudwatch.GraphWidget(
570 title="SQS - Dead Letter Queue Depth",
571 left=[
572 cloudwatch.Metric(
573 namespace="AWS/SQS",
574 metric_name="ApproximateNumberOfMessagesVisible",
575 dimensions_map={"QueueName": dlq_name},
576 statistic="Average",
577 period=Duration.minutes(1),
578 label=f"{region} DLQ",
579 color="#d62728",
580 region=region,
581 )
582 for _, dlq_name, region in queue_info
583 ],
584 width=12,
585 height=6,
586 )
587 widgets.append(dlq_widget)
589 # Messages sent/received/deleted
590 throughput_widget = cloudwatch.GraphWidget(
591 title="SQS - Throughput",
592 left=[
593 cloudwatch.Metric(
594 namespace="AWS/SQS",
595 metric_name="NumberOfMessagesSent",
596 dimensions_map={"QueueName": queue_name},
597 statistic="Sum",
598 period=Duration.minutes(5),
599 label=f"{region} Sent",
600 region=region,
601 )
602 for queue_name, _, region in queue_info
603 ],
604 right=[
605 cloudwatch.Metric(
606 namespace="AWS/SQS",
607 metric_name="NumberOfMessagesDeleted",
608 dimensions_map={"QueueName": queue_name},
609 statistic="Sum",
610 period=Duration.minutes(5),
611 label=f"{region} Processed",
612 region=region,
613 )
614 for queue_name, _, region in queue_info
615 ],
616 width=12,
617 height=6,
618 )
619 widgets.append(throughput_widget)
621 return widgets
623 def _create_dynamodb_widgets(self) -> list[cloudwatch.IWidget]:
624 """Create DynamoDB monitoring widgets for job queue, templates, and webhooks tables."""
625 widgets: list[cloudwatch.IWidget] = []
627 # Get table names from global stack
628 templates_table = self.global_stack.templates_table.table_name
629 webhooks_table = self.global_stack.webhooks_table.table_name
630 jobs_table = self.global_stack.jobs_table.table_name
632 # DynamoDB tables are in the global region
633 global_region = self.config.get_global_region()
635 # Section header
636 widgets.append(
637 cloudwatch.TextWidget(
638 markdown="# DynamoDB Tables\nJob queue, templates, and webhooks storage metrics",
639 width=24,
640 height=1,
641 )
642 )
644 # Read/Write capacity consumed
645 capacity_widget = cloudwatch.GraphWidget(
646 title="DynamoDB - Consumed Capacity",
647 left=[
648 cloudwatch.Metric(
649 namespace="AWS/DynamoDB",
650 metric_name="ConsumedReadCapacityUnits",
651 dimensions_map={"TableName": jobs_table},
652 statistic="Sum",
653 period=Duration.minutes(5),
654 label="Jobs Read",
655 region=global_region,
656 ),
657 cloudwatch.Metric(
658 namespace="AWS/DynamoDB",
659 metric_name="ConsumedReadCapacityUnits",
660 dimensions_map={"TableName": templates_table},
661 statistic="Sum",
662 period=Duration.minutes(5),
663 label="Templates Read",
664 region=global_region,
665 ),
666 cloudwatch.Metric(
667 namespace="AWS/DynamoDB",
668 metric_name="ConsumedReadCapacityUnits",
669 dimensions_map={"TableName": webhooks_table},
670 statistic="Sum",
671 period=Duration.minutes(5),
672 label="Webhooks Read",
673 region=global_region,
674 ),
675 ],
676 right=[
677 cloudwatch.Metric(
678 namespace="AWS/DynamoDB",
679 metric_name="ConsumedWriteCapacityUnits",
680 dimensions_map={"TableName": jobs_table},
681 statistic="Sum",
682 period=Duration.minutes(5),
683 label="Jobs Write",
684 region=global_region,
685 ),
686 cloudwatch.Metric(
687 namespace="AWS/DynamoDB",
688 metric_name="ConsumedWriteCapacityUnits",
689 dimensions_map={"TableName": templates_table},
690 statistic="Sum",
691 period=Duration.minutes(5),
692 label="Templates Write",
693 region=global_region,
694 ),
695 ],
696 width=12,
697 height=6,
698 region=global_region,
699 )
700 widgets.append(capacity_widget)
702 # Latency metrics
703 latency_widget = cloudwatch.GraphWidget(
704 title="DynamoDB - Latency (ms)",
705 left=[
706 cloudwatch.Metric(
707 namespace="AWS/DynamoDB",
708 metric_name="SuccessfulRequestLatency",
709 dimensions_map={"TableName": jobs_table, "Operation": "GetItem"},
710 statistic="Average",
711 period=Duration.minutes(5),
712 label="Jobs GetItem",
713 region=global_region,
714 ),
715 cloudwatch.Metric(
716 namespace="AWS/DynamoDB",
717 metric_name="SuccessfulRequestLatency",
718 dimensions_map={"TableName": jobs_table, "Operation": "PutItem"},
719 statistic="Average",
720 period=Duration.minutes(5),
721 label="Jobs PutItem",
722 region=global_region,
723 ),
724 cloudwatch.Metric(
725 namespace="AWS/DynamoDB",
726 metric_name="SuccessfulRequestLatency",
727 dimensions_map={"TableName": jobs_table, "Operation": "Query"},
728 statistic="Average",
729 period=Duration.minutes(5),
730 label="Jobs Query",
731 region=global_region,
732 ),
733 ],
734 width=12,
735 height=6,
736 region=global_region,
737 )
738 widgets.append(latency_widget)
740 # Throttled requests
741 throttle_widget = cloudwatch.GraphWidget(
742 title="DynamoDB - Throttled Requests",
743 left=[
744 cloudwatch.Metric(
745 namespace="AWS/DynamoDB",
746 metric_name="ThrottledRequests",
747 dimensions_map={"TableName": jobs_table},
748 statistic="Sum",
749 period=Duration.minutes(5),
750 label="Jobs",
751 color="#d62728",
752 region=global_region,
753 ),
754 cloudwatch.Metric(
755 namespace="AWS/DynamoDB",
756 metric_name="ThrottledRequests",
757 dimensions_map={"TableName": templates_table},
758 statistic="Sum",
759 period=Duration.minutes(5),
760 label="Templates",
761 color="#ff7f0e",
762 region=global_region,
763 ),
764 cloudwatch.Metric(
765 namespace="AWS/DynamoDB",
766 metric_name="ThrottledRequests",
767 dimensions_map={"TableName": webhooks_table},
768 statistic="Sum",
769 period=Duration.minutes(5),
770 label="Webhooks",
771 color="#9467bd",
772 region=global_region,
773 ),
774 ],
775 width=12,
776 height=6,
777 region=global_region,
778 )
779 widgets.append(throttle_widget)
781 # System errors
782 errors_widget = cloudwatch.GraphWidget(
783 title="DynamoDB - System Errors",
784 left=[
785 cloudwatch.Metric(
786 namespace="AWS/DynamoDB",
787 metric_name="SystemErrors",
788 dimensions_map={"TableName": jobs_table},
789 statistic="Sum",
790 period=Duration.minutes(5),
791 label="Jobs",
792 color="#d62728",
793 region=global_region,
794 ),
795 cloudwatch.Metric(
796 namespace="AWS/DynamoDB",
797 metric_name="SystemErrors",
798 dimensions_map={"TableName": templates_table},
799 statistic="Sum",
800 period=Duration.minutes(5),
801 label="Templates",
802 color="#ff7f0e",
803 region=global_region,
804 ),
805 ],
806 width=12,
807 height=6,
808 region=global_region,
809 )
810 widgets.append(errors_widget)
812 return widgets
814 def _create_eks_widgets(self) -> list[cloudwatch.IWidget]:
815 """Create EKS cluster monitoring widgets"""
816 widgets: list[cloudwatch.IWidget] = []
818 # Section header
819 widgets.append(
820 cloudwatch.TextWidget(
821 markdown="# EKS Clusters\nCluster resource utilization and node metrics",
822 width=24,
823 height=1,
824 )
825 )
827 # Build cluster info from regional stacks: (cluster_name, region)
828 cluster_info = [
829 (regional_stack.cluster.cluster_name, regional_stack.deployment_region)
830 for regional_stack in self.regional_stacks
831 ]
833 # EKS cluster status
834 cluster_status_widget = cloudwatch.SingleValueWidget(
835 title="EKS Clusters - Failed Requests",
836 metrics=[
837 cloudwatch.Metric(
838 namespace="AWS/EKS",
839 metric_name="cluster_failed_request_count",
840 dimensions_map={"cluster_name": cluster_name},
841 statistic="Sum",
842 period=Duration.minutes(5),
843 region=region,
844 )
845 for cluster_name, region in cluster_info
846 ],
847 width=12,
848 height=6,
849 )
850 widgets.append(cluster_status_widget)
852 # Container Insights - Node CPU utilization (aggregated across all nodes)
853 # Note: region parameter enables cross-region metrics in dashboard
854 cpu_widget = cloudwatch.GraphWidget(
855 title="EKS Clusters - Node CPU Utilization (%)",
856 left=[
857 cloudwatch.Metric(
858 namespace="ContainerInsights",
859 metric_name="node_cpu_utilization",
860 dimensions_map={"ClusterName": cluster_name},
861 statistic="Average",
862 period=Duration.minutes(5),
863 label=region,
864 region=region,
865 )
866 for cluster_name, region in cluster_info
867 ],
868 width=12,
869 height=6,
870 )
871 widgets.append(cpu_widget)
873 # Container Insights - Node Memory utilization (aggregated across all nodes)
874 memory_widget = cloudwatch.GraphWidget(
875 title="EKS Clusters - Node Memory Utilization (%)",
876 left=[
877 cloudwatch.Metric(
878 namespace="ContainerInsights",
879 metric_name="node_memory_utilization",
880 dimensions_map={"ClusterName": cluster_name},
881 statistic="Average",
882 period=Duration.minutes(5),
883 label=region,
884 region=region,
885 )
886 for cluster_name, region in cluster_info
887 ],
888 width=12,
889 height=6,
890 )
891 widgets.append(memory_widget)
893 # Node status - running pods capacity
894 node_widget = cloudwatch.GraphWidget(
895 title="EKS Clusters - Node Pod Capacity",
896 left=[
897 cloudwatch.Metric(
898 namespace="ContainerInsights",
899 metric_name="node_status_capacity_pods",
900 dimensions_map={"ClusterName": cluster_name},
901 statistic="Sum",
902 period=Duration.minutes(5),
903 label=f"{region} Capacity",
904 region=region,
905 )
906 for cluster_name, region in cluster_info
907 ],
908 right=[
909 cloudwatch.Metric(
910 namespace="ContainerInsights",
911 metric_name="node_number_of_running_pods",
912 dimensions_map={"ClusterName": cluster_name},
913 statistic="Sum",
914 period=Duration.minutes(5),
915 label=f"{region} Running",
916 region=region,
917 )
918 for cluster_name, region in cluster_info
919 ],
920 width=12,
921 height=6,
922 )
923 widgets.append(node_widget)
925 return widgets
927 def _create_gpu_widgets(self) -> list[cloudwatch.IWidget]:
928 """Create GPU monitoring widgets using DCGM Exporter metrics via ContainerInsights."""
929 widgets: list[cloudwatch.IWidget] = []
931 widgets.append(
932 cloudwatch.TextWidget(
933 markdown="# GPU Metrics\nGPU utilization, memory, and temperature from DCGM Exporter",
934 width=24,
935 height=1,
936 )
937 )
939 cluster_info = [
940 (regional_stack.cluster.cluster_name, regional_stack.deployment_region)
941 for regional_stack in self.regional_stacks
942 ]
944 # GPU utilization percentage
945 gpu_util_widget = cloudwatch.GraphWidget(
946 title="GPU Utilization (%)",
947 left=[
948 cloudwatch.Metric(
949 namespace="ContainerInsights",
950 metric_name="node_gpu_utilization",
951 dimensions_map={"ClusterName": cluster_name},
952 statistic="Average",
953 period=Duration.minutes(5),
954 label=region,
955 region=region,
956 )
957 for cluster_name, region in cluster_info
958 ],
959 width=12,
960 height=6,
961 )
962 widgets.append(gpu_util_widget)
964 # GPU memory utilization
965 gpu_mem_widget = cloudwatch.GraphWidget(
966 title="GPU Memory Utilization (%)",
967 left=[
968 cloudwatch.Metric(
969 namespace="ContainerInsights",
970 metric_name="node_gpu_memory_utilization",
971 dimensions_map={"ClusterName": cluster_name},
972 statistic="Average",
973 period=Duration.minutes(5),
974 label=region,
975 region=region,
976 )
977 for cluster_name, region in cluster_info
978 ],
979 width=12,
980 height=6,
981 )
982 widgets.append(gpu_mem_widget)
984 # GPU temperature
985 gpu_temp_widget = cloudwatch.GraphWidget(
986 title="GPU Temperature (°C)",
987 left=[
988 cloudwatch.Metric(
989 namespace="ContainerInsights",
990 metric_name="node_gpu_temperature",
991 dimensions_map={"ClusterName": cluster_name},
992 statistic="Maximum",
993 period=Duration.minutes(5),
994 label=region,
995 region=region,
996 )
997 for cluster_name, region in cluster_info
998 ],
999 width=12,
1000 height=6,
1001 )
1002 widgets.append(gpu_temp_widget)
1004 # GPU count (active GPUs)
1005 gpu_count_widget = cloudwatch.GraphWidget(
1006 title="Active GPU Count",
1007 left=[
1008 cloudwatch.Metric(
1009 namespace="ContainerInsights",
1010 metric_name="node_gpu_limit",
1011 dimensions_map={"ClusterName": cluster_name},
1012 statistic="Sum",
1013 period=Duration.minutes(5),
1014 label=region,
1015 region=region,
1016 )
1017 for cluster_name, region in cluster_info
1018 ],
1019 width=12,
1020 height=6,
1021 )
1022 widgets.append(gpu_count_widget)
1024 return widgets
1026 def _create_fsx_widgets(self) -> list[cloudwatch.IWidget]:
1027 """Create FSx for Lustre monitoring widgets.
1029 Only emits widgets for regions where the FSx file system is actually
1030 provisioned (``regional_stack.fsx_file_system`` is non-None). The
1031 dimension ``FileSystemId`` is the CDK-generated CloudFormation ref
1032 from each regional stack — CDK's ``cross_region_references=True``
1033 (enabled on this stack's constructor) plumbs the value across
1034 regions via SSM + custom resources.
1036 Returns an empty list if no region has FSx enabled — the dashboard
1037 skips the section entirely.
1038 """
1039 # Collect (file_system_id, region) tuples for regions that have FSx on.
1040 # fsx_file_system is either a CfnFileSystem or None; the local
1041 # assignment + is-not-None check lets mypy narrow the type so
1042 # ``.ref`` access typechecks cleanly (a list comprehension with
1043 # the guard in the ``if`` clause does not narrow the value clause).
1044 fsx_info: list[tuple[str, str]] = []
1045 for regional_stack in self.regional_stacks:
1046 fsx = getattr(regional_stack, "fsx_file_system", None)
1047 if fsx is None:
1048 continue
1049 fsx_info.append((fsx.ref, regional_stack.deployment_region))
1050 if not fsx_info:
1051 return []
1053 widgets: list[cloudwatch.IWidget] = []
1055 # Section header
1056 widgets.append(
1057 cloudwatch.TextWidget(
1058 markdown=(
1059 "# FSx for Lustre\n"
1060 "Parallel file system throughput, IOPS, and free storage "
1061 "capacity. Each line below is scoped to the exact GCO "
1062 "file system in its region — so unrelated FSx file "
1063 "systems in the same account do not appear on the "
1064 "dashboard."
1065 ),
1066 width=24,
1067 height=1,
1068 )
1069 )
1071 # Throughput: bytes read vs written
1072 throughput_widget = cloudwatch.GraphWidget(
1073 title="FSx - Throughput (Bytes/sec)",
1074 left=[
1075 cloudwatch.Metric(
1076 namespace="AWS/FSx",
1077 metric_name="DataReadBytes",
1078 dimensions_map={"FileSystemId": fs_id},
1079 statistic="Sum",
1080 period=Duration.minutes(1),
1081 label=f"{region} Read",
1082 region=region,
1083 )
1084 for fs_id, region in fsx_info
1085 ],
1086 right=[
1087 cloudwatch.Metric(
1088 namespace="AWS/FSx",
1089 metric_name="DataWriteBytes",
1090 dimensions_map={"FileSystemId": fs_id},
1091 statistic="Sum",
1092 period=Duration.minutes(1),
1093 label=f"{region} Write",
1094 region=region,
1095 )
1096 for fs_id, region in fsx_info
1097 ],
1098 width=12,
1099 height=6,
1100 )
1101 widgets.append(throughput_widget)
1103 # IOPS: read vs write operations
1104 iops_widget = cloudwatch.GraphWidget(
1105 title="FSx - IOPS",
1106 left=[
1107 cloudwatch.Metric(
1108 namespace="AWS/FSx",
1109 metric_name="DataReadOperations",
1110 dimensions_map={"FileSystemId": fs_id},
1111 statistic="Sum",
1112 period=Duration.minutes(1),
1113 label=f"{region} Read",
1114 region=region,
1115 )
1116 for fs_id, region in fsx_info
1117 ],
1118 right=[
1119 cloudwatch.Metric(
1120 namespace="AWS/FSx",
1121 metric_name="DataWriteOperations",
1122 dimensions_map={"FileSystemId": fs_id},
1123 statistic="Sum",
1124 period=Duration.minutes(1),
1125 label=f"{region} Write",
1126 region=region,
1127 )
1128 for fs_id, region in fsx_info
1129 ],
1130 width=12,
1131 height=6,
1132 )
1133 widgets.append(iops_widget)
1135 # Free storage capacity — the classic "running out of space" signal.
1136 # FreeDataStorageCapacity is emitted in bytes.
1137 free_storage_widget = cloudwatch.GraphWidget(
1138 title="FSx - Free Storage Capacity (Bytes)",
1139 left=[
1140 cloudwatch.Metric(
1141 namespace="AWS/FSx",
1142 metric_name="FreeDataStorageCapacity",
1143 dimensions_map={"FileSystemId": fs_id},
1144 statistic="Minimum",
1145 period=Duration.minutes(5),
1146 label=region,
1147 region=region,
1148 )
1149 for fs_id, region in fsx_info
1150 ],
1151 width=24,
1152 height=6,
1153 )
1154 widgets.append(free_storage_widget)
1156 return widgets
1158 def _create_valkey_widgets(self) -> list[cloudwatch.IWidget]:
1159 """Create Valkey (ElastiCache Serverless) monitoring widgets.
1161 Uses explicit ``clusterId`` dimension values (camelCase — the
1162 ElastiCache Serverless variant; distinct from the node-based
1163 ``CacheClusterId``). The regional stack names its cache
1164 deterministically as ``gco-{deployment_region}``, so we reproduce
1165 that name here and pin each widget to the exact cache in its
1166 region. No SEARCH expression, so the dashboard ignores every
1167 unrelated ElastiCache cluster in the account.
1168 """
1169 valkey_enabled = self.config.get_valkey_config().get("enabled", False)
1170 if not valkey_enabled or not self.regions:
1171 return []
1173 widgets: list[cloudwatch.IWidget] = []
1175 widgets.append(
1176 cloudwatch.TextWidget(
1177 markdown=(
1178 "# Valkey Serverless Cache\n"
1179 "ECPU consumption, storage, hit rate, and request "
1180 "latency — scoped to each region's ``gco-{region}`` "
1181 "cache exactly (no SEARCH)."
1182 ),
1183 width=24,
1184 height=1,
1185 )
1186 )
1188 # Build (cache_name, region) pairs. cache_name is the literal
1189 # ``serverless_cache_name`` the regional stack passes to the
1190 # CfnServerlessCache.
1191 cache_info = [(f"gco-{region}", region) for region in self.regions]
1193 # ECPU consumption and cache size per region
1194 for cache_name, region in cache_info:
1195 widgets.append(
1196 cloudwatch.GraphWidget(
1197 title=f"Valkey - ECPU & Cache Size ({region})",
1198 left=[
1199 cloudwatch.Metric(
1200 namespace="AWS/ElastiCache",
1201 metric_name="ElastiCacheProcessingUnits",
1202 dimensions_map={"clusterId": cache_name},
1203 statistic="Sum",
1204 period=Duration.minutes(1),
1205 label="ECPUs",
1206 region=region,
1207 ),
1208 ],
1209 right=[
1210 cloudwatch.Metric(
1211 namespace="AWS/ElastiCache",
1212 metric_name="BytesUsedForCache",
1213 dimensions_map={"clusterId": cache_name},
1214 statistic="Average",
1215 period=Duration.minutes(5),
1216 label="Bytes",
1217 region=region,
1218 ),
1219 ],
1220 width=12,
1221 height=6,
1222 region=region,
1223 )
1224 )
1226 # Hit rate and p99 read/write latency per region
1227 for cache_name, region in cache_info:
1228 widgets.append(
1229 cloudwatch.GraphWidget(
1230 title=f"Valkey - Hit Rate & Latency ({region})",
1231 left=[
1232 cloudwatch.Metric(
1233 namespace="AWS/ElastiCache",
1234 metric_name="CacheHitRate",
1235 dimensions_map={"clusterId": cache_name},
1236 statistic="Average",
1237 period=Duration.minutes(5),
1238 label="Hit Rate %",
1239 region=region,
1240 ),
1241 ],
1242 right=[
1243 cloudwatch.Metric(
1244 namespace="AWS/ElastiCache",
1245 metric_name="SuccessfulReadRequestLatency",
1246 dimensions_map={"clusterId": cache_name},
1247 statistic="p99",
1248 period=Duration.minutes(1),
1249 label="Read p99 µs",
1250 region=region,
1251 ),
1252 cloudwatch.Metric(
1253 namespace="AWS/ElastiCache",
1254 metric_name="SuccessfulWriteRequestLatency",
1255 dimensions_map={"clusterId": cache_name},
1256 statistic="p99",
1257 period=Duration.minutes(1),
1258 label="Write p99 µs",
1259 region=region,
1260 ),
1261 ],
1262 width=12,
1263 height=6,
1264 region=region,
1265 )
1266 )
1268 return widgets
1270 def _create_aurora_pgvector_widgets(self) -> list[cloudwatch.IWidget]:
1271 """Create Aurora Serverless v2 (pgvector) monitoring widgets.
1273 Pins each widget to the exact Aurora cluster provisioned by the
1274 regional stack via ``regional_stack.aurora_cluster.cluster_identifier``.
1275 CDK-generated cluster IDs are CloudFormation tokens; the
1276 ``cross_region_references=True`` flag on this stack handles
1277 plumbing them from each regional stack into the monitoring stack
1278 (us-east-2 by default) through SSM + custom resources.
1280 Returns an empty list when every region has Aurora pgvector
1281 disabled so the dashboard skips the section entirely.
1282 """
1283 # (cluster_identifier, region) pairs for regions with Aurora on.
1284 # Use a guarded loop (not a comprehension) so mypy can narrow the
1285 # Optional[DatabaseCluster] to a real cluster before dereferencing.
1286 aurora_info: list[tuple[str, str]] = []
1287 for regional_stack in self.regional_stacks:
1288 aurora = getattr(regional_stack, "aurora_cluster", None)
1289 if aurora is None:
1290 continue
1291 aurora_info.append((aurora.cluster_identifier, regional_stack.deployment_region))
1292 if not aurora_info:
1293 return []
1295 widgets: list[cloudwatch.IWidget] = []
1297 widgets.append(
1298 cloudwatch.TextWidget(
1299 markdown=(
1300 "# Aurora pgvector (Serverless v2)\n"
1301 "ACU utilization, database connections, query latency, "
1302 "and CPU utilization — pinned to each regional GCO "
1303 "Aurora cluster by ID. ACU utilization is the primary "
1304 "scale/cost signal for Serverless v2."
1305 ),
1306 width=24,
1307 height=1,
1308 )
1309 )
1311 # ACU utilization and capacity
1312 for cluster_id, region in aurora_info:
1313 widgets.append(
1314 cloudwatch.GraphWidget(
1315 title=f"Aurora - ACU Utilization & Capacity ({region})",
1316 left=[
1317 cloudwatch.Metric(
1318 namespace="AWS/RDS",
1319 metric_name="ACUUtilization",
1320 dimensions_map={"DBClusterIdentifier": cluster_id},
1321 statistic="Average",
1322 period=Duration.minutes(1),
1323 label="ACU %",
1324 region=region,
1325 ),
1326 ],
1327 right=[
1328 cloudwatch.Metric(
1329 namespace="AWS/RDS",
1330 metric_name="ServerlessDatabaseCapacity",
1331 dimensions_map={"DBClusterIdentifier": cluster_id},
1332 statistic="Average",
1333 period=Duration.minutes(1),
1334 label="ACUs",
1335 region=region,
1336 ),
1337 ],
1338 width=12,
1339 height=6,
1340 region=region,
1341 )
1342 )
1344 # Database connections and CPU utilization
1345 for cluster_id, region in aurora_info:
1346 widgets.append(
1347 cloudwatch.GraphWidget(
1348 title=f"Aurora - Connections & CPU ({region})",
1349 left=[
1350 cloudwatch.Metric(
1351 namespace="AWS/RDS",
1352 metric_name="DatabaseConnections",
1353 dimensions_map={"DBClusterIdentifier": cluster_id},
1354 statistic="Average",
1355 period=Duration.minutes(1),
1356 label="Connections",
1357 region=region,
1358 ),
1359 ],
1360 right=[
1361 cloudwatch.Metric(
1362 namespace="AWS/RDS",
1363 metric_name="CPUUtilization",
1364 dimensions_map={"DBClusterIdentifier": cluster_id},
1365 statistic="Average",
1366 period=Duration.minutes(1),
1367 label="CPU %",
1368 region=region,
1369 ),
1370 ],
1371 width=12,
1372 height=6,
1373 region=region,
1374 )
1375 )
1377 # Read and write latency p99
1378 for cluster_id, region in aurora_info:
1379 widgets.append(
1380 cloudwatch.GraphWidget(
1381 title=f"Aurora - Query Latency p99 ({region})",
1382 left=[
1383 cloudwatch.Metric(
1384 namespace="AWS/RDS",
1385 metric_name="ReadLatency",
1386 dimensions_map={"DBClusterIdentifier": cluster_id},
1387 statistic="p99",
1388 period=Duration.minutes(1),
1389 label="Read p99",
1390 region=region,
1391 ),
1392 ],
1393 right=[
1394 cloudwatch.Metric(
1395 namespace="AWS/RDS",
1396 metric_name="WriteLatency",
1397 dimensions_map={"DBClusterIdentifier": cluster_id},
1398 statistic="p99",
1399 period=Duration.minutes(1),
1400 label="Write p99",
1401 region=region,
1402 ),
1403 ],
1404 width=24,
1405 height=6,
1406 region=region,
1407 )
1408 )
1410 return widgets
1412 def _create_alb_widgets(self) -> list[cloudwatch.IWidget]:
1413 """Create ALB monitoring widgets scoped to the GCO platform ALB.
1415 ALBs are created by the AWS Load Balancer Controller at runtime
1416 from an Ingress resource (not by CDK), so the exact ALB name
1417 isn't known at synth time. We originally tried reading the ARN
1418 off the regional stack's ``GaRegistration`` custom resource via
1419 ``cross_region_references=True``, but that path races the
1420 custom-resource response pipeline: CDK's cross-region
1421 ``ExportsWriter`` executes ``Fn::GetAtt: [GaRegistration, AlbArn]``
1422 before CloudFormation has the updated response data stored, and
1423 errors with "Vendor response doesn't contain AlbArn attribute".
1425 Instead we use a SEARCH expression with a composite-token
1426 filter. The ALB Controller names the platform ALB
1427 ``k8s-gco-<hash>`` (the namespace is shortened because the
1428 controller enforces a 32-char total name limit); CloudWatch's
1429 ``LoadBalancer`` dimension is the ARN suffix ``app/<name>/<hash>``,
1430 so an unquoted filter ``LoadBalancer=app/k8s-gco-`` performs a
1431 composite-token match (the sequence ``app``, ``k``, ``8``, ``s``,
1432 ``gco`` must appear consecutively in the dimension value).
1433 Double-quoted filters would be exact matches and return nothing
1434 because no ALB's dimension value is literally ``app/k8s-gco-``.
1435 """
1436 widgets: list[cloudwatch.IWidget] = []
1438 # Section header
1439 widgets.append(
1440 cloudwatch.TextWidget(
1441 markdown=(
1442 "# Application Load Balancers\n"
1443 "Request metrics, response time, HTTP errors, and "
1444 "connection counts — scoped via SEARCH composite-token "
1445 "match to ALBs named ``app/k8s-gco-*`` so only the GCO "
1446 "platform ALB in each region appears. Inference ALBs "
1447 "(named per endpoint) and unrelated ALBs in the "
1448 "account are excluded."
1449 ),
1450 width=24,
1451 height=1,
1452 )
1453 )
1455 # Per-region request count
1456 for region in self.regions:
1457 widgets.append(
1458 cloudwatch.GraphWidget(
1459 title=f"ALB - Request Count ({region})",
1460 left=[
1461 cloudwatch.MathExpression(
1462 expression=(
1463 "SEARCH('{AWS/ApplicationELB,LoadBalancer} "
1464 'MetricName="RequestCount" '
1465 'LoadBalancer=app/k8s-gco-\', "Sum", 300)'
1466 ),
1467 label="Request Count",
1468 period=Duration.minutes(5),
1469 ),
1470 ],
1471 width=12,
1472 height=6,
1473 region=region,
1474 )
1475 )
1477 # Per-region response time (average and p99)
1478 for region in self.regions:
1479 widgets.append(
1480 cloudwatch.GraphWidget(
1481 title=f"ALB - Response Time ({region})",
1482 left=[
1483 cloudwatch.MathExpression(
1484 expression=(
1485 "SEARCH('{AWS/ApplicationELB,LoadBalancer} "
1486 'MetricName="TargetResponseTime" '
1487 'LoadBalancer=app/k8s-gco-\', "Average", 300)'
1488 ),
1489 label="Avg Response Time",
1490 period=Duration.minutes(5),
1491 ),
1492 cloudwatch.MathExpression(
1493 expression=(
1494 "SEARCH('{AWS/ApplicationELB,LoadBalancer} "
1495 'MetricName="TargetResponseTime" '
1496 'LoadBalancer=app/k8s-gco-\', "p99", 300)'
1497 ),
1498 label="p99 Response Time",
1499 period=Duration.minutes(5),
1500 ),
1501 ],
1502 width=12,
1503 height=6,
1504 region=region,
1505 )
1506 )
1508 # Per-region HTTP errors (4XX + 5XX from targets)
1509 for region in self.regions:
1510 widgets.append(
1511 cloudwatch.GraphWidget(
1512 title=f"ALB - HTTP Errors ({region})",
1513 left=[
1514 cloudwatch.MathExpression(
1515 expression=(
1516 "SEARCH('{AWS/ApplicationELB,LoadBalancer} "
1517 'MetricName="HTTPCode_Target_4XX_Count" '
1518 'LoadBalancer=app/k8s-gco-\', "Sum", 300)'
1519 ),
1520 label="4XX Errors",
1521 period=Duration.minutes(5),
1522 ),
1523 ],
1524 right=[
1525 cloudwatch.MathExpression(
1526 expression=(
1527 "SEARCH('{AWS/ApplicationELB,LoadBalancer} "
1528 'MetricName="HTTPCode_Target_5XX_Count" '
1529 'LoadBalancer=app/k8s-gco-\', "Sum", 300)'
1530 ),
1531 label="5XX Errors",
1532 period=Duration.minutes(5),
1533 ),
1534 ],
1535 width=12,
1536 height=6,
1537 region=region,
1538 )
1539 )
1541 # Per-region active connections
1542 for region in self.regions:
1543 widgets.append(
1544 cloudwatch.GraphWidget(
1545 title=f"ALB - Active Connections ({region})",
1546 left=[
1547 cloudwatch.MathExpression(
1548 expression=(
1549 "SEARCH('{AWS/ApplicationELB,LoadBalancer} "
1550 'MetricName="ActiveConnectionCount" '
1551 'LoadBalancer=app/k8s-gco-\', "Sum", 300)'
1552 ),
1553 label="Active Connections",
1554 period=Duration.minutes(5),
1555 ),
1556 ],
1557 width=12,
1558 height=6,
1559 region=region,
1560 )
1561 )
1563 return widgets
1565 def _create_application_widgets(self) -> list[cloudwatch.IWidget]:
1566 """Create custom application monitoring widgets"""
1567 widgets: list[cloudwatch.IWidget] = []
1569 # Section header
1570 widgets.append(
1571 cloudwatch.TextWidget(
1572 markdown="# Application Metrics\n"
1573 "Health monitor and manifest processor metrics. "
1574 "Application logs are available in Container Insights at "
1575 "`/aws/containerinsights/<cluster>/application`.",
1576 width=24,
1577 height=1,
1578 )
1579 )
1581 # Build cluster info from regional stacks: (cluster_name, region)
1582 cluster_info = [
1583 (regional_stack.cluster.cluster_name, regional_stack.deployment_region)
1584 for regional_stack in self.regional_stacks
1585 ]
1587 # Health monitor metrics
1588 health_monitor_widget = cloudwatch.GraphWidget(
1589 title="Health Monitor - Resource Utilization",
1590 left=[
1591 cloudwatch.Metric(
1592 namespace="GCO/HealthMonitor",
1593 metric_name="ClusterCpuUtilization",
1594 dimensions_map={
1595 "ClusterName": cluster_name,
1596 "Region": region,
1597 },
1598 statistic="Average",
1599 period=Duration.minutes(5),
1600 label=f"{region} CPU",
1601 region=region,
1602 )
1603 for cluster_name, region in cluster_info
1604 ],
1605 right=[
1606 cloudwatch.Metric(
1607 namespace="GCO/HealthMonitor",
1608 metric_name="ClusterMemoryUtilization",
1609 dimensions_map={
1610 "ClusterName": cluster_name,
1611 "Region": region,
1612 },
1613 statistic="Average",
1614 period=Duration.minutes(5),
1615 label=f"{region} Memory",
1616 region=region,
1617 )
1618 for cluster_name, region in cluster_info
1619 ],
1620 width=12,
1621 height=6,
1622 )
1623 widgets.append(health_monitor_widget)
1625 # Manifest processor metrics
1626 manifest_processor_widget = cloudwatch.GraphWidget(
1627 title="Manifest Processor - Submissions",
1628 left=[
1629 cloudwatch.Metric(
1630 namespace="GCO/ManifestProcessor",
1631 metric_name="ManifestSubmissions",
1632 dimensions_map={
1633 "ClusterName": cluster_name,
1634 "Region": region,
1635 },
1636 statistic="Sum",
1637 period=Duration.minutes(5),
1638 label=f"{region} Submissions",
1639 region=region,
1640 )
1641 for cluster_name, region in cluster_info
1642 ],
1643 right=[
1644 cloudwatch.Metric(
1645 namespace="GCO/ManifestProcessor",
1646 metric_name="ManifestFailures",
1647 dimensions_map={
1648 "ClusterName": cluster_name,
1649 "Region": region,
1650 },
1651 statistic="Sum",
1652 period=Duration.minutes(5),
1653 label=f"{region} Failures",
1654 color="#d62728",
1655 region=region,
1656 )
1657 for cluster_name, region in cluster_info
1658 ],
1659 width=12,
1660 height=6,
1661 )
1662 widgets.append(manifest_processor_widget)
1664 # Container Insights - Pod restarts (indicates application issues)
1665 pod_restarts_widget = cloudwatch.GraphWidget(
1666 title="Container Insights - Pod Restarts",
1667 left=[
1668 cloudwatch.Metric(
1669 namespace="ContainerInsights",
1670 metric_name="pod_number_of_container_restarts",
1671 dimensions_map={"ClusterName": cluster_name},
1672 statistic="Sum",
1673 period=Duration.minutes(5),
1674 label=f"{region}",
1675 region=region,
1676 )
1677 for cluster_name, region in cluster_info
1678 ],
1679 width=12,
1680 height=6,
1681 )
1682 widgets.append(pod_restarts_widget)
1684 # Secret rotation Lambda metrics (Secrets Manager doesn't publish rotation metrics,
1685 # so we monitor the rotation Lambda function instead)
1686 if self.api_gateway_stack: 1686 ↛ 1722line 1686 didn't jump to line 1722 because the condition on line 1686 was always true
1687 rotation_function_name = self.api_gateway_stack.rotation_lambda.function_name
1688 api_gw_region = self.config.get_api_gateway_region()
1690 rotation_widget = cloudwatch.GraphWidget(
1691 title="Secret Rotation Lambda - Invocations & Errors",
1692 left=[
1693 cloudwatch.Metric(
1694 namespace="AWS/Lambda",
1695 metric_name="Invocations",
1696 dimensions_map={"FunctionName": rotation_function_name},
1697 statistic="Sum",
1698 period=Duration.hours(1),
1699 label="Invocations",
1700 color="#2ca02c",
1701 region=api_gw_region,
1702 ),
1703 ],
1704 right=[
1705 cloudwatch.Metric(
1706 namespace="AWS/Lambda",
1707 metric_name="Errors",
1708 dimensions_map={"FunctionName": rotation_function_name},
1709 statistic="Sum",
1710 period=Duration.hours(1),
1711 label="Errors",
1712 color="#d62728",
1713 region=api_gw_region,
1714 ),
1715 ],
1716 width=12,
1717 height=6,
1718 )
1719 widgets.append(rotation_widget)
1720 else:
1721 # Fallback text widget if api_gateway_stack not available
1722 fallback_widget = cloudwatch.TextWidget(
1723 markdown="**Secret Rotation:** API Gateway stack not configured. "
1724 "Rotation Lambda metrics unavailable.",
1725 width=12,
1726 height=6,
1727 )
1728 widgets.append(fallback_widget)
1730 return widgets
1732 def _create_alarms(self) -> None:
1733 """Create CloudWatch alarms"""
1734 self._create_global_accelerator_alarms()
1735 self._create_api_gateway_alarms()
1736 self._create_lambda_alarms()
1737 self._create_sqs_alarms()
1738 self._create_dynamodb_alarms()
1739 self._create_eks_alarms()
1740 self._create_alb_alarms()
1741 self._create_application_alarms()
1743 def _create_global_accelerator_alarms(self) -> None:
1744 """Create Global Accelerator alarms.
1746 Note: Global Accelerator metrics are only available in us-west-2.
1747 CloudWatch Alarms must be in the same region as the metrics they monitor.
1748 Since this monitoring stack may be deployed in a different region,
1749 we skip GA alarms here. To monitor GA, either:
1750 1. Create alarms manually in us-west-2
1751 2. Use CloudWatch cross-region dashboard widgets (which we do)
1752 3. Deploy a separate alarm stack in us-west-2
1753 """
1754 # GA alarms skipped - metrics only available in us-west-2
1755 # Dashboard widgets use region parameter to display GA metrics correctly
1756 pass
1758 def _create_api_gateway_alarms(self) -> None:
1759 """Create API Gateway alarms"""
1760 # Get the actual API name from the api_gateway_stack
1761 api_name = (
1762 self.api_gateway_stack.api.rest_api_name if self.api_gateway_stack else "gco-global-api"
1763 )
1765 # High 5XX error rate
1766 api_5xx_alarm = cloudwatch.Alarm(
1767 self,
1768 "ApiGateway5xxAlarm",
1769 alarm_description="API Gateway has high 5XX error rate",
1770 metric=cloudwatch.Metric(
1771 namespace="AWS/ApiGateway",
1772 metric_name="5XXError",
1773 dimensions_map={"ApiName": api_name},
1774 statistic="Sum",
1775 period=Duration.minutes(5),
1776 ),
1777 threshold=10,
1778 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
1779 evaluation_periods=2,
1780 datapoints_to_alarm=2,
1781 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,
1782 )
1783 api_5xx_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))
1785 # High latency
1786 api_latency_alarm = cloudwatch.Alarm(
1787 self,
1788 "ApiGatewayHighLatencyAlarm",
1789 alarm_description="API Gateway has high latency",
1790 metric=cloudwatch.Metric(
1791 namespace="AWS/ApiGateway",
1792 metric_name="Latency",
1793 dimensions_map={"ApiName": api_name},
1794 statistic="p99",
1795 period=Duration.minutes(5),
1796 ),
1797 threshold=10000, # 10 seconds
1798 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
1799 evaluation_periods=3,
1800 datapoints_to_alarm=2,
1801 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,
1802 )
1803 api_latency_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))
1805 def _create_lambda_alarms(self) -> None:
1806 """Create Lambda function alarms"""
1807 # Get Lambda function names from api_gateway_stack if available
1808 if self.api_gateway_stack: 1808 ↛ exitline 1808 didn't return from function '_create_lambda_alarms' because the condition on line 1808 was always true
1809 proxy_function_name = self.api_gateway_stack.proxy_lambda.function_name
1810 rotation_function_name = self.api_gateway_stack.rotation_lambda.function_name
1812 # API Gateway Proxy Lambda errors
1813 proxy_errors_alarm = cloudwatch.Alarm(
1814 self,
1815 "ProxyLambdaErrorsAlarm",
1816 alarm_description="API Gateway proxy Lambda has errors",
1817 metric=cloudwatch.Metric(
1818 namespace="AWS/Lambda",
1819 metric_name="Errors",
1820 dimensions_map={"FunctionName": proxy_function_name},
1821 statistic="Sum",
1822 period=Duration.minutes(5),
1823 ),
1824 threshold=5,
1825 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
1826 evaluation_periods=2,
1827 datapoints_to_alarm=2,
1828 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,
1829 )
1830 proxy_errors_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))
1832 # Proxy Lambda throttles
1833 proxy_throttles_alarm = cloudwatch.Alarm(
1834 self,
1835 "ProxyLambdaThrottlesAlarm",
1836 alarm_description="API Gateway proxy Lambda is being throttled",
1837 metric=cloudwatch.Metric(
1838 namespace="AWS/Lambda",
1839 metric_name="Throttles",
1840 dimensions_map={"FunctionName": proxy_function_name},
1841 statistic="Sum",
1842 period=Duration.minutes(5),
1843 ),
1844 threshold=1,
1845 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
1846 evaluation_periods=2,
1847 datapoints_to_alarm=2,
1848 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,
1849 )
1850 proxy_throttles_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))
1852 # Secret rotation Lambda errors
1853 rotation_errors_alarm = cloudwatch.Alarm(
1854 self,
1855 "RotationLambdaErrorsAlarm",
1856 alarm_description="Secret rotation Lambda has errors",
1857 metric=cloudwatch.Metric(
1858 namespace="AWS/Lambda",
1859 metric_name="Errors",
1860 dimensions_map={"FunctionName": rotation_function_name},
1861 statistic="Sum",
1862 period=Duration.hours(1),
1863 ),
1864 threshold=1,
1865 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
1866 evaluation_periods=1,
1867 datapoints_to_alarm=1,
1868 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,
1869 )
1870 rotation_errors_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))
1872 def _create_sqs_alarms(self) -> None:
1873 """Create SQS queue alarms"""
1874 for regional_stack in self.regional_stacks:
1875 region = regional_stack.deployment_region
1876 queue_name = regional_stack.job_queue.queue_name
1877 dlq_name = regional_stack.job_dlq.queue_name
1878 region_id = region.replace("-", "").title()
1880 # Old message alarm (stuck jobs)
1881 old_message_alarm = cloudwatch.Alarm(
1882 self,
1883 f"SqsOldMessageAlarm{region_id}",
1884 alarm_description=f"SQS queue in {region} has old messages (potential stuck jobs)",
1885 metric=cloudwatch.Metric(
1886 namespace="AWS/SQS",
1887 metric_name="ApproximateAgeOfOldestMessage",
1888 dimensions_map={"QueueName": queue_name},
1889 statistic="Maximum",
1890 period=Duration.minutes(5),
1891 ),
1892 threshold=3600, # 1 hour
1893 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
1894 evaluation_periods=2,
1895 datapoints_to_alarm=2,
1896 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,
1897 )
1898 old_message_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))
1900 # Dead letter queue alarm
1901 dlq_alarm = cloudwatch.Alarm(
1902 self,
1903 f"SqsDlqAlarm{region_id}",
1904 alarm_description=f"SQS dead letter queue in {region} has messages",
1905 metric=cloudwatch.Metric(
1906 namespace="AWS/SQS",
1907 metric_name="ApproximateNumberOfMessagesVisible",
1908 dimensions_map={"QueueName": dlq_name},
1909 statistic="Sum",
1910 period=Duration.minutes(5),
1911 ),
1912 threshold=1,
1913 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
1914 evaluation_periods=1,
1915 datapoints_to_alarm=1,
1916 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,
1917 )
1918 dlq_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))
1920 def _create_dynamodb_alarms(self) -> None:
1921 """Create DynamoDB alarms for job queue, templates, and webhooks tables."""
1922 # Get table names from global stack
1923 jobs_table = self.global_stack.jobs_table.table_name
1925 # DynamoDB tables are in the global region
1926 global_region = self.config.get_global_region()
1928 # Jobs table throttling alarm
1929 jobs_throttle_alarm = cloudwatch.Alarm(
1930 self,
1931 "DynamoDBJobsThrottleAlarm",
1932 alarm_description="DynamoDB jobs table is being throttled",
1933 metric=cloudwatch.Metric(
1934 namespace="AWS/DynamoDB",
1935 metric_name="ThrottledRequests",
1936 dimensions_map={"TableName": jobs_table},
1937 statistic="Sum",
1938 period=Duration.minutes(5),
1939 region=global_region,
1940 ),
1941 threshold=1,
1942 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
1943 evaluation_periods=2,
1944 datapoints_to_alarm=2,
1945 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,
1946 )
1947 jobs_throttle_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))
1949 # Jobs table system errors alarm
1950 jobs_errors_alarm = cloudwatch.Alarm(
1951 self,
1952 "DynamoDBJobsErrorsAlarm",
1953 alarm_description="DynamoDB jobs table has system errors",
1954 metric=cloudwatch.Metric(
1955 namespace="AWS/DynamoDB",
1956 metric_name="SystemErrors",
1957 dimensions_map={"TableName": jobs_table},
1958 statistic="Sum",
1959 period=Duration.minutes(5),
1960 region=global_region,
1961 ),
1962 threshold=1,
1963 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
1964 evaluation_periods=1,
1965 datapoints_to_alarm=1,
1966 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,
1967 )
1968 jobs_errors_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))
1970 def _create_eks_alarms(self) -> None:
1971 """Create EKS cluster alarms"""
1972 for regional_stack in self.regional_stacks:
1973 region = regional_stack.deployment_region
1974 cluster_name = regional_stack.cluster.cluster_name
1975 region_id = region.replace("-", "").title()
1977 # High CPU utilization alarm (node-level metric)
1978 high_cpu_alarm = cloudwatch.Alarm(
1979 self,
1980 f"EksHighCpuAlarm{region_id}",
1981 alarm_description=f"EKS cluster {cluster_name} has high CPU utilization",
1982 metric=cloudwatch.Metric(
1983 namespace="ContainerInsights",
1984 metric_name="node_cpu_utilization",
1985 dimensions_map={"ClusterName": cluster_name},
1986 statistic="Average",
1987 period=Duration.minutes(5),
1988 ),
1989 threshold=80,
1990 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
1991 evaluation_periods=3,
1992 datapoints_to_alarm=2,
1993 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,
1994 )
1995 high_cpu_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))
1997 # High memory utilization alarm (node-level metric)
1998 high_memory_alarm = cloudwatch.Alarm(
1999 self,
2000 f"EksHighMemoryAlarm{region_id}",
2001 alarm_description=f"EKS cluster {cluster_name} has high memory utilization",
2002 metric=cloudwatch.Metric(
2003 namespace="ContainerInsights",
2004 metric_name="node_memory_utilization",
2005 dimensions_map={"ClusterName": cluster_name},
2006 statistic="Average",
2007 period=Duration.minutes(5),
2008 ),
2009 threshold=85,
2010 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
2011 evaluation_periods=3,
2012 datapoints_to_alarm=2,
2013 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,
2014 )
2015 high_memory_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))
2017 def _create_alb_alarms(self) -> None:
2018 """Create ALB alarms.
2020 Status: no alarms created yet, even though we now have the ALB
2021 ARN at deploy time via the GA registration custom resource (which
2022 also feeds the dashboard widgets). Adding per-ALB alarms here is
2023 a straightforward enhancement — derive the ``LoadBalancer``
2024 dimension the same way ``_create_alb_widgets`` does
2025 (``Fn.split(":loadbalancer/", alb_arn)[1]``) and wire it into
2026 ``cloudwatch.Alarm`` constructs.
2028 For now we rely on:
2029 1. Dashboard widgets pinned to each platform ALB (see
2030 ``_create_alb_widgets``)
2031 2. EKS Container Insights alarms for pod/node health
2032 3. API Gateway alarms for request-level monitoring
2033 """
2034 # TODO: Add UnHealthyHostCount / 5XXCount alarms using the ARN
2035 # returned by regional_stack.ga_registration.get_att_string("AlbArn").
2036 # The test suite explicitly documents that the ALB alarm count is
2037 # currently zero (test_alb_unhealthy_hosts_alarm_skipped); update
2038 # that test when adding real alarms.
2039 pass
2041 def _create_application_alarms(self) -> None:
2042 """Create application-specific alarms"""
2043 for regional_stack in self.regional_stacks:
2044 region = regional_stack.deployment_region
2045 cluster_name = regional_stack.cluster.cluster_name
2046 region_id = region.replace("-", "").title()
2048 # High manifest failure rate alarm
2049 high_failure_rate_alarm = cloudwatch.Alarm(
2050 self,
2051 f"ManifestHighFailureRateAlarm{region_id}",
2052 alarm_description=f"Manifest processor in {region} has high failure rate",
2053 metric=cloudwatch.Metric(
2054 namespace="GCO/ManifestProcessor",
2055 metric_name="ManifestFailures",
2056 dimensions_map={"ClusterName": cluster_name, "Region": region},
2057 statistic="Sum",
2058 period=Duration.minutes(5),
2059 ),
2060 threshold=10,
2061 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
2062 evaluation_periods=2,
2063 datapoints_to_alarm=2,
2064 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,
2065 )
2066 high_failure_rate_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))
2068 def _create_composite_alarms(self) -> None:
2069 """Create composite alarms for better signal-to-noise ratio"""
2071 # Store individual alarms for composite alarm references
2072 regional_alarms: dict[str, list[cloudwatch.Alarm]] = {}
2074 for regional_stack in self.regional_stacks:
2075 region = regional_stack.deployment_region
2076 cluster_name = regional_stack.cluster.cluster_name
2077 region_id = region.replace("-", "").title()
2078 regional_alarms[region] = []
2080 # Create regional health composite alarm
2081 # Triggers when multiple issues occur in the same region
2082 eks_cpu_alarm = cloudwatch.Alarm(
2083 self,
2084 f"CompositeEksCpu{region_id}",
2085 metric=cloudwatch.Metric(
2086 namespace="ContainerInsights",
2087 metric_name="node_cpu_utilization",
2088 dimensions_map={"ClusterName": cluster_name},
2089 statistic="Average",
2090 period=Duration.minutes(5),
2091 ),
2092 threshold=90,
2093 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
2094 evaluation_periods=2,
2095 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,
2096 )
2097 regional_alarms[region].append(eks_cpu_alarm)
2099 eks_memory_alarm = cloudwatch.Alarm(
2100 self,
2101 f"CompositeEksMemory{region_id}",
2102 metric=cloudwatch.Metric(
2103 namespace="ContainerInsights",
2104 metric_name="node_memory_utilization",
2105 dimensions_map={"ClusterName": cluster_name},
2106 statistic="Average",
2107 period=Duration.minutes(5),
2108 ),
2109 threshold=90,
2110 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
2111 evaluation_periods=2,
2112 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,
2113 )
2114 regional_alarms[region].append(eks_memory_alarm)
2116 # Create composite alarm for critical regional issues
2117 for region, alarms in regional_alarms.items():
2118 region_id = region.replace("-", "").title()
2119 if len(alarms) >= 2: 2119 ↛ 2117line 2119 didn't jump to line 2117 because the condition on line 2119 was always true
2120 composite_alarm = cloudwatch.CompositeAlarm(
2121 self,
2122 f"RegionalCriticalAlarm{region_id}",
2123 alarm_description=f"Critical: Multiple issues detected in {region}",
2124 alarm_rule=cloudwatch.AlarmRule.all_of(*alarms),
2125 )
2126 composite_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))
2128 # API Gateway + Lambda composite alarm (only if api_gateway_stack is available)
2129 if self.api_gateway_stack: 2129 ↛ exitline 2129 didn't return from function '_create_composite_alarms' because the condition on line 2129 was always true
2130 api_name = self.api_gateway_stack.api.rest_api_name
2131 proxy_function_name = self.api_gateway_stack.proxy_lambda.function_name
2133 api_error_alarm = cloudwatch.Alarm(
2134 self,
2135 "CompositeApiErrors",
2136 metric=cloudwatch.Metric(
2137 namespace="AWS/ApiGateway",
2138 metric_name="5XXError",
2139 dimensions_map={"ApiName": api_name},
2140 statistic="Sum",
2141 period=Duration.minutes(5),
2142 ),
2143 threshold=5,
2144 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
2145 evaluation_periods=2,
2146 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,
2147 )
2149 lambda_error_alarm = cloudwatch.Alarm(
2150 self,
2151 "CompositeLambdaErrors",
2152 metric=cloudwatch.Metric(
2153 namespace="AWS/Lambda",
2154 metric_name="Errors",
2155 dimensions_map={"FunctionName": proxy_function_name},
2156 statistic="Sum",
2157 period=Duration.minutes(5),
2158 ),
2159 threshold=3,
2160 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
2161 evaluation_periods=2,
2162 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,
2163 )
2165 api_lambda_composite = cloudwatch.CompositeAlarm(
2166 self,
2167 "ApiLambdaCompositeAlarm",
2168 alarm_description="Critical: Both API Gateway and Lambda proxy have errors",
2169 alarm_rule=cloudwatch.AlarmRule.all_of(api_error_alarm, lambda_error_alarm),
2170 )
2171 api_lambda_composite.add_alarm_action(cw_actions.SnsAction(self.alert_topic))
2173 def _create_custom_metrics(self) -> None:
2174 """Create custom metric filters and log groups"""
2175 for regional_stack in self.regional_stacks:
2176 region = regional_stack.deployment_region
2177 region_id = region.replace("-", "").title()
2179 # Health monitor log group
2180 # log_group_name intentionally omitted - let CDK generate unique name
2181 logs.LogGroup(
2182 self,
2183 f"HealthMonitorLogGroup{region_id}",
2184 retention=logs.RetentionDays.ONE_MONTH,
2185 removal_policy=RemovalPolicy.DESTROY,
2186 )
2188 # Manifest processor log group
2189 # log_group_name intentionally omitted - let CDK generate unique name
2190 logs.LogGroup(
2191 self,
2192 f"ManifestProcessorLogGroup{region_id}",
2193 retention=logs.RetentionDays.ONE_MONTH,
2194 removal_policy=RemovalPolicy.DESTROY,
2195 )
2197 def _create_outputs(self) -> None:
2198 """Create CloudFormation outputs"""
2199 CfnOutput(
2200 self,
2201 "DashboardUrl",
2202 value=f"https://console.aws.amazon.com/cloudwatch/home?region={self.region}#dashboards:name={self.dashboard.dashboard_name}",
2203 description="CloudWatch Dashboard URL",
2204 )
2206 CfnOutput(
2207 self,
2208 "AlertTopicArn",
2209 value=self.alert_topic.topic_arn,
2210 description="SNS Topic ARN for monitoring alerts",
2211 )
2213 CfnOutput(
2214 self,
2215 "AlarmCount",
2216 value="See CloudWatch Alarms console for full list",
2217 description="Monitoring alarms created",
2218 )