Coverage for gco / stacks / monitoring_stack.py: 98%
266 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-30 21:47 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-30 21:47 +0000
1"""
2Monitoring stack for GCO (Global Capacity Orchestrator on AWS) - Cross-region monitoring and observability.
4This stack creates centralized monitoring resources for all GCO deployments:
5- CloudWatch Dashboard with comprehensive widgets for all regions
6- SNS topic for alerting
7- CloudWatch Alarms for critical metrics
8- Log groups for application logs
9- Anomaly detection for traffic patterns
10- Composite alarms for better signal-to-noise
12Dashboard Sections:
13- Global Accelerator: Flow counts, processed bytes
14- API Gateway: Request counts, latency, error rates
15- Lambda Functions: Invocations, errors, duration, throttles
16- SQS Queues: Message counts, age, dead letter queue depth
17- DynamoDB Tables: Capacity, latency, throttles, errors
18- EKS Clusters: CPU/memory utilization per region
19- ALBs: Request counts, response times, healthy hosts
20- Applications: Custom metrics from health monitor and manifest processor
22Cross-Region Metrics:
23 CloudWatch metrics are region-specific. This stack handles cross-region
24 monitoring by specifying the `region` parameter on metrics:
25 - Global Accelerator metrics: Always in us-west-2
26 - DynamoDB metrics: In the global region (where tables are deployed)
27 - Regional metrics: In each cluster's region
29Alarms:
30- High CPU/memory utilization on EKS clusters
31- Unhealthy hosts in ALB target groups
32- High response times
33- Manifest processing failures
34- Lambda errors and throttles
35- SQS message age (stuck jobs)
36- DynamoDB throttling and system errors
37- API Gateway 5XX errors
38- Secret rotation failures
39"""
41from typing import TYPE_CHECKING, Any
43from aws_cdk import (
44 CfnOutput,
45 Duration,
46 RemovalPolicy,
47 Stack,
48)
49from aws_cdk import aws_cloudwatch as cloudwatch
50from aws_cdk import aws_cloudwatch_actions as cw_actions
51from aws_cdk import aws_logs as logs
52from aws_cdk import aws_sns as sns
53from constructs import Construct
55from gco.config.config_loader import ConfigLoader
57if TYPE_CHECKING:
58 from gco.stacks.api_gateway_global_stack import GCOApiGatewayGlobalStack
59 from gco.stacks.global_stack import GCOGlobalStack
60 from gco.stacks.regional_stack import GCORegionalStack
63class GCOMonitoringStack(Stack):
64 """
65 Cross-region monitoring and observability stack.
67 Creates a centralized CloudWatch dashboard and alarms that aggregate
68 metrics from all regional deployments.
70 Attributes:
71 alert_topic: SNS topic for alarm notifications
72 dashboard: CloudWatch dashboard with all monitoring widgets
73 """
75 def __init__(
76 self,
77 scope: Construct,
78 construct_id: str,
79 config: ConfigLoader,
80 global_stack: GCOGlobalStack,
81 regional_stacks: list[GCORegionalStack],
82 api_gateway_stack: GCOApiGatewayGlobalStack | None = None,
83 **kwargs: Any,
84 ) -> None:
85 super().__init__(scope, construct_id, **kwargs)
87 self.config = config
88 self.global_stack = global_stack
89 self.regional_stacks = regional_stacks
90 self.api_gateway_stack = api_gateway_stack
91 self.project_name = config.get_project_name()
92 self.regions = config.get_regions()
94 # Create SNS topic for alerts
95 self.alert_topic = self._create_alert_topic()
97 # Create CloudWatch dashboard
98 self.dashboard = self._create_dashboard()
100 # Create alarms
101 self._create_alarms()
103 # Create composite alarms
104 self._create_composite_alarms()
106 # Create custom metrics
107 self._create_custom_metrics()
109 # Export monitoring resources
110 self._create_outputs()
112 # Apply cdk-nag suppressions
113 self._apply_nag_suppressions()
115 def _apply_nag_suppressions(self) -> None:
116 """Apply cdk-nag suppressions for this stack."""
117 from gco.stacks.nag_suppressions import apply_all_suppressions
119 apply_all_suppressions(
120 self,
121 stack_type="monitoring",
122 regions=self.config.get_regions(),
123 global_region=self.config.get_global_region(),
124 )
126 def _create_alert_topic(self) -> sns.Topic:
127 """Create SNS topic for monitoring alerts"""
128 topic = sns.Topic(
129 self,
130 "GCOAlertTopic",
131 display_name="GCO (Global Capacity Orchestrator on AWS) Monitoring Alerts",
132 enforce_ssl=True,
133 )
134 return topic
136 def _create_dashboard(self) -> cloudwatch.Dashboard:
137 """Create comprehensive CloudWatch dashboard for monitoring"""
138 dashboard = cloudwatch.Dashboard(
139 self,
140 "GCODashboard",
141 period_override=cloudwatch.PeriodOverride.AUTO,
142 )
144 # Add widgets in logical order
145 dashboard.add_widgets(*self._create_global_accelerator_widgets())
146 dashboard.add_widgets(*self._create_api_gateway_widgets())
147 dashboard.add_widgets(*self._create_lambda_widgets())
148 dashboard.add_widgets(*self._create_sqs_widgets())
149 dashboard.add_widgets(*self._create_dynamodb_widgets())
150 dashboard.add_widgets(*self._create_eks_widgets())
151 dashboard.add_widgets(*self._create_gpu_widgets())
152 dashboard.add_widgets(*self._create_alb_widgets())
153 dashboard.add_widgets(*self._create_application_widgets())
155 return dashboard
157 def _create_global_accelerator_widgets(self) -> list[cloudwatch.IWidget]:
158 """Create Global Accelerator monitoring widgets.
160 Note: Global Accelerator metrics are only available in us-west-2,
161 regardless of where the accelerator endpoints are located.
162 CloudWatch uses the Accelerator ID (UUID), not the name.
163 """
164 widgets: list[cloudwatch.IWidget] = []
166 # Get the accelerator ID from the global stack (CloudWatch uses ID, not name)
167 accelerator_id = self.global_stack.accelerator_id
169 # Global Accelerator metrics are always in us-west-2
170 ga_metrics_region = "us-west-2"
172 # Section header
173 widgets.append(
174 cloudwatch.TextWidget(
175 markdown="# Global Accelerator\nTraffic distribution and connectivity metrics",
176 width=24,
177 height=1,
178 )
179 )
181 # Flow count with anomaly detection
182 flow_count_widget = cloudwatch.GraphWidget(
183 title="Global Accelerator - New Flows",
184 left=[
185 cloudwatch.Metric(
186 namespace="AWS/GlobalAccelerator",
187 metric_name="NewFlowCount",
188 dimensions_map={"Accelerator": accelerator_id},
189 statistic="Sum",
190 period=Duration.minutes(5),
191 region=ga_metrics_region,
192 )
193 ],
194 width=12,
195 height=6,
196 region=ga_metrics_region,
197 )
198 widgets.append(flow_count_widget)
200 # Processed bytes
201 bytes_widget = cloudwatch.GraphWidget(
202 title="Global Accelerator - Processed Bytes",
203 left=[
204 cloudwatch.Metric(
205 namespace="AWS/GlobalAccelerator",
206 metric_name="ProcessedBytesIn",
207 dimensions_map={"Accelerator": accelerator_id},
208 statistic="Sum",
209 period=Duration.minutes(5),
210 region=ga_metrics_region,
211 ),
212 cloudwatch.Metric(
213 namespace="AWS/GlobalAccelerator",
214 metric_name="ProcessedBytesOut",
215 dimensions_map={"Accelerator": accelerator_id},
216 statistic="Sum",
217 period=Duration.minutes(5),
218 region=ga_metrics_region,
219 ),
220 ],
221 width=12,
222 height=6,
223 region=ga_metrics_region,
224 )
225 widgets.append(bytes_widget)
227 return widgets
229 def _create_api_gateway_widgets(self) -> list[cloudwatch.IWidget]:
230 """Create API Gateway monitoring widgets"""
231 widgets: list[cloudwatch.IWidget] = []
233 # Get the actual API name from the api_gateway_stack
234 api_name = (
235 self.api_gateway_stack.api.rest_api_name if self.api_gateway_stack else "gco-global-api"
236 )
238 # API Gateway metrics are in the region where the API is deployed
239 api_gw_region = self.config.get_api_gateway_region()
241 # Section header
242 widgets.append(
243 cloudwatch.TextWidget(
244 markdown="# API Gateway\nRequest metrics, latency, and error rates",
245 width=24,
246 height=1,
247 )
248 )
250 # Request count and latency
251 request_widget = cloudwatch.GraphWidget(
252 title="API Gateway - Requests & Latency",
253 left=[
254 cloudwatch.Metric(
255 namespace="AWS/ApiGateway",
256 metric_name="Count",
257 dimensions_map={"ApiName": api_name},
258 statistic="Sum",
259 period=Duration.minutes(5),
260 region=api_gw_region,
261 )
262 ],
263 right=[
264 cloudwatch.Metric(
265 namespace="AWS/ApiGateway",
266 metric_name="Latency",
267 dimensions_map={"ApiName": api_name},
268 statistic="Average",
269 period=Duration.minutes(5),
270 region=api_gw_region,
271 ),
272 cloudwatch.Metric(
273 namespace="AWS/ApiGateway",
274 metric_name="Latency",
275 dimensions_map={"ApiName": api_name},
276 statistic="p99",
277 period=Duration.minutes(5),
278 region=api_gw_region,
279 ),
280 ],
281 width=12,
282 height=6,
283 region=api_gw_region,
284 )
285 widgets.append(request_widget)
287 # Error rates (4XX and 5XX)
288 error_widget = cloudwatch.GraphWidget(
289 title="API Gateway - Error Rates",
290 left=[
291 cloudwatch.Metric(
292 namespace="AWS/ApiGateway",
293 metric_name="4XXError",
294 dimensions_map={"ApiName": api_name},
295 statistic="Sum",
296 period=Duration.minutes(5),
297 color="#ff7f0e",
298 region=api_gw_region,
299 ),
300 cloudwatch.Metric(
301 namespace="AWS/ApiGateway",
302 metric_name="5XXError",
303 dimensions_map={"ApiName": api_name},
304 statistic="Sum",
305 period=Duration.minutes(5),
306 color="#d62728",
307 region=api_gw_region,
308 ),
309 ],
310 width=12,
311 height=6,
312 region=api_gw_region,
313 )
314 widgets.append(error_widget)
316 return widgets
318 def _create_lambda_widgets(self) -> list[cloudwatch.IWidget]:
319 """Create Lambda function monitoring widgets"""
320 widgets: list[cloudwatch.IWidget] = []
322 # Section header
323 widgets.append(
324 cloudwatch.TextWidget(
325 markdown="# Lambda Functions\nProxy, rotation, and regional Lambda metrics",
326 width=24,
327 height=1,
328 )
329 )
331 # Get API Gateway region for global Lambda functions
332 api_gw_region = self.config.get_api_gateway_region()
334 # Build Lambda function list: (function_name, label, region)
335 lambda_functions: list[tuple[str, str, str]] = []
337 # Add API Gateway Lambda functions if available
338 if self.api_gateway_stack: 338 ↛ 355line 338 didn't jump to line 355 because the condition on line 338 was always true
339 lambda_functions.append(
340 (
341 self.api_gateway_stack.proxy_lambda.function_name,
342 "API Gateway Proxy",
343 api_gw_region,
344 )
345 )
346 lambda_functions.append(
347 (
348 self.api_gateway_stack.rotation_lambda.function_name,
349 "Secret Rotation",
350 api_gw_region,
351 )
352 )
354 # Add regional Lambda functions from each regional stack
355 for regional_stack in self.regional_stacks:
356 region = regional_stack.deployment_region
357 lambda_functions.extend(
358 [
359 (
360 regional_stack.kubectl_lambda_function_name,
361 f"Kubectl Applier ({region})",
362 region,
363 ),
364 (
365 regional_stack.helm_installer_lambda_function_name,
366 f"Helm Installer ({region})",
367 region,
368 ),
369 ]
370 )
372 # Invocations widget
373 invocations_widget = cloudwatch.GraphWidget(
374 title="Lambda - Invocations",
375 left=[
376 cloudwatch.Metric(
377 namespace="AWS/Lambda",
378 metric_name="Invocations",
379 dimensions_map={"FunctionName": func_name},
380 statistic="Sum",
381 period=Duration.minutes(5),
382 label=label,
383 region=region,
384 )
385 for func_name, label, region in lambda_functions[:5]
386 ],
387 width=12,
388 height=6,
389 )
390 widgets.append(invocations_widget)
392 errors_widget = cloudwatch.GraphWidget(
393 title="Lambda - Errors",
394 left=[
395 cloudwatch.Metric(
396 namespace="AWS/Lambda",
397 metric_name="Errors",
398 dimensions_map={"FunctionName": func_name},
399 statistic="Sum",
400 period=Duration.minutes(5),
401 label=label,
402 color="#d62728",
403 region=region,
404 )
405 for func_name, label, region in lambda_functions[:5]
406 ],
407 width=12,
408 height=6,
409 )
410 widgets.append(errors_widget)
412 # Duration widget
413 duration_widget = cloudwatch.GraphWidget(
414 title="Lambda - Duration (ms)",
415 left=[
416 cloudwatch.Metric(
417 namespace="AWS/Lambda",
418 metric_name="Duration",
419 dimensions_map={"FunctionName": func_name},
420 statistic="Average",
421 period=Duration.minutes(5),
422 label=label,
423 region=region,
424 )
425 for func_name, label, region in lambda_functions[:5]
426 ],
427 width=12,
428 height=6,
429 )
430 widgets.append(duration_widget)
432 # Throttles widget
433 throttles_widget = cloudwatch.GraphWidget(
434 title="Lambda - Throttles & Concurrent Executions",
435 left=[
436 cloudwatch.Metric(
437 namespace="AWS/Lambda",
438 metric_name="Throttles",
439 dimensions_map={"FunctionName": func_name},
440 statistic="Sum",
441 period=Duration.minutes(5),
442 label=f"{label} Throttles",
443 region=region,
444 )
445 for func_name, label, region in lambda_functions[:3]
446 ],
447 right=[
448 cloudwatch.Metric(
449 namespace="AWS/Lambda",
450 metric_name="ConcurrentExecutions",
451 dimensions_map={"FunctionName": func_name},
452 statistic="Maximum",
453 period=Duration.minutes(5),
454 label=f"{label} Concurrent",
455 region=region,
456 )
457 for func_name, label, region in lambda_functions[:3]
458 ],
459 width=12,
460 height=6,
461 )
462 widgets.append(throttles_widget)
464 return widgets
466 def _create_sqs_widgets(self) -> list[cloudwatch.IWidget]:
467 """Create SQS queue monitoring widgets"""
468 widgets: list[cloudwatch.IWidget] = []
470 # Section header
471 widgets.append(
472 cloudwatch.TextWidget(
473 markdown="# SQS Queues\nJob submission queue metrics and dead letter queue",
474 width=24,
475 height=1,
476 )
477 )
479 # Build queue info from regional stacks: (queue_name, dlq_name, region)
480 queue_info = [
481 (
482 regional_stack.job_queue.queue_name,
483 regional_stack.job_dlq.queue_name,
484 regional_stack.deployment_region,
485 )
486 for regional_stack in self.regional_stacks
487 ]
489 # Messages visible and in-flight per region
490 messages_widget = cloudwatch.GraphWidget(
491 title="SQS - Messages (Visible & In-Flight)",
492 left=[
493 cloudwatch.Metric(
494 namespace="AWS/SQS",
495 metric_name="ApproximateNumberOfMessagesVisible",
496 dimensions_map={"QueueName": queue_name},
497 statistic="Average",
498 period=Duration.minutes(1),
499 label=f"{region} Visible",
500 region=region,
501 )
502 for queue_name, _, region in queue_info
503 ],
504 right=[
505 cloudwatch.Metric(
506 namespace="AWS/SQS",
507 metric_name="ApproximateNumberOfMessagesNotVisible",
508 dimensions_map={"QueueName": queue_name},
509 statistic="Average",
510 period=Duration.minutes(1),
511 label=f"{region} In-Flight",
512 region=region,
513 )
514 for queue_name, _, region in queue_info
515 ],
516 width=12,
517 height=6,
518 )
519 widgets.append(messages_widget)
521 # Age of oldest message (critical for detecting stuck jobs)
522 age_widget = cloudwatch.GraphWidget(
523 title="SQS - Age of Oldest Message (seconds)",
524 left=[
525 cloudwatch.Metric(
526 namespace="AWS/SQS",
527 metric_name="ApproximateAgeOfOldestMessage",
528 dimensions_map={"QueueName": queue_name},
529 statistic="Maximum",
530 period=Duration.minutes(1),
531 label=region,
532 region=region,
533 )
534 for queue_name, _, region in queue_info
535 ],
536 width=12,
537 height=6,
538 )
539 widgets.append(age_widget)
541 # Dead letter queue depth
542 dlq_widget = cloudwatch.GraphWidget(
543 title="SQS - Dead Letter Queue Depth",
544 left=[
545 cloudwatch.Metric(
546 namespace="AWS/SQS",
547 metric_name="ApproximateNumberOfMessagesVisible",
548 dimensions_map={"QueueName": dlq_name},
549 statistic="Average",
550 period=Duration.minutes(1),
551 label=f"{region} DLQ",
552 color="#d62728",
553 region=region,
554 )
555 for _, dlq_name, region in queue_info
556 ],
557 width=12,
558 height=6,
559 )
560 widgets.append(dlq_widget)
562 # Messages sent/received/deleted
563 throughput_widget = cloudwatch.GraphWidget(
564 title="SQS - Throughput",
565 left=[
566 cloudwatch.Metric(
567 namespace="AWS/SQS",
568 metric_name="NumberOfMessagesSent",
569 dimensions_map={"QueueName": queue_name},
570 statistic="Sum",
571 period=Duration.minutes(5),
572 label=f"{region} Sent",
573 region=region,
574 )
575 for queue_name, _, region in queue_info
576 ],
577 right=[
578 cloudwatch.Metric(
579 namespace="AWS/SQS",
580 metric_name="NumberOfMessagesDeleted",
581 dimensions_map={"QueueName": queue_name},
582 statistic="Sum",
583 period=Duration.minutes(5),
584 label=f"{region} Processed",
585 region=region,
586 )
587 for queue_name, _, region in queue_info
588 ],
589 width=12,
590 height=6,
591 )
592 widgets.append(throughput_widget)
594 return widgets
596 def _create_dynamodb_widgets(self) -> list[cloudwatch.IWidget]:
597 """Create DynamoDB monitoring widgets for job queue, templates, and webhooks tables."""
598 widgets: list[cloudwatch.IWidget] = []
600 # Get table names from global stack
601 templates_table = self.global_stack.templates_table.table_name
602 webhooks_table = self.global_stack.webhooks_table.table_name
603 jobs_table = self.global_stack.jobs_table.table_name
605 # DynamoDB tables are in the global region
606 global_region = self.config.get_global_region()
608 # Section header
609 widgets.append(
610 cloudwatch.TextWidget(
611 markdown="# DynamoDB Tables\nJob queue, templates, and webhooks storage metrics",
612 width=24,
613 height=1,
614 )
615 )
617 # Read/Write capacity consumed
618 capacity_widget = cloudwatch.GraphWidget(
619 title="DynamoDB - Consumed Capacity",
620 left=[
621 cloudwatch.Metric(
622 namespace="AWS/DynamoDB",
623 metric_name="ConsumedReadCapacityUnits",
624 dimensions_map={"TableName": jobs_table},
625 statistic="Sum",
626 period=Duration.minutes(5),
627 label="Jobs Read",
628 region=global_region,
629 ),
630 cloudwatch.Metric(
631 namespace="AWS/DynamoDB",
632 metric_name="ConsumedReadCapacityUnits",
633 dimensions_map={"TableName": templates_table},
634 statistic="Sum",
635 period=Duration.minutes(5),
636 label="Templates Read",
637 region=global_region,
638 ),
639 cloudwatch.Metric(
640 namespace="AWS/DynamoDB",
641 metric_name="ConsumedReadCapacityUnits",
642 dimensions_map={"TableName": webhooks_table},
643 statistic="Sum",
644 period=Duration.minutes(5),
645 label="Webhooks Read",
646 region=global_region,
647 ),
648 ],
649 right=[
650 cloudwatch.Metric(
651 namespace="AWS/DynamoDB",
652 metric_name="ConsumedWriteCapacityUnits",
653 dimensions_map={"TableName": jobs_table},
654 statistic="Sum",
655 period=Duration.minutes(5),
656 label="Jobs Write",
657 region=global_region,
658 ),
659 cloudwatch.Metric(
660 namespace="AWS/DynamoDB",
661 metric_name="ConsumedWriteCapacityUnits",
662 dimensions_map={"TableName": templates_table},
663 statistic="Sum",
664 period=Duration.minutes(5),
665 label="Templates Write",
666 region=global_region,
667 ),
668 ],
669 width=12,
670 height=6,
671 region=global_region,
672 )
673 widgets.append(capacity_widget)
675 # Latency metrics
676 latency_widget = cloudwatch.GraphWidget(
677 title="DynamoDB - Latency (ms)",
678 left=[
679 cloudwatch.Metric(
680 namespace="AWS/DynamoDB",
681 metric_name="SuccessfulRequestLatency",
682 dimensions_map={"TableName": jobs_table, "Operation": "GetItem"},
683 statistic="Average",
684 period=Duration.minutes(5),
685 label="Jobs GetItem",
686 region=global_region,
687 ),
688 cloudwatch.Metric(
689 namespace="AWS/DynamoDB",
690 metric_name="SuccessfulRequestLatency",
691 dimensions_map={"TableName": jobs_table, "Operation": "PutItem"},
692 statistic="Average",
693 period=Duration.minutes(5),
694 label="Jobs PutItem",
695 region=global_region,
696 ),
697 cloudwatch.Metric(
698 namespace="AWS/DynamoDB",
699 metric_name="SuccessfulRequestLatency",
700 dimensions_map={"TableName": jobs_table, "Operation": "Query"},
701 statistic="Average",
702 period=Duration.minutes(5),
703 label="Jobs Query",
704 region=global_region,
705 ),
706 ],
707 width=12,
708 height=6,
709 region=global_region,
710 )
711 widgets.append(latency_widget)
713 # Throttled requests
714 throttle_widget = cloudwatch.GraphWidget(
715 title="DynamoDB - Throttled Requests",
716 left=[
717 cloudwatch.Metric(
718 namespace="AWS/DynamoDB",
719 metric_name="ThrottledRequests",
720 dimensions_map={"TableName": jobs_table},
721 statistic="Sum",
722 period=Duration.minutes(5),
723 label="Jobs",
724 color="#d62728",
725 region=global_region,
726 ),
727 cloudwatch.Metric(
728 namespace="AWS/DynamoDB",
729 metric_name="ThrottledRequests",
730 dimensions_map={"TableName": templates_table},
731 statistic="Sum",
732 period=Duration.minutes(5),
733 label="Templates",
734 color="#ff7f0e",
735 region=global_region,
736 ),
737 cloudwatch.Metric(
738 namespace="AWS/DynamoDB",
739 metric_name="ThrottledRequests",
740 dimensions_map={"TableName": webhooks_table},
741 statistic="Sum",
742 period=Duration.minutes(5),
743 label="Webhooks",
744 color="#9467bd",
745 region=global_region,
746 ),
747 ],
748 width=12,
749 height=6,
750 region=global_region,
751 )
752 widgets.append(throttle_widget)
754 # System errors
755 errors_widget = cloudwatch.GraphWidget(
756 title="DynamoDB - System Errors",
757 left=[
758 cloudwatch.Metric(
759 namespace="AWS/DynamoDB",
760 metric_name="SystemErrors",
761 dimensions_map={"TableName": jobs_table},
762 statistic="Sum",
763 period=Duration.minutes(5),
764 label="Jobs",
765 color="#d62728",
766 region=global_region,
767 ),
768 cloudwatch.Metric(
769 namespace="AWS/DynamoDB",
770 metric_name="SystemErrors",
771 dimensions_map={"TableName": templates_table},
772 statistic="Sum",
773 period=Duration.minutes(5),
774 label="Templates",
775 color="#ff7f0e",
776 region=global_region,
777 ),
778 ],
779 width=12,
780 height=6,
781 region=global_region,
782 )
783 widgets.append(errors_widget)
785 return widgets
787 def _create_eks_widgets(self) -> list[cloudwatch.IWidget]:
788 """Create EKS cluster monitoring widgets"""
789 widgets: list[cloudwatch.IWidget] = []
791 # Section header
792 widgets.append(
793 cloudwatch.TextWidget(
794 markdown="# EKS Clusters\nCluster resource utilization and node metrics",
795 width=24,
796 height=1,
797 )
798 )
800 # Build cluster info from regional stacks: (cluster_name, region)
801 cluster_info = [
802 (regional_stack.cluster.cluster_name, regional_stack.deployment_region)
803 for regional_stack in self.regional_stacks
804 ]
806 # EKS cluster status
807 cluster_status_widget = cloudwatch.SingleValueWidget(
808 title="EKS Clusters - Failed Requests",
809 metrics=[
810 cloudwatch.Metric(
811 namespace="AWS/EKS",
812 metric_name="cluster_failed_request_count",
813 dimensions_map={"cluster_name": cluster_name},
814 statistic="Sum",
815 period=Duration.minutes(5),
816 region=region,
817 )
818 for cluster_name, region in cluster_info
819 ],
820 width=12,
821 height=6,
822 )
823 widgets.append(cluster_status_widget)
825 # Container Insights - Node CPU utilization (aggregated across all nodes)
826 # Note: region parameter enables cross-region metrics in dashboard
827 cpu_widget = cloudwatch.GraphWidget(
828 title="EKS Clusters - Node CPU Utilization (%)",
829 left=[
830 cloudwatch.Metric(
831 namespace="ContainerInsights",
832 metric_name="node_cpu_utilization",
833 dimensions_map={"ClusterName": cluster_name},
834 statistic="Average",
835 period=Duration.minutes(5),
836 label=region,
837 region=region,
838 )
839 for cluster_name, region in cluster_info
840 ],
841 width=12,
842 height=6,
843 )
844 widgets.append(cpu_widget)
846 # Container Insights - Node Memory utilization (aggregated across all nodes)
847 memory_widget = cloudwatch.GraphWidget(
848 title="EKS Clusters - Node Memory Utilization (%)",
849 left=[
850 cloudwatch.Metric(
851 namespace="ContainerInsights",
852 metric_name="node_memory_utilization",
853 dimensions_map={"ClusterName": cluster_name},
854 statistic="Average",
855 period=Duration.minutes(5),
856 label=region,
857 region=region,
858 )
859 for cluster_name, region in cluster_info
860 ],
861 width=12,
862 height=6,
863 )
864 widgets.append(memory_widget)
866 # Node status - running pods capacity
867 node_widget = cloudwatch.GraphWidget(
868 title="EKS Clusters - Node Pod Capacity",
869 left=[
870 cloudwatch.Metric(
871 namespace="ContainerInsights",
872 metric_name="node_status_capacity_pods",
873 dimensions_map={"ClusterName": cluster_name},
874 statistic="Sum",
875 period=Duration.minutes(5),
876 label=f"{region} Capacity",
877 region=region,
878 )
879 for cluster_name, region in cluster_info
880 ],
881 right=[
882 cloudwatch.Metric(
883 namespace="ContainerInsights",
884 metric_name="node_number_of_running_pods",
885 dimensions_map={"ClusterName": cluster_name},
886 statistic="Sum",
887 period=Duration.minutes(5),
888 label=f"{region} Running",
889 region=region,
890 )
891 for cluster_name, region in cluster_info
892 ],
893 width=12,
894 height=6,
895 )
896 widgets.append(node_widget)
898 return widgets
900 def _create_gpu_widgets(self) -> list[cloudwatch.IWidget]:
901 """Create GPU monitoring widgets using DCGM Exporter metrics via ContainerInsights."""
902 widgets: list[cloudwatch.IWidget] = []
904 widgets.append(
905 cloudwatch.TextWidget(
906 markdown="# GPU Metrics\nGPU utilization, memory, and temperature from DCGM Exporter",
907 width=24,
908 height=1,
909 )
910 )
912 cluster_info = [
913 (regional_stack.cluster.cluster_name, regional_stack.deployment_region)
914 for regional_stack in self.regional_stacks
915 ]
917 # GPU utilization percentage
918 gpu_util_widget = cloudwatch.GraphWidget(
919 title="GPU Utilization (%)",
920 left=[
921 cloudwatch.Metric(
922 namespace="ContainerInsights",
923 metric_name="node_gpu_utilization",
924 dimensions_map={"ClusterName": cluster_name},
925 statistic="Average",
926 period=Duration.minutes(5),
927 label=region,
928 region=region,
929 )
930 for cluster_name, region in cluster_info
931 ],
932 width=12,
933 height=6,
934 )
935 widgets.append(gpu_util_widget)
937 # GPU memory utilization
938 gpu_mem_widget = cloudwatch.GraphWidget(
939 title="GPU Memory Utilization (%)",
940 left=[
941 cloudwatch.Metric(
942 namespace="ContainerInsights",
943 metric_name="node_gpu_memory_utilization",
944 dimensions_map={"ClusterName": cluster_name},
945 statistic="Average",
946 period=Duration.minutes(5),
947 label=region,
948 region=region,
949 )
950 for cluster_name, region in cluster_info
951 ],
952 width=12,
953 height=6,
954 )
955 widgets.append(gpu_mem_widget)
957 # GPU temperature
958 gpu_temp_widget = cloudwatch.GraphWidget(
959 title="GPU Temperature (°C)",
960 left=[
961 cloudwatch.Metric(
962 namespace="ContainerInsights",
963 metric_name="node_gpu_temperature",
964 dimensions_map={"ClusterName": cluster_name},
965 statistic="Maximum",
966 period=Duration.minutes(5),
967 label=region,
968 region=region,
969 )
970 for cluster_name, region in cluster_info
971 ],
972 width=12,
973 height=6,
974 )
975 widgets.append(gpu_temp_widget)
977 # GPU count (active GPUs)
978 gpu_count_widget = cloudwatch.GraphWidget(
979 title="Active GPU Count",
980 left=[
981 cloudwatch.Metric(
982 namespace="ContainerInsights",
983 metric_name="node_gpu_limit",
984 dimensions_map={"ClusterName": cluster_name},
985 statistic="Sum",
986 period=Duration.minutes(5),
987 label=region,
988 region=region,
989 )
990 for cluster_name, region in cluster_info
991 ],
992 width=12,
993 height=6,
994 )
995 widgets.append(gpu_count_widget)
997 return widgets
999 def _create_alb_widgets(self) -> list[cloudwatch.IWidget]:
1000 """Create ALB monitoring widgets.
1002 Note: ALBs are created by the AWS Load Balancer Controller in Kubernetes
1003 via Ingress resources, not by CDK. The controller uses a naming convention:
1004 k8s-<namespace>-<ingress-name>-<hash>
1006 Since we can't know the exact ALB name at CDK synth time (includes a hash),
1007 we use CloudWatch SEARCH expressions to dynamically find ALBs matching
1008 the prefix pattern at dashboard render time.
1009 """
1010 widgets: list[cloudwatch.IWidget] = []
1012 # Section header
1013 widgets.append(
1014 cloudwatch.TextWidget(
1015 markdown="# Application Load Balancers\n"
1016 "Request metrics and health status. "
1017 "Uses CloudWatch SEARCH to dynamically find ALBs created by "
1018 "AWS Load Balancer Controller.",
1019 width=24,
1020 height=1,
1021 )
1022 )
1024 # Create one widget per region for ALB request count
1025 for region in self.regions:
1026 request_count_widget = cloudwatch.GraphWidget(
1027 title=f"ALB - Request Count ({region})",
1028 left=[
1029 cloudwatch.MathExpression(
1030 expression=(
1031 'SEARCH(\'Namespace="AWS/ApplicationELB" '
1032 'MetricName="RequestCount"\', "Sum", 300)'
1033 ),
1034 label="Request Count",
1035 period=Duration.minutes(5),
1036 )
1037 ],
1038 width=12,
1039 height=6,
1040 region=region,
1041 )
1042 widgets.append(request_count_widget)
1044 # Create one widget per region for ALB response time
1045 for region in self.regions:
1046 response_time_widget = cloudwatch.GraphWidget(
1047 title=f"ALB - Response Time ({region})",
1048 left=[
1049 cloudwatch.MathExpression(
1050 expression=(
1051 'SEARCH(\'Namespace="AWS/ApplicationELB" '
1052 'MetricName="TargetResponseTime"\', "Average", 300)'
1053 ),
1054 label="Avg Response Time",
1055 period=Duration.minutes(5),
1056 )
1057 ],
1058 width=12,
1059 height=6,
1060 region=region,
1061 )
1062 widgets.append(response_time_widget)
1064 # Create one widget per region for ALB HTTP errors
1065 for region in self.regions:
1066 http_errors_widget = cloudwatch.GraphWidget(
1067 title=f"ALB - HTTP Errors ({region})",
1068 left=[
1069 cloudwatch.MathExpression(
1070 expression=(
1071 'SEARCH(\'Namespace="AWS/ApplicationELB" '
1072 'MetricName="HTTPCode_Target_4XX_Count"\', "Sum", 300)'
1073 ),
1074 label="4XX Errors",
1075 period=Duration.minutes(5),
1076 )
1077 ],
1078 right=[
1079 cloudwatch.MathExpression(
1080 expression=(
1081 'SEARCH(\'Namespace="AWS/ApplicationELB" '
1082 'MetricName="HTTPCode_Target_5XX_Count"\', "Sum", 300)'
1083 ),
1084 label="5XX Errors",
1085 period=Duration.minutes(5),
1086 )
1087 ],
1088 width=12,
1089 height=6,
1090 region=region,
1091 )
1092 widgets.append(http_errors_widget)
1094 # Create one widget per region for ALB active connections
1095 for region in self.regions:
1096 connections_widget = cloudwatch.GraphWidget(
1097 title=f"ALB - Active Connections ({region})",
1098 left=[
1099 cloudwatch.MathExpression(
1100 expression=(
1101 'SEARCH(\'Namespace="AWS/ApplicationELB" '
1102 'MetricName="ActiveConnectionCount"\', "Sum", 300)'
1103 ),
1104 label="Active Connections",
1105 period=Duration.minutes(5),
1106 )
1107 ],
1108 width=12,
1109 height=6,
1110 region=region,
1111 )
1112 widgets.append(connections_widget)
1114 return widgets
1116 def _create_application_widgets(self) -> list[cloudwatch.IWidget]:
1117 """Create custom application monitoring widgets"""
1118 widgets: list[cloudwatch.IWidget] = []
1120 # Section header
1121 widgets.append(
1122 cloudwatch.TextWidget(
1123 markdown="# Application Metrics\n"
1124 "Health monitor and manifest processor metrics. "
1125 "Application logs are available in Container Insights at "
1126 "`/aws/containerinsights/<cluster>/application`.",
1127 width=24,
1128 height=1,
1129 )
1130 )
1132 # Build cluster info from regional stacks: (cluster_name, region)
1133 cluster_info = [
1134 (regional_stack.cluster.cluster_name, regional_stack.deployment_region)
1135 for regional_stack in self.regional_stacks
1136 ]
1138 # Health monitor metrics
1139 health_monitor_widget = cloudwatch.GraphWidget(
1140 title="Health Monitor - Resource Utilization",
1141 left=[
1142 cloudwatch.Metric(
1143 namespace="GCO/HealthMonitor",
1144 metric_name="ClusterCpuUtilization",
1145 dimensions_map={
1146 "ClusterName": cluster_name,
1147 "Region": region,
1148 },
1149 statistic="Average",
1150 period=Duration.minutes(5),
1151 label=f"{region} CPU",
1152 region=region,
1153 )
1154 for cluster_name, region in cluster_info
1155 ],
1156 right=[
1157 cloudwatch.Metric(
1158 namespace="GCO/HealthMonitor",
1159 metric_name="ClusterMemoryUtilization",
1160 dimensions_map={
1161 "ClusterName": cluster_name,
1162 "Region": region,
1163 },
1164 statistic="Average",
1165 period=Duration.minutes(5),
1166 label=f"{region} Memory",
1167 region=region,
1168 )
1169 for cluster_name, region in cluster_info
1170 ],
1171 width=12,
1172 height=6,
1173 )
1174 widgets.append(health_monitor_widget)
1176 # Manifest processor metrics
1177 manifest_processor_widget = cloudwatch.GraphWidget(
1178 title="Manifest Processor - Submissions",
1179 left=[
1180 cloudwatch.Metric(
1181 namespace="GCO/ManifestProcessor",
1182 metric_name="ManifestSubmissions",
1183 dimensions_map={
1184 "ClusterName": cluster_name,
1185 "Region": region,
1186 },
1187 statistic="Sum",
1188 period=Duration.minutes(5),
1189 label=f"{region} Submissions",
1190 region=region,
1191 )
1192 for cluster_name, region in cluster_info
1193 ],
1194 right=[
1195 cloudwatch.Metric(
1196 namespace="GCO/ManifestProcessor",
1197 metric_name="ManifestFailures",
1198 dimensions_map={
1199 "ClusterName": cluster_name,
1200 "Region": region,
1201 },
1202 statistic="Sum",
1203 period=Duration.minutes(5),
1204 label=f"{region} Failures",
1205 color="#d62728",
1206 region=region,
1207 )
1208 for cluster_name, region in cluster_info
1209 ],
1210 width=12,
1211 height=6,
1212 )
1213 widgets.append(manifest_processor_widget)
1215 # Container Insights - Pod restarts (indicates application issues)
1216 pod_restarts_widget = cloudwatch.GraphWidget(
1217 title="Container Insights - Pod Restarts",
1218 left=[
1219 cloudwatch.Metric(
1220 namespace="ContainerInsights",
1221 metric_name="pod_number_of_container_restarts",
1222 dimensions_map={"ClusterName": cluster_name},
1223 statistic="Sum",
1224 period=Duration.minutes(5),
1225 label=f"{region}",
1226 region=region,
1227 )
1228 for cluster_name, region in cluster_info
1229 ],
1230 width=12,
1231 height=6,
1232 )
1233 widgets.append(pod_restarts_widget)
1235 # Secret rotation Lambda metrics (Secrets Manager doesn't publish rotation metrics,
1236 # so we monitor the rotation Lambda function instead)
1237 if self.api_gateway_stack: 1237 ↛ 1273line 1237 didn't jump to line 1273 because the condition on line 1237 was always true
1238 rotation_function_name = self.api_gateway_stack.rotation_lambda.function_name
1239 api_gw_region = self.config.get_api_gateway_region()
1241 rotation_widget = cloudwatch.GraphWidget(
1242 title="Secret Rotation Lambda - Invocations & Errors",
1243 left=[
1244 cloudwatch.Metric(
1245 namespace="AWS/Lambda",
1246 metric_name="Invocations",
1247 dimensions_map={"FunctionName": rotation_function_name},
1248 statistic="Sum",
1249 period=Duration.hours(1),
1250 label="Invocations",
1251 color="#2ca02c",
1252 region=api_gw_region,
1253 ),
1254 ],
1255 right=[
1256 cloudwatch.Metric(
1257 namespace="AWS/Lambda",
1258 metric_name="Errors",
1259 dimensions_map={"FunctionName": rotation_function_name},
1260 statistic="Sum",
1261 period=Duration.hours(1),
1262 label="Errors",
1263 color="#d62728",
1264 region=api_gw_region,
1265 ),
1266 ],
1267 width=12,
1268 height=6,
1269 )
1270 widgets.append(rotation_widget)
1271 else:
1272 # Fallback text widget if api_gateway_stack not available
1273 fallback_widget = cloudwatch.TextWidget(
1274 markdown="**Secret Rotation:** API Gateway stack not configured. "
1275 "Rotation Lambda metrics unavailable.",
1276 width=12,
1277 height=6,
1278 )
1279 widgets.append(fallback_widget)
1281 return widgets
1283 def _create_alarms(self) -> None:
1284 """Create CloudWatch alarms"""
1285 self._create_global_accelerator_alarms()
1286 self._create_api_gateway_alarms()
1287 self._create_lambda_alarms()
1288 self._create_sqs_alarms()
1289 self._create_dynamodb_alarms()
1290 self._create_eks_alarms()
1291 self._create_alb_alarms()
1292 self._create_application_alarms()
1294 def _create_global_accelerator_alarms(self) -> None:
1295 """Create Global Accelerator alarms.
1297 Note: Global Accelerator metrics are only available in us-west-2.
1298 CloudWatch Alarms must be in the same region as the metrics they monitor.
1299 Since this monitoring stack may be deployed in a different region,
1300 we skip GA alarms here. To monitor GA, either:
1301 1. Create alarms manually in us-west-2
1302 2. Use CloudWatch cross-region dashboard widgets (which we do)
1303 3. Deploy a separate alarm stack in us-west-2
1304 """
1305 # GA alarms skipped - metrics only available in us-west-2
1306 # Dashboard widgets use region parameter to display GA metrics correctly
1307 pass
1309 def _create_api_gateway_alarms(self) -> None:
1310 """Create API Gateway alarms"""
1311 # Get the actual API name from the api_gateway_stack
1312 api_name = (
1313 self.api_gateway_stack.api.rest_api_name if self.api_gateway_stack else "gco-global-api"
1314 )
1316 # High 5XX error rate
1317 api_5xx_alarm = cloudwatch.Alarm(
1318 self,
1319 "ApiGateway5xxAlarm",
1320 alarm_description="API Gateway has high 5XX error rate",
1321 metric=cloudwatch.Metric(
1322 namespace="AWS/ApiGateway",
1323 metric_name="5XXError",
1324 dimensions_map={"ApiName": api_name},
1325 statistic="Sum",
1326 period=Duration.minutes(5),
1327 ),
1328 threshold=10,
1329 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
1330 evaluation_periods=2,
1331 datapoints_to_alarm=2,
1332 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,
1333 )
1334 api_5xx_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))
1336 # High latency
1337 api_latency_alarm = cloudwatch.Alarm(
1338 self,
1339 "ApiGatewayHighLatencyAlarm",
1340 alarm_description="API Gateway has high latency",
1341 metric=cloudwatch.Metric(
1342 namespace="AWS/ApiGateway",
1343 metric_name="Latency",
1344 dimensions_map={"ApiName": api_name},
1345 statistic="p99",
1346 period=Duration.minutes(5),
1347 ),
1348 threshold=10000, # 10 seconds
1349 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
1350 evaluation_periods=3,
1351 datapoints_to_alarm=2,
1352 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,
1353 )
1354 api_latency_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))
1356 def _create_lambda_alarms(self) -> None:
1357 """Create Lambda function alarms"""
1358 # Get Lambda function names from api_gateway_stack if available
1359 if self.api_gateway_stack: 1359 ↛ exitline 1359 didn't return from function '_create_lambda_alarms' because the condition on line 1359 was always true
1360 proxy_function_name = self.api_gateway_stack.proxy_lambda.function_name
1361 rotation_function_name = self.api_gateway_stack.rotation_lambda.function_name
1363 # API Gateway Proxy Lambda errors
1364 proxy_errors_alarm = cloudwatch.Alarm(
1365 self,
1366 "ProxyLambdaErrorsAlarm",
1367 alarm_description="API Gateway proxy Lambda has errors",
1368 metric=cloudwatch.Metric(
1369 namespace="AWS/Lambda",
1370 metric_name="Errors",
1371 dimensions_map={"FunctionName": proxy_function_name},
1372 statistic="Sum",
1373 period=Duration.minutes(5),
1374 ),
1375 threshold=5,
1376 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
1377 evaluation_periods=2,
1378 datapoints_to_alarm=2,
1379 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,
1380 )
1381 proxy_errors_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))
1383 # Proxy Lambda throttles
1384 proxy_throttles_alarm = cloudwatch.Alarm(
1385 self,
1386 "ProxyLambdaThrottlesAlarm",
1387 alarm_description="API Gateway proxy Lambda is being throttled",
1388 metric=cloudwatch.Metric(
1389 namespace="AWS/Lambda",
1390 metric_name="Throttles",
1391 dimensions_map={"FunctionName": proxy_function_name},
1392 statistic="Sum",
1393 period=Duration.minutes(5),
1394 ),
1395 threshold=1,
1396 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
1397 evaluation_periods=2,
1398 datapoints_to_alarm=2,
1399 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,
1400 )
1401 proxy_throttles_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))
1403 # Secret rotation Lambda errors
1404 rotation_errors_alarm = cloudwatch.Alarm(
1405 self,
1406 "RotationLambdaErrorsAlarm",
1407 alarm_description="Secret rotation Lambda has errors",
1408 metric=cloudwatch.Metric(
1409 namespace="AWS/Lambda",
1410 metric_name="Errors",
1411 dimensions_map={"FunctionName": rotation_function_name},
1412 statistic="Sum",
1413 period=Duration.hours(1),
1414 ),
1415 threshold=1,
1416 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
1417 evaluation_periods=1,
1418 datapoints_to_alarm=1,
1419 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,
1420 )
1421 rotation_errors_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))
1423 def _create_sqs_alarms(self) -> None:
1424 """Create SQS queue alarms"""
1425 for regional_stack in self.regional_stacks:
1426 region = regional_stack.deployment_region
1427 queue_name = regional_stack.job_queue.queue_name
1428 dlq_name = regional_stack.job_dlq.queue_name
1429 region_id = region.replace("-", "").title()
1431 # Old message alarm (stuck jobs)
1432 old_message_alarm = cloudwatch.Alarm(
1433 self,
1434 f"SqsOldMessageAlarm{region_id}",
1435 alarm_description=f"SQS queue in {region} has old messages (potential stuck jobs)",
1436 metric=cloudwatch.Metric(
1437 namespace="AWS/SQS",
1438 metric_name="ApproximateAgeOfOldestMessage",
1439 dimensions_map={"QueueName": queue_name},
1440 statistic="Maximum",
1441 period=Duration.minutes(5),
1442 ),
1443 threshold=3600, # 1 hour
1444 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
1445 evaluation_periods=2,
1446 datapoints_to_alarm=2,
1447 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,
1448 )
1449 old_message_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))
1451 # Dead letter queue alarm
1452 dlq_alarm = cloudwatch.Alarm(
1453 self,
1454 f"SqsDlqAlarm{region_id}",
1455 alarm_description=f"SQS dead letter queue in {region} has messages",
1456 metric=cloudwatch.Metric(
1457 namespace="AWS/SQS",
1458 metric_name="ApproximateNumberOfMessagesVisible",
1459 dimensions_map={"QueueName": dlq_name},
1460 statistic="Sum",
1461 period=Duration.minutes(5),
1462 ),
1463 threshold=1,
1464 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
1465 evaluation_periods=1,
1466 datapoints_to_alarm=1,
1467 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,
1468 )
1469 dlq_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))
1471 def _create_dynamodb_alarms(self) -> None:
1472 """Create DynamoDB alarms for job queue, templates, and webhooks tables."""
1473 # Get table names from global stack
1474 jobs_table = self.global_stack.jobs_table.table_name
1476 # DynamoDB tables are in the global region
1477 global_region = self.config.get_global_region()
1479 # Jobs table throttling alarm
1480 jobs_throttle_alarm = cloudwatch.Alarm(
1481 self,
1482 "DynamoDBJobsThrottleAlarm",
1483 alarm_description="DynamoDB jobs table is being throttled",
1484 metric=cloudwatch.Metric(
1485 namespace="AWS/DynamoDB",
1486 metric_name="ThrottledRequests",
1487 dimensions_map={"TableName": jobs_table},
1488 statistic="Sum",
1489 period=Duration.minutes(5),
1490 region=global_region,
1491 ),
1492 threshold=1,
1493 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
1494 evaluation_periods=2,
1495 datapoints_to_alarm=2,
1496 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,
1497 )
1498 jobs_throttle_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))
1500 # Jobs table system errors alarm
1501 jobs_errors_alarm = cloudwatch.Alarm(
1502 self,
1503 "DynamoDBJobsErrorsAlarm",
1504 alarm_description="DynamoDB jobs table has system errors",
1505 metric=cloudwatch.Metric(
1506 namespace="AWS/DynamoDB",
1507 metric_name="SystemErrors",
1508 dimensions_map={"TableName": jobs_table},
1509 statistic="Sum",
1510 period=Duration.minutes(5),
1511 region=global_region,
1512 ),
1513 threshold=1,
1514 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
1515 evaluation_periods=1,
1516 datapoints_to_alarm=1,
1517 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,
1518 )
1519 jobs_errors_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))
1521 def _create_eks_alarms(self) -> None:
1522 """Create EKS cluster alarms"""
1523 for regional_stack in self.regional_stacks:
1524 region = regional_stack.deployment_region
1525 cluster_name = regional_stack.cluster.cluster_name
1526 region_id = region.replace("-", "").title()
1528 # High CPU utilization alarm (node-level metric)
1529 high_cpu_alarm = cloudwatch.Alarm(
1530 self,
1531 f"EksHighCpuAlarm{region_id}",
1532 alarm_description=f"EKS cluster {cluster_name} has high CPU utilization",
1533 metric=cloudwatch.Metric(
1534 namespace="ContainerInsights",
1535 metric_name="node_cpu_utilization",
1536 dimensions_map={"ClusterName": cluster_name},
1537 statistic="Average",
1538 period=Duration.minutes(5),
1539 ),
1540 threshold=80,
1541 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
1542 evaluation_periods=3,
1543 datapoints_to_alarm=2,
1544 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,
1545 )
1546 high_cpu_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))
1548 # High memory utilization alarm (node-level metric)
1549 high_memory_alarm = cloudwatch.Alarm(
1550 self,
1551 f"EksHighMemoryAlarm{region_id}",
1552 alarm_description=f"EKS cluster {cluster_name} has high memory utilization",
1553 metric=cloudwatch.Metric(
1554 namespace="ContainerInsights",
1555 metric_name="node_memory_utilization",
1556 dimensions_map={"ClusterName": cluster_name},
1557 statistic="Average",
1558 period=Duration.minutes(5),
1559 ),
1560 threshold=85,
1561 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
1562 evaluation_periods=3,
1563 datapoints_to_alarm=2,
1564 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,
1565 )
1566 high_memory_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))
1568 def _create_alb_alarms(self) -> None:
1569 """Create ALB alarms.
1571 Note: ALBs are created dynamically by the AWS Load Balancer Controller
1572 in Kubernetes via Ingress resources. Since we can't know the exact ALB
1573 name at CDK synth time (it includes a hash), we cannot create alarms
1574 with specific ALB dimensions.
1576 CloudWatch Alarms don't support SEARCH expressions like dashboards do,
1577 so we skip ALB-specific alarms. Instead, rely on:
1578 1. Dashboard widgets with SEARCH expressions for monitoring
1579 2. EKS Container Insights alarms for pod/node health
1580 3. API Gateway alarms for request-level monitoring
1582 If ALB-specific alarms are needed, consider:
1583 - Using a custom resource to discover ALB names at deploy time
1584 - Creating alarms via AWS CLI/SDK after deployment
1585 - Using CloudWatch Anomaly Detection on the namespace level
1586 """
1587 # ALB alarms are skipped because ALB names are not known at synth time
1588 # The AWS Load Balancer Controller creates ALBs with names like:
1589 # k8s-<namespace>-<ingress>-<hash>
1590 pass
1592 def _create_application_alarms(self) -> None:
1593 """Create application-specific alarms"""
1594 for regional_stack in self.regional_stacks:
1595 region = regional_stack.deployment_region
1596 cluster_name = regional_stack.cluster.cluster_name
1597 region_id = region.replace("-", "").title()
1599 # High manifest failure rate alarm
1600 high_failure_rate_alarm = cloudwatch.Alarm(
1601 self,
1602 f"ManifestHighFailureRateAlarm{region_id}",
1603 alarm_description=f"Manifest processor in {region} has high failure rate",
1604 metric=cloudwatch.Metric(
1605 namespace="GCO/ManifestProcessor",
1606 metric_name="ManifestFailures",
1607 dimensions_map={"ClusterName": cluster_name, "Region": region},
1608 statistic="Sum",
1609 period=Duration.minutes(5),
1610 ),
1611 threshold=10,
1612 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
1613 evaluation_periods=2,
1614 datapoints_to_alarm=2,
1615 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,
1616 )
1617 high_failure_rate_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))
1619 def _create_composite_alarms(self) -> None:
1620 """Create composite alarms for better signal-to-noise ratio"""
1622 # Store individual alarms for composite alarm references
1623 regional_alarms: dict[str, list[cloudwatch.Alarm]] = {}
1625 for regional_stack in self.regional_stacks:
1626 region = regional_stack.deployment_region
1627 cluster_name = regional_stack.cluster.cluster_name
1628 region_id = region.replace("-", "").title()
1629 regional_alarms[region] = []
1631 # Create regional health composite alarm
1632 # Triggers when multiple issues occur in the same region
1633 eks_cpu_alarm = cloudwatch.Alarm(
1634 self,
1635 f"CompositeEksCpu{region_id}",
1636 metric=cloudwatch.Metric(
1637 namespace="ContainerInsights",
1638 metric_name="node_cpu_utilization",
1639 dimensions_map={"ClusterName": cluster_name},
1640 statistic="Average",
1641 period=Duration.minutes(5),
1642 ),
1643 threshold=90,
1644 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
1645 evaluation_periods=2,
1646 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,
1647 )
1648 regional_alarms[region].append(eks_cpu_alarm)
1650 eks_memory_alarm = cloudwatch.Alarm(
1651 self,
1652 f"CompositeEksMemory{region_id}",
1653 metric=cloudwatch.Metric(
1654 namespace="ContainerInsights",
1655 metric_name="node_memory_utilization",
1656 dimensions_map={"ClusterName": cluster_name},
1657 statistic="Average",
1658 period=Duration.minutes(5),
1659 ),
1660 threshold=90,
1661 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
1662 evaluation_periods=2,
1663 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,
1664 )
1665 regional_alarms[region].append(eks_memory_alarm)
1667 # Create composite alarm for critical regional issues
1668 for region, alarms in regional_alarms.items():
1669 region_id = region.replace("-", "").title()
1670 if len(alarms) >= 2: 1670 ↛ 1668line 1670 didn't jump to line 1668 because the condition on line 1670 was always true
1671 composite_alarm = cloudwatch.CompositeAlarm(
1672 self,
1673 f"RegionalCriticalAlarm{region_id}",
1674 alarm_description=f"Critical: Multiple issues detected in {region}",
1675 alarm_rule=cloudwatch.AlarmRule.all_of(*alarms),
1676 )
1677 composite_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))
1679 # API Gateway + Lambda composite alarm (only if api_gateway_stack is available)
1680 if self.api_gateway_stack: 1680 ↛ exitline 1680 didn't return from function '_create_composite_alarms' because the condition on line 1680 was always true
1681 api_name = self.api_gateway_stack.api.rest_api_name
1682 proxy_function_name = self.api_gateway_stack.proxy_lambda.function_name
1684 api_error_alarm = cloudwatch.Alarm(
1685 self,
1686 "CompositeApiErrors",
1687 metric=cloudwatch.Metric(
1688 namespace="AWS/ApiGateway",
1689 metric_name="5XXError",
1690 dimensions_map={"ApiName": api_name},
1691 statistic="Sum",
1692 period=Duration.minutes(5),
1693 ),
1694 threshold=5,
1695 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
1696 evaluation_periods=2,
1697 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,
1698 )
1700 lambda_error_alarm = cloudwatch.Alarm(
1701 self,
1702 "CompositeLambdaErrors",
1703 metric=cloudwatch.Metric(
1704 namespace="AWS/Lambda",
1705 metric_name="Errors",
1706 dimensions_map={"FunctionName": proxy_function_name},
1707 statistic="Sum",
1708 period=Duration.minutes(5),
1709 ),
1710 threshold=3,
1711 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
1712 evaluation_periods=2,
1713 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,
1714 )
1716 api_lambda_composite = cloudwatch.CompositeAlarm(
1717 self,
1718 "ApiLambdaCompositeAlarm",
1719 alarm_description="Critical: Both API Gateway and Lambda proxy have errors",
1720 alarm_rule=cloudwatch.AlarmRule.all_of(api_error_alarm, lambda_error_alarm),
1721 )
1722 api_lambda_composite.add_alarm_action(cw_actions.SnsAction(self.alert_topic))
1724 def _create_custom_metrics(self) -> None:
1725 """Create custom metric filters and log groups"""
1726 for regional_stack in self.regional_stacks:
1727 region = regional_stack.deployment_region
1728 region_id = region.replace("-", "").title()
1730 # Health monitor log group
1731 # log_group_name intentionally omitted - let CDK generate unique name
1732 logs.LogGroup(
1733 self,
1734 f"HealthMonitorLogGroup{region_id}",
1735 retention=logs.RetentionDays.ONE_MONTH,
1736 removal_policy=RemovalPolicy.DESTROY,
1737 )
1739 # Manifest processor log group
1740 # log_group_name intentionally omitted - let CDK generate unique name
1741 logs.LogGroup(
1742 self,
1743 f"ManifestProcessorLogGroup{region_id}",
1744 retention=logs.RetentionDays.ONE_MONTH,
1745 removal_policy=RemovalPolicy.DESTROY,
1746 )
1748 def _create_outputs(self) -> None:
1749 """Create CloudFormation outputs"""
1750 CfnOutput(
1751 self,
1752 "DashboardUrl",
1753 value=f"https://console.aws.amazon.com/cloudwatch/home?region={self.region}#dashboards:name={self.dashboard.dashboard_name}",
1754 description="CloudWatch Dashboard URL",
1755 )
1757 CfnOutput(
1758 self,
1759 "AlertTopicArn",
1760 value=self.alert_topic.topic_arn,
1761 description="SNS Topic ARN for monitoring alerts",
1762 )
1764 CfnOutput(
1765 self,
1766 "AlarmCount",
1767 value="See CloudWatch Alarms console for full list",
1768 description="Monitoring alarms created",
1769 )