Coverage for gco/stacks/monitoring

1"""

2Monitoring stack for GCO (Global Capacity Orchestrator on AWS) - Cross-region monitoring and observability.

4This stack creates centralized monitoring resources for all GCO deployments:

5- CloudWatch Dashboard with comprehensive widgets for all regions

6- SNS topic for alerting

7- CloudWatch Alarms for critical metrics

8- Log groups for application logs

9- Anomaly detection for traffic patterns

10- Composite alarms for better signal-to-noise

12Dashboard Sections:

13- Global Accelerator: Flow counts, processed bytes

14- API Gateway: Request counts, latency, error rates

15- Lambda Functions: Invocations, errors, duration, throttles

16- SQS Queues: Message counts, age, dead letter queue depth

17- DynamoDB Tables: Capacity, latency, throttles, errors

18- EKS Clusters: CPU/memory utilization per region

19- ALBs: Request counts, response times, healthy hosts

20- Applications: Custom metrics from health monitor and manifest processor

22Cross-Region Metrics:

23 CloudWatch metrics are region-specific. This stack handles cross-region

24 monitoring by specifying the `region` parameter on metrics:

25 - Global Accelerator metrics: Always in us-west-2

26 - DynamoDB metrics: In the global region (where tables are deployed)

27 - Regional metrics: In each cluster's region

29Alarms:

30- High CPU/memory utilization on EKS clusters

31- Unhealthy hosts in ALB target groups

32- High response times

33- Manifest processing failures

34- Lambda errors and throttles

35- SQS message age (stuck jobs)

36- DynamoDB throttling and system errors

37- API Gateway 5XX errors

38- Secret rotation failures

39"""

41from typing import TYPE_CHECKING, Any

43from aws_cdk import (

44 CfnOutput,

45 Duration,

46 RemovalPolicy,

47 Stack,

48)

49from aws_cdk import aws_cloudwatch as cloudwatch

50from aws_cdk import aws_cloudwatch_actions as cw_actions

51from aws_cdk import aws_logs as logs

52from aws_cdk import aws_sns as sns

53from constructs import Construct

55from gco.config.config_loader import ConfigLoader

57if TYPE_CHECKING:

58 from gco.stacks.api_gateway_global_stack import GCOApiGatewayGlobalStack

59 from gco.stacks.global_stack import GCOGlobalStack

60 from gco.stacks.regional_stack import GCORegionalStack

63class GCOMonitoringStack(Stack):

64 """

65 Cross-region monitoring and observability stack.

67 Creates a centralized CloudWatch dashboard and alarms that aggregate

68 metrics from all regional deployments.

70 Attributes:

71 alert_topic: SNS topic for alarm notifications

72 dashboard: CloudWatch dashboard with all monitoring widgets

73 """

75 def __init__(

76 self,

77 scope: Construct,

78 construct_id: str,

79 config: ConfigLoader,

80 global_stack: GCOGlobalStack,

81 regional_stacks: list[GCORegionalStack],

82 api_gateway_stack: GCOApiGatewayGlobalStack | None = None,

83 **kwargs: Any,

84 ) -> None:

85 super().__init__(scope, construct_id, **kwargs)

87 self.config = config

88 self.global_stack = global_stack

89 self.regional_stacks = regional_stacks

90 self.api_gateway_stack = api_gateway_stack

91 self.project_name = config.get_project_name()

92 self.regions = config.get_regions()

94 # Create SNS topic for alerts

95 self.alert_topic = self._create_alert_topic()

97 # Create CloudWatch dashboard

98 self.dashboard = self._create_dashboard()

100 # Create alarms

101 self._create_alarms()

102

103 # Create composite alarms

104 self._create_composite_alarms()

105

106 # Create custom metrics

107 self._create_custom_metrics()

108

109 # Export monitoring resources

110 self._create_outputs()

111

112 # Apply cdk-nag suppressions

113 self._apply_nag_suppressions()

114

115 def _apply_nag_suppressions(self) -> None:

116 """Apply cdk-nag suppressions for this stack."""

117 from gco.stacks.nag_suppressions import apply_all_suppressions

118

119 apply_all_suppressions(

120 self,

121 stack_type="monitoring",

122 regions=self.config.get_regions(),

123 global_region=self.config.get_global_region(),

124 )

125

126 def _create_alert_topic(self) -> sns.Topic:

127 """Create SNS topic for monitoring alerts"""

128 topic = sns.Topic(

129 self,

130 "GCOAlertTopic",

131 display_name="GCO (Global Capacity Orchestrator on AWS) Monitoring Alerts",

132 enforce_ssl=True,

133 )

134 return topic

135

136 def _create_dashboard(self) -> cloudwatch.Dashboard:

137 """Create comprehensive CloudWatch dashboard for monitoring"""

138 dashboard = cloudwatch.Dashboard(

139 self,

140 "GCODashboard",

141 period_override=cloudwatch.PeriodOverride.AUTO,

142 )

143

144 # Add widgets in logical order

145 dashboard.add_widgets(*self._create_global_accelerator_widgets())

146 dashboard.add_widgets(*self._create_api_gateway_widgets())

147 dashboard.add_widgets(*self._create_lambda_widgets())

148 dashboard.add_widgets(*self._create_sqs_widgets())

149 dashboard.add_widgets(*self._create_dynamodb_widgets())

150 dashboard.add_widgets(*self._create_eks_widgets())

151 dashboard.add_widgets(*self._create_gpu_widgets())

152 dashboard.add_widgets(*self._create_alb_widgets())

153 dashboard.add_widgets(*self._create_application_widgets())

154

155 return dashboard

156

157 def _create_global_accelerator_widgets(self) -> list[cloudwatch.IWidget]:

158 """Create Global Accelerator monitoring widgets.

159

160 Note: Global Accelerator metrics are only available in us-west-2,

161 regardless of where the accelerator endpoints are located.

162 CloudWatch uses the Accelerator ID (UUID), not the name.

163 """

164 widgets: list[cloudwatch.IWidget] = []

165

166 # Get the accelerator ID from the global stack (CloudWatch uses ID, not name)

167 accelerator_id = self.global_stack.accelerator_id

168

169 # Global Accelerator metrics are always in us-west-2

170 ga_metrics_region = "us-west-2"

171

172 # Section header

173 widgets.append(

174 cloudwatch.TextWidget(

175 markdown="# Global Accelerator\nTraffic distribution and connectivity metrics",

176 width=24,

177 height=1,

178 )

179 )

180

181 # Flow count with anomaly detection

182 flow_count_widget = cloudwatch.GraphWidget(

183 title="Global Accelerator - New Flows",

184 left=[

185 cloudwatch.Metric(

186 namespace="AWS/GlobalAccelerator",

187 metric_name="NewFlowCount",

188 dimensions_map={"Accelerator": accelerator_id},

189 statistic="Sum",

190 period=Duration.minutes(5),

191 region=ga_metrics_region,

192 )

193 ],

194 width=12,

195 height=6,

196 region=ga_metrics_region,

197 )

198 widgets.append(flow_count_widget)

199

200 # Processed bytes

201 bytes_widget = cloudwatch.GraphWidget(

202 title="Global Accelerator - Processed Bytes",

203 left=[

204 cloudwatch.Metric(

205 namespace="AWS/GlobalAccelerator",

206 metric_name="ProcessedBytesIn",

207 dimensions_map={"Accelerator": accelerator_id},

208 statistic="Sum",

209 period=Duration.minutes(5),

210 region=ga_metrics_region,

211 ),

212 cloudwatch.Metric(

213 namespace="AWS/GlobalAccelerator",

214 metric_name="ProcessedBytesOut",

215 dimensions_map={"Accelerator": accelerator_id},

216 statistic="Sum",

217 period=Duration.minutes(5),

218 region=ga_metrics_region,

219 ),

220 ],

221 width=12,

222 height=6,

223 region=ga_metrics_region,

224 )

225 widgets.append(bytes_widget)

226

227 return widgets

228

229 def _create_api_gateway_widgets(self) -> list[cloudwatch.IWidget]:

230 """Create API Gateway monitoring widgets"""

231 widgets: list[cloudwatch.IWidget] = []

232

233 # Get the actual API name from the api_gateway_stack

234 api_name = (

235 self.api_gateway_stack.api.rest_api_name if self.api_gateway_stack else "gco-global-api"

236 )

237

238 # API Gateway metrics are in the region where the API is deployed

239 api_gw_region = self.config.get_api_gateway_region()

240

241 # Section header

242 widgets.append(

243 cloudwatch.TextWidget(

244 markdown="# API Gateway\nRequest metrics, latency, and error rates",

245 width=24,

246 height=1,

247 )

248 )

249

250 # Request count and latency

251 request_widget = cloudwatch.GraphWidget(

252 title="API Gateway - Requests & Latency",

253 left=[

254 cloudwatch.Metric(

255 namespace="AWS/ApiGateway",

256 metric_name="Count",

257 dimensions_map={"ApiName": api_name},

258 statistic="Sum",

259 period=Duration.minutes(5),

260 region=api_gw_region,

261 )

262 ],

263 right=[

264 cloudwatch.Metric(

265 namespace="AWS/ApiGateway",

266 metric_name="Latency",

267 dimensions_map={"ApiName": api_name},

268 statistic="Average",

269 period=Duration.minutes(5),

270 region=api_gw_region,

271 ),

272 cloudwatch.Metric(

273 namespace="AWS/ApiGateway",

274 metric_name="Latency",

275 dimensions_map={"ApiName": api_name},

276 statistic="p99",

277 period=Duration.minutes(5),

278 region=api_gw_region,

279 ),

280 ],

281 width=12,

282 height=6,

283 region=api_gw_region,

284 )

285 widgets.append(request_widget)

286

287 # Error rates (4XX and 5XX)

288 error_widget = cloudwatch.GraphWidget(

289 title="API Gateway - Error Rates",

290 left=[

291 cloudwatch.Metric(

292 namespace="AWS/ApiGateway",

293 metric_name="4XXError",

294 dimensions_map={"ApiName": api_name},

295 statistic="Sum",

296 period=Duration.minutes(5),

297 color="#ff7f0e",

298 region=api_gw_region,

299 ),

300 cloudwatch.Metric(

301 namespace="AWS/ApiGateway",

302 metric_name="5XXError",

303 dimensions_map={"ApiName": api_name},

304 statistic="Sum",

305 period=Duration.minutes(5),

306 color="#d62728",

307 region=api_gw_region,

308 ),

309 ],

310 width=12,

311 height=6,

312 region=api_gw_region,

313 )

314 widgets.append(error_widget)

315

316 return widgets

317

318 def _create_lambda_widgets(self) -> list[cloudwatch.IWidget]:

319 """Create Lambda function monitoring widgets"""

320 widgets: list[cloudwatch.IWidget] = []

321

322 # Section header

323 widgets.append(

324 cloudwatch.TextWidget(

325 markdown="# Lambda Functions\nProxy, rotation, and regional Lambda metrics",

326 width=24,

327 height=1,

328 )

329 )

330

331 # Get API Gateway region for global Lambda functions

332 api_gw_region = self.config.get_api_gateway_region()

333

334 # Build Lambda function list: (function_name, label, region)

335 lambda_functions: list[tuple[str, str, str]] = []

336

337 # Add API Gateway Lambda functions if available

338 if self.api_gateway_stack: 338 ↛ 355line 338 didn't jump to line 355 because the condition on line 338 was always true

339 lambda_functions.append(

340 (

341 self.api_gateway_stack.proxy_lambda.function_name,

342 "API Gateway Proxy",

343 api_gw_region,

344 )

345 )

346 lambda_functions.append(

347 (

348 self.api_gateway_stack.rotation_lambda.function_name,

349 "Secret Rotation",

350 api_gw_region,

351 )

352 )

353

354 # Add regional Lambda functions from each regional stack

355 for regional_stack in self.regional_stacks:

356 region = regional_stack.deployment_region

357 lambda_functions.extend(

358 [

359 (

360 regional_stack.kubectl_lambda_function_name,

361 f"Kubectl Applier ({region})",

362 region,

363 ),

364 (

365 regional_stack.helm_installer_lambda_function_name,

366 f"Helm Installer ({region})",

367 region,

368 ),

369 ]

370 )

371

372 # Invocations widget

373 invocations_widget = cloudwatch.GraphWidget(

374 title="Lambda - Invocations",

375 left=[

376 cloudwatch.Metric(

377 namespace="AWS/Lambda",

378 metric_name="Invocations",

379 dimensions_map={"FunctionName": func_name},

380 statistic="Sum",

381 period=Duration.minutes(5),

382 label=label,

383 region=region,

384 )

385 for func_name, label, region in lambda_functions[:5]

386 ],

387 width=12,

388 height=6,

389 )

390 widgets.append(invocations_widget)

391

392 errors_widget = cloudwatch.GraphWidget(

393 title="Lambda - Errors",

394 left=[

395 cloudwatch.Metric(

396 namespace="AWS/Lambda",

397 metric_name="Errors",

398 dimensions_map={"FunctionName": func_name},

399 statistic="Sum",

400 period=Duration.minutes(5),

401 label=label,

402 color="#d62728",

403 region=region,

404 )

405 for func_name, label, region in lambda_functions[:5]

406 ],

407 width=12,

408 height=6,

409 )

410 widgets.append(errors_widget)

411

412 # Duration widget

413 duration_widget = cloudwatch.GraphWidget(

414 title="Lambda - Duration (ms)",

415 left=[

416 cloudwatch.Metric(

417 namespace="AWS/Lambda",

418 metric_name="Duration",

419 dimensions_map={"FunctionName": func_name},

420 statistic="Average",

421 period=Duration.minutes(5),

422 label=label,

423 region=region,

424 )

425 for func_name, label, region in lambda_functions[:5]

426 ],

427 width=12,

428 height=6,

429 )

430 widgets.append(duration_widget)

431

432 # Throttles widget

433 throttles_widget = cloudwatch.GraphWidget(

434 title="Lambda - Throttles & Concurrent Executions",

435 left=[

436 cloudwatch.Metric(

437 namespace="AWS/Lambda",

438 metric_name="Throttles",

439 dimensions_map={"FunctionName": func_name},

440 statistic="Sum",

441 period=Duration.minutes(5),

442 label=f"{label} Throttles",

443 region=region,

444 )

445 for func_name, label, region in lambda_functions[:3]

446 ],

447 right=[

448 cloudwatch.Metric(

449 namespace="AWS/Lambda",

450 metric_name="ConcurrentExecutions",

451 dimensions_map={"FunctionName": func_name},

452 statistic="Maximum",

453 period=Duration.minutes(5),

454 label=f"{label} Concurrent",

455 region=region,

456 )

457 for func_name, label, region in lambda_functions[:3]

458 ],

459 width=12,

460 height=6,

461 )

462 widgets.append(throttles_widget)

463

464 return widgets

465

466 def _create_sqs_widgets(self) -> list[cloudwatch.IWidget]:

467 """Create SQS queue monitoring widgets"""

468 widgets: list[cloudwatch.IWidget] = []

469

470 # Section header

471 widgets.append(

472 cloudwatch.TextWidget(

473 markdown="# SQS Queues\nJob submission queue metrics and dead letter queue",

474 width=24,

475 height=1,

476 )

477 )

478

479 # Build queue info from regional stacks: (queue_name, dlq_name, region)

480 queue_info = [

481 (

482 regional_stack.job_queue.queue_name,

483 regional_stack.job_dlq.queue_name,

484 regional_stack.deployment_region,

485 )

486 for regional_stack in self.regional_stacks

487 ]

488

489 # Messages visible and in-flight per region

490 messages_widget = cloudwatch.GraphWidget(

491 title="SQS - Messages (Visible & In-Flight)",

492 left=[

493 cloudwatch.Metric(

494 namespace="AWS/SQS",

495 metric_name="ApproximateNumberOfMessagesVisible",

496 dimensions_map={"QueueName": queue_name},

497 statistic="Average",

498 period=Duration.minutes(1),

499 label=f"{region} Visible",

500 region=region,

501 )

502 for queue_name, _, region in queue_info

503 ],

504 right=[

505 cloudwatch.Metric(

506 namespace="AWS/SQS",

507 metric_name="ApproximateNumberOfMessagesNotVisible",

508 dimensions_map={"QueueName": queue_name},

509 statistic="Average",

510 period=Duration.minutes(1),

511 label=f"{region} In-Flight",

512 region=region,

513 )

514 for queue_name, _, region in queue_info

515 ],

516 width=12,

517 height=6,

518 )

519 widgets.append(messages_widget)

520

521 # Age of oldest message (critical for detecting stuck jobs)

522 age_widget = cloudwatch.GraphWidget(

523 title="SQS - Age of Oldest Message (seconds)",

524 left=[

525 cloudwatch.Metric(

526 namespace="AWS/SQS",

527 metric_name="ApproximateAgeOfOldestMessage",

528 dimensions_map={"QueueName": queue_name},

529 statistic="Maximum",

530 period=Duration.minutes(1),

531 label=region,

532 region=region,

533 )

534 for queue_name, _, region in queue_info

535 ],

536 width=12,

537 height=6,

538 )

539 widgets.append(age_widget)

540

541 # Dead letter queue depth

542 dlq_widget = cloudwatch.GraphWidget(

543 title="SQS - Dead Letter Queue Depth",

544 left=[

545 cloudwatch.Metric(

546 namespace="AWS/SQS",

547 metric_name="ApproximateNumberOfMessagesVisible",

548 dimensions_map={"QueueName": dlq_name},

549 statistic="Average",

550 period=Duration.minutes(1),

551 label=f"{region} DLQ",

552 color="#d62728",

553 region=region,

554 )

555 for _, dlq_name, region in queue_info

556 ],

557 width=12,

558 height=6,

559 )

560 widgets.append(dlq_widget)

561

562 # Messages sent/received/deleted

563 throughput_widget = cloudwatch.GraphWidget(

564 title="SQS - Throughput",

565 left=[

566 cloudwatch.Metric(

567 namespace="AWS/SQS",

568 metric_name="NumberOfMessagesSent",

569 dimensions_map={"QueueName": queue_name},

570 statistic="Sum",

571 period=Duration.minutes(5),

572 label=f"{region} Sent",

573 region=region,

574 )

575 for queue_name, _, region in queue_info

576 ],

577 right=[

578 cloudwatch.Metric(

579 namespace="AWS/SQS",

580 metric_name="NumberOfMessagesDeleted",

581 dimensions_map={"QueueName": queue_name},

582 statistic="Sum",

583 period=Duration.minutes(5),

584 label=f"{region} Processed",

585 region=region,

586 )

587 for queue_name, _, region in queue_info

588 ],

589 width=12,

590 height=6,

591 )

592 widgets.append(throughput_widget)

593

594 return widgets

595

596 def _create_dynamodb_widgets(self) -> list[cloudwatch.IWidget]:

597 """Create DynamoDB monitoring widgets for job queue, templates, and webhooks tables."""

598 widgets: list[cloudwatch.IWidget] = []

599

600 # Get table names from global stack

601 templates_table = self.global_stack.templates_table.table_name

602 webhooks_table = self.global_stack.webhooks_table.table_name

603 jobs_table = self.global_stack.jobs_table.table_name

604

605 # DynamoDB tables are in the global region

606 global_region = self.config.get_global_region()

607

608 # Section header

609 widgets.append(

610 cloudwatch.TextWidget(

611 markdown="# DynamoDB Tables\nJob queue, templates, and webhooks storage metrics",

612 width=24,

613 height=1,

614 )

615 )

616

617 # Read/Write capacity consumed

618 capacity_widget = cloudwatch.GraphWidget(

619 title="DynamoDB - Consumed Capacity",

620 left=[

621 cloudwatch.Metric(

622 namespace="AWS/DynamoDB",

623 metric_name="ConsumedReadCapacityUnits",

624 dimensions_map={"TableName": jobs_table},

625 statistic="Sum",

626 period=Duration.minutes(5),

627 label="Jobs Read",

628 region=global_region,

629 ),

630 cloudwatch.Metric(

631 namespace="AWS/DynamoDB",

632 metric_name="ConsumedReadCapacityUnits",

633 dimensions_map={"TableName": templates_table},

634 statistic="Sum",

635 period=Duration.minutes(5),

636 label="Templates Read",

637 region=global_region,

638 ),

639 cloudwatch.Metric(

640 namespace="AWS/DynamoDB",

641 metric_name="ConsumedReadCapacityUnits",

642 dimensions_map={"TableName": webhooks_table},

643 statistic="Sum",

644 period=Duration.minutes(5),

645 label="Webhooks Read",

646 region=global_region,

647 ),

648 ],

649 right=[

650 cloudwatch.Metric(

651 namespace="AWS/DynamoDB",

652 metric_name="ConsumedWriteCapacityUnits",

653 dimensions_map={"TableName": jobs_table},

654 statistic="Sum",

655 period=Duration.minutes(5),

656 label="Jobs Write",

657 region=global_region,

658 ),

659 cloudwatch.Metric(

660 namespace="AWS/DynamoDB",

661 metric_name="ConsumedWriteCapacityUnits",

662 dimensions_map={"TableName": templates_table},

663 statistic="Sum",

664 period=Duration.minutes(5),

665 label="Templates Write",

666 region=global_region,

667 ),

668 ],

669 width=12,

670 height=6,

671 region=global_region,

672 )

673 widgets.append(capacity_widget)

674

675 # Latency metrics

676 latency_widget = cloudwatch.GraphWidget(

677 title="DynamoDB - Latency (ms)",

678 left=[

679 cloudwatch.Metric(

680 namespace="AWS/DynamoDB",

681 metric_name="SuccessfulRequestLatency",

682 dimensions_map={"TableName": jobs_table, "Operation": "GetItem"},

683 statistic="Average",

684 period=Duration.minutes(5),

685 label="Jobs GetItem",

686 region=global_region,

687 ),

688 cloudwatch.Metric(

689 namespace="AWS/DynamoDB",

690 metric_name="SuccessfulRequestLatency",

691 dimensions_map={"TableName": jobs_table, "Operation": "PutItem"},

692 statistic="Average",

693 period=Duration.minutes(5),

694 label="Jobs PutItem",

695 region=global_region,

696 ),

697 cloudwatch.Metric(

698 namespace="AWS/DynamoDB",

699 metric_name="SuccessfulRequestLatency",

700 dimensions_map={"TableName": jobs_table, "Operation": "Query"},

701 statistic="Average",

702 period=Duration.minutes(5),

703 label="Jobs Query",

704 region=global_region,

705 ),

706 ],

707 width=12,

708 height=6,

709 region=global_region,

710 )

711 widgets.append(latency_widget)

712

713 # Throttled requests

714 throttle_widget = cloudwatch.GraphWidget(

715 title="DynamoDB - Throttled Requests",

716 left=[

717 cloudwatch.Metric(

718 namespace="AWS/DynamoDB",

719 metric_name="ThrottledRequests",

720 dimensions_map={"TableName": jobs_table},

721 statistic="Sum",

722 period=Duration.minutes(5),

723 label="Jobs",

724 color="#d62728",

725 region=global_region,

726 ),

727 cloudwatch.Metric(

728 namespace="AWS/DynamoDB",

729 metric_name="ThrottledRequests",

730 dimensions_map={"TableName": templates_table},

731 statistic="Sum",

732 period=Duration.minutes(5),

733 label="Templates",

734 color="#ff7f0e",

735 region=global_region,

736 ),

737 cloudwatch.Metric(

738 namespace="AWS/DynamoDB",

739 metric_name="ThrottledRequests",

740 dimensions_map={"TableName": webhooks_table},

741 statistic="Sum",

742 period=Duration.minutes(5),

743 label="Webhooks",

744 color="#9467bd",

745 region=global_region,

746 ),

747 ],

748 width=12,

749 height=6,

750 region=global_region,

751 )

752 widgets.append(throttle_widget)

753

754 # System errors

755 errors_widget = cloudwatch.GraphWidget(

756 title="DynamoDB - System Errors",

757 left=[

758 cloudwatch.Metric(

759 namespace="AWS/DynamoDB",

760 metric_name="SystemErrors",

761 dimensions_map={"TableName": jobs_table},

762 statistic="Sum",

763 period=Duration.minutes(5),

764 label="Jobs",

765 color="#d62728",

766 region=global_region,

767 ),

768 cloudwatch.Metric(

769 namespace="AWS/DynamoDB",

770 metric_name="SystemErrors",

771 dimensions_map={"TableName": templates_table},

772 statistic="Sum",

773 period=Duration.minutes(5),

774 label="Templates",

775 color="#ff7f0e",

776 region=global_region,

777 ),

778 ],

779 width=12,

780 height=6,

781 region=global_region,

782 )

783 widgets.append(errors_widget)

784

785 return widgets

786

787 def _create_eks_widgets(self) -> list[cloudwatch.IWidget]:

788 """Create EKS cluster monitoring widgets"""

789 widgets: list[cloudwatch.IWidget] = []

790

791 # Section header

792 widgets.append(

793 cloudwatch.TextWidget(

794 markdown="# EKS Clusters\nCluster resource utilization and node metrics",

795 width=24,

796 height=1,

797 )

798 )

799

800 # Build cluster info from regional stacks: (cluster_name, region)

801 cluster_info = [

802 (regional_stack.cluster.cluster_name, regional_stack.deployment_region)

803 for regional_stack in self.regional_stacks

804 ]

805

806 # EKS cluster status

807 cluster_status_widget = cloudwatch.SingleValueWidget(

808 title="EKS Clusters - Failed Requests",

809 metrics=[

810 cloudwatch.Metric(

811 namespace="AWS/EKS",

812 metric_name="cluster_failed_request_count",

813 dimensions_map={"cluster_name": cluster_name},

814 statistic="Sum",

815 period=Duration.minutes(5),

816 region=region,

817 )

818 for cluster_name, region in cluster_info

819 ],

820 width=12,

821 height=6,

822 )

823 widgets.append(cluster_status_widget)

824

825 # Container Insights - Node CPU utilization (aggregated across all nodes)

826 # Note: region parameter enables cross-region metrics in dashboard

827 cpu_widget = cloudwatch.GraphWidget(

828 title="EKS Clusters - Node CPU Utilization (%)",

829 left=[

830 cloudwatch.Metric(

831 namespace="ContainerInsights",

832 metric_name="node_cpu_utilization",

833 dimensions_map={"ClusterName": cluster_name},

834 statistic="Average",

835 period=Duration.minutes(5),

836 label=region,

837 region=region,

838 )

839 for cluster_name, region in cluster_info

840 ],

841 width=12,

842 height=6,

843 )

844 widgets.append(cpu_widget)

845

846 # Container Insights - Node Memory utilization (aggregated across all nodes)

847 memory_widget = cloudwatch.GraphWidget(

848 title="EKS Clusters - Node Memory Utilization (%)",

849 left=[

850 cloudwatch.Metric(

851 namespace="ContainerInsights",

852 metric_name="node_memory_utilization",

853 dimensions_map={"ClusterName": cluster_name},

854 statistic="Average",

855 period=Duration.minutes(5),

856 label=region,

857 region=region,

858 )

859 for cluster_name, region in cluster_info

860 ],

861 width=12,

862 height=6,

863 )

864 widgets.append(memory_widget)

865

866 # Node status - running pods capacity

867 node_widget = cloudwatch.GraphWidget(

868 title="EKS Clusters - Node Pod Capacity",

869 left=[

870 cloudwatch.Metric(

871 namespace="ContainerInsights",

872 metric_name="node_status_capacity_pods",

873 dimensions_map={"ClusterName": cluster_name},

874 statistic="Sum",

875 period=Duration.minutes(5),

876 label=f"{region} Capacity",

877 region=region,

878 )

879 for cluster_name, region in cluster_info

880 ],

881 right=[

882 cloudwatch.Metric(

883 namespace="ContainerInsights",

884 metric_name="node_number_of_running_pods",

885 dimensions_map={"ClusterName": cluster_name},

886 statistic="Sum",

887 period=Duration.minutes(5),

888 label=f"{region} Running",

889 region=region,

890 )

891 for cluster_name, region in cluster_info

892 ],

893 width=12,

894 height=6,

895 )

896 widgets.append(node_widget)

897

898 return widgets

899

900 def _create_gpu_widgets(self) -> list[cloudwatch.IWidget]:

901 """Create GPU monitoring widgets using DCGM Exporter metrics via ContainerInsights."""

902 widgets: list[cloudwatch.IWidget] = []

903

904 widgets.append(

905 cloudwatch.TextWidget(

906 markdown="# GPU Metrics\nGPU utilization, memory, and temperature from DCGM Exporter",

907 width=24,

908 height=1,

909 )

910 )

911

912 cluster_info = [

913 (regional_stack.cluster.cluster_name, regional_stack.deployment_region)

914 for regional_stack in self.regional_stacks

915 ]

916

917 # GPU utilization percentage

918 gpu_util_widget = cloudwatch.GraphWidget(

919 title="GPU Utilization (%)",

920 left=[

921 cloudwatch.Metric(

922 namespace="ContainerInsights",

923 metric_name="node_gpu_utilization",

924 dimensions_map={"ClusterName": cluster_name},

925 statistic="Average",

926 period=Duration.minutes(5),

927 label=region,

928 region=region,

929 )

930 for cluster_name, region in cluster_info

931 ],

932 width=12,

933 height=6,

934 )

935 widgets.append(gpu_util_widget)

936

937 # GPU memory utilization

938 gpu_mem_widget = cloudwatch.GraphWidget(

939 title="GPU Memory Utilization (%)",

940 left=[

941 cloudwatch.Metric(

942 namespace="ContainerInsights",

943 metric_name="node_gpu_memory_utilization",

944 dimensions_map={"ClusterName": cluster_name},

945 statistic="Average",

946 period=Duration.minutes(5),

947 label=region,

948 region=region,

949 )

950 for cluster_name, region in cluster_info

951 ],

952 width=12,

953 height=6,

954 )

955 widgets.append(gpu_mem_widget)

956

957 # GPU temperature

958 gpu_temp_widget = cloudwatch.GraphWidget(

959 title="GPU Temperature (°C)",

960 left=[

961 cloudwatch.Metric(

962 namespace="ContainerInsights",

963 metric_name="node_gpu_temperature",

964 dimensions_map={"ClusterName": cluster_name},

965 statistic="Maximum",

966 period=Duration.minutes(5),

967 label=region,

968 region=region,

969 )

970 for cluster_name, region in cluster_info

971 ],

972 width=12,

973 height=6,

974 )

975 widgets.append(gpu_temp_widget)

976

977 # GPU count (active GPUs)

978 gpu_count_widget = cloudwatch.GraphWidget(

979 title="Active GPU Count",

980 left=[

981 cloudwatch.Metric(

982 namespace="ContainerInsights",

983 metric_name="node_gpu_limit",

984 dimensions_map={"ClusterName": cluster_name},

985 statistic="Sum",

986 period=Duration.minutes(5),

987 label=region,

988 region=region,

989 )

990 for cluster_name, region in cluster_info

991 ],

992 width=12,

993 height=6,

994 )

995 widgets.append(gpu_count_widget)

996

997 return widgets

998

999 def _create_alb_widgets(self) -> list[cloudwatch.IWidget]:

1000 """Create ALB monitoring widgets.

1001

1002 Note: ALBs are created by the AWS Load Balancer Controller in Kubernetes

1003 via Ingress resources, not by CDK. The controller uses a naming convention:

1004 k8s-<namespace>-<ingress-name>-<hash>

1005

1006 Since we can't know the exact ALB name at CDK synth time (includes a hash),

1007 we use CloudWatch SEARCH expressions to dynamically find ALBs matching

1008 the prefix pattern at dashboard render time.

1009 """

1010 widgets: list[cloudwatch.IWidget] = []

1011

1012 # Section header

1013 widgets.append(

1014 cloudwatch.TextWidget(

1015 markdown="# Application Load Balancers\n"

1016 "Request metrics and health status. "

1017 "Uses CloudWatch SEARCH to dynamically find ALBs created by "

1018 "AWS Load Balancer Controller.",

1019 width=24,

1020 height=1,

1021 )

1022 )

1023

1024 # Create one widget per region for ALB request count

1025 for region in self.regions:

1026 request_count_widget = cloudwatch.GraphWidget(

1027 title=f"ALB - Request Count ({region})",

1028 left=[

1029 cloudwatch.MathExpression(

1030 expression=(

1031 'SEARCH(\'Namespace="AWS/ApplicationELB" '

1032 'MetricName="RequestCount"\', "Sum", 300)'

1033 ),

1034 label="Request Count",

1035 period=Duration.minutes(5),

1036 )

1037 ],

1038 width=12,

1039 height=6,

1040 region=region,

1041 )

1042 widgets.append(request_count_widget)

1043

1044 # Create one widget per region for ALB response time

1045 for region in self.regions:

1046 response_time_widget = cloudwatch.GraphWidget(

1047 title=f"ALB - Response Time ({region})",

1048 left=[

1049 cloudwatch.MathExpression(

1050 expression=(

1051 'SEARCH(\'Namespace="AWS/ApplicationELB" '

1052 'MetricName="TargetResponseTime"\', "Average", 300)'

1053 ),

1054 label="Avg Response Time",

1055 period=Duration.minutes(5),

1056 )

1057 ],

1058 width=12,

1059 height=6,

1060 region=region,

1061 )

1062 widgets.append(response_time_widget)

1063

1064 # Create one widget per region for ALB HTTP errors

1065 for region in self.regions:

1066 http_errors_widget = cloudwatch.GraphWidget(

1067 title=f"ALB - HTTP Errors ({region})",

1068 left=[

1069 cloudwatch.MathExpression(

1070 expression=(

1071 'SEARCH(\'Namespace="AWS/ApplicationELB" '

1072 'MetricName="HTTPCode_Target_4XX_Count"\', "Sum", 300)'

1073 ),

1074 label="4XX Errors",

1075 period=Duration.minutes(5),

1076 )

1077 ],

1078 right=[

1079 cloudwatch.MathExpression(

1080 expression=(

1081 'SEARCH(\'Namespace="AWS/ApplicationELB" '

1082 'MetricName="HTTPCode_Target_5XX_Count"\', "Sum", 300)'

1083 ),

1084 label="5XX Errors",

1085 period=Duration.minutes(5),

1086 )

1087 ],

1088 width=12,

1089 height=6,

1090 region=region,

1091 )

1092 widgets.append(http_errors_widget)

1093

1094 # Create one widget per region for ALB active connections

1095 for region in self.regions:

1096 connections_widget = cloudwatch.GraphWidget(

1097 title=f"ALB - Active Connections ({region})",

1098 left=[

1099 cloudwatch.MathExpression(

1100 expression=(

1101 'SEARCH(\'Namespace="AWS/ApplicationELB" '

1102 'MetricName="ActiveConnectionCount"\', "Sum", 300)'

1103 ),

1104 label="Active Connections",

1105 period=Duration.minutes(5),

1106 )

1107 ],

1108 width=12,

1109 height=6,

1110 region=region,

1111 )

1112 widgets.append(connections_widget)

1113

1114 return widgets

1115

1116 def _create_application_widgets(self) -> list[cloudwatch.IWidget]:

1117 """Create custom application monitoring widgets"""

1118 widgets: list[cloudwatch.IWidget] = []

1119

1120 # Section header

1121 widgets.append(

1122 cloudwatch.TextWidget(

1123 markdown="# Application Metrics\n"

1124 "Health monitor and manifest processor metrics. "

1125 "Application logs are available in Container Insights at "

1126 "`/aws/containerinsights/<cluster>/application`.",

1127 width=24,

1128 height=1,

1129 )

1130 )

1131

1132 # Build cluster info from regional stacks: (cluster_name, region)

1133 cluster_info = [

1134 (regional_stack.cluster.cluster_name, regional_stack.deployment_region)

1135 for regional_stack in self.regional_stacks

1136 ]

1137

1138 # Health monitor metrics

1139 health_monitor_widget = cloudwatch.GraphWidget(

1140 title="Health Monitor - Resource Utilization",

1141 left=[

1142 cloudwatch.Metric(

1143 namespace="GCO/HealthMonitor",

1144 metric_name="ClusterCpuUtilization",

1145 dimensions_map={

1146 "ClusterName": cluster_name,

1147 "Region": region,

1148 },

1149 statistic="Average",

1150 period=Duration.minutes(5),

1151 label=f"{region} CPU",

1152 region=region,

1153 )

1154 for cluster_name, region in cluster_info

1155 ],

1156 right=[

1157 cloudwatch.Metric(

1158 namespace="GCO/HealthMonitor",

1159 metric_name="ClusterMemoryUtilization",

1160 dimensions_map={

1161 "ClusterName": cluster_name,

1162 "Region": region,

1163 },

1164 statistic="Average",

1165 period=Duration.minutes(5),

1166 label=f"{region} Memory",

1167 region=region,

1168 )

1169 for cluster_name, region in cluster_info

1170 ],

1171 width=12,

1172 height=6,

1173 )

1174 widgets.append(health_monitor_widget)

1175

1176 # Manifest processor metrics

1177 manifest_processor_widget = cloudwatch.GraphWidget(

1178 title="Manifest Processor - Submissions",

1179 left=[

1180 cloudwatch.Metric(

1181 namespace="GCO/ManifestProcessor",

1182 metric_name="ManifestSubmissions",

1183 dimensions_map={

1184 "ClusterName": cluster_name,

1185 "Region": region,

1186 },

1187 statistic="Sum",

1188 period=Duration.minutes(5),

1189 label=f"{region} Submissions",

1190 region=region,

1191 )

1192 for cluster_name, region in cluster_info

1193 ],

1194 right=[

1195 cloudwatch.Metric(

1196 namespace="GCO/ManifestProcessor",

1197 metric_name="ManifestFailures",

1198 dimensions_map={

1199 "ClusterName": cluster_name,

1200 "Region": region,

1201 },

1202 statistic="Sum",

1203 period=Duration.minutes(5),

1204 label=f"{region} Failures",

1205 color="#d62728",

1206 region=region,

1207 )

1208 for cluster_name, region in cluster_info

1209 ],

1210 width=12,

1211 height=6,

1212 )

1213 widgets.append(manifest_processor_widget)

1214

1215 # Container Insights - Pod restarts (indicates application issues)

1216 pod_restarts_widget = cloudwatch.GraphWidget(

1217 title="Container Insights - Pod Restarts",

1218 left=[

1219 cloudwatch.Metric(

1220 namespace="ContainerInsights",

1221 metric_name="pod_number_of_container_restarts",

1222 dimensions_map={"ClusterName": cluster_name},

1223 statistic="Sum",

1224 period=Duration.minutes(5),

1225 label=f"{region}",

1226 region=region,

1227 )

1228 for cluster_name, region in cluster_info

1229 ],

1230 width=12,

1231 height=6,

1232 )

1233 widgets.append(pod_restarts_widget)

1234

1235 # Secret rotation Lambda metrics (Secrets Manager doesn't publish rotation metrics,

1236 # so we monitor the rotation Lambda function instead)

1237 if self.api_gateway_stack: 1237 ↛ 1273line 1237 didn't jump to line 1273 because the condition on line 1237 was always true

1238 rotation_function_name = self.api_gateway_stack.rotation_lambda.function_name

1239 api_gw_region = self.config.get_api_gateway_region()

1240

1241 rotation_widget = cloudwatch.GraphWidget(

1242 title="Secret Rotation Lambda - Invocations & Errors",

1243 left=[

1244 cloudwatch.Metric(

1245 namespace="AWS/Lambda",

1246 metric_name="Invocations",

1247 dimensions_map={"FunctionName": rotation_function_name},

1248 statistic="Sum",

1249 period=Duration.hours(1),

1250 label="Invocations",

1251 color="#2ca02c",

1252 region=api_gw_region,

1253 ),

1254 ],

1255 right=[

1256 cloudwatch.Metric(

1257 namespace="AWS/Lambda",

1258 metric_name="Errors",

1259 dimensions_map={"FunctionName": rotation_function_name},

1260 statistic="Sum",

1261 period=Duration.hours(1),

1262 label="Errors",

1263 color="#d62728",

1264 region=api_gw_region,

1265 ),

1266 ],

1267 width=12,

1268 height=6,

1269 )

1270 widgets.append(rotation_widget)

1271 else:

1272 # Fallback text widget if api_gateway_stack not available

1273 fallback_widget = cloudwatch.TextWidget(

1274 markdown="**Secret Rotation:** API Gateway stack not configured. "

1275 "Rotation Lambda metrics unavailable.",

1276 width=12,

1277 height=6,

1278 )

1279 widgets.append(fallback_widget)

1280

1281 return widgets

1282

1283 def _create_alarms(self) -> None:

1284 """Create CloudWatch alarms"""

1285 self._create_global_accelerator_alarms()

1286 self._create_api_gateway_alarms()

1287 self._create_lambda_alarms()

1288 self._create_sqs_alarms()

1289 self._create_dynamodb_alarms()

1290 self._create_eks_alarms()

1291 self._create_alb_alarms()

1292 self._create_application_alarms()

1293

1294 def _create_global_accelerator_alarms(self) -> None:

1295 """Create Global Accelerator alarms.

1296

1297 Note: Global Accelerator metrics are only available in us-west-2.

1298 CloudWatch Alarms must be in the same region as the metrics they monitor.

1299 Since this monitoring stack may be deployed in a different region,

1300 we skip GA alarms here. To monitor GA, either:

1301 1. Create alarms manually in us-west-2

1302 2. Use CloudWatch cross-region dashboard widgets (which we do)

1303 3. Deploy a separate alarm stack in us-west-2

1304 """

1305 # GA alarms skipped - metrics only available in us-west-2

1306 # Dashboard widgets use region parameter to display GA metrics correctly

1307 pass

1308

1309 def _create_api_gateway_alarms(self) -> None:

1310 """Create API Gateway alarms"""

1311 # Get the actual API name from the api_gateway_stack

1312 api_name = (

1313 self.api_gateway_stack.api.rest_api_name if self.api_gateway_stack else "gco-global-api"

1314 )

1315

1316 # High 5XX error rate

1317 api_5xx_alarm = cloudwatch.Alarm(

1318 self,

1319 "ApiGateway5xxAlarm",

1320 alarm_description="API Gateway has high 5XX error rate",

1321 metric=cloudwatch.Metric(

1322 namespace="AWS/ApiGateway",

1323 metric_name="5XXError",

1324 dimensions_map={"ApiName": api_name},

1325 statistic="Sum",

1326 period=Duration.minutes(5),

1327 ),

1328 threshold=10,

1329 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,

1330 evaluation_periods=2,

1331 datapoints_to_alarm=2,

1332 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,

1333 )

1334 api_5xx_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))

1335

1336 # High latency

1337 api_latency_alarm = cloudwatch.Alarm(

1338 self,

1339 "ApiGatewayHighLatencyAlarm",

1340 alarm_description="API Gateway has high latency",

1341 metric=cloudwatch.Metric(

1342 namespace="AWS/ApiGateway",

1343 metric_name="Latency",

1344 dimensions_map={"ApiName": api_name},

1345 statistic="p99",

1346 period=Duration.minutes(5),

1347 ),

1348 threshold=10000, # 10 seconds

1349 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,

1350 evaluation_periods=3,

1351 datapoints_to_alarm=2,

1352 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,

1353 )

1354 api_latency_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))

1355

1356 def _create_lambda_alarms(self) -> None:

1357 """Create Lambda function alarms"""

1358 # Get Lambda function names from api_gateway_stack if available

1359 if self.api_gateway_stack: 1359 ↛ exitline 1359 didn't return from function '_create_lambda_alarms' because the condition on line 1359 was always true

1360 proxy_function_name = self.api_gateway_stack.proxy_lambda.function_name

1361 rotation_function_name = self.api_gateway_stack.rotation_lambda.function_name

1362

1363 # API Gateway Proxy Lambda errors

1364 proxy_errors_alarm = cloudwatch.Alarm(

1365 self,

1366 "ProxyLambdaErrorsAlarm",

1367 alarm_description="API Gateway proxy Lambda has errors",

1368 metric=cloudwatch.Metric(

1369 namespace="AWS/Lambda",

1370 metric_name="Errors",

1371 dimensions_map={"FunctionName": proxy_function_name},

1372 statistic="Sum",

1373 period=Duration.minutes(5),

1374 ),

1375 threshold=5,

1376 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,

1377 evaluation_periods=2,

1378 datapoints_to_alarm=2,

1379 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,

1380 )

1381 proxy_errors_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))

1382

1383 # Proxy Lambda throttles

1384 proxy_throttles_alarm = cloudwatch.Alarm(

1385 self,

1386 "ProxyLambdaThrottlesAlarm",

1387 alarm_description="API Gateway proxy Lambda is being throttled",

1388 metric=cloudwatch.Metric(

1389 namespace="AWS/Lambda",

1390 metric_name="Throttles",

1391 dimensions_map={"FunctionName": proxy_function_name},

1392 statistic="Sum",

1393 period=Duration.minutes(5),

1394 ),

1395 threshold=1,

1396 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,

1397 evaluation_periods=2,

1398 datapoints_to_alarm=2,

1399 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,

1400 )

1401 proxy_throttles_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))

1402

1403 # Secret rotation Lambda errors

1404 rotation_errors_alarm = cloudwatch.Alarm(

1405 self,

1406 "RotationLambdaErrorsAlarm",

1407 alarm_description="Secret rotation Lambda has errors",

1408 metric=cloudwatch.Metric(

1409 namespace="AWS/Lambda",

1410 metric_name="Errors",

1411 dimensions_map={"FunctionName": rotation_function_name},

1412 statistic="Sum",

1413 period=Duration.hours(1),

1414 ),

1415 threshold=1,

1416 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,

1417 evaluation_periods=1,

1418 datapoints_to_alarm=1,

1419 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,

1420 )

1421 rotation_errors_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))

1422

1423 def _create_sqs_alarms(self) -> None:

1424 """Create SQS queue alarms"""

1425 for regional_stack in self.regional_stacks:

1426 region = regional_stack.deployment_region

1427 queue_name = regional_stack.job_queue.queue_name

1428 dlq_name = regional_stack.job_dlq.queue_name

1429 region_id = region.replace("-", "").title()

1430

1431 # Old message alarm (stuck jobs)

1432 old_message_alarm = cloudwatch.Alarm(

1433 self,

1434 f"SqsOldMessageAlarm{region_id}",

1435 alarm_description=f"SQS queue in {region} has old messages (potential stuck jobs)",

1436 metric=cloudwatch.Metric(

1437 namespace="AWS/SQS",

1438 metric_name="ApproximateAgeOfOldestMessage",

1439 dimensions_map={"QueueName": queue_name},

1440 statistic="Maximum",

1441 period=Duration.minutes(5),

1442 ),

1443 threshold=3600, # 1 hour

1444 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,

1445 evaluation_periods=2,

1446 datapoints_to_alarm=2,

1447 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,

1448 )

1449 old_message_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))

1450

1451 # Dead letter queue alarm

1452 dlq_alarm = cloudwatch.Alarm(

1453 self,

1454 f"SqsDlqAlarm{region_id}",

1455 alarm_description=f"SQS dead letter queue in {region} has messages",

1456 metric=cloudwatch.Metric(

1457 namespace="AWS/SQS",

1458 metric_name="ApproximateNumberOfMessagesVisible",

1459 dimensions_map={"QueueName": dlq_name},

1460 statistic="Sum",

1461 period=Duration.minutes(5),

1462 ),

1463 threshold=1,

1464 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,

1465 evaluation_periods=1,

1466 datapoints_to_alarm=1,

1467 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,

1468 )

1469 dlq_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))

1470

1471 def _create_dynamodb_alarms(self) -> None:

1472 """Create DynamoDB alarms for job queue, templates, and webhooks tables."""

1473 # Get table names from global stack

1474 jobs_table = self.global_stack.jobs_table.table_name

1475

1476 # DynamoDB tables are in the global region

1477 global_region = self.config.get_global_region()

1478

1479 # Jobs table throttling alarm

1480 jobs_throttle_alarm = cloudwatch.Alarm(

1481 self,

1482 "DynamoDBJobsThrottleAlarm",

1483 alarm_description="DynamoDB jobs table is being throttled",

1484 metric=cloudwatch.Metric(

1485 namespace="AWS/DynamoDB",

1486 metric_name="ThrottledRequests",

1487 dimensions_map={"TableName": jobs_table},

1488 statistic="Sum",

1489 period=Duration.minutes(5),

1490 region=global_region,

1491 ),

1492 threshold=1,

1493 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,

1494 evaluation_periods=2,

1495 datapoints_to_alarm=2,

1496 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,

1497 )

1498 jobs_throttle_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))

1499

1500 # Jobs table system errors alarm

1501 jobs_errors_alarm = cloudwatch.Alarm(

1502 self,

1503 "DynamoDBJobsErrorsAlarm",

1504 alarm_description="DynamoDB jobs table has system errors",

1505 metric=cloudwatch.Metric(

1506 namespace="AWS/DynamoDB",

1507 metric_name="SystemErrors",

1508 dimensions_map={"TableName": jobs_table},

1509 statistic="Sum",

1510 period=Duration.minutes(5),

1511 region=global_region,

1512 ),

1513 threshold=1,

1514 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,

1515 evaluation_periods=1,

1516 datapoints_to_alarm=1,

1517 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,

1518 )

1519 jobs_errors_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))

1520

1521 def _create_eks_alarms(self) -> None:

1522 """Create EKS cluster alarms"""

1523 for regional_stack in self.regional_stacks:

1524 region = regional_stack.deployment_region

1525 cluster_name = regional_stack.cluster.cluster_name

1526 region_id = region.replace("-", "").title()

1527

1528 # High CPU utilization alarm (node-level metric)

1529 high_cpu_alarm = cloudwatch.Alarm(

1530 self,

1531 f"EksHighCpuAlarm{region_id}",

1532 alarm_description=f"EKS cluster {cluster_name} has high CPU utilization",

1533 metric=cloudwatch.Metric(

1534 namespace="ContainerInsights",

1535 metric_name="node_cpu_utilization",

1536 dimensions_map={"ClusterName": cluster_name},

1537 statistic="Average",

1538 period=Duration.minutes(5),

1539 ),

1540 threshold=80,

1541 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,

1542 evaluation_periods=3,

1543 datapoints_to_alarm=2,

1544 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,

1545 )

1546 high_cpu_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))

1547

1548 # High memory utilization alarm (node-level metric)

1549 high_memory_alarm = cloudwatch.Alarm(

1550 self,

1551 f"EksHighMemoryAlarm{region_id}",

1552 alarm_description=f"EKS cluster {cluster_name} has high memory utilization",

1553 metric=cloudwatch.Metric(

1554 namespace="ContainerInsights",

1555 metric_name="node_memory_utilization",

1556 dimensions_map={"ClusterName": cluster_name},

1557 statistic="Average",

1558 period=Duration.minutes(5),

1559 ),

1560 threshold=85,

1561 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,

1562 evaluation_periods=3,

1563 datapoints_to_alarm=2,

1564 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,

1565 )

1566 high_memory_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))

1567

1568 def _create_alb_alarms(self) -> None:

1569 """Create ALB alarms.

1570

1571 Note: ALBs are created dynamically by the AWS Load Balancer Controller

1572 in Kubernetes via Ingress resources. Since we can't know the exact ALB

1573 name at CDK synth time (it includes a hash), we cannot create alarms

1574 with specific ALB dimensions.

1575

1576 CloudWatch Alarms don't support SEARCH expressions like dashboards do,

1577 so we skip ALB-specific alarms. Instead, rely on:

1578 1. Dashboard widgets with SEARCH expressions for monitoring

1579 2. EKS Container Insights alarms for pod/node health

1580 3. API Gateway alarms for request-level monitoring

1581

1582 If ALB-specific alarms are needed, consider:

1583 - Using a custom resource to discover ALB names at deploy time

1584 - Creating alarms via AWS CLI/SDK after deployment

1585 - Using CloudWatch Anomaly Detection on the namespace level

1586 """

1587 # ALB alarms are skipped because ALB names are not known at synth time

1588 # The AWS Load Balancer Controller creates ALBs with names like:

1589 # k8s-<namespace>-<ingress>-<hash>

1590 pass

1591

1592 def _create_application_alarms(self) -> None:

1593 """Create application-specific alarms"""

1594 for regional_stack in self.regional_stacks:

1595 region = regional_stack.deployment_region

1596 cluster_name = regional_stack.cluster.cluster_name

1597 region_id = region.replace("-", "").title()

1598

1599 # High manifest failure rate alarm

1600 high_failure_rate_alarm = cloudwatch.Alarm(

1601 self,

1602 f"ManifestHighFailureRateAlarm{region_id}",

1603 alarm_description=f"Manifest processor in {region} has high failure rate",

1604 metric=cloudwatch.Metric(

1605 namespace="GCO/ManifestProcessor",

1606 metric_name="ManifestFailures",

1607 dimensions_map={"ClusterName": cluster_name, "Region": region},

1608 statistic="Sum",

1609 period=Duration.minutes(5),

1610 ),

1611 threshold=10,

1612 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,

1613 evaluation_periods=2,

1614 datapoints_to_alarm=2,

1615 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,

1616 )

1617 high_failure_rate_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))

1618

1619 def _create_composite_alarms(self) -> None:

1620 """Create composite alarms for better signal-to-noise ratio"""

1621

1622 # Store individual alarms for composite alarm references

1623 regional_alarms: dict[str, list[cloudwatch.Alarm]] = {}

1624

1625 for regional_stack in self.regional_stacks:

1626 region = regional_stack.deployment_region

1627 cluster_name = regional_stack.cluster.cluster_name

1628 region_id = region.replace("-", "").title()

1629 regional_alarms[region] = []

1630

1631 # Create regional health composite alarm

1632 # Triggers when multiple issues occur in the same region

1633 eks_cpu_alarm = cloudwatch.Alarm(

1634 self,

1635 f"CompositeEksCpu{region_id}",

1636 metric=cloudwatch.Metric(

1637 namespace="ContainerInsights",

1638 metric_name="node_cpu_utilization",

1639 dimensions_map={"ClusterName": cluster_name},

1640 statistic="Average",

1641 period=Duration.minutes(5),

1642 ),

1643 threshold=90,

1644 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,

1645 evaluation_periods=2,

1646 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,

1647 )

1648 regional_alarms[region].append(eks_cpu_alarm)

1649

1650 eks_memory_alarm = cloudwatch.Alarm(

1651 self,

1652 f"CompositeEksMemory{region_id}",

1653 metric=cloudwatch.Metric(

1654 namespace="ContainerInsights",

1655 metric_name="node_memory_utilization",

1656 dimensions_map={"ClusterName": cluster_name},

1657 statistic="Average",

1658 period=Duration.minutes(5),

1659 ),

1660 threshold=90,

1661 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,

1662 evaluation_periods=2,

1663 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,

1664 )

1665 regional_alarms[region].append(eks_memory_alarm)

1666

1667 # Create composite alarm for critical regional issues

1668 for region, alarms in regional_alarms.items():

1669 region_id = region.replace("-", "").title()

1670 if len(alarms) >= 2: 1670 ↛ 1668line 1670 didn't jump to line 1668 because the condition on line 1670 was always true

1671 composite_alarm = cloudwatch.CompositeAlarm(

1672 self,

1673 f"RegionalCriticalAlarm{region_id}",

1674 alarm_description=f"Critical: Multiple issues detected in {region}",

1675 alarm_rule=cloudwatch.AlarmRule.all_of(*alarms),

1676 )

1677 composite_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic))

1678

1679 # API Gateway + Lambda composite alarm (only if api_gateway_stack is available)

1680 if self.api_gateway_stack: 1680 ↛ exitline 1680 didn't return from function '_create_composite_alarms' because the condition on line 1680 was always true

1681 api_name = self.api_gateway_stack.api.rest_api_name

1682 proxy_function_name = self.api_gateway_stack.proxy_lambda.function_name

1683

1684 api_error_alarm = cloudwatch.Alarm(

1685 self,

1686 "CompositeApiErrors",

1687 metric=cloudwatch.Metric(

1688 namespace="AWS/ApiGateway",

1689 metric_name="5XXError",

1690 dimensions_map={"ApiName": api_name},

1691 statistic="Sum",

1692 period=Duration.minutes(5),

1693 ),

1694 threshold=5,

1695 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,

1696 evaluation_periods=2,

1697 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,

1698 )

1699

1700 lambda_error_alarm = cloudwatch.Alarm(

1701 self,

1702 "CompositeLambdaErrors",

1703 metric=cloudwatch.Metric(

1704 namespace="AWS/Lambda",

1705 metric_name="Errors",

1706 dimensions_map={"FunctionName": proxy_function_name},

1707 statistic="Sum",

1708 period=Duration.minutes(5),

1709 ),

1710 threshold=3,

1711 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,

1712 evaluation_periods=2,

1713 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING,

1714 )

1715

1716 api_lambda_composite = cloudwatch.CompositeAlarm(

1717 self,

1718 "ApiLambdaCompositeAlarm",

1719 alarm_description="Critical: Both API Gateway and Lambda proxy have errors",

1720 alarm_rule=cloudwatch.AlarmRule.all_of(api_error_alarm, lambda_error_alarm),

1721 )

1722 api_lambda_composite.add_alarm_action(cw_actions.SnsAction(self.alert_topic))

1723

1724 def _create_custom_metrics(self) -> None:

1725 """Create custom metric filters and log groups"""

1726 for regional_stack in self.regional_stacks:

1727 region = regional_stack.deployment_region

1728 region_id = region.replace("-", "").title()

1729

1730 # Health monitor log group

1731 # log_group_name intentionally omitted - let CDK generate unique name

1732 logs.LogGroup(

1733 self,

1734 f"HealthMonitorLogGroup{region_id}",

1735 retention=logs.RetentionDays.ONE_MONTH,

1736 removal_policy=RemovalPolicy.DESTROY,

1737 )

1738

1739 # Manifest processor log group

1740 # log_group_name intentionally omitted - let CDK generate unique name

1741 logs.LogGroup(

1742 self,

1743 f"ManifestProcessorLogGroup{region_id}",

1744 retention=logs.RetentionDays.ONE_MONTH,

1745 removal_policy=RemovalPolicy.DESTROY,

1746 )

1747

1748 def _create_outputs(self) -> None:

1749 """Create CloudFormation outputs"""

1750 CfnOutput(

1751 self,

1752 "DashboardUrl",

1753 value=f"https://console.aws.amazon.com/cloudwatch/home?region={self.region}#dashboards:name={self.dashboard.dashboard_name}",

1754 description="CloudWatch Dashboard URL",

1755 )

1756

1757 CfnOutput(

1758 self,

1759 "AlertTopicArn",

1760 value=self.alert_topic.topic_arn,

1761 description="SNS Topic ARN for monitoring alerts",

1762 )

1763

1764 CfnOutput(

1765 self,

1766 "AlarmCount",

1767 value="See CloudWatch Alarms console for full list",

1768 description="Monitoring alarms created",

1769 )

Coverage for gco / stacks / monitoring_stack.py: 98%

266 statements