Coverage for gco / stacks / monitoring_stack.py: 98%

266 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-30 21:47 +0000

1""" 

2Monitoring stack for GCO (Global Capacity Orchestrator on AWS) - Cross-region monitoring and observability. 

3 

4This stack creates centralized monitoring resources for all GCO deployments: 

5- CloudWatch Dashboard with comprehensive widgets for all regions 

6- SNS topic for alerting 

7- CloudWatch Alarms for critical metrics 

8- Log groups for application logs 

9- Anomaly detection for traffic patterns 

10- Composite alarms for better signal-to-noise 

11 

12Dashboard Sections: 

13- Global Accelerator: Flow counts, processed bytes 

14- API Gateway: Request counts, latency, error rates 

15- Lambda Functions: Invocations, errors, duration, throttles 

16- SQS Queues: Message counts, age, dead letter queue depth 

17- DynamoDB Tables: Capacity, latency, throttles, errors 

18- EKS Clusters: CPU/memory utilization per region 

19- ALBs: Request counts, response times, healthy hosts 

20- Applications: Custom metrics from health monitor and manifest processor 

21 

22Cross-Region Metrics: 

23 CloudWatch metrics are region-specific. This stack handles cross-region 

24 monitoring by specifying the `region` parameter on metrics: 

25 - Global Accelerator metrics: Always in us-west-2 

26 - DynamoDB metrics: In the global region (where tables are deployed) 

27 - Regional metrics: In each cluster's region 

28 

29Alarms: 

30- High CPU/memory utilization on EKS clusters 

31- Unhealthy hosts in ALB target groups 

32- High response times 

33- Manifest processing failures 

34- Lambda errors and throttles 

35- SQS message age (stuck jobs) 

36- DynamoDB throttling and system errors 

37- API Gateway 5XX errors 

38- Secret rotation failures 

39""" 

40 

41from typing import TYPE_CHECKING, Any 

42 

43from aws_cdk import ( 

44 CfnOutput, 

45 Duration, 

46 RemovalPolicy, 

47 Stack, 

48) 

49from aws_cdk import aws_cloudwatch as cloudwatch 

50from aws_cdk import aws_cloudwatch_actions as cw_actions 

51from aws_cdk import aws_logs as logs 

52from aws_cdk import aws_sns as sns 

53from constructs import Construct 

54 

55from gco.config.config_loader import ConfigLoader 

56 

57if TYPE_CHECKING: 

58 from gco.stacks.api_gateway_global_stack import GCOApiGatewayGlobalStack 

59 from gco.stacks.global_stack import GCOGlobalStack 

60 from gco.stacks.regional_stack import GCORegionalStack 

61 

62 

63class GCOMonitoringStack(Stack): 

64 """ 

65 Cross-region monitoring and observability stack. 

66 

67 Creates a centralized CloudWatch dashboard and alarms that aggregate 

68 metrics from all regional deployments. 

69 

70 Attributes: 

71 alert_topic: SNS topic for alarm notifications 

72 dashboard: CloudWatch dashboard with all monitoring widgets 

73 """ 

74 

75 def __init__( 

76 self, 

77 scope: Construct, 

78 construct_id: str, 

79 config: ConfigLoader, 

80 global_stack: GCOGlobalStack, 

81 regional_stacks: list[GCORegionalStack], 

82 api_gateway_stack: GCOApiGatewayGlobalStack | None = None, 

83 **kwargs: Any, 

84 ) -> None: 

85 super().__init__(scope, construct_id, **kwargs) 

86 

87 self.config = config 

88 self.global_stack = global_stack 

89 self.regional_stacks = regional_stacks 

90 self.api_gateway_stack = api_gateway_stack 

91 self.project_name = config.get_project_name() 

92 self.regions = config.get_regions() 

93 

94 # Create SNS topic for alerts 

95 self.alert_topic = self._create_alert_topic() 

96 

97 # Create CloudWatch dashboard 

98 self.dashboard = self._create_dashboard() 

99 

100 # Create alarms 

101 self._create_alarms() 

102 

103 # Create composite alarms 

104 self._create_composite_alarms() 

105 

106 # Create custom metrics 

107 self._create_custom_metrics() 

108 

109 # Export monitoring resources 

110 self._create_outputs() 

111 

112 # Apply cdk-nag suppressions 

113 self._apply_nag_suppressions() 

114 

115 def _apply_nag_suppressions(self) -> None: 

116 """Apply cdk-nag suppressions for this stack.""" 

117 from gco.stacks.nag_suppressions import apply_all_suppressions 

118 

119 apply_all_suppressions( 

120 self, 

121 stack_type="monitoring", 

122 regions=self.config.get_regions(), 

123 global_region=self.config.get_global_region(), 

124 ) 

125 

126 def _create_alert_topic(self) -> sns.Topic: 

127 """Create SNS topic for monitoring alerts""" 

128 topic = sns.Topic( 

129 self, 

130 "GCOAlertTopic", 

131 display_name="GCO (Global Capacity Orchestrator on AWS) Monitoring Alerts", 

132 enforce_ssl=True, 

133 ) 

134 return topic 

135 

136 def _create_dashboard(self) -> cloudwatch.Dashboard: 

137 """Create comprehensive CloudWatch dashboard for monitoring""" 

138 dashboard = cloudwatch.Dashboard( 

139 self, 

140 "GCODashboard", 

141 period_override=cloudwatch.PeriodOverride.AUTO, 

142 ) 

143 

144 # Add widgets in logical order 

145 dashboard.add_widgets(*self._create_global_accelerator_widgets()) 

146 dashboard.add_widgets(*self._create_api_gateway_widgets()) 

147 dashboard.add_widgets(*self._create_lambda_widgets()) 

148 dashboard.add_widgets(*self._create_sqs_widgets()) 

149 dashboard.add_widgets(*self._create_dynamodb_widgets()) 

150 dashboard.add_widgets(*self._create_eks_widgets()) 

151 dashboard.add_widgets(*self._create_gpu_widgets()) 

152 dashboard.add_widgets(*self._create_alb_widgets()) 

153 dashboard.add_widgets(*self._create_application_widgets()) 

154 

155 return dashboard 

156 

157 def _create_global_accelerator_widgets(self) -> list[cloudwatch.IWidget]: 

158 """Create Global Accelerator monitoring widgets. 

159 

160 Note: Global Accelerator metrics are only available in us-west-2, 

161 regardless of where the accelerator endpoints are located. 

162 CloudWatch uses the Accelerator ID (UUID), not the name. 

163 """ 

164 widgets: list[cloudwatch.IWidget] = [] 

165 

166 # Get the accelerator ID from the global stack (CloudWatch uses ID, not name) 

167 accelerator_id = self.global_stack.accelerator_id 

168 

169 # Global Accelerator metrics are always in us-west-2 

170 ga_metrics_region = "us-west-2" 

171 

172 # Section header 

173 widgets.append( 

174 cloudwatch.TextWidget( 

175 markdown="# Global Accelerator\nTraffic distribution and connectivity metrics", 

176 width=24, 

177 height=1, 

178 ) 

179 ) 

180 

181 # Flow count with anomaly detection 

182 flow_count_widget = cloudwatch.GraphWidget( 

183 title="Global Accelerator - New Flows", 

184 left=[ 

185 cloudwatch.Metric( 

186 namespace="AWS/GlobalAccelerator", 

187 metric_name="NewFlowCount", 

188 dimensions_map={"Accelerator": accelerator_id}, 

189 statistic="Sum", 

190 period=Duration.minutes(5), 

191 region=ga_metrics_region, 

192 ) 

193 ], 

194 width=12, 

195 height=6, 

196 region=ga_metrics_region, 

197 ) 

198 widgets.append(flow_count_widget) 

199 

200 # Processed bytes 

201 bytes_widget = cloudwatch.GraphWidget( 

202 title="Global Accelerator - Processed Bytes", 

203 left=[ 

204 cloudwatch.Metric( 

205 namespace="AWS/GlobalAccelerator", 

206 metric_name="ProcessedBytesIn", 

207 dimensions_map={"Accelerator": accelerator_id}, 

208 statistic="Sum", 

209 period=Duration.minutes(5), 

210 region=ga_metrics_region, 

211 ), 

212 cloudwatch.Metric( 

213 namespace="AWS/GlobalAccelerator", 

214 metric_name="ProcessedBytesOut", 

215 dimensions_map={"Accelerator": accelerator_id}, 

216 statistic="Sum", 

217 period=Duration.minutes(5), 

218 region=ga_metrics_region, 

219 ), 

220 ], 

221 width=12, 

222 height=6, 

223 region=ga_metrics_region, 

224 ) 

225 widgets.append(bytes_widget) 

226 

227 return widgets 

228 

229 def _create_api_gateway_widgets(self) -> list[cloudwatch.IWidget]: 

230 """Create API Gateway monitoring widgets""" 

231 widgets: list[cloudwatch.IWidget] = [] 

232 

233 # Get the actual API name from the api_gateway_stack 

234 api_name = ( 

235 self.api_gateway_stack.api.rest_api_name if self.api_gateway_stack else "gco-global-api" 

236 ) 

237 

238 # API Gateway metrics are in the region where the API is deployed 

239 api_gw_region = self.config.get_api_gateway_region() 

240 

241 # Section header 

242 widgets.append( 

243 cloudwatch.TextWidget( 

244 markdown="# API Gateway\nRequest metrics, latency, and error rates", 

245 width=24, 

246 height=1, 

247 ) 

248 ) 

249 

250 # Request count and latency 

251 request_widget = cloudwatch.GraphWidget( 

252 title="API Gateway - Requests & Latency", 

253 left=[ 

254 cloudwatch.Metric( 

255 namespace="AWS/ApiGateway", 

256 metric_name="Count", 

257 dimensions_map={"ApiName": api_name}, 

258 statistic="Sum", 

259 period=Duration.minutes(5), 

260 region=api_gw_region, 

261 ) 

262 ], 

263 right=[ 

264 cloudwatch.Metric( 

265 namespace="AWS/ApiGateway", 

266 metric_name="Latency", 

267 dimensions_map={"ApiName": api_name}, 

268 statistic="Average", 

269 period=Duration.minutes(5), 

270 region=api_gw_region, 

271 ), 

272 cloudwatch.Metric( 

273 namespace="AWS/ApiGateway", 

274 metric_name="Latency", 

275 dimensions_map={"ApiName": api_name}, 

276 statistic="p99", 

277 period=Duration.minutes(5), 

278 region=api_gw_region, 

279 ), 

280 ], 

281 width=12, 

282 height=6, 

283 region=api_gw_region, 

284 ) 

285 widgets.append(request_widget) 

286 

287 # Error rates (4XX and 5XX) 

288 error_widget = cloudwatch.GraphWidget( 

289 title="API Gateway - Error Rates", 

290 left=[ 

291 cloudwatch.Metric( 

292 namespace="AWS/ApiGateway", 

293 metric_name="4XXError", 

294 dimensions_map={"ApiName": api_name}, 

295 statistic="Sum", 

296 period=Duration.minutes(5), 

297 color="#ff7f0e", 

298 region=api_gw_region, 

299 ), 

300 cloudwatch.Metric( 

301 namespace="AWS/ApiGateway", 

302 metric_name="5XXError", 

303 dimensions_map={"ApiName": api_name}, 

304 statistic="Sum", 

305 period=Duration.minutes(5), 

306 color="#d62728", 

307 region=api_gw_region, 

308 ), 

309 ], 

310 width=12, 

311 height=6, 

312 region=api_gw_region, 

313 ) 

314 widgets.append(error_widget) 

315 

316 return widgets 

317 

318 def _create_lambda_widgets(self) -> list[cloudwatch.IWidget]: 

319 """Create Lambda function monitoring widgets""" 

320 widgets: list[cloudwatch.IWidget] = [] 

321 

322 # Section header 

323 widgets.append( 

324 cloudwatch.TextWidget( 

325 markdown="# Lambda Functions\nProxy, rotation, and regional Lambda metrics", 

326 width=24, 

327 height=1, 

328 ) 

329 ) 

330 

331 # Get API Gateway region for global Lambda functions 

332 api_gw_region = self.config.get_api_gateway_region() 

333 

334 # Build Lambda function list: (function_name, label, region) 

335 lambda_functions: list[tuple[str, str, str]] = [] 

336 

337 # Add API Gateway Lambda functions if available 

338 if self.api_gateway_stack: 338 ↛ 355line 338 didn't jump to line 355 because the condition on line 338 was always true

339 lambda_functions.append( 

340 ( 

341 self.api_gateway_stack.proxy_lambda.function_name, 

342 "API Gateway Proxy", 

343 api_gw_region, 

344 ) 

345 ) 

346 lambda_functions.append( 

347 ( 

348 self.api_gateway_stack.rotation_lambda.function_name, 

349 "Secret Rotation", 

350 api_gw_region, 

351 ) 

352 ) 

353 

354 # Add regional Lambda functions from each regional stack 

355 for regional_stack in self.regional_stacks: 

356 region = regional_stack.deployment_region 

357 lambda_functions.extend( 

358 [ 

359 ( 

360 regional_stack.kubectl_lambda_function_name, 

361 f"Kubectl Applier ({region})", 

362 region, 

363 ), 

364 ( 

365 regional_stack.helm_installer_lambda_function_name, 

366 f"Helm Installer ({region})", 

367 region, 

368 ), 

369 ] 

370 ) 

371 

372 # Invocations widget 

373 invocations_widget = cloudwatch.GraphWidget( 

374 title="Lambda - Invocations", 

375 left=[ 

376 cloudwatch.Metric( 

377 namespace="AWS/Lambda", 

378 metric_name="Invocations", 

379 dimensions_map={"FunctionName": func_name}, 

380 statistic="Sum", 

381 period=Duration.minutes(5), 

382 label=label, 

383 region=region, 

384 ) 

385 for func_name, label, region in lambda_functions[:5] 

386 ], 

387 width=12, 

388 height=6, 

389 ) 

390 widgets.append(invocations_widget) 

391 

392 errors_widget = cloudwatch.GraphWidget( 

393 title="Lambda - Errors", 

394 left=[ 

395 cloudwatch.Metric( 

396 namespace="AWS/Lambda", 

397 metric_name="Errors", 

398 dimensions_map={"FunctionName": func_name}, 

399 statistic="Sum", 

400 period=Duration.minutes(5), 

401 label=label, 

402 color="#d62728", 

403 region=region, 

404 ) 

405 for func_name, label, region in lambda_functions[:5] 

406 ], 

407 width=12, 

408 height=6, 

409 ) 

410 widgets.append(errors_widget) 

411 

412 # Duration widget 

413 duration_widget = cloudwatch.GraphWidget( 

414 title="Lambda - Duration (ms)", 

415 left=[ 

416 cloudwatch.Metric( 

417 namespace="AWS/Lambda", 

418 metric_name="Duration", 

419 dimensions_map={"FunctionName": func_name}, 

420 statistic="Average", 

421 period=Duration.minutes(5), 

422 label=label, 

423 region=region, 

424 ) 

425 for func_name, label, region in lambda_functions[:5] 

426 ], 

427 width=12, 

428 height=6, 

429 ) 

430 widgets.append(duration_widget) 

431 

432 # Throttles widget 

433 throttles_widget = cloudwatch.GraphWidget( 

434 title="Lambda - Throttles & Concurrent Executions", 

435 left=[ 

436 cloudwatch.Metric( 

437 namespace="AWS/Lambda", 

438 metric_name="Throttles", 

439 dimensions_map={"FunctionName": func_name}, 

440 statistic="Sum", 

441 period=Duration.minutes(5), 

442 label=f"{label} Throttles", 

443 region=region, 

444 ) 

445 for func_name, label, region in lambda_functions[:3] 

446 ], 

447 right=[ 

448 cloudwatch.Metric( 

449 namespace="AWS/Lambda", 

450 metric_name="ConcurrentExecutions", 

451 dimensions_map={"FunctionName": func_name}, 

452 statistic="Maximum", 

453 period=Duration.minutes(5), 

454 label=f"{label} Concurrent", 

455 region=region, 

456 ) 

457 for func_name, label, region in lambda_functions[:3] 

458 ], 

459 width=12, 

460 height=6, 

461 ) 

462 widgets.append(throttles_widget) 

463 

464 return widgets 

465 

466 def _create_sqs_widgets(self) -> list[cloudwatch.IWidget]: 

467 """Create SQS queue monitoring widgets""" 

468 widgets: list[cloudwatch.IWidget] = [] 

469 

470 # Section header 

471 widgets.append( 

472 cloudwatch.TextWidget( 

473 markdown="# SQS Queues\nJob submission queue metrics and dead letter queue", 

474 width=24, 

475 height=1, 

476 ) 

477 ) 

478 

479 # Build queue info from regional stacks: (queue_name, dlq_name, region) 

480 queue_info = [ 

481 ( 

482 regional_stack.job_queue.queue_name, 

483 regional_stack.job_dlq.queue_name, 

484 regional_stack.deployment_region, 

485 ) 

486 for regional_stack in self.regional_stacks 

487 ] 

488 

489 # Messages visible and in-flight per region 

490 messages_widget = cloudwatch.GraphWidget( 

491 title="SQS - Messages (Visible & In-Flight)", 

492 left=[ 

493 cloudwatch.Metric( 

494 namespace="AWS/SQS", 

495 metric_name="ApproximateNumberOfMessagesVisible", 

496 dimensions_map={"QueueName": queue_name}, 

497 statistic="Average", 

498 period=Duration.minutes(1), 

499 label=f"{region} Visible", 

500 region=region, 

501 ) 

502 for queue_name, _, region in queue_info 

503 ], 

504 right=[ 

505 cloudwatch.Metric( 

506 namespace="AWS/SQS", 

507 metric_name="ApproximateNumberOfMessagesNotVisible", 

508 dimensions_map={"QueueName": queue_name}, 

509 statistic="Average", 

510 period=Duration.minutes(1), 

511 label=f"{region} In-Flight", 

512 region=region, 

513 ) 

514 for queue_name, _, region in queue_info 

515 ], 

516 width=12, 

517 height=6, 

518 ) 

519 widgets.append(messages_widget) 

520 

521 # Age of oldest message (critical for detecting stuck jobs) 

522 age_widget = cloudwatch.GraphWidget( 

523 title="SQS - Age of Oldest Message (seconds)", 

524 left=[ 

525 cloudwatch.Metric( 

526 namespace="AWS/SQS", 

527 metric_name="ApproximateAgeOfOldestMessage", 

528 dimensions_map={"QueueName": queue_name}, 

529 statistic="Maximum", 

530 period=Duration.minutes(1), 

531 label=region, 

532 region=region, 

533 ) 

534 for queue_name, _, region in queue_info 

535 ], 

536 width=12, 

537 height=6, 

538 ) 

539 widgets.append(age_widget) 

540 

541 # Dead letter queue depth 

542 dlq_widget = cloudwatch.GraphWidget( 

543 title="SQS - Dead Letter Queue Depth", 

544 left=[ 

545 cloudwatch.Metric( 

546 namespace="AWS/SQS", 

547 metric_name="ApproximateNumberOfMessagesVisible", 

548 dimensions_map={"QueueName": dlq_name}, 

549 statistic="Average", 

550 period=Duration.minutes(1), 

551 label=f"{region} DLQ", 

552 color="#d62728", 

553 region=region, 

554 ) 

555 for _, dlq_name, region in queue_info 

556 ], 

557 width=12, 

558 height=6, 

559 ) 

560 widgets.append(dlq_widget) 

561 

562 # Messages sent/received/deleted 

563 throughput_widget = cloudwatch.GraphWidget( 

564 title="SQS - Throughput", 

565 left=[ 

566 cloudwatch.Metric( 

567 namespace="AWS/SQS", 

568 metric_name="NumberOfMessagesSent", 

569 dimensions_map={"QueueName": queue_name}, 

570 statistic="Sum", 

571 period=Duration.minutes(5), 

572 label=f"{region} Sent", 

573 region=region, 

574 ) 

575 for queue_name, _, region in queue_info 

576 ], 

577 right=[ 

578 cloudwatch.Metric( 

579 namespace="AWS/SQS", 

580 metric_name="NumberOfMessagesDeleted", 

581 dimensions_map={"QueueName": queue_name}, 

582 statistic="Sum", 

583 period=Duration.minutes(5), 

584 label=f"{region} Processed", 

585 region=region, 

586 ) 

587 for queue_name, _, region in queue_info 

588 ], 

589 width=12, 

590 height=6, 

591 ) 

592 widgets.append(throughput_widget) 

593 

594 return widgets 

595 

596 def _create_dynamodb_widgets(self) -> list[cloudwatch.IWidget]: 

597 """Create DynamoDB monitoring widgets for job queue, templates, and webhooks tables.""" 

598 widgets: list[cloudwatch.IWidget] = [] 

599 

600 # Get table names from global stack 

601 templates_table = self.global_stack.templates_table.table_name 

602 webhooks_table = self.global_stack.webhooks_table.table_name 

603 jobs_table = self.global_stack.jobs_table.table_name 

604 

605 # DynamoDB tables are in the global region 

606 global_region = self.config.get_global_region() 

607 

608 # Section header 

609 widgets.append( 

610 cloudwatch.TextWidget( 

611 markdown="# DynamoDB Tables\nJob queue, templates, and webhooks storage metrics", 

612 width=24, 

613 height=1, 

614 ) 

615 ) 

616 

617 # Read/Write capacity consumed 

618 capacity_widget = cloudwatch.GraphWidget( 

619 title="DynamoDB - Consumed Capacity", 

620 left=[ 

621 cloudwatch.Metric( 

622 namespace="AWS/DynamoDB", 

623 metric_name="ConsumedReadCapacityUnits", 

624 dimensions_map={"TableName": jobs_table}, 

625 statistic="Sum", 

626 period=Duration.minutes(5), 

627 label="Jobs Read", 

628 region=global_region, 

629 ), 

630 cloudwatch.Metric( 

631 namespace="AWS/DynamoDB", 

632 metric_name="ConsumedReadCapacityUnits", 

633 dimensions_map={"TableName": templates_table}, 

634 statistic="Sum", 

635 period=Duration.minutes(5), 

636 label="Templates Read", 

637 region=global_region, 

638 ), 

639 cloudwatch.Metric( 

640 namespace="AWS/DynamoDB", 

641 metric_name="ConsumedReadCapacityUnits", 

642 dimensions_map={"TableName": webhooks_table}, 

643 statistic="Sum", 

644 period=Duration.minutes(5), 

645 label="Webhooks Read", 

646 region=global_region, 

647 ), 

648 ], 

649 right=[ 

650 cloudwatch.Metric( 

651 namespace="AWS/DynamoDB", 

652 metric_name="ConsumedWriteCapacityUnits", 

653 dimensions_map={"TableName": jobs_table}, 

654 statistic="Sum", 

655 period=Duration.minutes(5), 

656 label="Jobs Write", 

657 region=global_region, 

658 ), 

659 cloudwatch.Metric( 

660 namespace="AWS/DynamoDB", 

661 metric_name="ConsumedWriteCapacityUnits", 

662 dimensions_map={"TableName": templates_table}, 

663 statistic="Sum", 

664 period=Duration.minutes(5), 

665 label="Templates Write", 

666 region=global_region, 

667 ), 

668 ], 

669 width=12, 

670 height=6, 

671 region=global_region, 

672 ) 

673 widgets.append(capacity_widget) 

674 

675 # Latency metrics 

676 latency_widget = cloudwatch.GraphWidget( 

677 title="DynamoDB - Latency (ms)", 

678 left=[ 

679 cloudwatch.Metric( 

680 namespace="AWS/DynamoDB", 

681 metric_name="SuccessfulRequestLatency", 

682 dimensions_map={"TableName": jobs_table, "Operation": "GetItem"}, 

683 statistic="Average", 

684 period=Duration.minutes(5), 

685 label="Jobs GetItem", 

686 region=global_region, 

687 ), 

688 cloudwatch.Metric( 

689 namespace="AWS/DynamoDB", 

690 metric_name="SuccessfulRequestLatency", 

691 dimensions_map={"TableName": jobs_table, "Operation": "PutItem"}, 

692 statistic="Average", 

693 period=Duration.minutes(5), 

694 label="Jobs PutItem", 

695 region=global_region, 

696 ), 

697 cloudwatch.Metric( 

698 namespace="AWS/DynamoDB", 

699 metric_name="SuccessfulRequestLatency", 

700 dimensions_map={"TableName": jobs_table, "Operation": "Query"}, 

701 statistic="Average", 

702 period=Duration.minutes(5), 

703 label="Jobs Query", 

704 region=global_region, 

705 ), 

706 ], 

707 width=12, 

708 height=6, 

709 region=global_region, 

710 ) 

711 widgets.append(latency_widget) 

712 

713 # Throttled requests 

714 throttle_widget = cloudwatch.GraphWidget( 

715 title="DynamoDB - Throttled Requests", 

716 left=[ 

717 cloudwatch.Metric( 

718 namespace="AWS/DynamoDB", 

719 metric_name="ThrottledRequests", 

720 dimensions_map={"TableName": jobs_table}, 

721 statistic="Sum", 

722 period=Duration.minutes(5), 

723 label="Jobs", 

724 color="#d62728", 

725 region=global_region, 

726 ), 

727 cloudwatch.Metric( 

728 namespace="AWS/DynamoDB", 

729 metric_name="ThrottledRequests", 

730 dimensions_map={"TableName": templates_table}, 

731 statistic="Sum", 

732 period=Duration.minutes(5), 

733 label="Templates", 

734 color="#ff7f0e", 

735 region=global_region, 

736 ), 

737 cloudwatch.Metric( 

738 namespace="AWS/DynamoDB", 

739 metric_name="ThrottledRequests", 

740 dimensions_map={"TableName": webhooks_table}, 

741 statistic="Sum", 

742 period=Duration.minutes(5), 

743 label="Webhooks", 

744 color="#9467bd", 

745 region=global_region, 

746 ), 

747 ], 

748 width=12, 

749 height=6, 

750 region=global_region, 

751 ) 

752 widgets.append(throttle_widget) 

753 

754 # System errors 

755 errors_widget = cloudwatch.GraphWidget( 

756 title="DynamoDB - System Errors", 

757 left=[ 

758 cloudwatch.Metric( 

759 namespace="AWS/DynamoDB", 

760 metric_name="SystemErrors", 

761 dimensions_map={"TableName": jobs_table}, 

762 statistic="Sum", 

763 period=Duration.minutes(5), 

764 label="Jobs", 

765 color="#d62728", 

766 region=global_region, 

767 ), 

768 cloudwatch.Metric( 

769 namespace="AWS/DynamoDB", 

770 metric_name="SystemErrors", 

771 dimensions_map={"TableName": templates_table}, 

772 statistic="Sum", 

773 period=Duration.minutes(5), 

774 label="Templates", 

775 color="#ff7f0e", 

776 region=global_region, 

777 ), 

778 ], 

779 width=12, 

780 height=6, 

781 region=global_region, 

782 ) 

783 widgets.append(errors_widget) 

784 

785 return widgets 

786 

787 def _create_eks_widgets(self) -> list[cloudwatch.IWidget]: 

788 """Create EKS cluster monitoring widgets""" 

789 widgets: list[cloudwatch.IWidget] = [] 

790 

791 # Section header 

792 widgets.append( 

793 cloudwatch.TextWidget( 

794 markdown="# EKS Clusters\nCluster resource utilization and node metrics", 

795 width=24, 

796 height=1, 

797 ) 

798 ) 

799 

800 # Build cluster info from regional stacks: (cluster_name, region) 

801 cluster_info = [ 

802 (regional_stack.cluster.cluster_name, regional_stack.deployment_region) 

803 for regional_stack in self.regional_stacks 

804 ] 

805 

806 # EKS cluster status 

807 cluster_status_widget = cloudwatch.SingleValueWidget( 

808 title="EKS Clusters - Failed Requests", 

809 metrics=[ 

810 cloudwatch.Metric( 

811 namespace="AWS/EKS", 

812 metric_name="cluster_failed_request_count", 

813 dimensions_map={"cluster_name": cluster_name}, 

814 statistic="Sum", 

815 period=Duration.minutes(5), 

816 region=region, 

817 ) 

818 for cluster_name, region in cluster_info 

819 ], 

820 width=12, 

821 height=6, 

822 ) 

823 widgets.append(cluster_status_widget) 

824 

825 # Container Insights - Node CPU utilization (aggregated across all nodes) 

826 # Note: region parameter enables cross-region metrics in dashboard 

827 cpu_widget = cloudwatch.GraphWidget( 

828 title="EKS Clusters - Node CPU Utilization (%)", 

829 left=[ 

830 cloudwatch.Metric( 

831 namespace="ContainerInsights", 

832 metric_name="node_cpu_utilization", 

833 dimensions_map={"ClusterName": cluster_name}, 

834 statistic="Average", 

835 period=Duration.minutes(5), 

836 label=region, 

837 region=region, 

838 ) 

839 for cluster_name, region in cluster_info 

840 ], 

841 width=12, 

842 height=6, 

843 ) 

844 widgets.append(cpu_widget) 

845 

846 # Container Insights - Node Memory utilization (aggregated across all nodes) 

847 memory_widget = cloudwatch.GraphWidget( 

848 title="EKS Clusters - Node Memory Utilization (%)", 

849 left=[ 

850 cloudwatch.Metric( 

851 namespace="ContainerInsights", 

852 metric_name="node_memory_utilization", 

853 dimensions_map={"ClusterName": cluster_name}, 

854 statistic="Average", 

855 period=Duration.minutes(5), 

856 label=region, 

857 region=region, 

858 ) 

859 for cluster_name, region in cluster_info 

860 ], 

861 width=12, 

862 height=6, 

863 ) 

864 widgets.append(memory_widget) 

865 

866 # Node status - running pods capacity 

867 node_widget = cloudwatch.GraphWidget( 

868 title="EKS Clusters - Node Pod Capacity", 

869 left=[ 

870 cloudwatch.Metric( 

871 namespace="ContainerInsights", 

872 metric_name="node_status_capacity_pods", 

873 dimensions_map={"ClusterName": cluster_name}, 

874 statistic="Sum", 

875 period=Duration.minutes(5), 

876 label=f"{region} Capacity", 

877 region=region, 

878 ) 

879 for cluster_name, region in cluster_info 

880 ], 

881 right=[ 

882 cloudwatch.Metric( 

883 namespace="ContainerInsights", 

884 metric_name="node_number_of_running_pods", 

885 dimensions_map={"ClusterName": cluster_name}, 

886 statistic="Sum", 

887 period=Duration.minutes(5), 

888 label=f"{region} Running", 

889 region=region, 

890 ) 

891 for cluster_name, region in cluster_info 

892 ], 

893 width=12, 

894 height=6, 

895 ) 

896 widgets.append(node_widget) 

897 

898 return widgets 

899 

900 def _create_gpu_widgets(self) -> list[cloudwatch.IWidget]: 

901 """Create GPU monitoring widgets using DCGM Exporter metrics via ContainerInsights.""" 

902 widgets: list[cloudwatch.IWidget] = [] 

903 

904 widgets.append( 

905 cloudwatch.TextWidget( 

906 markdown="# GPU Metrics\nGPU utilization, memory, and temperature from DCGM Exporter", 

907 width=24, 

908 height=1, 

909 ) 

910 ) 

911 

912 cluster_info = [ 

913 (regional_stack.cluster.cluster_name, regional_stack.deployment_region) 

914 for regional_stack in self.regional_stacks 

915 ] 

916 

917 # GPU utilization percentage 

918 gpu_util_widget = cloudwatch.GraphWidget( 

919 title="GPU Utilization (%)", 

920 left=[ 

921 cloudwatch.Metric( 

922 namespace="ContainerInsights", 

923 metric_name="node_gpu_utilization", 

924 dimensions_map={"ClusterName": cluster_name}, 

925 statistic="Average", 

926 period=Duration.minutes(5), 

927 label=region, 

928 region=region, 

929 ) 

930 for cluster_name, region in cluster_info 

931 ], 

932 width=12, 

933 height=6, 

934 ) 

935 widgets.append(gpu_util_widget) 

936 

937 # GPU memory utilization 

938 gpu_mem_widget = cloudwatch.GraphWidget( 

939 title="GPU Memory Utilization (%)", 

940 left=[ 

941 cloudwatch.Metric( 

942 namespace="ContainerInsights", 

943 metric_name="node_gpu_memory_utilization", 

944 dimensions_map={"ClusterName": cluster_name}, 

945 statistic="Average", 

946 period=Duration.minutes(5), 

947 label=region, 

948 region=region, 

949 ) 

950 for cluster_name, region in cluster_info 

951 ], 

952 width=12, 

953 height=6, 

954 ) 

955 widgets.append(gpu_mem_widget) 

956 

957 # GPU temperature 

958 gpu_temp_widget = cloudwatch.GraphWidget( 

959 title="GPU Temperature (°C)", 

960 left=[ 

961 cloudwatch.Metric( 

962 namespace="ContainerInsights", 

963 metric_name="node_gpu_temperature", 

964 dimensions_map={"ClusterName": cluster_name}, 

965 statistic="Maximum", 

966 period=Duration.minutes(5), 

967 label=region, 

968 region=region, 

969 ) 

970 for cluster_name, region in cluster_info 

971 ], 

972 width=12, 

973 height=6, 

974 ) 

975 widgets.append(gpu_temp_widget) 

976 

977 # GPU count (active GPUs) 

978 gpu_count_widget = cloudwatch.GraphWidget( 

979 title="Active GPU Count", 

980 left=[ 

981 cloudwatch.Metric( 

982 namespace="ContainerInsights", 

983 metric_name="node_gpu_limit", 

984 dimensions_map={"ClusterName": cluster_name}, 

985 statistic="Sum", 

986 period=Duration.minutes(5), 

987 label=region, 

988 region=region, 

989 ) 

990 for cluster_name, region in cluster_info 

991 ], 

992 width=12, 

993 height=6, 

994 ) 

995 widgets.append(gpu_count_widget) 

996 

997 return widgets 

998 

999 def _create_alb_widgets(self) -> list[cloudwatch.IWidget]: 

1000 """Create ALB monitoring widgets. 

1001 

1002 Note: ALBs are created by the AWS Load Balancer Controller in Kubernetes 

1003 via Ingress resources, not by CDK. The controller uses a naming convention: 

1004 k8s-<namespace>-<ingress-name>-<hash> 

1005 

1006 Since we can't know the exact ALB name at CDK synth time (includes a hash), 

1007 we use CloudWatch SEARCH expressions to dynamically find ALBs matching 

1008 the prefix pattern at dashboard render time. 

1009 """ 

1010 widgets: list[cloudwatch.IWidget] = [] 

1011 

1012 # Section header 

1013 widgets.append( 

1014 cloudwatch.TextWidget( 

1015 markdown="# Application Load Balancers\n" 

1016 "Request metrics and health status. " 

1017 "Uses CloudWatch SEARCH to dynamically find ALBs created by " 

1018 "AWS Load Balancer Controller.", 

1019 width=24, 

1020 height=1, 

1021 ) 

1022 ) 

1023 

1024 # Create one widget per region for ALB request count 

1025 for region in self.regions: 

1026 request_count_widget = cloudwatch.GraphWidget( 

1027 title=f"ALB - Request Count ({region})", 

1028 left=[ 

1029 cloudwatch.MathExpression( 

1030 expression=( 

1031 'SEARCH(\'Namespace="AWS/ApplicationELB" ' 

1032 'MetricName="RequestCount"\', "Sum", 300)' 

1033 ), 

1034 label="Request Count", 

1035 period=Duration.minutes(5), 

1036 ) 

1037 ], 

1038 width=12, 

1039 height=6, 

1040 region=region, 

1041 ) 

1042 widgets.append(request_count_widget) 

1043 

1044 # Create one widget per region for ALB response time 

1045 for region in self.regions: 

1046 response_time_widget = cloudwatch.GraphWidget( 

1047 title=f"ALB - Response Time ({region})", 

1048 left=[ 

1049 cloudwatch.MathExpression( 

1050 expression=( 

1051 'SEARCH(\'Namespace="AWS/ApplicationELB" ' 

1052 'MetricName="TargetResponseTime"\', "Average", 300)' 

1053 ), 

1054 label="Avg Response Time", 

1055 period=Duration.minutes(5), 

1056 ) 

1057 ], 

1058 width=12, 

1059 height=6, 

1060 region=region, 

1061 ) 

1062 widgets.append(response_time_widget) 

1063 

1064 # Create one widget per region for ALB HTTP errors 

1065 for region in self.regions: 

1066 http_errors_widget = cloudwatch.GraphWidget( 

1067 title=f"ALB - HTTP Errors ({region})", 

1068 left=[ 

1069 cloudwatch.MathExpression( 

1070 expression=( 

1071 'SEARCH(\'Namespace="AWS/ApplicationELB" ' 

1072 'MetricName="HTTPCode_Target_4XX_Count"\', "Sum", 300)' 

1073 ), 

1074 label="4XX Errors", 

1075 period=Duration.minutes(5), 

1076 ) 

1077 ], 

1078 right=[ 

1079 cloudwatch.MathExpression( 

1080 expression=( 

1081 'SEARCH(\'Namespace="AWS/ApplicationELB" ' 

1082 'MetricName="HTTPCode_Target_5XX_Count"\', "Sum", 300)' 

1083 ), 

1084 label="5XX Errors", 

1085 period=Duration.minutes(5), 

1086 ) 

1087 ], 

1088 width=12, 

1089 height=6, 

1090 region=region, 

1091 ) 

1092 widgets.append(http_errors_widget) 

1093 

1094 # Create one widget per region for ALB active connections 

1095 for region in self.regions: 

1096 connections_widget = cloudwatch.GraphWidget( 

1097 title=f"ALB - Active Connections ({region})", 

1098 left=[ 

1099 cloudwatch.MathExpression( 

1100 expression=( 

1101 'SEARCH(\'Namespace="AWS/ApplicationELB" ' 

1102 'MetricName="ActiveConnectionCount"\', "Sum", 300)' 

1103 ), 

1104 label="Active Connections", 

1105 period=Duration.minutes(5), 

1106 ) 

1107 ], 

1108 width=12, 

1109 height=6, 

1110 region=region, 

1111 ) 

1112 widgets.append(connections_widget) 

1113 

1114 return widgets 

1115 

1116 def _create_application_widgets(self) -> list[cloudwatch.IWidget]: 

1117 """Create custom application monitoring widgets""" 

1118 widgets: list[cloudwatch.IWidget] = [] 

1119 

1120 # Section header 

1121 widgets.append( 

1122 cloudwatch.TextWidget( 

1123 markdown="# Application Metrics\n" 

1124 "Health monitor and manifest processor metrics. " 

1125 "Application logs are available in Container Insights at " 

1126 "`/aws/containerinsights/<cluster>/application`.", 

1127 width=24, 

1128 height=1, 

1129 ) 

1130 ) 

1131 

1132 # Build cluster info from regional stacks: (cluster_name, region) 

1133 cluster_info = [ 

1134 (regional_stack.cluster.cluster_name, regional_stack.deployment_region) 

1135 for regional_stack in self.regional_stacks 

1136 ] 

1137 

1138 # Health monitor metrics 

1139 health_monitor_widget = cloudwatch.GraphWidget( 

1140 title="Health Monitor - Resource Utilization", 

1141 left=[ 

1142 cloudwatch.Metric( 

1143 namespace="GCO/HealthMonitor", 

1144 metric_name="ClusterCpuUtilization", 

1145 dimensions_map={ 

1146 "ClusterName": cluster_name, 

1147 "Region": region, 

1148 }, 

1149 statistic="Average", 

1150 period=Duration.minutes(5), 

1151 label=f"{region} CPU", 

1152 region=region, 

1153 ) 

1154 for cluster_name, region in cluster_info 

1155 ], 

1156 right=[ 

1157 cloudwatch.Metric( 

1158 namespace="GCO/HealthMonitor", 

1159 metric_name="ClusterMemoryUtilization", 

1160 dimensions_map={ 

1161 "ClusterName": cluster_name, 

1162 "Region": region, 

1163 }, 

1164 statistic="Average", 

1165 period=Duration.minutes(5), 

1166 label=f"{region} Memory", 

1167 region=region, 

1168 ) 

1169 for cluster_name, region in cluster_info 

1170 ], 

1171 width=12, 

1172 height=6, 

1173 ) 

1174 widgets.append(health_monitor_widget) 

1175 

1176 # Manifest processor metrics 

1177 manifest_processor_widget = cloudwatch.GraphWidget( 

1178 title="Manifest Processor - Submissions", 

1179 left=[ 

1180 cloudwatch.Metric( 

1181 namespace="GCO/ManifestProcessor", 

1182 metric_name="ManifestSubmissions", 

1183 dimensions_map={ 

1184 "ClusterName": cluster_name, 

1185 "Region": region, 

1186 }, 

1187 statistic="Sum", 

1188 period=Duration.minutes(5), 

1189 label=f"{region} Submissions", 

1190 region=region, 

1191 ) 

1192 for cluster_name, region in cluster_info 

1193 ], 

1194 right=[ 

1195 cloudwatch.Metric( 

1196 namespace="GCO/ManifestProcessor", 

1197 metric_name="ManifestFailures", 

1198 dimensions_map={ 

1199 "ClusterName": cluster_name, 

1200 "Region": region, 

1201 }, 

1202 statistic="Sum", 

1203 period=Duration.minutes(5), 

1204 label=f"{region} Failures", 

1205 color="#d62728", 

1206 region=region, 

1207 ) 

1208 for cluster_name, region in cluster_info 

1209 ], 

1210 width=12, 

1211 height=6, 

1212 ) 

1213 widgets.append(manifest_processor_widget) 

1214 

1215 # Container Insights - Pod restarts (indicates application issues) 

1216 pod_restarts_widget = cloudwatch.GraphWidget( 

1217 title="Container Insights - Pod Restarts", 

1218 left=[ 

1219 cloudwatch.Metric( 

1220 namespace="ContainerInsights", 

1221 metric_name="pod_number_of_container_restarts", 

1222 dimensions_map={"ClusterName": cluster_name}, 

1223 statistic="Sum", 

1224 period=Duration.minutes(5), 

1225 label=f"{region}", 

1226 region=region, 

1227 ) 

1228 for cluster_name, region in cluster_info 

1229 ], 

1230 width=12, 

1231 height=6, 

1232 ) 

1233 widgets.append(pod_restarts_widget) 

1234 

1235 # Secret rotation Lambda metrics (Secrets Manager doesn't publish rotation metrics, 

1236 # so we monitor the rotation Lambda function instead) 

1237 if self.api_gateway_stack: 1237 ↛ 1273line 1237 didn't jump to line 1273 because the condition on line 1237 was always true

1238 rotation_function_name = self.api_gateway_stack.rotation_lambda.function_name 

1239 api_gw_region = self.config.get_api_gateway_region() 

1240 

1241 rotation_widget = cloudwatch.GraphWidget( 

1242 title="Secret Rotation Lambda - Invocations & Errors", 

1243 left=[ 

1244 cloudwatch.Metric( 

1245 namespace="AWS/Lambda", 

1246 metric_name="Invocations", 

1247 dimensions_map={"FunctionName": rotation_function_name}, 

1248 statistic="Sum", 

1249 period=Duration.hours(1), 

1250 label="Invocations", 

1251 color="#2ca02c", 

1252 region=api_gw_region, 

1253 ), 

1254 ], 

1255 right=[ 

1256 cloudwatch.Metric( 

1257 namespace="AWS/Lambda", 

1258 metric_name="Errors", 

1259 dimensions_map={"FunctionName": rotation_function_name}, 

1260 statistic="Sum", 

1261 period=Duration.hours(1), 

1262 label="Errors", 

1263 color="#d62728", 

1264 region=api_gw_region, 

1265 ), 

1266 ], 

1267 width=12, 

1268 height=6, 

1269 ) 

1270 widgets.append(rotation_widget) 

1271 else: 

1272 # Fallback text widget if api_gateway_stack not available 

1273 fallback_widget = cloudwatch.TextWidget( 

1274 markdown="**Secret Rotation:** API Gateway stack not configured. " 

1275 "Rotation Lambda metrics unavailable.", 

1276 width=12, 

1277 height=6, 

1278 ) 

1279 widgets.append(fallback_widget) 

1280 

1281 return widgets 

1282 

1283 def _create_alarms(self) -> None: 

1284 """Create CloudWatch alarms""" 

1285 self._create_global_accelerator_alarms() 

1286 self._create_api_gateway_alarms() 

1287 self._create_lambda_alarms() 

1288 self._create_sqs_alarms() 

1289 self._create_dynamodb_alarms() 

1290 self._create_eks_alarms() 

1291 self._create_alb_alarms() 

1292 self._create_application_alarms() 

1293 

1294 def _create_global_accelerator_alarms(self) -> None: 

1295 """Create Global Accelerator alarms. 

1296 

1297 Note: Global Accelerator metrics are only available in us-west-2. 

1298 CloudWatch Alarms must be in the same region as the metrics they monitor. 

1299 Since this monitoring stack may be deployed in a different region, 

1300 we skip GA alarms here. To monitor GA, either: 

1301 1. Create alarms manually in us-west-2 

1302 2. Use CloudWatch cross-region dashboard widgets (which we do) 

1303 3. Deploy a separate alarm stack in us-west-2 

1304 """ 

1305 # GA alarms skipped - metrics only available in us-west-2 

1306 # Dashboard widgets use region parameter to display GA metrics correctly 

1307 pass 

1308 

1309 def _create_api_gateway_alarms(self) -> None: 

1310 """Create API Gateway alarms""" 

1311 # Get the actual API name from the api_gateway_stack 

1312 api_name = ( 

1313 self.api_gateway_stack.api.rest_api_name if self.api_gateway_stack else "gco-global-api" 

1314 ) 

1315 

1316 # High 5XX error rate 

1317 api_5xx_alarm = cloudwatch.Alarm( 

1318 self, 

1319 "ApiGateway5xxAlarm", 

1320 alarm_description="API Gateway has high 5XX error rate", 

1321 metric=cloudwatch.Metric( 

1322 namespace="AWS/ApiGateway", 

1323 metric_name="5XXError", 

1324 dimensions_map={"ApiName": api_name}, 

1325 statistic="Sum", 

1326 period=Duration.minutes(5), 

1327 ), 

1328 threshold=10, 

1329 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD, 

1330 evaluation_periods=2, 

1331 datapoints_to_alarm=2, 

1332 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING, 

1333 ) 

1334 api_5xx_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic)) 

1335 

1336 # High latency 

1337 api_latency_alarm = cloudwatch.Alarm( 

1338 self, 

1339 "ApiGatewayHighLatencyAlarm", 

1340 alarm_description="API Gateway has high latency", 

1341 metric=cloudwatch.Metric( 

1342 namespace="AWS/ApiGateway", 

1343 metric_name="Latency", 

1344 dimensions_map={"ApiName": api_name}, 

1345 statistic="p99", 

1346 period=Duration.minutes(5), 

1347 ), 

1348 threshold=10000, # 10 seconds 

1349 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD, 

1350 evaluation_periods=3, 

1351 datapoints_to_alarm=2, 

1352 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING, 

1353 ) 

1354 api_latency_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic)) 

1355 

1356 def _create_lambda_alarms(self) -> None: 

1357 """Create Lambda function alarms""" 

1358 # Get Lambda function names from api_gateway_stack if available 

1359 if self.api_gateway_stack: 1359 ↛ exitline 1359 didn't return from function '_create_lambda_alarms' because the condition on line 1359 was always true

1360 proxy_function_name = self.api_gateway_stack.proxy_lambda.function_name 

1361 rotation_function_name = self.api_gateway_stack.rotation_lambda.function_name 

1362 

1363 # API Gateway Proxy Lambda errors 

1364 proxy_errors_alarm = cloudwatch.Alarm( 

1365 self, 

1366 "ProxyLambdaErrorsAlarm", 

1367 alarm_description="API Gateway proxy Lambda has errors", 

1368 metric=cloudwatch.Metric( 

1369 namespace="AWS/Lambda", 

1370 metric_name="Errors", 

1371 dimensions_map={"FunctionName": proxy_function_name}, 

1372 statistic="Sum", 

1373 period=Duration.minutes(5), 

1374 ), 

1375 threshold=5, 

1376 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD, 

1377 evaluation_periods=2, 

1378 datapoints_to_alarm=2, 

1379 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING, 

1380 ) 

1381 proxy_errors_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic)) 

1382 

1383 # Proxy Lambda throttles 

1384 proxy_throttles_alarm = cloudwatch.Alarm( 

1385 self, 

1386 "ProxyLambdaThrottlesAlarm", 

1387 alarm_description="API Gateway proxy Lambda is being throttled", 

1388 metric=cloudwatch.Metric( 

1389 namespace="AWS/Lambda", 

1390 metric_name="Throttles", 

1391 dimensions_map={"FunctionName": proxy_function_name}, 

1392 statistic="Sum", 

1393 period=Duration.minutes(5), 

1394 ), 

1395 threshold=1, 

1396 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD, 

1397 evaluation_periods=2, 

1398 datapoints_to_alarm=2, 

1399 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING, 

1400 ) 

1401 proxy_throttles_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic)) 

1402 

1403 # Secret rotation Lambda errors 

1404 rotation_errors_alarm = cloudwatch.Alarm( 

1405 self, 

1406 "RotationLambdaErrorsAlarm", 

1407 alarm_description="Secret rotation Lambda has errors", 

1408 metric=cloudwatch.Metric( 

1409 namespace="AWS/Lambda", 

1410 metric_name="Errors", 

1411 dimensions_map={"FunctionName": rotation_function_name}, 

1412 statistic="Sum", 

1413 period=Duration.hours(1), 

1414 ), 

1415 threshold=1, 

1416 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD, 

1417 evaluation_periods=1, 

1418 datapoints_to_alarm=1, 

1419 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING, 

1420 ) 

1421 rotation_errors_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic)) 

1422 

1423 def _create_sqs_alarms(self) -> None: 

1424 """Create SQS queue alarms""" 

1425 for regional_stack in self.regional_stacks: 

1426 region = regional_stack.deployment_region 

1427 queue_name = regional_stack.job_queue.queue_name 

1428 dlq_name = regional_stack.job_dlq.queue_name 

1429 region_id = region.replace("-", "").title() 

1430 

1431 # Old message alarm (stuck jobs) 

1432 old_message_alarm = cloudwatch.Alarm( 

1433 self, 

1434 f"SqsOldMessageAlarm{region_id}", 

1435 alarm_description=f"SQS queue in {region} has old messages (potential stuck jobs)", 

1436 metric=cloudwatch.Metric( 

1437 namespace="AWS/SQS", 

1438 metric_name="ApproximateAgeOfOldestMessage", 

1439 dimensions_map={"QueueName": queue_name}, 

1440 statistic="Maximum", 

1441 period=Duration.minutes(5), 

1442 ), 

1443 threshold=3600, # 1 hour 

1444 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD, 

1445 evaluation_periods=2, 

1446 datapoints_to_alarm=2, 

1447 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING, 

1448 ) 

1449 old_message_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic)) 

1450 

1451 # Dead letter queue alarm 

1452 dlq_alarm = cloudwatch.Alarm( 

1453 self, 

1454 f"SqsDlqAlarm{region_id}", 

1455 alarm_description=f"SQS dead letter queue in {region} has messages", 

1456 metric=cloudwatch.Metric( 

1457 namespace="AWS/SQS", 

1458 metric_name="ApproximateNumberOfMessagesVisible", 

1459 dimensions_map={"QueueName": dlq_name}, 

1460 statistic="Sum", 

1461 period=Duration.minutes(5), 

1462 ), 

1463 threshold=1, 

1464 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD, 

1465 evaluation_periods=1, 

1466 datapoints_to_alarm=1, 

1467 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING, 

1468 ) 

1469 dlq_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic)) 

1470 

1471 def _create_dynamodb_alarms(self) -> None: 

1472 """Create DynamoDB alarms for job queue, templates, and webhooks tables.""" 

1473 # Get table names from global stack 

1474 jobs_table = self.global_stack.jobs_table.table_name 

1475 

1476 # DynamoDB tables are in the global region 

1477 global_region = self.config.get_global_region() 

1478 

1479 # Jobs table throttling alarm 

1480 jobs_throttle_alarm = cloudwatch.Alarm( 

1481 self, 

1482 "DynamoDBJobsThrottleAlarm", 

1483 alarm_description="DynamoDB jobs table is being throttled", 

1484 metric=cloudwatch.Metric( 

1485 namespace="AWS/DynamoDB", 

1486 metric_name="ThrottledRequests", 

1487 dimensions_map={"TableName": jobs_table}, 

1488 statistic="Sum", 

1489 period=Duration.minutes(5), 

1490 region=global_region, 

1491 ), 

1492 threshold=1, 

1493 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD, 

1494 evaluation_periods=2, 

1495 datapoints_to_alarm=2, 

1496 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING, 

1497 ) 

1498 jobs_throttle_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic)) 

1499 

1500 # Jobs table system errors alarm 

1501 jobs_errors_alarm = cloudwatch.Alarm( 

1502 self, 

1503 "DynamoDBJobsErrorsAlarm", 

1504 alarm_description="DynamoDB jobs table has system errors", 

1505 metric=cloudwatch.Metric( 

1506 namespace="AWS/DynamoDB", 

1507 metric_name="SystemErrors", 

1508 dimensions_map={"TableName": jobs_table}, 

1509 statistic="Sum", 

1510 period=Duration.minutes(5), 

1511 region=global_region, 

1512 ), 

1513 threshold=1, 

1514 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD, 

1515 evaluation_periods=1, 

1516 datapoints_to_alarm=1, 

1517 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING, 

1518 ) 

1519 jobs_errors_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic)) 

1520 

1521 def _create_eks_alarms(self) -> None: 

1522 """Create EKS cluster alarms""" 

1523 for regional_stack in self.regional_stacks: 

1524 region = regional_stack.deployment_region 

1525 cluster_name = regional_stack.cluster.cluster_name 

1526 region_id = region.replace("-", "").title() 

1527 

1528 # High CPU utilization alarm (node-level metric) 

1529 high_cpu_alarm = cloudwatch.Alarm( 

1530 self, 

1531 f"EksHighCpuAlarm{region_id}", 

1532 alarm_description=f"EKS cluster {cluster_name} has high CPU utilization", 

1533 metric=cloudwatch.Metric( 

1534 namespace="ContainerInsights", 

1535 metric_name="node_cpu_utilization", 

1536 dimensions_map={"ClusterName": cluster_name}, 

1537 statistic="Average", 

1538 period=Duration.minutes(5), 

1539 ), 

1540 threshold=80, 

1541 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD, 

1542 evaluation_periods=3, 

1543 datapoints_to_alarm=2, 

1544 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING, 

1545 ) 

1546 high_cpu_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic)) 

1547 

1548 # High memory utilization alarm (node-level metric) 

1549 high_memory_alarm = cloudwatch.Alarm( 

1550 self, 

1551 f"EksHighMemoryAlarm{region_id}", 

1552 alarm_description=f"EKS cluster {cluster_name} has high memory utilization", 

1553 metric=cloudwatch.Metric( 

1554 namespace="ContainerInsights", 

1555 metric_name="node_memory_utilization", 

1556 dimensions_map={"ClusterName": cluster_name}, 

1557 statistic="Average", 

1558 period=Duration.minutes(5), 

1559 ), 

1560 threshold=85, 

1561 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD, 

1562 evaluation_periods=3, 

1563 datapoints_to_alarm=2, 

1564 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING, 

1565 ) 

1566 high_memory_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic)) 

1567 

1568 def _create_alb_alarms(self) -> None: 

1569 """Create ALB alarms. 

1570 

1571 Note: ALBs are created dynamically by the AWS Load Balancer Controller 

1572 in Kubernetes via Ingress resources. Since we can't know the exact ALB 

1573 name at CDK synth time (it includes a hash), we cannot create alarms 

1574 with specific ALB dimensions. 

1575 

1576 CloudWatch Alarms don't support SEARCH expressions like dashboards do, 

1577 so we skip ALB-specific alarms. Instead, rely on: 

1578 1. Dashboard widgets with SEARCH expressions for monitoring 

1579 2. EKS Container Insights alarms for pod/node health 

1580 3. API Gateway alarms for request-level monitoring 

1581 

1582 If ALB-specific alarms are needed, consider: 

1583 - Using a custom resource to discover ALB names at deploy time 

1584 - Creating alarms via AWS CLI/SDK after deployment 

1585 - Using CloudWatch Anomaly Detection on the namespace level 

1586 """ 

1587 # ALB alarms are skipped because ALB names are not known at synth time 

1588 # The AWS Load Balancer Controller creates ALBs with names like: 

1589 # k8s-<namespace>-<ingress>-<hash> 

1590 pass 

1591 

1592 def _create_application_alarms(self) -> None: 

1593 """Create application-specific alarms""" 

1594 for regional_stack in self.regional_stacks: 

1595 region = regional_stack.deployment_region 

1596 cluster_name = regional_stack.cluster.cluster_name 

1597 region_id = region.replace("-", "").title() 

1598 

1599 # High manifest failure rate alarm 

1600 high_failure_rate_alarm = cloudwatch.Alarm( 

1601 self, 

1602 f"ManifestHighFailureRateAlarm{region_id}", 

1603 alarm_description=f"Manifest processor in {region} has high failure rate", 

1604 metric=cloudwatch.Metric( 

1605 namespace="GCO/ManifestProcessor", 

1606 metric_name="ManifestFailures", 

1607 dimensions_map={"ClusterName": cluster_name, "Region": region}, 

1608 statistic="Sum", 

1609 period=Duration.minutes(5), 

1610 ), 

1611 threshold=10, 

1612 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD, 

1613 evaluation_periods=2, 

1614 datapoints_to_alarm=2, 

1615 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING, 

1616 ) 

1617 high_failure_rate_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic)) 

1618 

1619 def _create_composite_alarms(self) -> None: 

1620 """Create composite alarms for better signal-to-noise ratio""" 

1621 

1622 # Store individual alarms for composite alarm references 

1623 regional_alarms: dict[str, list[cloudwatch.Alarm]] = {} 

1624 

1625 for regional_stack in self.regional_stacks: 

1626 region = regional_stack.deployment_region 

1627 cluster_name = regional_stack.cluster.cluster_name 

1628 region_id = region.replace("-", "").title() 

1629 regional_alarms[region] = [] 

1630 

1631 # Create regional health composite alarm 

1632 # Triggers when multiple issues occur in the same region 

1633 eks_cpu_alarm = cloudwatch.Alarm( 

1634 self, 

1635 f"CompositeEksCpu{region_id}", 

1636 metric=cloudwatch.Metric( 

1637 namespace="ContainerInsights", 

1638 metric_name="node_cpu_utilization", 

1639 dimensions_map={"ClusterName": cluster_name}, 

1640 statistic="Average", 

1641 period=Duration.minutes(5), 

1642 ), 

1643 threshold=90, 

1644 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD, 

1645 evaluation_periods=2, 

1646 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING, 

1647 ) 

1648 regional_alarms[region].append(eks_cpu_alarm) 

1649 

1650 eks_memory_alarm = cloudwatch.Alarm( 

1651 self, 

1652 f"CompositeEksMemory{region_id}", 

1653 metric=cloudwatch.Metric( 

1654 namespace="ContainerInsights", 

1655 metric_name="node_memory_utilization", 

1656 dimensions_map={"ClusterName": cluster_name}, 

1657 statistic="Average", 

1658 period=Duration.minutes(5), 

1659 ), 

1660 threshold=90, 

1661 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD, 

1662 evaluation_periods=2, 

1663 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING, 

1664 ) 

1665 regional_alarms[region].append(eks_memory_alarm) 

1666 

1667 # Create composite alarm for critical regional issues 

1668 for region, alarms in regional_alarms.items(): 

1669 region_id = region.replace("-", "").title() 

1670 if len(alarms) >= 2: 1670 ↛ 1668line 1670 didn't jump to line 1668 because the condition on line 1670 was always true

1671 composite_alarm = cloudwatch.CompositeAlarm( 

1672 self, 

1673 f"RegionalCriticalAlarm{region_id}", 

1674 alarm_description=f"Critical: Multiple issues detected in {region}", 

1675 alarm_rule=cloudwatch.AlarmRule.all_of(*alarms), 

1676 ) 

1677 composite_alarm.add_alarm_action(cw_actions.SnsAction(self.alert_topic)) 

1678 

1679 # API Gateway + Lambda composite alarm (only if api_gateway_stack is available) 

1680 if self.api_gateway_stack: 1680 ↛ exitline 1680 didn't return from function '_create_composite_alarms' because the condition on line 1680 was always true

1681 api_name = self.api_gateway_stack.api.rest_api_name 

1682 proxy_function_name = self.api_gateway_stack.proxy_lambda.function_name 

1683 

1684 api_error_alarm = cloudwatch.Alarm( 

1685 self, 

1686 "CompositeApiErrors", 

1687 metric=cloudwatch.Metric( 

1688 namespace="AWS/ApiGateway", 

1689 metric_name="5XXError", 

1690 dimensions_map={"ApiName": api_name}, 

1691 statistic="Sum", 

1692 period=Duration.minutes(5), 

1693 ), 

1694 threshold=5, 

1695 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD, 

1696 evaluation_periods=2, 

1697 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING, 

1698 ) 

1699 

1700 lambda_error_alarm = cloudwatch.Alarm( 

1701 self, 

1702 "CompositeLambdaErrors", 

1703 metric=cloudwatch.Metric( 

1704 namespace="AWS/Lambda", 

1705 metric_name="Errors", 

1706 dimensions_map={"FunctionName": proxy_function_name}, 

1707 statistic="Sum", 

1708 period=Duration.minutes(5), 

1709 ), 

1710 threshold=3, 

1711 comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD, 

1712 evaluation_periods=2, 

1713 treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING, 

1714 ) 

1715 

1716 api_lambda_composite = cloudwatch.CompositeAlarm( 

1717 self, 

1718 "ApiLambdaCompositeAlarm", 

1719 alarm_description="Critical: Both API Gateway and Lambda proxy have errors", 

1720 alarm_rule=cloudwatch.AlarmRule.all_of(api_error_alarm, lambda_error_alarm), 

1721 ) 

1722 api_lambda_composite.add_alarm_action(cw_actions.SnsAction(self.alert_topic)) 

1723 

1724 def _create_custom_metrics(self) -> None: 

1725 """Create custom metric filters and log groups""" 

1726 for regional_stack in self.regional_stacks: 

1727 region = regional_stack.deployment_region 

1728 region_id = region.replace("-", "").title() 

1729 

1730 # Health monitor log group 

1731 # log_group_name intentionally omitted - let CDK generate unique name 

1732 logs.LogGroup( 

1733 self, 

1734 f"HealthMonitorLogGroup{region_id}", 

1735 retention=logs.RetentionDays.ONE_MONTH, 

1736 removal_policy=RemovalPolicy.DESTROY, 

1737 ) 

1738 

1739 # Manifest processor log group 

1740 # log_group_name intentionally omitted - let CDK generate unique name 

1741 logs.LogGroup( 

1742 self, 

1743 f"ManifestProcessorLogGroup{region_id}", 

1744 retention=logs.RetentionDays.ONE_MONTH, 

1745 removal_policy=RemovalPolicy.DESTROY, 

1746 ) 

1747 

1748 def _create_outputs(self) -> None: 

1749 """Create CloudFormation outputs""" 

1750 CfnOutput( 

1751 self, 

1752 "DashboardUrl", 

1753 value=f"https://console.aws.amazon.com/cloudwatch/home?region={self.region}#dashboards:name={self.dashboard.dashboard_name}", 

1754 description="CloudWatch Dashboard URL", 

1755 ) 

1756 

1757 CfnOutput( 

1758 self, 

1759 "AlertTopicArn", 

1760 value=self.alert_topic.topic_arn, 

1761 description="SNS Topic ARN for monitoring alerts", 

1762 ) 

1763 

1764 CfnOutput( 

1765 self, 

1766 "AlarmCount", 

1767 value="See CloudWatch Alarms console for full list", 

1768 description="Monitoring alarms created", 

1769 )