Monitoring and Observability Guide

This guide covers comprehensive monitoring, logging, and observability setup for the GenAI IDP Accelerator.

Monitoring Architecture

Three Pillars of Observability

Metrics: Quantitative measurements of system behavior
Logs: Detailed records of events and transactions
Traces: Request flow through distributed components

Monitoring Stack

graph TB
    App[Application] --> CW[CloudWatch]
    App --> XRay[X-Ray]
    CW --> Alarms[CloudWatch Alarms]
    CW --> Dashboards[CloudWatch Dashboards]
    Alarms --> SNS[SNS Topics]
    SNS --> Email[Email Notifications]
    SNS --> Slack[Slack Integration]
    XRay --> ServiceMap[Service Map]
    XRay --> Traces[Distributed Traces]

CloudWatch Configuration

Log Groups

# Lambda function log groups
resource "aws_cloudwatch_log_group" "lambda_logs" {
  for_each = var.lambda_functions

  name              = "/aws/lambda/${var.environment}-idp-${each.key}"
  retention_in_days = var.log_retention_days
  kms_key_id        = aws_kms_key.cloudwatch_logs_key.arn

  tags = local.common_tags
}

# API Gateway log group
resource "aws_cloudwatch_log_group" "api_gateway_logs" {
  name              = "/aws/apigateway/${var.environment}-idp-api"
  retention_in_days = var.log_retention_days
  kms_key_id        = aws_kms_key.cloudwatch_logs_key.arn

  tags = local.common_tags
}

# Step Functions log group
resource "aws_cloudwatch_log_group" "step_functions_logs" {
  name              = "/aws/stepfunctions/${var.environment}-idp-workflow"
  retention_in_days = var.log_retention_days
  kms_key_id        = aws_kms_key.cloudwatch_logs_key.arn

  tags = local.common_tags
}

Custom Metrics

# Custom metric filters
resource "aws_cloudwatch_log_metric_filter" "document_processing_duration" {
  name           = "${var.environment}-document-processing-duration"
  log_group_name = aws_cloudwatch_log_group.lambda_logs["document_processor"].name
  pattern        = "[timestamp, request_id, \"PROCESSING_DURATION\", duration]"

  metric_transformation {
    name      = "DocumentProcessingDuration"
    namespace = "IDP/Performance"
    value     = "$duration"
    unit      = "Seconds"
  }
}

resource "aws_cloudwatch_log_metric_filter" "document_processing_errors" {
  name           = "${var.environment}-document-processing-errors"
  log_group_name = aws_cloudwatch_log_group.lambda_logs["document_processor"].name
  pattern        = "[timestamp, request_id, \"ERROR\", ...]"

  metric_transformation {
    name      = "DocumentProcessingErrors"
    namespace = "IDP/Errors"
    value     = "1"
  }
}

resource "aws_cloudwatch_log_metric_filter" "bedrock_api_calls" {
  name           = "${var.environment}-bedrock-api-calls"
  log_group_name = aws_cloudwatch_log_group.lambda_logs["ai_processor"].name
  pattern        = "[timestamp, request_id, \"BEDROCK_API_CALL\", model, tokens]"

  metric_transformation {
    name      = "BedrockAPICalls"
    namespace = "IDP/Usage"
    value     = "1"
  }
}

CloudWatch Alarms

Performance Alarms

# Lambda function duration alarm
resource "aws_cloudwatch_metric_alarm" "lambda_duration" {
  for_each = var.lambda_functions

  alarm_name          = "${var.environment}-idp-${each.key}-duration"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "2"
  metric_name         = "Duration"
  namespace           = "AWS/Lambda"
  period              = "300"
  statistic           = "Average"
  threshold           = each.value.timeout_threshold
  alarm_description   = "Lambda function ${each.key} duration is too high"

  dimensions = {
    FunctionName = aws_lambda_function.functions[each.key].function_name
  }

  alarm_actions = [aws_sns_topic.alerts.arn]
  ok_actions    = [aws_sns_topic.alerts.arn]

  tags = local.common_tags
}

# Lambda function error rate alarm
resource "aws_cloudwatch_metric_alarm" "lambda_error_rate" {
  for_each = var.lambda_functions

  alarm_name          = "${var.environment}-idp-${each.key}-error-rate"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "2"
  metric_name         = "Errors"
  namespace           = "AWS/Lambda"
  period              = "300"
  statistic           = "Sum"
  threshold           = "5"
  alarm_description   = "Lambda function ${each.key} error rate is too high"

  dimensions = {
    FunctionName = aws_lambda_function.functions[each.key].function_name
  }

  alarm_actions = [aws_sns_topic.alerts.arn]

  tags = local.common_tags
}

# API Gateway 4xx errors
resource "aws_cloudwatch_metric_alarm" "api_gateway_4xx_errors" {
  alarm_name          = "${var.environment}-idp-api-4xx-errors"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "2"
  metric_name         = "4XXError"
  namespace           = "AWS/ApiGateway"
  period              = "300"
  statistic           = "Sum"
  threshold           = "10"
  alarm_description   = "API Gateway 4xx error rate is too high"

  dimensions = {
    ApiName = aws_api_gateway_rest_api.idp_api.name
  }

  alarm_actions = [aws_sns_topic.alerts.arn]

  tags = local.common_tags
}

# API Gateway 5xx errors
resource "aws_cloudwatch_metric_alarm" "api_gateway_5xx_errors" {
  alarm_name          = "${var.environment}-idp-api-5xx-errors"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "1"
  metric_name         = "5XXError"
  namespace           = "AWS/ApiGateway"
  period              = "300"
  statistic           = "Sum"
  threshold           = "1"
  alarm_description   = "API Gateway 5xx error detected"

  dimensions = {
    ApiName = aws_api_gateway_rest_api.idp_api.name
  }

  alarm_actions = [aws_sns_topic.critical_alerts.arn]

  tags = local.common_tags
}

Resource Utilization Alarms

# DynamoDB throttling alarm
resource "aws_cloudwatch_metric_alarm" "dynamodb_throttling" {
  alarm_name          = "${var.environment}-idp-dynamodb-throttling"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "2"
  metric_name         = "ThrottledRequests"
  namespace           = "AWS/DynamoDB"
  period              = "300"
  statistic           = "Sum"
  threshold           = "0"
  alarm_description   = "DynamoDB requests are being throttled"

  dimensions = {
    TableName = aws_dynamodb_table.document_metadata.name
  }

  alarm_actions = [aws_sns_topic.alerts.arn]

  tags = local.common_tags
}

# S3 bucket size alarm
resource "aws_cloudwatch_metric_alarm" "s3_bucket_size" {
  alarm_name          = "${var.environment}-idp-s3-bucket-size"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "1"
  metric_name         = "BucketSizeBytes"
  namespace           = "AWS/S3"
  period              = "86400"  # Daily
  statistic           = "Average"
  threshold           = var.s3_size_threshold_bytes
  alarm_description   = "S3 bucket size is approaching limits"

  dimensions = {
    BucketName = aws_s3_bucket.documents.bucket
    StorageType = "StandardStorage"
  }

  alarm_actions = [aws_sns_topic.alerts.arn]

  tags = local.common_tags
}

CloudWatch Dashboards

Main Dashboard

resource "aws_cloudwatch_dashboard" "idp_main_dashboard" {
  dashboard_name = "${var.environment}-idp-main-dashboard"

  dashboard_body = jsonencode({
    widgets = [
      {
        type   = "metric"
        x      = 0
        y      = 0
        width  = 12
        height = 6

        properties = {
          metrics = [
            ["AWS/Lambda", "Invocations", "FunctionName", aws_lambda_function.document_processor.function_name],
            [".", "Duration", ".", "."],
            [".", "Errors", ".", "."],
            [".", "Throttles", ".", "."]
          ]
          view    = "timeSeries"
          stacked = false
          region  = var.region
          title   = "Document Processor Lambda Metrics"
          period  = 300
        }
      },
      {
        type   = "metric"
        x      = 12
        y      = 0
        width  = 12
        height = 6

        properties = {
          metrics = [
            ["AWS/ApiGateway", "Count", "ApiName", aws_api_gateway_rest_api.idp_api.name],
            [".", "Latency", ".", "."],
            [".", "4XXError", ".", "."],
            [".", "5XXError", ".", "."]
          ]
          view    = "timeSeries"
          stacked = false
          region  = var.region
          title   = "API Gateway Metrics"
          period  = 300
        }
      },
      {
        type   = "metric"
        x      = 0
        y      = 6
        width  = 8
        height = 6

        properties = {
          metrics = [
            ["AWS/DynamoDB", "ConsumedReadCapacityUnits", "TableName", aws_dynamodb_table.document_metadata.name],
            [".", "ConsumedWriteCapacityUnits", ".", "."],
            [".", "ThrottledRequests", ".", "."]
          ]
          view    = "timeSeries"
          stacked = false
          region  = var.region
          title   = "DynamoDB Metrics"
          period  = 300
        }
      },
      {
        type   = "metric"
        x      = 8
        y      = 6
        width  = 8
        height = 6

        properties = {
          metrics = [
            ["IDP/Performance", "DocumentProcessingDuration"],
            ["IDP/Errors", "DocumentProcessingErrors"],
            ["IDP/Usage", "BedrockAPICalls"]
          ]
          view    = "timeSeries"
          stacked = false
          region  = var.region
          title   = "Custom Application Metrics"
          period  = 300
        }
      },
      {
        type   = "log"
        x      = 16
        y      = 6
        width  = 8
        height = 6

        properties = {
          query   = "SOURCE '/aws/lambda/${var.environment}-idp-document-processor' | fields @timestamp, @message | filter @message like /ERROR/ | sort @timestamp desc | limit 20"
          region  = var.region
          title   = "Recent Errors"
          view    = "table"
        }
      }
    ]
  })
}

Performance Dashboard

resource "aws_cloudwatch_dashboard" "idp_performance_dashboard" {
  dashboard_name = "${var.environment}-idp-performance-dashboard"

  dashboard_body = jsonencode({
    widgets = [
      {
        type   = "metric"
        x      = 0
        y      = 0
        width  = 24
        height = 6

        properties = {
          metrics = [
            ["IDP/Performance", "DocumentProcessingDuration", { "stat": "Average" }],
            [".", ".", { "stat": "p95" }],
            [".", ".", { "stat": "p99" }]
          ]
          view    = "timeSeries"
          stacked = false
          region  = var.region
          title   = "Document Processing Duration (Average, P95, P99)"
          period  = 300
          yAxis = {
            left = {
              min = 0
            }
          }
        }
      },
      {
        type   = "metric"
        x      = 0
        y      = 6
        width  = 12
        height = 6

        properties = {
          metrics = [
            ["AWS/Lambda", "ConcurrentExecutions", "FunctionName", aws_lambda_function.document_processor.function_name],
            [".", ".", "FunctionName", aws_lambda_function.ai_processor.function_name]
          ]
          view    = "timeSeries"
          stacked = false
          region  = var.region
          title   = "Lambda Concurrent Executions"
          period  = 300
        }
      },
      {
        type   = "metric"
        x      = 12
        y      = 6
        width  = 12
        height = 6

        properties = {
          metrics = [
            ["AWS/S3", "NumberOfObjects", "BucketName", aws_s3_bucket.documents.bucket, "StorageType", "AllStorageTypes"],
            [".", "BucketSizeBytes", ".", ".", ".", "StandardStorage"]
          ]
          view    = "timeSeries"
          stacked = false
          region  = var.region
          title   = "S3 Storage Metrics"
          period  = 86400
        }
      }
    ]
  })
}

X-Ray Tracing

Configuration

# X-Ray sampling rule
resource "aws_xray_sampling_rule" "idp_sampling_rule" {
  rule_name      = "${var.environment}-idp-sampling-rule"
  priority       = 9000
  version        = 1
  reservoir_size = 1
  fixed_rate     = var.xray_sampling_rate
  url_path       = "*"
  host           = "*"
  http_method    = "*"
  service_type   = "*"
  service_name   = "${var.environment}-idp-*"
  resource_arn   = "*"

  tags = local.common_tags
}

# Lambda tracing configuration
resource "aws_lambda_function" "document_processor" {
  # ... other configuration ...

  tracing_config {
    mode = var.enable_xray_tracing ? "Active" : "PassThrough"
  }
}

# API Gateway tracing
resource "aws_api_gateway_stage" "idp_api_stage" {
  deployment_id = aws_api_gateway_deployment.idp_api_deployment.id
  rest_api_id   = aws_api_gateway_rest_api.idp_api.id
  stage_name    = var.environment

  xray_tracing_enabled = var.enable_xray_tracing

  # ... other configuration ...
}

Custom Tracing

# Lambda function with custom X-Ray tracing
import json
import boto3
from aws_xray_sdk.core import xray_recorder
from aws_xray_sdk.core import patch_all

# Patch AWS SDK calls
patch_all()

@xray_recorder.capture('document_processor')
def lambda_handler(event, context):

    @xray_recorder.capture('validate_input')
    def validate_input(event_data):
        # Input validation logic
        return validated_data

    @xray_recorder.capture('process_with_textract')
    def process_with_textract(document_key):
        textract = boto3.client('textract')

        # Add custom metadata to trace
        xray_recorder.current_subsegment().put_metadata('document_key', document_key)

        response = textract.detect_document_text(
            Document={'S3Object': {'Bucket': bucket_name, 'Name': document_key}}
        )

        # Add result metadata
        xray_recorder.current_subsegment().put_metadata('text_blocks_count', len(response['Blocks']))

        return response

    @xray_recorder.capture('process_with_bedrock')
    def process_with_bedrock(extracted_text):
        bedrock = boto3.client('bedrock-runtime')

        # Add custom annotation for filtering
        xray_recorder.current_subsegment().put_annotation('model_used', 'claude-3-sonnet')

        response = bedrock.invoke_model(
            modelId='anthropic.claude-3-sonnet-20240229-v1:0',
            body=json.dumps({
                'anthropic_version': 'bedrock-2023-05-31',
                'max_tokens': 1000,
                'messages': [{'role': 'user', 'content': f'Analyze this text: {extracted_text}'}]
            })
        )

        return response

    try:
        # Process the document
        validated_input = validate_input(event)
        textract_result = process_with_textract(validated_input['document_key'])
        bedrock_result = process_with_bedrock(textract_result['text'])

        # Add success annotation
        xray_recorder.current_subsegment().put_annotation('processing_status', 'success')

        return {
            'statusCode': 200,
            'body': json.dumps({'result': 'success'})
        }

    except Exception as e:
        # Add error annotation
        xray_recorder.current_subsegment().put_annotation('processing_status', 'error')
        xray_recorder.current_subsegment().add_exception(e)

        raise e

Application Insights

CloudWatch Application Insights

resource "aws_applicationinsights_application" "idp_application" {
  resource_group_name = aws_resourcegroups_group.idp_resources.name
  auto_config_enabled = true
  auto_create         = true

  log_pattern {
    pattern_name = "LambdaErrors"
    pattern      = "[timestamp, request_id, level=\"ERROR\", ...]"
    rank         = 1
  }

  log_pattern {
    pattern_name = "APIGatewayErrors"
    pattern      = "[timestamp, request_id, ip, user, timestamp2, method, resource, protocol, status_code=5*, ...]"
    rank         = 1
  }

  tags = local.common_tags
}

# Resource group for application insights
resource "aws_resourcegroups_group" "idp_resources" {
  name = "${var.environment}-idp-resources"

  resource_query {
    query = jsonencode({
      ResourceTypeFilters = ["AWS::AllSupported"]
      TagFilters = [
        {
          Key    = "Environment"
          Values = [var.environment]
        },
        {
          Key    = "Project"
          Values = ["genai-idp-accelerator"]
        }
      ]
    })
  }

  tags = local.common_tags
}

Notification Setup

# General alerts topic
resource "aws_sns_topic" "alerts" {
  name = "${var.environment}-idp-alerts"

  tags = local.common_tags
}

# Critical alerts topic
resource "aws_sns_topic" "critical_alerts" {
  name = "${var.environment}-idp-critical-alerts"

  tags = local.common_tags
}

# Email subscriptions
resource "aws_sns_topic_subscription" "email_alerts" {
  count     = length(var.alert_email_addresses)
  topic_arn = aws_sns_topic.alerts.arn
  protocol  = "email"
  endpoint  = var.alert_email_addresses[count.index]
}

resource "aws_sns_topic_subscription" "critical_email_alerts" {
  count     = length(var.critical_alert_email_addresses)
  topic_arn = aws_sns_topic.critical_alerts.arn
  protocol  = "email"
  endpoint  = var.critical_alert_email_addresses[count.index]
}

Slack Integration

# Lambda function for Slack notifications
resource "aws_lambda_function" "slack_notifier" {
  count = var.slack_webhook_url != "" ? 1 : 0

  filename         = "slack_notifier.zip"
  function_name    = "${var.environment}-idp-slack-notifier"
  role            = aws_iam_role.slack_notifier_role[0].arn
  handler         = "index.handler"
  runtime         = "python3.9"
  timeout         = 30

  environment {
    variables = {
      SLACK_WEBHOOK_URL = var.slack_webhook_url
      ENVIRONMENT       = var.environment
    }
  }

  tags = local.common_tags
}

# SNS subscription for Slack notifications
resource "aws_sns_topic_subscription" "slack_alerts" {
  count     = var.slack_webhook_url != "" ? 1 : 0
  topic_arn = aws_sns_topic.alerts.arn
  protocol  = "lambda"
  endpoint  = aws_lambda_function.slack_notifier[0].arn
}

Log Analysis

CloudWatch Insights Queries

Common Queries:

-- Find all errors in the last hour
fields @timestamp, @message
| filter @message like /ERROR/
| sort @timestamp desc
| limit 100

-- Analyze processing duration
fields @timestamp, @message
| filter @message like /PROCESSING_DURATION/
| parse @message "PROCESSING_DURATION: * seconds" as duration
| stats avg(duration), max(duration), min(duration) by bin(5m)

-- Track API Gateway response times
fields @timestamp, @message
| filter @message like /\d+ms/
| parse @message /(?<responseTime>\d+)ms/
| stats avg(responseTime), max(responseTime), p95(responseTime) by bin(5m)

-- Monitor Bedrock API usage
fields @timestamp, @message
| filter @message like /BEDROCK_API_CALL/
| parse @message "BEDROCK_API_CALL: model=* tokens=*" as model, tokens
| stats sum(tokens) by model, bin(1h)

Automated Log Analysis

# CloudWatch Insights scheduled query
resource "aws_cloudwatch_query_definition" "error_analysis" {
  name = "${var.environment}-idp-error-analysis"

  log_group_names = [
    aws_cloudwatch_log_group.lambda_logs["document_processor"].name,
    aws_cloudwatch_log_group.lambda_logs["ai_processor"].name,
    aws_cloudwatch_log_group.api_gateway_logs.name
  ]

  query_string = <<EOF
fields @timestamp, @message, @logStream
| filter @message like /ERROR/
| stats count() by @logStream, bin(1h)
| sort @timestamp desc
EOF
}

Cost Monitoring

Cost and Usage Tracking

# Cost anomaly detection
resource "aws_ce_anomaly_detector" "idp_cost_anomaly" {
  name         = "${var.environment}-idp-cost-anomaly"
  monitor_type = "DIMENSIONAL"

  specification = jsonencode({
    Dimension = "SERVICE"
    MatchOptions = ["EQUALS"]
    Values = ["Amazon Bedrock", "AWS Lambda", "Amazon S3", "Amazon DynamoDB"]
  })

  tags = local.common_tags
}

resource "aws_ce_anomaly_subscription" "idp_cost_anomaly_subscription" {
  name      = "${var.environment}-idp-cost-anomaly-subscription"
  frequency = "DAILY"

  monitor_arn_list = [
    aws_ce_anomaly_detector.idp_cost_anomaly.arn
  ]

  subscriber {
    type    = "EMAIL"
    address = var.cost_alert_email
  }

  threshold_expression {
    and {
      dimension {
        key           = "ANOMALY_TOTAL_IMPACT_ABSOLUTE"
        values        = ["100"]
        match_options = ["GREATER_THAN_OR_EQUAL"]
      }
    }
  }

  tags = local.common_tags
}

Health Checks

Application Health Monitoring

# Route 53 health check for API endpoint
resource "aws_route53_health_check" "idp_api_health" {
  count                           = var.enable_health_checks ? 1 : 0
  fqdn                           = var.api_domain_name
  port                           = 443
  type                           = "HTTPS"
  resource_path                  = "/health"
  failure_threshold              = "3"
  request_interval               = "30"
  cloudwatch_alarm_region        = var.region
  cloudwatch_alarm_name          = aws_cloudwatch_metric_alarm.api_health_alarm[0].alarm_name
  insufficient_data_health_status = "Failure"

  tags = local.common_tags
}

# Health check alarm
resource "aws_cloudwatch_metric_alarm" "api_health_alarm" {
  count               = var.enable_health_checks ? 1 : 0
  alarm_name          = "${var.environment}-idp-api-health"
  comparison_operator = "LessThanThreshold"
  evaluation_periods  = "2"
  metric_name         = "HealthCheckStatus"
  namespace           = "AWS/Route53"
  period              = "60"
  statistic           = "Minimum"
  threshold           = "1"
  alarm_description   = "API health check failed"

  dimensions = {
    HealthCheckId = aws_route53_health_check.idp_api_health[0].id
  }

  alarm_actions = [aws_sns_topic.critical_alerts.arn]

  tags = local.common_tags
}

Next: Cost Optimization | Best Practices