Coverage for gco / models / health_models.py: 97%
73 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-30 21:47 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-30 21:47 +0000
1"""
2Health monitoring data models for GCO (Global Capacity Orchestrator on AWS).
4This module defines dataclasses for health monitoring including:
5- ResourceUtilization: Current CPU/memory/GPU utilization percentages
6- HealthStatus: Complete health status report with utilization and thresholds
8These models are used by the health monitor service to track and report
9cluster health status for load balancer health checks and monitoring.
10"""
12from __future__ import annotations
14from dataclasses import dataclass
15from datetime import datetime
16from typing import TYPE_CHECKING, Literal
18if TYPE_CHECKING:
19 from .cluster_models import ResourceThresholds
22@dataclass
23class RequestedResources:
24 """
25 Resources requested by pending pods (absolute values).
27 Attributes:
28 cpu_vcpus: Total vCPUs requested by pending pods
29 memory_gb: Total GB memory requested by pending pods
30 gpus: Total GPUs requested by pending pods
31 """
33 cpu_vcpus: float
34 memory_gb: float
35 gpus: int = 0
37 def __post_init__(self) -> None:
38 """Validate requested values"""
39 if not isinstance(self.cpu_vcpus, (int, float)) or self.cpu_vcpus < 0.0:
40 raise ValueError(f"cpu_vcpus must be a non-negative number, got {self.cpu_vcpus}")
41 if not isinstance(self.memory_gb, (int, float)) or self.memory_gb < 0.0:
42 raise ValueError(f"memory_gb must be a non-negative number, got {self.memory_gb}")
43 if not isinstance(self.gpus, int) or self.gpus < 0:
44 raise ValueError(f"gpus must be a non-negative integer, got {self.gpus}")
47@dataclass
48class ResourceUtilization:
49 """
50 Current resource utilization metrics for a cluster.
52 Attributes:
53 cpu: CPU utilization percentage (0.0-100.0)
54 memory: Memory utilization percentage (0.0-100.0)
55 gpu: GPU utilization percentage (0.0-100.0)
56 """
58 cpu: float
59 memory: float
60 gpu: float
62 def __post_init__(self) -> None:
63 """Validate utilization values"""
64 for field_name, value in [("cpu", self.cpu), ("memory", self.memory), ("gpu", self.gpu)]:
65 if not isinstance(value, (int, float)) or not 0.0 <= value <= 100.0:
66 raise ValueError(
67 f"{field_name} must be a number between 0.0 and 100.0, got {value}"
68 )
71@dataclass
72class HealthStatus:
73 """Health status report for a cluster"""
75 cluster_id: str
76 region: str
77 timestamp: datetime
78 status: Literal["healthy", "unhealthy"]
79 resource_utilization: ResourceUtilization
80 thresholds: ResourceThresholds # Forward reference to avoid circular import
81 active_jobs: int
82 pending_pods: int = 0
83 pending_requested: RequestedResources | None = None
84 message: str | None = None
86 def __post_init__(self) -> None:
87 """Validate health status"""
88 if not self.cluster_id:
89 raise ValueError("Cluster ID cannot be empty")
91 if not self.region:
92 raise ValueError("Region cannot be empty")
94 if self.active_jobs < 0:
95 raise ValueError("Active jobs count cannot be negative")
97 if self.pending_pods < 0:
98 raise ValueError("Pending pods count cannot be negative")
100 if self.status not in ["healthy", "unhealthy"]:
101 raise ValueError("Status must be 'healthy' or 'unhealthy'")
103 def is_healthy(self) -> bool:
104 """Check if the cluster is healthy based on resource thresholds"""
105 # Check utilization thresholds
106 utilization_ok = (
107 self.resource_utilization.cpu <= self.thresholds.cpu_threshold
108 and self.resource_utilization.memory <= self.thresholds.memory_threshold
109 and self.resource_utilization.gpu <= self.thresholds.gpu_threshold
110 )
112 # Check pending pods threshold
113 pending_ok = self.pending_pods <= self.thresholds.pending_pods_threshold
115 # Check pending requested resources thresholds
116 pending_resources_ok = True
117 if self.pending_requested:
118 pending_resources_ok = (
119 self.pending_requested.cpu_vcpus <= self.thresholds.pending_requested_cpu_vcpus
120 and self.pending_requested.memory_gb <= self.thresholds.pending_requested_memory_gb
121 and self.pending_requested.gpus <= self.thresholds.pending_requested_gpus
122 )
124 return utilization_ok and pending_ok and pending_resources_ok
126 def get_threshold_violations(self) -> list[str]:
127 """Get list of threshold violations"""
128 violations = []
130 if self.resource_utilization.cpu > self.thresholds.cpu_threshold:
131 violations.append(
132 f"CPU: {self.resource_utilization.cpu:.1f}% > {self.thresholds.cpu_threshold}%"
133 )
135 if self.resource_utilization.memory > self.thresholds.memory_threshold:
136 violations.append(
137 f"Memory: {self.resource_utilization.memory:.1f}% > {self.thresholds.memory_threshold}%"
138 )
140 if self.resource_utilization.gpu > self.thresholds.gpu_threshold:
141 violations.append(
142 f"GPU: {self.resource_utilization.gpu:.1f}% > {self.thresholds.gpu_threshold}%"
143 )
145 if self.pending_pods > self.thresholds.pending_pods_threshold:
146 violations.append(
147 f"Pending Pods: {self.pending_pods} > {self.thresholds.pending_pods_threshold}"
148 )
150 if self.pending_requested:
151 if self.pending_requested.cpu_vcpus > self.thresholds.pending_requested_cpu_vcpus: 151 ↛ 155line 151 didn't jump to line 155 because the condition on line 151 was always true
152 violations.append(
153 f"Pending CPU: {self.pending_requested.cpu_vcpus:.1f} vCPUs > {self.thresholds.pending_requested_cpu_vcpus} vCPUs"
154 )
155 if self.pending_requested.memory_gb > self.thresholds.pending_requested_memory_gb: 155 ↛ 159line 155 didn't jump to line 159 because the condition on line 155 was always true
156 violations.append(
157 f"Pending Memory: {self.pending_requested.memory_gb:.1f} GB > {self.thresholds.pending_requested_memory_gb} GB"
158 )
159 if self.pending_requested.gpus > self.thresholds.pending_requested_gpus: 159 ↛ 164line 159 didn't jump to line 164 because the condition on line 159 was always true
160 violations.append(
161 f"Pending GPUs: {self.pending_requested.gpus} > {self.thresholds.pending_requested_gpus}"
162 )
164 return violations