Coverage for gco/models/health

1"""

2Health monitoring data models for GCO (Global Capacity Orchestrator on AWS).

4This module defines dataclasses for health monitoring including:

5- ResourceUtilization: Current CPU/memory/GPU utilization percentages

6- HealthStatus: Complete health status report with utilization and thresholds

8These models are used by the health monitor service to track and report

9cluster health status for load balancer health checks and monitoring.

10"""

12from __future__ import annotations

14from dataclasses import dataclass

15from datetime import datetime

16from typing import TYPE_CHECKING, Literal

18if TYPE_CHECKING:

19 from .cluster_models import ResourceThresholds

22@dataclass

23class RequestedResources:

24 """

25 Resources requested by pending pods (absolute values).

27 Attributes:

28 cpu_vcpus: Total vCPUs requested by pending pods

29 memory_gb: Total GB memory requested by pending pods

30 gpus: Total GPUs requested by pending pods

31 """

33 cpu_vcpus: float

34 memory_gb: float

35 gpus: int = 0

37 def __post_init__(self) -> None:

38 """Validate requested values"""

39 if not isinstance(self.cpu_vcpus, (int, float)) or self.cpu_vcpus < 0.0:

40 raise ValueError(f"cpu_vcpus must be a non-negative number, got {self.cpu_vcpus}")

41 if not isinstance(self.memory_gb, (int, float)) or self.memory_gb < 0.0:

42 raise ValueError(f"memory_gb must be a non-negative number, got {self.memory_gb}")

43 if not isinstance(self.gpus, int) or self.gpus < 0:

44 raise ValueError(f"gpus must be a non-negative integer, got {self.gpus}")

47@dataclass

48class ResourceUtilization:

49 """

50 Current resource utilization metrics for a cluster.

52 Attributes:

53 cpu: CPU utilization percentage (0.0-100.0)

54 memory: Memory utilization percentage (0.0-100.0)

55 gpu: GPU utilization percentage (0.0-100.0)

56 """

58 cpu: float

59 memory: float

60 gpu: float

62 def __post_init__(self) -> None:

63 """Validate utilization values"""

64 for field_name, value in [("cpu", self.cpu), ("memory", self.memory), ("gpu", self.gpu)]:

65 if not isinstance(value, (int, float)) or not 0.0 <= value <= 100.0:

66 raise ValueError(

67 f"{field_name} must be a number between 0.0 and 100.0, got {value}"

68 )

71@dataclass

72class HealthStatus:

73 """Health status report for a cluster"""

75 cluster_id: str

76 region: str

77 timestamp: datetime

78 status: Literal["healthy", "unhealthy"]

79 resource_utilization: ResourceUtilization

80 thresholds: ResourceThresholds # Forward reference to avoid circular import

81 active_jobs: int

82 pending_pods: int = 0

83 pending_requested: RequestedResources | None = None

84 message: str | None = None

86 def __post_init__(self) -> None:

87 """Validate health status"""

88 if not self.cluster_id:

89 raise ValueError("Cluster ID cannot be empty")

91 if not self.region:

92 raise ValueError("Region cannot be empty")

94 if self.active_jobs < 0:

95 raise ValueError("Active jobs count cannot be negative")

97 if self.pending_pods < 0:

98 raise ValueError("Pending pods count cannot be negative")

100 if self.status not in ["healthy", "unhealthy"]:

101 raise ValueError("Status must be 'healthy' or 'unhealthy'")

102

103 def is_healthy(self) -> bool:

104 """Check if the cluster is healthy based on resource thresholds"""

105 # Check utilization thresholds

106 utilization_ok = (

107 self.resource_utilization.cpu <= self.thresholds.cpu_threshold

108 and self.resource_utilization.memory <= self.thresholds.memory_threshold

109 and self.resource_utilization.gpu <= self.thresholds.gpu_threshold

110 )

111

112 # Check pending pods threshold

113 pending_ok = self.pending_pods <= self.thresholds.pending_pods_threshold

114

115 # Check pending requested resources thresholds

116 pending_resources_ok = True

117 if self.pending_requested:

118 pending_resources_ok = (

119 self.pending_requested.cpu_vcpus <= self.thresholds.pending_requested_cpu_vcpus

120 and self.pending_requested.memory_gb <= self.thresholds.pending_requested_memory_gb

121 and self.pending_requested.gpus <= self.thresholds.pending_requested_gpus

122 )

123

124 return utilization_ok and pending_ok and pending_resources_ok

125

126 def get_threshold_violations(self) -> list[str]:

127 """Get list of threshold violations"""

128 violations = []

129

130 if self.resource_utilization.cpu > self.thresholds.cpu_threshold:

131 violations.append(

132 f"CPU: {self.resource_utilization.cpu:.1f}% > {self.thresholds.cpu_threshold}%"

133 )

134

135 if self.resource_utilization.memory > self.thresholds.memory_threshold:

136 violations.append(

137 f"Memory: {self.resource_utilization.memory:.1f}% > {self.thresholds.memory_threshold}%"

138 )

139

140 if self.resource_utilization.gpu > self.thresholds.gpu_threshold:

141 violations.append(

142 f"GPU: {self.resource_utilization.gpu:.1f}% > {self.thresholds.gpu_threshold}%"

143 )

144

145 if self.pending_pods > self.thresholds.pending_pods_threshold:

146 violations.append(

147 f"Pending Pods: {self.pending_pods} > {self.thresholds.pending_pods_threshold}"

148 )

149

150 if self.pending_requested:

151 if self.pending_requested.cpu_vcpus > self.thresholds.pending_requested_cpu_vcpus: 151 ↛ 155line 151 didn't jump to line 155 because the condition on line 151 was always true

152 violations.append(

153 f"Pending CPU: {self.pending_requested.cpu_vcpus:.1f} vCPUs > {self.thresholds.pending_requested_cpu_vcpus} vCPUs"

154 )

155 if self.pending_requested.memory_gb > self.thresholds.pending_requested_memory_gb: 155 ↛ 159line 155 didn't jump to line 159 because the condition on line 155 was always true

156 violations.append(

157 f"Pending Memory: {self.pending_requested.memory_gb:.1f} GB > {self.thresholds.pending_requested_memory_gb} GB"

158 )

159 if self.pending_requested.gpus > self.thresholds.pending_requested_gpus: 159 ↛ 164line 159 didn't jump to line 164 because the condition on line 159 was always true

160 violations.append(

161 f"Pending GPUs: {self.pending_requested.gpus} > {self.thresholds.pending_requested_gpus}"

162 )

163

164 return violations

Coverage for gco / models / health_models.py: 97%

73 statements