Coverage for gco / models / health_models.py: 97%

73 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-30 21:47 +0000

1""" 

2Health monitoring data models for GCO (Global Capacity Orchestrator on AWS). 

3 

4This module defines dataclasses for health monitoring including: 

5- ResourceUtilization: Current CPU/memory/GPU utilization percentages 

6- HealthStatus: Complete health status report with utilization and thresholds 

7 

8These models are used by the health monitor service to track and report 

9cluster health status for load balancer health checks and monitoring. 

10""" 

11 

12from __future__ import annotations 

13 

14from dataclasses import dataclass 

15from datetime import datetime 

16from typing import TYPE_CHECKING, Literal 

17 

18if TYPE_CHECKING: 

19 from .cluster_models import ResourceThresholds 

20 

21 

22@dataclass 

23class RequestedResources: 

24 """ 

25 Resources requested by pending pods (absolute values). 

26 

27 Attributes: 

28 cpu_vcpus: Total vCPUs requested by pending pods 

29 memory_gb: Total GB memory requested by pending pods 

30 gpus: Total GPUs requested by pending pods 

31 """ 

32 

33 cpu_vcpus: float 

34 memory_gb: float 

35 gpus: int = 0 

36 

37 def __post_init__(self) -> None: 

38 """Validate requested values""" 

39 if not isinstance(self.cpu_vcpus, (int, float)) or self.cpu_vcpus < 0.0: 

40 raise ValueError(f"cpu_vcpus must be a non-negative number, got {self.cpu_vcpus}") 

41 if not isinstance(self.memory_gb, (int, float)) or self.memory_gb < 0.0: 

42 raise ValueError(f"memory_gb must be a non-negative number, got {self.memory_gb}") 

43 if not isinstance(self.gpus, int) or self.gpus < 0: 

44 raise ValueError(f"gpus must be a non-negative integer, got {self.gpus}") 

45 

46 

47@dataclass 

48class ResourceUtilization: 

49 """ 

50 Current resource utilization metrics for a cluster. 

51 

52 Attributes: 

53 cpu: CPU utilization percentage (0.0-100.0) 

54 memory: Memory utilization percentage (0.0-100.0) 

55 gpu: GPU utilization percentage (0.0-100.0) 

56 """ 

57 

58 cpu: float 

59 memory: float 

60 gpu: float 

61 

62 def __post_init__(self) -> None: 

63 """Validate utilization values""" 

64 for field_name, value in [("cpu", self.cpu), ("memory", self.memory), ("gpu", self.gpu)]: 

65 if not isinstance(value, (int, float)) or not 0.0 <= value <= 100.0: 

66 raise ValueError( 

67 f"{field_name} must be a number between 0.0 and 100.0, got {value}" 

68 ) 

69 

70 

71@dataclass 

72class HealthStatus: 

73 """Health status report for a cluster""" 

74 

75 cluster_id: str 

76 region: str 

77 timestamp: datetime 

78 status: Literal["healthy", "unhealthy"] 

79 resource_utilization: ResourceUtilization 

80 thresholds: ResourceThresholds # Forward reference to avoid circular import 

81 active_jobs: int 

82 pending_pods: int = 0 

83 pending_requested: RequestedResources | None = None 

84 message: str | None = None 

85 

86 def __post_init__(self) -> None: 

87 """Validate health status""" 

88 if not self.cluster_id: 

89 raise ValueError("Cluster ID cannot be empty") 

90 

91 if not self.region: 

92 raise ValueError("Region cannot be empty") 

93 

94 if self.active_jobs < 0: 

95 raise ValueError("Active jobs count cannot be negative") 

96 

97 if self.pending_pods < 0: 

98 raise ValueError("Pending pods count cannot be negative") 

99 

100 if self.status not in ["healthy", "unhealthy"]: 

101 raise ValueError("Status must be 'healthy' or 'unhealthy'") 

102 

103 def is_healthy(self) -> bool: 

104 """Check if the cluster is healthy based on resource thresholds""" 

105 # Check utilization thresholds 

106 utilization_ok = ( 

107 self.resource_utilization.cpu <= self.thresholds.cpu_threshold 

108 and self.resource_utilization.memory <= self.thresholds.memory_threshold 

109 and self.resource_utilization.gpu <= self.thresholds.gpu_threshold 

110 ) 

111 

112 # Check pending pods threshold 

113 pending_ok = self.pending_pods <= self.thresholds.pending_pods_threshold 

114 

115 # Check pending requested resources thresholds 

116 pending_resources_ok = True 

117 if self.pending_requested: 

118 pending_resources_ok = ( 

119 self.pending_requested.cpu_vcpus <= self.thresholds.pending_requested_cpu_vcpus 

120 and self.pending_requested.memory_gb <= self.thresholds.pending_requested_memory_gb 

121 and self.pending_requested.gpus <= self.thresholds.pending_requested_gpus 

122 ) 

123 

124 return utilization_ok and pending_ok and pending_resources_ok 

125 

126 def get_threshold_violations(self) -> list[str]: 

127 """Get list of threshold violations""" 

128 violations = [] 

129 

130 if self.resource_utilization.cpu > self.thresholds.cpu_threshold: 

131 violations.append( 

132 f"CPU: {self.resource_utilization.cpu:.1f}% > {self.thresholds.cpu_threshold}%" 

133 ) 

134 

135 if self.resource_utilization.memory > self.thresholds.memory_threshold: 

136 violations.append( 

137 f"Memory: {self.resource_utilization.memory:.1f}% > {self.thresholds.memory_threshold}%" 

138 ) 

139 

140 if self.resource_utilization.gpu > self.thresholds.gpu_threshold: 

141 violations.append( 

142 f"GPU: {self.resource_utilization.gpu:.1f}% > {self.thresholds.gpu_threshold}%" 

143 ) 

144 

145 if self.pending_pods > self.thresholds.pending_pods_threshold: 

146 violations.append( 

147 f"Pending Pods: {self.pending_pods} > {self.thresholds.pending_pods_threshold}" 

148 ) 

149 

150 if self.pending_requested: 

151 if self.pending_requested.cpu_vcpus > self.thresholds.pending_requested_cpu_vcpus: 151 ↛ 155line 151 didn't jump to line 155 because the condition on line 151 was always true

152 violations.append( 

153 f"Pending CPU: {self.pending_requested.cpu_vcpus:.1f} vCPUs > {self.thresholds.pending_requested_cpu_vcpus} vCPUs" 

154 ) 

155 if self.pending_requested.memory_gb > self.thresholds.pending_requested_memory_gb: 155 ↛ 159line 155 didn't jump to line 159 because the condition on line 155 was always true

156 violations.append( 

157 f"Pending Memory: {self.pending_requested.memory_gb:.1f} GB > {self.thresholds.pending_requested_memory_gb} GB" 

158 ) 

159 if self.pending_requested.gpus > self.thresholds.pending_requested_gpus: 159 ↛ 164line 159 didn't jump to line 164 because the condition on line 159 was always true

160 violations.append( 

161 f"Pending GPUs: {self.pending_requested.gpus} > {self.thresholds.pending_requested_gpus}" 

162 ) 

163 

164 return violations