Coverage for cli / commands / capacity_cmd.py: 96%
310 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-30 21:47 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-30 21:47 +0000
1"""Capacity checking commands."""
3import sys
4from typing import Any
6import click
8from ..capacity import get_capacity_checker
9from ..config import GCOConfig
10from ..output import format_capacity_table, get_output_formatter
12pass_config = click.make_pass_decorator(GCOConfig, ensure=True)
15@click.group()
16@pass_config
17def capacity(config: Any) -> None:
18 """Check EC2 capacity availability."""
19 pass
22@capacity.command("check")
23@click.option("--instance-type", "-i", required=True, help="EC2 instance type")
24@click.option("--region", "-r", required=True, help="AWS region")
25@click.option(
26 "--type",
27 "-t",
28 "capacity_type",
29 type=click.Choice(["spot", "on-demand", "both"]),
30 default="both",
31 help="Capacity type to check",
32)
33@pass_config
34def check_capacity(config: Any, instance_type: Any, region: Any, capacity_type: Any) -> None:
35 """Check capacity availability for an instance type.
37 Provides estimates based on spot price history and availability patterns.
38 """
39 formatter = get_output_formatter(config)
40 checker = get_capacity_checker(config)
42 try:
43 estimates = checker.estimate_capacity(instance_type, region, capacity_type)
45 if config.output_format == "table": 45 ↛ 48line 45 didn't jump to line 48 because the condition on line 45 was always true
46 print(format_capacity_table(estimates))
47 else:
48 formatter.print(estimates)
50 except Exception as e:
51 formatter.print_error(f"Failed to check capacity: {e}")
52 sys.exit(1)
55@capacity.command("recommend")
56@click.option("--instance-type", "-i", required=True, help="EC2 instance type")
57@click.option("--region", "-r", required=True, help="AWS region")
58@click.option(
59 "--fault-tolerance",
60 "-f",
61 type=click.Choice(["high", "medium", "low"]),
62 default="medium",
63 help="Fault tolerance level",
64)
65@pass_config
66def recommend_capacity(config: Any, instance_type: Any, region: Any, fault_tolerance: Any) -> None:
67 """Get capacity type recommendation for a workload."""
68 formatter = get_output_formatter(config)
69 checker = get_capacity_checker(config)
71 try:
72 capacity_type, explanation = checker.recommend_capacity_type(
73 instance_type, region, fault_tolerance
74 )
76 formatter.print_info(f"Recommended: {capacity_type.upper()}")
77 formatter.print_info(f"Reason: {explanation}")
79 except Exception as e:
80 formatter.print_error(f"Failed to get recommendation: {e}")
81 sys.exit(1)
84@capacity.command("spot-prices")
85@click.option("--instance-type", "-i", required=True, help="EC2 instance type")
86@click.option("--region", "-r", required=True, help="AWS region")
87@click.option("--days", "-d", default=7, help="Days of history")
88@pass_config
89def spot_prices(config: Any, instance_type: Any, region: Any, days: Any) -> None:
90 """Get spot price history for an instance type."""
91 formatter = get_output_formatter(config)
92 checker = get_capacity_checker(config)
94 try:
95 prices = checker.get_spot_price_history(instance_type, region, days)
97 if not prices:
98 formatter.print_warning(f"No spot price data for {instance_type} in {region}")
99 return
101 formatter.print(
102 prices,
103 columns=[
104 "availability_zone",
105 "current_price",
106 "avg_price_7d",
107 "min_price_7d",
108 "max_price_7d",
109 "price_stability",
110 ],
111 )
113 except Exception as e:
114 formatter.print_error(f"Failed to get spot prices: {e}")
115 sys.exit(1)
118@capacity.command("instance-info")
119@click.argument("instance_type")
120@pass_config
121def instance_info(config: Any, instance_type: Any) -> None:
122 """Get information about an instance type."""
123 formatter = get_output_formatter(config)
124 checker = get_capacity_checker(config)
126 try:
127 info = checker.get_instance_info(instance_type)
128 if info:
129 formatter.print(info)
130 else:
131 formatter.print_error(f"Instance type {instance_type} not found")
132 sys.exit(1)
133 except Exception as e:
134 formatter.print_error(f"Failed to get instance info: {e}")
135 sys.exit(1)
138@capacity.command("status")
139@click.option("--region", "-r", help="Specific region to check")
140@click.option("--all-regions", "-a", is_flag=True, default=True, help="Check all regions (default)")
141@pass_config
142def capacity_status(config: Any, region: Any, all_regions: Any) -> None:
143 """Show comprehensive resource utilization across regions.
145 Displays pending/running workloads, GPU/CPU utilization, queue depth,
146 and active job counts for one or all GCO clusters.
148 Examples:
149 gco capacity status
150 gco capacity status --region us-east-1
151 gco capacity status --all-regions
152 """
153 from ..capacity import get_multi_region_capacity_checker
155 formatter = get_output_formatter(config)
157 try:
158 checker = get_multi_region_capacity_checker(config)
160 if region:
161 capacity = checker.get_region_capacity(region)
162 formatter.print(capacity)
163 else:
164 capacities = checker.get_all_regions_capacity()
166 if not capacities:
167 formatter.print_warning("No GCO stacks found")
168 return
170 # Format as table
171 print("\n REGION QUEUE RUNNING GPU% CPU% SCORE")
172 print(" " + "-" * 55)
173 for c in sorted(capacities, key=lambda x: x.recommendation_score):
174 print(
175 f" {c.region:<15} {c.queue_depth:>5} {c.running_jobs:>7} "
176 f"{c.gpu_utilization:>4.0f}% {c.cpu_utilization:>4.0f}% {c.recommendation_score:>5.0f}"
177 )
179 # Show recommendation
180 print()
181 best = min(capacities, key=lambda x: x.recommendation_score)
182 formatter.print_info(f"Recommended region: {best.region} (lowest score = best)")
184 except Exception as e:
185 formatter.print_error(f"Failed to get capacity status: {e}")
186 sys.exit(1)
189@capacity.command("recommend-region")
190@click.option("--gpu", is_flag=True, help="Job requires GPUs")
191@click.option("--min-gpus", default=0, help="Minimum GPUs required")
192@click.option(
193 "--instance-type", "-i", default=None, help="Specific instance type for workload-aware scoring"
194)
195@click.option("--gpu-count", default=0, help="Number of GPUs required")
196@pass_config
197def recommend_region(
198 config: Any, gpu: Any, min_gpus: Any, instance_type: Any, gpu_count: Any
199) -> None:
200 """Recommend optimal region for job placement.
202 Analyzes capacity across all deployed EKS regions and recommends
203 the best region. When --instance-type is provided, uses weighted
204 multi-signal scoring that factors in spot placement scores, pricing,
205 queue depth, GPU utilization, and running job counts.
207 Without --instance-type, uses a simpler composite score based on
208 queue depth, GPU utilization, and running jobs.
210 Examples:
211 gco capacity recommend-region
212 gco capacity recommend-region --gpu
213 gco capacity recommend-region -i g5.xlarge
214 gco capacity recommend-region -i p4d.24xlarge --gpu-count 8
215 """
216 from ..capacity import get_multi_region_capacity_checker
218 formatter = get_output_formatter(config)
220 try:
221 checker = get_multi_region_capacity_checker(config)
222 recommendation = checker.recommend_region_for_job(
223 gpu_required=gpu,
224 min_gpus=min_gpus,
225 instance_type=instance_type,
226 gpu_count=gpu_count,
227 )
229 formatter.print_success(f"Recommended region: {recommendation['region']}")
230 formatter.print_info(f"Reason: {recommendation['reason']}")
232 if config.verbose:
233 print("\nAll regions ranked:")
234 for r in recommendation.get("all_regions", []):
235 print(
236 f" {r['region']}: score={r['score']:.4f}, "
237 f"queue={r['queue_depth']}, gpu={r['gpu_utilization']:.0f}%"
238 )
240 except Exception as e:
241 formatter.print_error(f"Failed to get recommendation: {e}")
242 sys.exit(1)
245@capacity.command("ai-recommend")
246@click.option("--workload", "-w", help="Description of your workload")
247@click.option(
248 "--instance-type",
249 "-i",
250 multiple=True,
251 help="Instance types to consider (can specify multiple)",
252)
253@click.option("--region", "-r", multiple=True, help="Regions to consider (can specify multiple)")
254@click.option("--gpu", is_flag=True, help="Workload requires GPUs")
255@click.option("--min-gpus", default=0, help="Minimum GPUs required")
256@click.option("--min-memory-gb", default=0, help="Minimum memory in GB")
257@click.option(
258 "--fault-tolerance",
259 "-f",
260 type=click.Choice(["high", "medium", "low"]),
261 default="medium",
262 help="Fault tolerance level",
263)
264@click.option("--max-cost", type=float, help="Maximum cost per hour in USD")
265@click.option(
266 "--model",
267 "-m",
268 default="us.anthropic.claude-sonnet-4-5-20250929-v1:0",
269 help="Bedrock model ID to use",
270)
271@click.option("--raw", is_flag=True, help="Show raw AI response")
272@pass_config
273def ai_recommend(
274 config: Any,
275 workload: Any,
276 instance_type: Any,
277 region: Any,
278 gpu: Any,
279 min_gpus: Any,
280 min_memory_gb: Any,
281 fault_tolerance: Any,
282 max_cost: Any,
283 model: Any,
284 raw: Any,
285) -> None:
286 """Get AI-powered capacity recommendation using Amazon Bedrock.
288 This command gathers comprehensive capacity data including:
289 - Spot placement scores and pricing across regions
290 - On-demand availability and pricing
291 - Current cluster utilization (queue depth, GPU/CPU usage)
292 - Running and pending job counts
294 The data is analyzed by an LLM to provide intelligent recommendations
295 for where to place your workload.
297 ⚠️ DISCLAIMER: Recommendations are AI-generated and should be validated
298 before making production decisions. Capacity availability and pricing
299 can change rapidly.
301 REQUIREMENTS:
302 - AWS credentials with bedrock:InvokeModel permission
303 - The specified Bedrock model must be enabled in your account
304 - Default model: Claude Sonnet 4.5 (anthropic.claude-sonnet-4-5-20250929-v1:0)
306 Examples:
307 gco capacity ai-recommend --workload "Training a large language model"
309 gco capacity ai-recommend -w "Inference workload" --gpu --min-gpus 4
311 gco capacity ai-recommend -i g5.xlarge -i g5.2xlarge -r us-east-1 -r us-west-2
313 gco capacity ai-recommend --fault-tolerance high --max-cost 5.00
314 """
315 from ..capacity import get_bedrock_capacity_advisor
317 formatter = get_output_formatter(config)
319 # Print disclaimer
320 print()
321 print(" " + "=" * 70)
322 print(" ⚠️ AI-POWERED RECOMMENDATION DISCLAIMER")
323 print(" " + "-" * 70)
324 print(" This recommendation is generated by an AI model and should be")
325 print(" validated before making production decisions.")
326 print(" ")
327 print(" • Capacity availability can change rapidly")
328 print(" • Spot instances may be interrupted at any time")
329 print(" • Pricing data may not reflect real-time prices")
330 print(" • AI recommendations are not guaranteed to be optimal")
331 print(" " + "=" * 70)
332 print()
334 try:
335 formatter.print_info("Gathering capacity data across regions...")
337 advisor = get_bedrock_capacity_advisor(config, model_id=model)
339 # Build requirements dict
340 requirements = {
341 "gpu_required": gpu,
342 "min_gpus": min_gpus if min_gpus > 0 else None,
343 "min_memory_gb": min_memory_gb if min_memory_gb > 0 else None,
344 "fault_tolerance": fault_tolerance,
345 "max_cost_per_hour": max_cost,
346 }
347 # Remove None values
348 requirements = {k: v for k, v in requirements.items() if v is not None}
350 formatter.print_info(f"Analyzing with {model}...")
352 recommendation = advisor.get_recommendation(
353 workload_description=workload,
354 instance_types=list(instance_type) if instance_type else None,
355 regions=list(region) if region else None,
356 requirements=requirements if requirements else None,
357 )
359 # Display recommendation
360 print()
361 print(" " + "=" * 70)
362 print(" 🤖 AI RECOMMENDATION")
363 print(" " + "=" * 70)
364 print()
365 print(f" Region: {recommendation.recommended_region}")
366 print(f" Instance Type: {recommendation.recommended_instance_type}")
367 print(f" Capacity Type: {recommendation.recommended_capacity_type.upper()}")
368 print(f" Confidence: {recommendation.confidence.upper()}")
369 if recommendation.cost_estimate:
370 print(f" Est. Cost: {recommendation.cost_estimate}")
371 print()
372 print(" REASONING:")
373 print(" " + "-" * 68)
374 # Word wrap the reasoning
375 reasoning_lines = recommendation.reasoning.split(". ")
376 for line in reasoning_lines:
377 if line.strip(): 377 ↛ 376line 377 didn't jump to line 376 because the condition on line 377 was always true
378 print(f" {line.strip()}.")
379 print()
381 # Show alternatives
382 if recommendation.alternative_options:
383 print(" ALTERNATIVE OPTIONS:")
384 print(" " + "-" * 68)
385 for i, alt in enumerate(recommendation.alternative_options[:3], 1):
386 print(
387 f" {i}. {alt.get('region', 'N/A')} / "
388 f"{alt.get('instance_type', 'N/A')} / "
389 f"{alt.get('capacity_type', 'N/A').upper()}"
390 )
391 if alt.get("reason"): 391 ↛ 385line 391 didn't jump to line 385 because the condition on line 391 was always true
392 print(f" {alt['reason']}")
393 print()
395 # Show warnings
396 if recommendation.warnings:
397 print(" ⚠️ WARNINGS:")
398 print(" " + "-" * 68)
399 for warning in recommendation.warnings:
400 print(f" • {warning}")
401 print()
403 # Show raw response if requested
404 if raw:
405 print(" RAW AI RESPONSE:")
406 print(" " + "-" * 68)
407 print(recommendation.raw_response)
408 print()
410 print(" " + "=" * 70)
411 print()
413 except Exception as e:
414 formatter.print_error(f"Failed to get AI recommendation: {e}")
415 sys.exit(1)
418@capacity.command("reservations")
419@click.option("--instance-type", "-i", help="Filter by instance type")
420@click.option("--region", "-r", help="Specific region (default: all deployed regions)")
421@pass_config
422def list_reservations(config: Any, instance_type: Any, region: Any) -> None:
423 """List On-Demand Capacity Reservations (ODCRs) across regions.
425 Shows all active capacity reservations with utilization details.
427 Examples:
428 gco capacity reservations
429 gco capacity reservations -i p5.48xlarge
430 gco capacity reservations -r us-east-1
431 """
432 formatter = get_output_formatter(config)
433 checker = get_capacity_checker(config)
435 try:
436 if region:
437 reservations = checker.list_capacity_reservations(region, instance_type=instance_type)
438 result = {
439 "regions_checked": [region],
440 "total_reservations": len(reservations),
441 "total_reserved_instances": sum(r["total_instances"] for r in reservations),
442 "total_available_instances": sum(r["available_instances"] for r in reservations),
443 "reservations": reservations,
444 }
445 else:
446 result = checker.list_all_reservations(instance_type=instance_type)
448 if config.output_format != "table": 448 ↛ 449line 448 didn't jump to line 449 because the condition on line 448 was never true
449 formatter.print(result)
450 return
452 reservations = result["reservations"]
453 if not reservations:
454 formatter.print_info("No active capacity reservations found")
455 return
457 print(f"\n Capacity Reservations ({len(reservations)} found)")
458 print(" " + "-" * 90)
459 print(
460 f" {'INSTANCE TYPE':<18} {'REGION':<15} {'AZ':<18} "
461 f"{'TOTAL':>5} {'AVAIL':>5} {'USED%':>6} {'MATCH CRITERIA'}"
462 )
463 print(" " + "-" * 90)
464 for r in reservations:
465 print(
466 f" {r['instance_type']:<18} {r['region']:<15} "
467 f"{r['availability_zone']:<18} {r['total_instances']:>5} "
468 f"{r['available_instances']:>5} {r['utilization_pct']:>5.1f}% "
469 f"{r.get('instance_match_criteria', 'open')}"
470 )
472 print()
473 print(
474 f" Total: {result['total_reserved_instances']} reserved, "
475 f"{result['total_available_instances']} available"
476 )
477 print()
479 except Exception as e:
480 formatter.print_error(f"Failed to list reservations: {e}")
481 sys.exit(1)
484@capacity.command("reservation-check")
485@click.option("--instance-type", "-i", required=True, help="Instance type to check")
486@click.option("--region", "-r", help="Specific region (default: all deployed regions)")
487@click.option("--count", "-c", default=1, help="Minimum instances needed")
488@click.option(
489 "--include-blocks/--no-blocks",
490 default=True,
491 help="Include Capacity Block offerings (default: yes)",
492)
493@click.option(
494 "--block-duration",
495 default=24,
496 type=int,
497 help="Capacity Block duration in hours (default: 24)",
498)
499@pass_config
500def reservation_check(
501 config: Any,
502 instance_type: Any,
503 region: Any,
504 count: Any,
505 include_blocks: Any,
506 block_duration: Any,
507) -> None:
508 """Check reservation availability and Capacity Block offerings.
510 Checks both existing ODCRs and purchasable Capacity Blocks for ML
511 workloads. Capacity Blocks provide guaranteed GPU capacity for a
512 fixed duration at a known price.
514 Examples:
515 gco capacity reservation-check -i p5.48xlarge
516 gco capacity reservation-check -i p4d.24xlarge -c 2 --block-duration 48
517 gco capacity reservation-check -i g5.48xlarge -r us-east-1 --no-blocks
518 """
519 formatter = get_output_formatter(config)
520 checker = get_capacity_checker(config)
522 try:
523 formatter.print_info(
524 f"Checking reservations for {instance_type} "
525 f"(min {count} instance{'s' if count > 1 else ''})..."
526 )
528 result = checker.check_reservation_availability(
529 instance_type=instance_type,
530 region=region,
531 min_count=count,
532 include_capacity_blocks=include_blocks,
533 block_duration_hours=block_duration,
534 )
536 if config.output_format != "table": 536 ↛ 537line 536 didn't jump to line 537 because the condition on line 536 was never true
537 formatter.print(result)
538 return
540 # ODCR section
541 odcr = result["odcr"]
542 print(f"\n On-Demand Capacity Reservations for {instance_type}")
543 print(" " + "-" * 60)
544 if odcr["reservations"]:
545 for r in odcr["reservations"]:
546 print(
547 f" ✓ {r['availability_zone']}: "
548 f"{r['available_instances']}/{r['total_instances']} available "
549 f"({r['reservation_id']})"
550 )
551 print(
552 f"\n Total: {odcr['total_available_instances']} available "
553 f"of {odcr['total_reserved_instances']} reserved"
554 )
555 else:
556 print(" No active ODCRs found for this instance type")
558 # Capacity Blocks section
559 if include_blocks:
560 blocks = result["capacity_blocks"]
561 print(f"\n Capacity Block Offerings ({block_duration}h)")
562 print(" " + "-" * 60)
563 if blocks["offerings"]: 563 ↛ 571line 563 didn't jump to line 571 because the condition on line 563 was always true
564 for b in blocks["offerings"]:
565 print(
566 f" ✓ {b['availability_zone']}: "
567 f"{b['instance_count']}x {b['duration_hours']}h "
568 f"starting {b['start_date'][:16]} — ${b['upfront_fee']}"
569 )
570 else:
571 print(" No Capacity Block offerings available")
573 # Recommendation
574 print()
575 print(f" 💡 {result['recommendation']}")
576 print()
578 except Exception as e:
579 formatter.print_error(f"Failed to check reservations: {e}")
580 sys.exit(1)
583@capacity.command("reserve")
584@click.option(
585 "--offering-id",
586 "-o",
587 required=True,
588 help="Capacity Block offering ID (cb-xxx) from reservation-check",
589)
590@click.option("--region", "-r", required=True, help="AWS region where the offering exists")
591@click.option(
592 "--dry-run",
593 is_flag=True,
594 help="Validate the offering without purchasing (no cost incurred)",
595)
596@pass_config
597def reserve_capacity(config: Any, offering_id: Any, region: Any, dry_run: Any) -> None:
598 """Purchase a Capacity Block offering by its ID.
600 Use 'gco capacity reservation-check' first to find available offerings
601 and their IDs, then purchase with this command.
603 ⚠️ WARNING: This command purchases capacity and incurs charges.
604 Use --dry-run to validate first.
606 Examples:
607 # First, find offerings:
608 gco capacity reservation-check -i p4d.24xlarge -r us-east-1
610 # Validate without purchasing:
611 gco capacity reserve -o cb-0123456789abcdef0 -r us-east-1 --dry-run
613 # Purchase:
614 gco capacity reserve -o cb-0123456789abcdef0 -r us-east-1
615 """
616 formatter = get_output_formatter(config)
617 checker = get_capacity_checker(config)
619 try:
620 if dry_run:
621 formatter.print_info(f"Dry run: validating offering {offering_id} in {region}...")
622 else:
623 formatter.print_info(f"Purchasing Capacity Block {offering_id} in {region}...")
625 result = checker.purchase_capacity_block(
626 offering_id=offering_id,
627 region=region,
628 dry_run=dry_run,
629 )
631 if config.output_format != "table": 631 ↛ 632line 631 didn't jump to line 632 because the condition on line 631 was never true
632 formatter.print(result)
633 return
635 if result["success"]:
636 if dry_run:
637 print()
638 print(f" ✓ Dry run passed — offering {offering_id} is valid and purchasable")
639 print(f" Region: {region}")
640 print()
641 print(" To purchase, run without --dry-run:")
642 print(f" gco capacity reserve -o {offering_id} -r {region}")
643 print()
644 else:
645 print()
646 print(" ✓ Capacity Block purchased successfully")
647 print(f" Reservation ID: {result['reservation_id']}")
648 print(f" Instance Type: {result['instance_type']}")
649 print(f" AZ: {result['availability_zone']}")
650 print(f" Instances: {result['total_instances']}")
651 print(f" Start: {result.get('start_date', 'N/A')}")
652 print(f" End: {result.get('end_date', 'N/A')}")
653 print()
654 print(" To create a NodePool for this reservation:")
655 print(
656 f" gco nodepools create-odcr -n my-pool -r {region} "
657 f"-c {result['reservation_id']} -i {result['instance_type']}"
658 )
659 print()
660 else:
661 formatter.print_error(
662 f"Failed: {result.get('error_code', 'Unknown')}: {result.get('error', '')}"
663 )
664 sys.exit(1)
666 except Exception as e:
667 formatter.print_error(f"Failed to reserve capacity: {e}")
668 sys.exit(1)