Coverage for cli/commands/inference_cmd.py: 88%
504 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-15 15:07 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-15 15:07 +0000
1"""Inference endpoint commands."""
3import sys
4from typing import Any
6import click
8from ..config import GCOConfig
9from ..output import get_output_formatter
11pass_config = click.make_pass_decorator(GCOConfig, ensure=True)
14@click.group()
15@pass_config
16def inference(config: Any) -> None:
17 """Manage multi-region inference endpoints."""
18 pass
21@inference.command("deploy")
22@click.argument("endpoint_name")
23@click.option("--image", "-i", required=True, help="Container image (e.g. vllm/vllm-openai:v0.8.0)")
24@click.option(
25 "--region",
26 "-r",
27 multiple=True,
28 help="Target region(s). Repeatable. Default: all deployed regions",
29)
30@click.option("--replicas", default=1, help="Replicas per region (default: 1)")
31@click.option("--gpu-count", default=1, help="GPUs per replica (default: 1)")
32@click.option("--gpu-type", help="GPU instance type hint (e.g. g5.xlarge)")
33@click.option("--port", default=8000, help="Container port (default: 8000)")
34@click.option("--model-path", help="EFS path for model weights")
35@click.option(
36 "--model-source",
37 help="S3 URI for model weights (e.g. s3://bucket/models/llama3). "
38 "Auto-synced to each region via init container.",
39)
40@click.option("--health-path", default="/health", help="Health check path (default: /health)")
41@click.option("--env", "-e", multiple=True, help="Environment variable (KEY=VALUE). Repeatable")
42@click.option("--namespace", "-n", default="gco-inference", help="Kubernetes namespace")
43@click.option("--label", "-l", multiple=True, help="Label (key=value). Repeatable")
44@click.option("--min-replicas", type=int, default=None, help="Autoscaling: minimum replicas")
45@click.option("--max-replicas", type=int, default=None, help="Autoscaling: maximum replicas")
46@click.option(
47 "--autoscale-metric",
48 multiple=True,
49 help="Autoscaling metric (cpu:70, memory:80, gpu:60). Repeatable. Enables autoscaling.",
50)
51@click.option(
52 "--capacity-type",
53 type=click.Choice(["on-demand", "spot"]),
54 default=None,
55 help="Node capacity type. 'spot' uses cheaper preemptible instances.",
56)
57@click.option(
58 "--extra-args",
59 multiple=True,
60 help="Extra arguments passed to the container (e.g. '--kv-transfer-config {...}'). Repeatable.",
61)
62@click.option(
63 "--accelerator",
64 type=click.Choice(["nvidia", "neuron"]),
65 default="nvidia",
66 help="Accelerator type: 'nvidia' for GPU instances (default), 'neuron' for Trainium/Inferentia.",
67)
68@click.option(
69 "--node-selector",
70 multiple=True,
71 help="Node selector (key=value). Repeatable. E.g. --node-selector eks.amazonaws.com/instance-family=inf2",
72)
73@click.option(
74 "--no-rewrite-image",
75 is_flag=True,
76 default=False,
77 help="Skip the per-region ECR URI rewrite. The image URI is sent verbatim "
78 "to every target region (operator owns cross-region pulls).",
79)
80@pass_config
81def inference_deploy(
82 config: Any,
83 endpoint_name: Any,
84 image: Any,
85 region: Any,
86 replicas: Any,
87 gpu_count: Any,
88 gpu_type: Any,
89 port: Any,
90 model_path: Any,
91 model_source: Any,
92 health_path: Any,
93 env: Any,
94 namespace: Any,
95 label: Any,
96 min_replicas: Any,
97 max_replicas: Any,
98 autoscale_metric: Any,
99 capacity_type: Any,
100 extra_args: Any,
101 accelerator: Any,
102 node_selector: Any,
103 no_rewrite_image: Any,
104) -> None:
105 """Deploy an inference endpoint to one or more regions.
107 The endpoint is registered in DynamoDB and the inference_monitor
108 in each target region creates the Kubernetes resources automatically.
110 Examples:
111 gco inference deploy my-llm -i vllm/vllm-openai:v0.8.0
113 gco inference deploy llama3-70b \\
114 -i vllm/vllm-openai:v0.8.0 \\
115 -r us-east-1 -r eu-west-1 \\
116 --replicas 2 --gpu-count 4 \\
117 --model-path /mnt/gco/models/llama3-70b \\
118 -e MODEL_NAME=meta-llama/Llama-3-70B
119 """
120 from ..inference import get_inference_manager
122 formatter = get_output_formatter(config)
124 # Parse env vars and labels
125 env_dict = {}
126 for e_var in env:
127 if "=" in e_var: 127 ↛ 126line 127 didn't jump to line 126 because the condition on line 127 was always true
128 k, v = e_var.split("=", 1)
129 env_dict[k] = v
131 labels_dict = {}
132 for lbl in label:
133 if "=" in lbl: 133 ↛ 132line 133 didn't jump to line 132 because the condition on line 133 was always true
134 k, v = lbl.split("=", 1)
135 labels_dict[k] = v
137 node_selector_dict = {}
138 for ns in node_selector: 138 ↛ 139line 138 didn't jump to line 139 because the loop on line 138 never started
139 if "=" in ns:
140 k, v = ns.split("=", 1)
141 node_selector_dict[k] = v
143 # Build autoscaling config
144 autoscaling_config = None
145 if autoscale_metric:
146 metrics = []
147 for m in autoscale_metric:
148 if ":" in m:
149 mtype, mtarget = m.split(":", 1)
150 metrics.append({"type": mtype, "target": int(mtarget)})
151 else:
152 metrics.append({"type": m, "target": 70})
153 autoscaling_config = {
154 "enabled": True,
155 "min_replicas": min_replicas or 1,
156 "max_replicas": max_replicas or 10,
157 "metrics": metrics,
158 }
160 try:
161 manager = get_inference_manager(config)
162 result = manager.deploy(
163 endpoint_name=endpoint_name,
164 image=image,
165 target_regions=list(region) if region else None,
166 replicas=replicas,
167 gpu_count=gpu_count,
168 gpu_type=gpu_type,
169 port=port,
170 model_path=model_path,
171 model_source=model_source,
172 health_check_path=health_path,
173 env=env_dict if env_dict else None,
174 namespace=namespace,
175 labels=labels_dict if labels_dict else None,
176 autoscaling=autoscaling_config,
177 capacity_type=capacity_type,
178 extra_args=list(extra_args) if extra_args else None,
179 accelerator=accelerator,
180 node_selector=node_selector_dict if node_selector_dict else None,
181 rewrite_image=not no_rewrite_image,
182 )
184 formatter.print_success(f"Endpoint '{endpoint_name}' registered for deployment")
185 regions_str = ", ".join(result.get("target_regions", []))
186 formatter.print_info(f"Target regions: {regions_str}")
187 formatter.print_info(f"Ingress path: {result.get('ingress_path', '')}")
188 formatter.print_info(
189 "The inference_monitor in each region will create the resources. "
190 "Use 'gco inference status' to track progress."
191 )
193 # Warn if deploying to a subset of regions
194 if region:
195 from ..aws_client import get_aws_client as _get_client
197 all_stacks = _get_client(config).discover_regional_stacks()
198 all_regions = set(all_stacks.keys())
199 target_set = set(result.get("target_regions", []))
200 missing = all_regions - target_set
201 if missing:
202 formatter.print_warning(
203 f"Endpoint is NOT deployed to: {', '.join(sorted(missing))}. "
204 "Global Accelerator may route users to those regions where "
205 "the endpoint won't exist. Consider deploying to all regions "
206 "(omit -r) for consistent global routing."
207 )
209 if config.output_format != "table": 209 ↛ 210line 209 didn't jump to line 210 because the condition on line 209 was never true
210 formatter.print(result)
212 except ValueError as e:
213 formatter.print_error(str(e))
214 sys.exit(1)
215 except Exception as e:
216 formatter.print_error(f"Failed to deploy endpoint: {e}")
217 sys.exit(1)
220@inference.command("list")
221@click.option("--state", "-s", help="Filter by state (deploying, running, stopped, deleted)")
222@click.option("--region", "-r", help="Filter by target region")
223@pass_config
224def inference_list(config: Any, state: Any, region: Any) -> None:
225 """List inference endpoints.
227 Examples:
228 gco inference list
229 gco inference list --state running
230 gco inference list -r us-east-1
231 """
232 from ..inference import get_inference_manager
234 formatter = get_output_formatter(config)
236 try:
237 manager = get_inference_manager(config)
238 endpoints = manager.list_endpoints(desired_state=state, region=region)
240 if config.output_format != "table": 240 ↛ 241line 240 didn't jump to line 241 because the condition on line 240 was never true
241 formatter.print(endpoints)
242 return
244 if not endpoints:
245 formatter.print_info("No inference endpoints found")
246 return
248 print(f"\n Inference Endpoints ({len(endpoints)} found)")
249 print(" " + "-" * 85)
250 print(f" {'NAME':<25} {'STATE':<12} {'REGIONS':<25} {'REPLICAS':>8} {'IMAGE'}")
251 print(" " + "-" * 85)
252 for ep in endpoints:
253 name = ep.get("endpoint_name", "")[:24]
254 ep_state = ep.get("desired_state", "unknown")
255 regions = ", ".join(ep.get("target_regions", []))[:24]
256 spec = ep.get("spec", {})
257 replicas = spec.get("replicas", 1) if isinstance(spec, dict) else 1
258 image = spec.get("image", "")[:40] if isinstance(spec, dict) else ""
259 print(f" {name:<25} {ep_state:<12} {regions:<25} {replicas:>8} {image}")
261 print()
263 except Exception as e:
264 formatter.print_error(f"Failed to list endpoints: {e}")
265 sys.exit(1)
268@inference.command("status")
269@click.argument("endpoint_name")
270@pass_config
271def inference_status(config: Any, endpoint_name: Any) -> None:
272 """Show detailed status of an inference endpoint.
274 Examples:
275 gco inference status my-llm
276 """
277 from ..inference import get_inference_manager
279 formatter = get_output_formatter(config)
281 try:
282 manager = get_inference_manager(config)
283 endpoint = manager.get_endpoint(endpoint_name)
285 if not endpoint:
286 formatter.print_error(f"Endpoint '{endpoint_name}' not found")
287 sys.exit(1)
289 if config.output_format != "table": 289 ↛ 290line 289 didn't jump to line 290 because the condition on line 289 was never true
290 formatter.print(endpoint)
291 return
293 spec = endpoint.get("spec", {})
294 print(f"\n Endpoint: {endpoint_name}")
295 print(" " + "-" * 60)
296 print(f" State: {endpoint.get('desired_state', 'unknown')}")
297 print(f" Image: {spec.get('image', 'N/A')}")
298 print(f" Replicas: {spec.get('replicas', 1)}")
299 print(f" GPUs: {spec.get('gpu_count', 0)}")
300 print(f" Port: {spec.get('port', 8000)}")
301 print(f" Path: {endpoint.get('ingress_path', 'N/A')}")
302 print(f" Namespace: {endpoint.get('namespace', 'N/A')}")
303 print(f" Created: {endpoint.get('created_at', 'N/A')}")
305 # Region status
306 region_status = endpoint.get("region_status", {})
307 if region_status:
308 print("\n Region Status:")
309 print(f" {'REGION':<18} {'STATE':<12} {'READY':>5} {'DESIRED':>7} {'LAST SYNC'}")
310 print(" " + "-" * 65)
311 for r, status in region_status.items():
312 if isinstance(status, dict): 312 ↛ 311line 312 didn't jump to line 311 because the condition on line 312 was always true
313 r_state = status.get("state", "unknown")
314 ready = status.get("replicas_ready", 0)
315 desired = status.get("replicas_desired", 0)
316 last_sync = status.get("last_sync", "N/A")
317 if last_sync and len(last_sync) > 19: 317 ↛ 319line 317 didn't jump to line 319 because the condition on line 317 was always true
318 last_sync = last_sync[:19]
319 print(f" {r:<18} {r_state:<12} {ready:>5} {desired:>7} {last_sync}")
320 else:
321 target_regions = endpoint.get("target_regions", [])
322 print(f"\n Target regions: {', '.join(target_regions)}")
323 print(" (Waiting for inference_monitor to sync)")
325 print()
327 except Exception as e:
328 formatter.print_error(f"Failed to get endpoint status: {e}")
329 sys.exit(1)
332@inference.command("scale")
333@click.argument("endpoint_name")
334@click.option("--replicas", "-r", required=True, type=int, help="New replica count")
335@pass_config
336def inference_scale(config: Any, endpoint_name: Any, replicas: Any) -> None:
337 """Scale an inference endpoint.
339 Examples:
340 gco inference scale my-llm --replicas 4
341 """
342 from ..inference import get_inference_manager
344 formatter = get_output_formatter(config)
346 try:
347 manager = get_inference_manager(config)
348 result = manager.scale(endpoint_name, replicas)
350 if result:
351 formatter.print_success(f"Endpoint '{endpoint_name}' scaled to {replicas} replicas")
352 else:
353 formatter.print_error(f"Endpoint '{endpoint_name}' not found")
354 sys.exit(1)
356 except Exception as e:
357 formatter.print_error(f"Failed to scale endpoint: {e}")
358 sys.exit(1)
361@inference.command("stop")
362@click.argument("endpoint_name")
363@click.option("--yes", "-y", is_flag=True, help="Skip confirmation")
364@pass_config
365def inference_stop(config: Any, endpoint_name: Any, yes: Any) -> None:
366 """Stop an inference endpoint (scale to zero, keep config).
368 Examples:
369 gco inference stop my-llm -y
370 """
371 from ..inference import get_inference_manager
373 formatter = get_output_formatter(config)
375 if not yes: 375 ↛ 376line 375 didn't jump to line 376 because the condition on line 375 was never true
376 click.confirm(f"Stop endpoint '{endpoint_name}'?", abort=True)
378 try:
379 manager = get_inference_manager(config)
380 result = manager.stop(endpoint_name)
382 if result:
383 formatter.print_success(f"Endpoint '{endpoint_name}' marked for stop")
384 else:
385 formatter.print_error(f"Endpoint '{endpoint_name}' not found")
386 sys.exit(1)
388 except Exception as e:
389 formatter.print_error(f"Failed to stop endpoint: {e}")
390 sys.exit(1)
393@inference.command("start")
394@click.argument("endpoint_name")
395@pass_config
396def inference_start(config: Any, endpoint_name: Any) -> None:
397 """Start a stopped inference endpoint.
399 Examples:
400 gco inference start my-llm
401 """
402 from ..inference import get_inference_manager
404 formatter = get_output_formatter(config)
406 try:
407 manager = get_inference_manager(config)
408 result = manager.start(endpoint_name)
410 if result:
411 formatter.print_success(f"Endpoint '{endpoint_name}' marked for start")
412 else:
413 formatter.print_error(f"Endpoint '{endpoint_name}' not found")
414 sys.exit(1)
416 except Exception as e:
417 formatter.print_error(f"Failed to start endpoint: {e}")
418 sys.exit(1)
421@inference.command("delete")
422@click.argument("endpoint_name")
423@click.option("--yes", "-y", is_flag=True, help="Skip confirmation")
424@pass_config
425def inference_delete(config: Any, endpoint_name: Any, yes: Any) -> None:
426 """Delete an inference endpoint from all regions.
428 The inference_monitor in each region will clean up the K8s resources.
430 Examples:
431 gco inference delete my-llm -y
432 """
433 from ..inference import get_inference_manager
435 formatter = get_output_formatter(config)
437 if not yes: 437 ↛ 438line 437 didn't jump to line 438 because the condition on line 437 was never true
438 click.confirm(f"Delete endpoint '{endpoint_name}' from all regions?", abort=True)
440 try:
441 manager = get_inference_manager(config)
442 result = manager.delete(endpoint_name)
444 if result:
445 formatter.print_success(
446 f"Endpoint '{endpoint_name}' marked for deletion. "
447 "The inference_monitor will clean up resources in each region."
448 )
449 else:
450 formatter.print_error(f"Endpoint '{endpoint_name}' not found")
451 sys.exit(1)
453 except Exception as e:
454 formatter.print_error(f"Failed to delete endpoint: {e}")
455 sys.exit(1)
458@inference.command("update-image")
459@click.argument("endpoint_name")
460@click.option("--image", "-i", required=True, help="New container image")
461@pass_config
462def inference_update_image(config: Any, endpoint_name: Any, image: Any) -> None:
463 """Update the container image for an inference endpoint.
465 Triggers a rolling update across all target regions.
467 Examples:
468 gco inference update-image my-llm -i vllm/vllm-openai:v0.9.0
469 """
470 from ..inference import get_inference_manager
472 formatter = get_output_formatter(config)
474 try:
475 manager = get_inference_manager(config)
476 result = manager.update_image(endpoint_name, image)
478 if result:
479 formatter.print_success(f"Endpoint '{endpoint_name}' image updated to {image}")
480 formatter.print_info("Rolling update will be applied by inference_monitor")
481 else:
482 formatter.print_error(f"Endpoint '{endpoint_name}' not found")
483 sys.exit(1)
485 except Exception as e:
486 formatter.print_error(f"Failed to update image: {e}")
487 sys.exit(1)
490@inference.command("invoke")
491@click.argument("endpoint_name")
492@click.option("--prompt", "-p", help="Text prompt to send")
493@click.option("--data", "-d", help="Raw JSON body to send")
494@click.option(
495 "--path", "api_path", default=None, help="API sub-path (default: auto-detect from framework)"
496)
497@click.option("--region", "-r", help="Target region for the request")
498@click.option(
499 "--max-tokens", type=int, default=100, help="Maximum tokens to generate (default: 100)"
500)
501@click.option("--stream/--no-stream", default=False, help="Stream the response")
502@pass_config
503def inference_invoke(
504 config: Any,
505 endpoint_name: Any,
506 prompt: Any,
507 data: Any,
508 api_path: Any,
509 region: Any,
510 max_tokens: Any,
511 stream: Any,
512) -> None:
513 """Send a request to an inference endpoint and print the response.
515 Automatically discovers the endpoint's ingress path and routes the
516 request through the API Gateway with SigV4 authentication.
518 Examples:
519 gco inference invoke my-llm -p "What is GPU orchestration?"
521 gco inference invoke my-llm -d '{"prompt": "Hello", "max_tokens": 50}'
523 gco inference invoke my-llm -p "Explain K8s" --path /v1/completions
524 """
525 import json as _json
527 from ..aws_client import get_aws_client
528 from ..inference import get_inference_manager
530 formatter = get_output_formatter(config)
532 if not prompt and not data:
533 formatter.print_error("Provide --prompt (-p) or --data (-d)")
534 sys.exit(1)
536 try:
537 # Look up the endpoint to get its ingress path and spec
538 manager = get_inference_manager(config)
539 endpoint = manager.get_endpoint(endpoint_name)
540 if not endpoint:
541 formatter.print_error(f"Endpoint '{endpoint_name}' not found")
542 sys.exit(1)
544 ingress_path = endpoint.get("ingress_path", f"/inference/{endpoint_name}")
545 spec = endpoint.get("spec", {})
546 image = spec.get("image", "") if isinstance(spec, dict) else ""
548 # Auto-detect the API sub-path based on the container image
549 if api_path is None:
550 if "vllm" in image:
551 api_path = "/v1/completions"
552 elif "text-generation-inference" in image or "tgi" in image:
553 api_path = "/generate"
554 elif "tritonserver" in image or "triton" in image: 554 ↛ 557line 554 didn't jump to line 557 because the condition on line 554 was always true
555 api_path = "/v2/models"
556 else:
557 api_path = "/v1/completions"
559 full_path = f"{ingress_path}{api_path}"
561 # Build the request body
562 body_str: str | None = None
563 if data:
564 body_str = data
565 elif prompt: 565 ↛ 611line 565 didn't jump to line 611 because the condition on line 565 was always true
566 # Build a sensible default body based on framework
567 if "generate" in api_path:
568 # TGI format
569 body_dict = {"inputs": prompt, "parameters": {"max_new_tokens": max_tokens}}
570 elif "/v2/" in api_path:
571 # Triton — just list models, prompt not used for this path
572 body_dict = {}
573 else:
574 # OpenAI-compatible (vLLM, etc.)
575 # Determine model name for OpenAI-compatible request
576 model_name = endpoint_name
577 if isinstance(spec, dict): 577 ↛ 603line 577 didn't jump to line 603 because the condition on line 577 was always true
578 # Check env vars first
579 model_name = spec.get("env", {}).get("MODEL", model_name)
580 # Check container args for --model (vLLM, etc.)
581 args_list = spec.get("args") or []
582 for i, arg in enumerate(args_list): 582 ↛ 583line 582 didn't jump to line 583 because the loop on line 582 never started
583 if arg == "--model" and i + 1 < len(args_list):
584 model_name = args_list[i + 1]
585 break
586 # Default for vLLM with no explicit model — auto-detect
587 # by querying /v1/models on the running endpoint
588 if model_name == endpoint_name and "vllm" in image: 588 ↛ 603line 588 didn't jump to line 603 because the condition on line 588 was always true
589 try:
590 detect_client = get_aws_client(config)
591 models_path = f"/inference/{endpoint_name}/v1/models"
592 models_resp = detect_client.make_authenticated_request(
593 method="GET",
594 path=models_path,
595 target_region=region,
596 )
597 if models_resp.ok:
598 models_data = models_resp.json().get("data", [])
599 if models_data: 599 ↛ 600line 599 didn't jump to line 600 because the condition on line 599 was never true
600 model_name = models_data[0]["id"]
601 except Exception:
602 pass # Fall through to endpoint_name as model
603 body_dict = {
604 "model": model_name,
605 "prompt": prompt,
606 "max_tokens": max_tokens,
607 "stream": stream,
608 }
609 body_str = _json.dumps(body_dict)
611 formatter.print_info(f"POST {full_path}")
613 # Make the authenticated request
614 client = get_aws_client(config)
615 response = client.make_authenticated_request(
616 method="POST" if body_str else "GET",
617 path=full_path,
618 body=_json.loads(body_str) if body_str else None,
619 target_region=region,
620 )
622 # Print the response
623 if response.ok:
624 try:
625 resp_json = response.json()
626 # Extract the generated text for common formats
627 text = None
628 if "choices" in resp_json:
629 # OpenAI format
630 choices = resp_json["choices"]
631 if choices: 631 ↛ 641line 631 didn't jump to line 641 because the condition on line 631 was always true
632 text = choices[0].get("text") or choices[0].get("message", {}).get(
633 "content"
634 )
635 elif "generated_text" in resp_json: 635 ↛ 637line 635 didn't jump to line 637 because the condition on line 635 was never true
636 # TGI format
637 text = resp_json["generated_text"]
638 elif isinstance(resp_json, list) and resp_json and "generated_text" in resp_json[0]:
639 text = resp_json[0]["generated_text"]
641 if text and config.output_format == "table":
642 print(f"\n{text.strip()}\n")
643 else:
644 print(_json.dumps(resp_json, indent=2))
645 except _json.JSONDecodeError:
646 print(response.text)
647 else:
648 formatter.print_error(f"HTTP {response.status_code}: {response.text[:500]}")
649 sys.exit(1)
651 except Exception as e:
652 formatter.print_error(f"Failed to invoke endpoint: {e}")
653 sys.exit(1)
656@inference.command("canary")
657@click.argument("endpoint_name")
658@click.option("--image", "-i", required=True, help="New container image for canary")
659@click.option(
660 "--weight",
661 "-w",
662 default=10,
663 type=int,
664 help="Percentage of traffic to canary (1-99, default: 10)",
665)
666@click.option(
667 "--replicas", "-r", default=1, type=int, help="Number of canary replicas (default: 1)"
668)
669@pass_config
670def inference_canary(
671 config: Any, endpoint_name: Any, image: Any, weight: Any, replicas: Any
672) -> None:
673 """Start a canary deployment with a new image.
675 Routes a percentage of traffic to the canary while the primary
676 continues serving the rest. Use 'promote' to make the canary
677 the new primary, or 'rollback' to remove it.
679 Examples:
680 gco inference canary my-llm -i vllm/vllm-openai:v0.9.0 --weight 10
681 gco inference canary my-llm -i new-image:latest -w 25 -r 2
682 """
683 from ..inference import get_inference_manager
685 formatter = get_output_formatter(config)
687 try:
688 manager = get_inference_manager(config)
689 result = manager.canary_deploy(endpoint_name, image, weight=weight, replicas=replicas)
691 if not result:
692 formatter.print_error(f"Endpoint '{endpoint_name}' not found")
693 sys.exit(1)
695 formatter.print_success(
696 f"Canary started: {weight}% traffic → {image} ({replicas} replica(s))"
697 )
698 formatter.print_info(f"Monitor with: gco inference status {endpoint_name}")
699 formatter.print_info(f"Promote with: gco inference promote {endpoint_name}")
700 formatter.print_info(f"Rollback with: gco inference rollback {endpoint_name}")
702 except ValueError as e:
703 formatter.print_error(str(e))
704 sys.exit(1)
705 except Exception as e:
706 formatter.print_error(f"Failed to start canary: {e}")
707 sys.exit(1)
710@inference.command("promote")
711@click.argument("endpoint_name")
712@click.option("--yes", "-y", is_flag=True, help="Skip confirmation")
713@pass_config
714def inference_promote(config: Any, endpoint_name: Any, yes: Any) -> None:
715 """Promote the canary to primary.
717 Replaces the primary image with the canary image and removes
718 the canary deployment. All traffic goes to the new image.
720 Examples:
721 gco inference promote my-llm -y
722 """
723 from ..inference import get_inference_manager
725 formatter = get_output_formatter(config)
727 try:
728 manager = get_inference_manager(config)
729 endpoint = manager.get_endpoint(endpoint_name)
731 if not endpoint:
732 formatter.print_error(f"Endpoint '{endpoint_name}' not found")
733 sys.exit(1)
735 canary = endpoint.get("spec", {}).get("canary")
736 if not canary:
737 formatter.print_error(f"Endpoint '{endpoint_name}' has no active canary")
738 sys.exit(1)
740 if not yes: 740 ↛ 741line 740 didn't jump to line 741 because the condition on line 740 was never true
741 current_image = endpoint.get("spec", {}).get("image", "unknown")
742 click.echo(f" Current primary: {current_image}")
743 click.echo(f" Canary image: {canary.get('image', 'unknown')}")
744 click.echo(f" Canary weight: {canary.get('weight', 0)}%")
745 if not click.confirm(" Promote canary to primary?"):
746 formatter.print_info("Cancelled")
747 return
749 result = manager.promote_canary(endpoint_name)
750 if result: 750 ↛ 754line 750 didn't jump to line 754 because the condition on line 750 was always true
751 new_image = result.get("spec", {}).get("image", "unknown")
752 formatter.print_success(f"Promoted: all traffic now serving {new_image}")
753 else:
754 formatter.print_error("Promotion failed")
755 sys.exit(1)
757 except ValueError as e:
758 formatter.print_error(str(e))
759 sys.exit(1)
760 except Exception as e:
761 formatter.print_error(f"Failed to promote canary: {e}")
762 sys.exit(1)
765@inference.command("rollback")
766@click.argument("endpoint_name")
767@click.option("--yes", "-y", is_flag=True, help="Skip confirmation")
768@pass_config
769def inference_rollback(config: Any, endpoint_name: Any, yes: Any) -> None:
770 """Remove the canary deployment, keeping the primary unchanged.
772 All traffic returns to the primary deployment.
774 Examples:
775 gco inference rollback my-llm -y
776 """
777 from ..inference import get_inference_manager
779 formatter = get_output_formatter(config)
781 try:
782 manager = get_inference_manager(config)
783 endpoint = manager.get_endpoint(endpoint_name)
785 if not endpoint: 785 ↛ 786line 785 didn't jump to line 786 because the condition on line 785 was never true
786 formatter.print_error(f"Endpoint '{endpoint_name}' not found")
787 sys.exit(1)
789 canary = endpoint.get("spec", {}).get("canary")
790 if not canary:
791 formatter.print_error(f"Endpoint '{endpoint_name}' has no active canary")
792 sys.exit(1)
794 if not yes: 794 ↛ 795line 794 didn't jump to line 795 because the condition on line 794 was never true
795 click.echo(f" Canary image: {canary.get('image', 'unknown')}")
796 click.echo(f" Canary weight: {canary.get('weight', 0)}%")
797 if not click.confirm(" Remove canary and restore full traffic to primary?"):
798 formatter.print_info("Cancelled")
799 return
801 result = manager.rollback_canary(endpoint_name)
802 if result: 802 ↛ 806line 802 didn't jump to line 806 because the condition on line 802 was always true
803 primary_image = result.get("spec", {}).get("image", "unknown")
804 formatter.print_success(f"Rolled back: all traffic now serving {primary_image}")
805 else:
806 formatter.print_error("Rollback failed")
807 sys.exit(1)
809 except ValueError as e:
810 formatter.print_error(str(e))
811 sys.exit(1)
812 except Exception as e:
813 formatter.print_error(f"Failed to rollback canary: {e}")
814 sys.exit(1)
817@inference.command("health")
818@click.argument("endpoint_name")
819@click.option("--region", "-r", help="Target region to check")
820@pass_config
821def inference_health(config: Any, endpoint_name: Any, region: Any) -> None:
822 """Check if an inference endpoint is healthy and ready to serve.
824 Hits the endpoint's health check path and reports status and latency.
826 Examples:
827 gco inference health my-llm
829 gco inference health my-llm -r us-east-1
830 """
831 import json as _json
832 import time as _time
834 from ..aws_client import get_aws_client
835 from ..inference import get_inference_manager
837 formatter = get_output_formatter(config)
839 try:
840 manager = get_inference_manager(config)
841 endpoint = manager.get_endpoint(endpoint_name)
842 if not endpoint:
843 formatter.print_error(f"Endpoint '{endpoint_name}' not found")
844 sys.exit(1)
846 ingress_path = endpoint.get("ingress_path", f"/inference/{endpoint_name}")
847 spec = endpoint.get("spec", {})
848 health_path = spec.get("health_path", "/health") if isinstance(spec, dict) else "/health"
849 full_path = f"{ingress_path}{health_path}"
851 client = get_aws_client(config)
852 start = _time.monotonic()
853 response = client.make_authenticated_request(
854 method="GET",
855 path=full_path,
856 target_region=region,
857 )
858 latency_ms = (_time.monotonic() - start) * 1000
860 result = {
861 "endpoint": endpoint_name,
862 "status": "healthy" if response.ok else "unhealthy",
863 "http_status": response.status_code,
864 "latency_ms": round(latency_ms, 1),
865 "path": full_path,
866 }
868 try:
869 result["body"] = response.json()
870 except Exception:
871 result["body"] = response.text[:200] if response.text else None
873 if config.output_format == "json": 873 ↛ 874line 873 didn't jump to line 874 because the condition on line 873 was never true
874 print(_json.dumps(result, indent=2))
875 else:
876 status_icon = "✓" if response.ok else "✗"
877 formatter.print_info(
878 f"{status_icon} {endpoint_name}: {result['status']} "
879 f"(HTTP {response.status_code}, {result['latency_ms']}ms)"
880 )
882 except Exception as e:
883 formatter.print_error(f"Health check failed: {e}")
884 sys.exit(1)
887@inference.command("models")
888@click.argument("endpoint_name")
889@click.option("--region", "-r", help="Target region to query")
890@pass_config
891def inference_models(config: Any, endpoint_name: Any, region: Any) -> None:
892 """List models loaded on an inference endpoint.
894 Queries the /v1/models path (OpenAI-compatible) to discover loaded models.
896 Examples:
897 gco inference models my-llm
898 """
899 import json as _json
901 from ..aws_client import get_aws_client
902 from ..inference import get_inference_manager
904 formatter = get_output_formatter(config)
906 try:
907 manager = get_inference_manager(config)
908 endpoint = manager.get_endpoint(endpoint_name)
909 if not endpoint:
910 formatter.print_error(f"Endpoint '{endpoint_name}' not found")
911 sys.exit(1)
913 ingress_path = endpoint.get("ingress_path", f"/inference/{endpoint_name}")
914 full_path = f"{ingress_path}/v1/models"
916 client = get_aws_client(config)
917 response = client.make_authenticated_request(
918 method="GET",
919 path=full_path,
920 target_region=region,
921 )
923 if response.ok:
924 try:
925 resp_json = response.json()
926 print(_json.dumps(resp_json, indent=2))
927 except _json.JSONDecodeError:
928 print(response.text)
929 else:
930 formatter.print_error(f"HTTP {response.status_code}: {response.text[:500]}")
931 sys.exit(1)
933 except Exception as e:
934 formatter.print_error(f"Failed to list models: {e}")
935 sys.exit(1)