Coverage for cli / commands / inference_cmd.py: 88%
502 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-30 21:47 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-30 21:47 +0000
1"""Inference endpoint commands."""
3import sys
4from typing import Any
6import click
8from ..config import GCOConfig
9from ..output import get_output_formatter
11pass_config = click.make_pass_decorator(GCOConfig, ensure=True)
14@click.group()
15@pass_config
16def inference(config: Any) -> None:
17 """Manage multi-region inference endpoints."""
18 pass
21@inference.command("deploy")
22@click.argument("endpoint_name")
23@click.option("--image", "-i", required=True, help="Container image (e.g. vllm/vllm-openai:v0.8.0)")
24@click.option(
25 "--region",
26 "-r",
27 multiple=True,
28 help="Target region(s). Repeatable. Default: all deployed regions",
29)
30@click.option("--replicas", default=1, help="Replicas per region (default: 1)")
31@click.option("--gpu-count", default=1, help="GPUs per replica (default: 1)")
32@click.option("--gpu-type", help="GPU instance type hint (e.g. g5.xlarge)")
33@click.option("--port", default=8000, help="Container port (default: 8000)")
34@click.option("--model-path", help="EFS path for model weights")
35@click.option(
36 "--model-source",
37 help="S3 URI for model weights (e.g. s3://bucket/models/llama3). "
38 "Auto-synced to each region via init container.",
39)
40@click.option("--health-path", default="/health", help="Health check path (default: /health)")
41@click.option("--env", "-e", multiple=True, help="Environment variable (KEY=VALUE). Repeatable")
42@click.option("--namespace", "-n", default="gco-inference", help="Kubernetes namespace")
43@click.option("--label", "-l", multiple=True, help="Label (key=value). Repeatable")
44@click.option("--min-replicas", type=int, default=None, help="Autoscaling: minimum replicas")
45@click.option("--max-replicas", type=int, default=None, help="Autoscaling: maximum replicas")
46@click.option(
47 "--autoscale-metric",
48 multiple=True,
49 help="Autoscaling metric (cpu:70, memory:80, gpu:60). Repeatable. Enables autoscaling.",
50)
51@click.option(
52 "--capacity-type",
53 type=click.Choice(["on-demand", "spot"]),
54 default=None,
55 help="Node capacity type. 'spot' uses cheaper preemptible instances.",
56)
57@click.option(
58 "--extra-args",
59 multiple=True,
60 help="Extra arguments passed to the container (e.g. '--kv-transfer-config {...}'). Repeatable.",
61)
62@click.option(
63 "--accelerator",
64 type=click.Choice(["nvidia", "neuron"]),
65 default="nvidia",
66 help="Accelerator type: 'nvidia' for GPU instances (default), 'neuron' for Trainium/Inferentia.",
67)
68@click.option(
69 "--node-selector",
70 multiple=True,
71 help="Node selector (key=value). Repeatable. E.g. --node-selector eks.amazonaws.com/instance-family=inf2",
72)
73@pass_config
74def inference_deploy(
75 config: Any,
76 endpoint_name: Any,
77 image: Any,
78 region: Any,
79 replicas: Any,
80 gpu_count: Any,
81 gpu_type: Any,
82 port: Any,
83 model_path: Any,
84 model_source: Any,
85 health_path: Any,
86 env: Any,
87 namespace: Any,
88 label: Any,
89 min_replicas: Any,
90 max_replicas: Any,
91 autoscale_metric: Any,
92 capacity_type: Any,
93 extra_args: Any,
94 accelerator: Any,
95 node_selector: Any,
96) -> None:
97 """Deploy an inference endpoint to one or more regions.
99 The endpoint is registered in DynamoDB and the inference_monitor
100 in each target region creates the Kubernetes resources automatically.
102 Examples:
103 gco inference deploy my-llm -i vllm/vllm-openai:v0.8.0
105 gco inference deploy llama3-70b \\
106 -i vllm/vllm-openai:v0.8.0 \\
107 -r us-east-1 -r eu-west-1 \\
108 --replicas 2 --gpu-count 4 \\
109 --model-path /mnt/gco/models/llama3-70b \\
110 -e MODEL_NAME=meta-llama/Llama-3-70B
111 """
112 from ..inference import get_inference_manager
114 formatter = get_output_formatter(config)
116 # Parse env vars and labels
117 env_dict = {}
118 for e_var in env:
119 if "=" in e_var: 119 ↛ 118line 119 didn't jump to line 118 because the condition on line 119 was always true
120 k, v = e_var.split("=", 1)
121 env_dict[k] = v
123 labels_dict = {}
124 for lbl in label:
125 if "=" in lbl: 125 ↛ 124line 125 didn't jump to line 124 because the condition on line 125 was always true
126 k, v = lbl.split("=", 1)
127 labels_dict[k] = v
129 node_selector_dict = {}
130 for ns in node_selector: 130 ↛ 131line 130 didn't jump to line 131 because the loop on line 130 never started
131 if "=" in ns:
132 k, v = ns.split("=", 1)
133 node_selector_dict[k] = v
135 # Build autoscaling config
136 autoscaling_config = None
137 if autoscale_metric:
138 metrics = []
139 for m in autoscale_metric:
140 if ":" in m:
141 mtype, mtarget = m.split(":", 1)
142 metrics.append({"type": mtype, "target": int(mtarget)})
143 else:
144 metrics.append({"type": m, "target": 70})
145 autoscaling_config = {
146 "enabled": True,
147 "min_replicas": min_replicas or 1,
148 "max_replicas": max_replicas or 10,
149 "metrics": metrics,
150 }
152 try:
153 manager = get_inference_manager(config)
154 result = manager.deploy(
155 endpoint_name=endpoint_name,
156 image=image,
157 target_regions=list(region) if region else None,
158 replicas=replicas,
159 gpu_count=gpu_count,
160 gpu_type=gpu_type,
161 port=port,
162 model_path=model_path,
163 model_source=model_source,
164 health_check_path=health_path,
165 env=env_dict if env_dict else None,
166 namespace=namespace,
167 labels=labels_dict if labels_dict else None,
168 autoscaling=autoscaling_config,
169 capacity_type=capacity_type,
170 extra_args=list(extra_args) if extra_args else None,
171 accelerator=accelerator,
172 node_selector=node_selector_dict if node_selector_dict else None,
173 )
175 formatter.print_success(f"Endpoint '{endpoint_name}' registered for deployment")
176 regions_str = ", ".join(result.get("target_regions", []))
177 formatter.print_info(f"Target regions: {regions_str}")
178 formatter.print_info(f"Ingress path: {result.get('ingress_path', '')}")
179 formatter.print_info(
180 "The inference_monitor in each region will create the resources. "
181 "Use 'gco inference status' to track progress."
182 )
184 # Warn if deploying to a subset of regions
185 if region:
186 from ..aws_client import get_aws_client as _get_client
188 all_stacks = _get_client(config).discover_regional_stacks()
189 all_regions = set(all_stacks.keys())
190 target_set = set(result.get("target_regions", []))
191 missing = all_regions - target_set
192 if missing:
193 formatter.print_warning(
194 f"Endpoint is NOT deployed to: {', '.join(sorted(missing))}. "
195 "Global Accelerator may route users to those regions where "
196 "the endpoint won't exist. Consider deploying to all regions "
197 "(omit -r) for consistent global routing."
198 )
200 if config.output_format != "table": 200 ↛ 201line 200 didn't jump to line 201 because the condition on line 200 was never true
201 formatter.print(result)
203 except ValueError as e:
204 formatter.print_error(str(e))
205 sys.exit(1)
206 except Exception as e:
207 formatter.print_error(f"Failed to deploy endpoint: {e}")
208 sys.exit(1)
211@inference.command("list")
212@click.option("--state", "-s", help="Filter by state (deploying, running, stopped, deleted)")
213@click.option("--region", "-r", help="Filter by target region")
214@pass_config
215def inference_list(config: Any, state: Any, region: Any) -> None:
216 """List inference endpoints.
218 Examples:
219 gco inference list
220 gco inference list --state running
221 gco inference list -r us-east-1
222 """
223 from ..inference import get_inference_manager
225 formatter = get_output_formatter(config)
227 try:
228 manager = get_inference_manager(config)
229 endpoints = manager.list_endpoints(desired_state=state, region=region)
231 if config.output_format != "table": 231 ↛ 232line 231 didn't jump to line 232 because the condition on line 231 was never true
232 formatter.print(endpoints)
233 return
235 if not endpoints:
236 formatter.print_info("No inference endpoints found")
237 return
239 print(f"\n Inference Endpoints ({len(endpoints)} found)")
240 print(" " + "-" * 85)
241 print(f" {'NAME':<25} {'STATE':<12} {'REGIONS':<25} {'REPLICAS':>8} {'IMAGE'}")
242 print(" " + "-" * 85)
243 for ep in endpoints:
244 name = ep.get("endpoint_name", "")[:24]
245 ep_state = ep.get("desired_state", "unknown")
246 regions = ", ".join(ep.get("target_regions", []))[:24]
247 spec = ep.get("spec", {})
248 replicas = spec.get("replicas", 1) if isinstance(spec, dict) else 1
249 image = spec.get("image", "")[:40] if isinstance(spec, dict) else ""
250 print(f" {name:<25} {ep_state:<12} {regions:<25} {replicas:>8} {image}")
252 print()
254 except Exception as e:
255 formatter.print_error(f"Failed to list endpoints: {e}")
256 sys.exit(1)
259@inference.command("status")
260@click.argument("endpoint_name")
261@pass_config
262def inference_status(config: Any, endpoint_name: Any) -> None:
263 """Show detailed status of an inference endpoint.
265 Examples:
266 gco inference status my-llm
267 """
268 from ..inference import get_inference_manager
270 formatter = get_output_formatter(config)
272 try:
273 manager = get_inference_manager(config)
274 endpoint = manager.get_endpoint(endpoint_name)
276 if not endpoint:
277 formatter.print_error(f"Endpoint '{endpoint_name}' not found")
278 sys.exit(1)
280 if config.output_format != "table": 280 ↛ 281line 280 didn't jump to line 281 because the condition on line 280 was never true
281 formatter.print(endpoint)
282 return
284 spec = endpoint.get("spec", {})
285 print(f"\n Endpoint: {endpoint_name}")
286 print(" " + "-" * 60)
287 print(f" State: {endpoint.get('desired_state', 'unknown')}")
288 print(f" Image: {spec.get('image', 'N/A')}")
289 print(f" Replicas: {spec.get('replicas', 1)}")
290 print(f" GPUs: {spec.get('gpu_count', 0)}")
291 print(f" Port: {spec.get('port', 8000)}")
292 print(f" Path: {endpoint.get('ingress_path', 'N/A')}")
293 print(f" Namespace: {endpoint.get('namespace', 'N/A')}")
294 print(f" Created: {endpoint.get('created_at', 'N/A')}")
296 # Region status
297 region_status = endpoint.get("region_status", {})
298 if region_status:
299 print("\n Region Status:")
300 print(f" {'REGION':<18} {'STATE':<12} {'READY':>5} {'DESIRED':>7} {'LAST SYNC'}")
301 print(" " + "-" * 65)
302 for r, status in region_status.items():
303 if isinstance(status, dict): 303 ↛ 302line 303 didn't jump to line 302 because the condition on line 303 was always true
304 r_state = status.get("state", "unknown")
305 ready = status.get("replicas_ready", 0)
306 desired = status.get("replicas_desired", 0)
307 last_sync = status.get("last_sync", "N/A")
308 if last_sync and len(last_sync) > 19: 308 ↛ 310line 308 didn't jump to line 310 because the condition on line 308 was always true
309 last_sync = last_sync[:19]
310 print(f" {r:<18} {r_state:<12} {ready:>5} {desired:>7} {last_sync}")
311 else:
312 target_regions = endpoint.get("target_regions", [])
313 print(f"\n Target regions: {', '.join(target_regions)}")
314 print(" (Waiting for inference_monitor to sync)")
316 print()
318 except Exception as e:
319 formatter.print_error(f"Failed to get endpoint status: {e}")
320 sys.exit(1)
323@inference.command("scale")
324@click.argument("endpoint_name")
325@click.option("--replicas", "-r", required=True, type=int, help="New replica count")
326@pass_config
327def inference_scale(config: Any, endpoint_name: Any, replicas: Any) -> None:
328 """Scale an inference endpoint.
330 Examples:
331 gco inference scale my-llm --replicas 4
332 """
333 from ..inference import get_inference_manager
335 formatter = get_output_formatter(config)
337 try:
338 manager = get_inference_manager(config)
339 result = manager.scale(endpoint_name, replicas)
341 if result:
342 formatter.print_success(f"Endpoint '{endpoint_name}' scaled to {replicas} replicas")
343 else:
344 formatter.print_error(f"Endpoint '{endpoint_name}' not found")
345 sys.exit(1)
347 except Exception as e:
348 formatter.print_error(f"Failed to scale endpoint: {e}")
349 sys.exit(1)
352@inference.command("stop")
353@click.argument("endpoint_name")
354@click.option("--yes", "-y", is_flag=True, help="Skip confirmation")
355@pass_config
356def inference_stop(config: Any, endpoint_name: Any, yes: Any) -> None:
357 """Stop an inference endpoint (scale to zero, keep config).
359 Examples:
360 gco inference stop my-llm -y
361 """
362 from ..inference import get_inference_manager
364 formatter = get_output_formatter(config)
366 if not yes: 366 ↛ 367line 366 didn't jump to line 367 because the condition on line 366 was never true
367 click.confirm(f"Stop endpoint '{endpoint_name}'?", abort=True)
369 try:
370 manager = get_inference_manager(config)
371 result = manager.stop(endpoint_name)
373 if result:
374 formatter.print_success(f"Endpoint '{endpoint_name}' marked for stop")
375 else:
376 formatter.print_error(f"Endpoint '{endpoint_name}' not found")
377 sys.exit(1)
379 except Exception as e:
380 formatter.print_error(f"Failed to stop endpoint: {e}")
381 sys.exit(1)
384@inference.command("start")
385@click.argument("endpoint_name")
386@pass_config
387def inference_start(config: Any, endpoint_name: Any) -> None:
388 """Start a stopped inference endpoint.
390 Examples:
391 gco inference start my-llm
392 """
393 from ..inference import get_inference_manager
395 formatter = get_output_formatter(config)
397 try:
398 manager = get_inference_manager(config)
399 result = manager.start(endpoint_name)
401 if result:
402 formatter.print_success(f"Endpoint '{endpoint_name}' marked for start")
403 else:
404 formatter.print_error(f"Endpoint '{endpoint_name}' not found")
405 sys.exit(1)
407 except Exception as e:
408 formatter.print_error(f"Failed to start endpoint: {e}")
409 sys.exit(1)
412@inference.command("delete")
413@click.argument("endpoint_name")
414@click.option("--yes", "-y", is_flag=True, help="Skip confirmation")
415@pass_config
416def inference_delete(config: Any, endpoint_name: Any, yes: Any) -> None:
417 """Delete an inference endpoint from all regions.
419 The inference_monitor in each region will clean up the K8s resources.
421 Examples:
422 gco inference delete my-llm -y
423 """
424 from ..inference import get_inference_manager
426 formatter = get_output_formatter(config)
428 if not yes: 428 ↛ 429line 428 didn't jump to line 429 because the condition on line 428 was never true
429 click.confirm(f"Delete endpoint '{endpoint_name}' from all regions?", abort=True)
431 try:
432 manager = get_inference_manager(config)
433 result = manager.delete(endpoint_name)
435 if result:
436 formatter.print_success(
437 f"Endpoint '{endpoint_name}' marked for deletion. "
438 "The inference_monitor will clean up resources in each region."
439 )
440 else:
441 formatter.print_error(f"Endpoint '{endpoint_name}' not found")
442 sys.exit(1)
444 except Exception as e:
445 formatter.print_error(f"Failed to delete endpoint: {e}")
446 sys.exit(1)
449@inference.command("update-image")
450@click.argument("endpoint_name")
451@click.option("--image", "-i", required=True, help="New container image")
452@pass_config
453def inference_update_image(config: Any, endpoint_name: Any, image: Any) -> None:
454 """Update the container image for an inference endpoint.
456 Triggers a rolling update across all target regions.
458 Examples:
459 gco inference update-image my-llm -i vllm/vllm-openai:v0.9.0
460 """
461 from ..inference import get_inference_manager
463 formatter = get_output_formatter(config)
465 try:
466 manager = get_inference_manager(config)
467 result = manager.update_image(endpoint_name, image)
469 if result:
470 formatter.print_success(f"Endpoint '{endpoint_name}' image updated to {image}")
471 formatter.print_info("Rolling update will be applied by inference_monitor")
472 else:
473 formatter.print_error(f"Endpoint '{endpoint_name}' not found")
474 sys.exit(1)
476 except Exception as e:
477 formatter.print_error(f"Failed to update image: {e}")
478 sys.exit(1)
481@inference.command("invoke")
482@click.argument("endpoint_name")
483@click.option("--prompt", "-p", help="Text prompt to send")
484@click.option("--data", "-d", help="Raw JSON body to send")
485@click.option(
486 "--path", "api_path", default=None, help="API sub-path (default: auto-detect from framework)"
487)
488@click.option("--region", "-r", help="Target region for the request")
489@click.option(
490 "--max-tokens", type=int, default=100, help="Maximum tokens to generate (default: 100)"
491)
492@click.option("--stream/--no-stream", default=False, help="Stream the response")
493@pass_config
494def inference_invoke(
495 config: Any,
496 endpoint_name: Any,
497 prompt: Any,
498 data: Any,
499 api_path: Any,
500 region: Any,
501 max_tokens: Any,
502 stream: Any,
503) -> None:
504 """Send a request to an inference endpoint and print the response.
506 Automatically discovers the endpoint's ingress path and routes the
507 request through the API Gateway with SigV4 authentication.
509 Examples:
510 gco inference invoke my-llm -p "What is GPU orchestration?"
512 gco inference invoke my-llm -d '{"prompt": "Hello", "max_tokens": 50}'
514 gco inference invoke my-llm -p "Explain K8s" --path /v1/completions
515 """
516 import json as _json
518 from ..aws_client import get_aws_client
519 from ..inference import get_inference_manager
521 formatter = get_output_formatter(config)
523 if not prompt and not data:
524 formatter.print_error("Provide --prompt (-p) or --data (-d)")
525 sys.exit(1)
527 try:
528 # Look up the endpoint to get its ingress path and spec
529 manager = get_inference_manager(config)
530 endpoint = manager.get_endpoint(endpoint_name)
531 if not endpoint:
532 formatter.print_error(f"Endpoint '{endpoint_name}' not found")
533 sys.exit(1)
535 ingress_path = endpoint.get("ingress_path", f"/inference/{endpoint_name}")
536 spec = endpoint.get("spec", {})
537 image = spec.get("image", "") if isinstance(spec, dict) else ""
539 # Auto-detect the API sub-path based on the container image
540 if api_path is None:
541 if "vllm" in image:
542 api_path = "/v1/completions"
543 elif "text-generation-inference" in image or "tgi" in image:
544 api_path = "/generate"
545 elif "tritonserver" in image or "triton" in image: 545 ↛ 548line 545 didn't jump to line 548 because the condition on line 545 was always true
546 api_path = "/v2/models"
547 else:
548 api_path = "/v1/completions"
550 full_path = f"{ingress_path}{api_path}"
552 # Build the request body
553 if data:
554 body_str = data
555 elif prompt: 555 ↛ 601line 555 didn't jump to line 601 because the condition on line 555 was always true
556 # Build a sensible default body based on framework
557 if "generate" in api_path:
558 # TGI format
559 body_dict = {"inputs": prompt, "parameters": {"max_new_tokens": max_tokens}}
560 elif "/v2/" in api_path:
561 # Triton — just list models, prompt not used for this path
562 body_dict = {}
563 else:
564 # OpenAI-compatible (vLLM, etc.)
565 # Determine model name for OpenAI-compatible request
566 model_name = endpoint_name
567 if isinstance(spec, dict): 567 ↛ 593line 567 didn't jump to line 593 because the condition on line 567 was always true
568 # Check env vars first
569 model_name = spec.get("env", {}).get("MODEL", model_name)
570 # Check container args for --model (vLLM, etc.)
571 args_list = spec.get("args") or []
572 for i, arg in enumerate(args_list): 572 ↛ 573line 572 didn't jump to line 573 because the loop on line 572 never started
573 if arg == "--model" and i + 1 < len(args_list):
574 model_name = args_list[i + 1]
575 break
576 # Default for vLLM with no explicit model — auto-detect
577 # by querying /v1/models on the running endpoint
578 if model_name == endpoint_name and "vllm" in image: 578 ↛ 593line 578 didn't jump to line 593 because the condition on line 578 was always true
579 try:
580 detect_client = get_aws_client(config)
581 models_path = f"/inference/{endpoint_name}/v1/models"
582 models_resp = detect_client.make_authenticated_request(
583 method="GET",
584 path=models_path,
585 target_region=region,
586 )
587 if models_resp.ok:
588 models_data = models_resp.json().get("data", [])
589 if models_data: 589 ↛ 590line 589 didn't jump to line 590 because the condition on line 589 was never true
590 model_name = models_data[0]["id"]
591 except Exception:
592 pass # Fall through to endpoint_name as model
593 body_dict = {
594 "model": model_name,
595 "prompt": prompt,
596 "max_tokens": max_tokens,
597 "stream": stream,
598 }
599 body_str = _json.dumps(body_dict)
601 formatter.print_info(f"POST {full_path}")
603 # Make the authenticated request
604 client = get_aws_client(config)
605 response = client.make_authenticated_request(
606 method="POST" if body_str else "GET",
607 path=full_path,
608 body=_json.loads(body_str) if body_str else None,
609 target_region=region,
610 )
612 # Print the response
613 if response.ok:
614 try:
615 resp_json = response.json()
616 # Extract the generated text for common formats
617 text = None
618 if "choices" in resp_json:
619 # OpenAI format
620 choices = resp_json["choices"]
621 if choices: 621 ↛ 631line 621 didn't jump to line 631 because the condition on line 621 was always true
622 text = choices[0].get("text") or choices[0].get("message", {}).get(
623 "content"
624 )
625 elif "generated_text" in resp_json: 625 ↛ 627line 625 didn't jump to line 627 because the condition on line 625 was never true
626 # TGI format
627 text = resp_json["generated_text"]
628 elif isinstance(resp_json, list) and resp_json and "generated_text" in resp_json[0]:
629 text = resp_json[0]["generated_text"]
631 if text and config.output_format == "table":
632 print(f"\n{text.strip()}\n")
633 else:
634 print(_json.dumps(resp_json, indent=2))
635 except _json.JSONDecodeError:
636 print(response.text)
637 else:
638 formatter.print_error(f"HTTP {response.status_code}: {response.text[:500]}")
639 sys.exit(1)
641 except Exception as e:
642 formatter.print_error(f"Failed to invoke endpoint: {e}")
643 sys.exit(1)
646@inference.command("canary")
647@click.argument("endpoint_name")
648@click.option("--image", "-i", required=True, help="New container image for canary")
649@click.option(
650 "--weight",
651 "-w",
652 default=10,
653 type=int,
654 help="Percentage of traffic to canary (1-99, default: 10)",
655)
656@click.option(
657 "--replicas", "-r", default=1, type=int, help="Number of canary replicas (default: 1)"
658)
659@pass_config
660def inference_canary(
661 config: Any, endpoint_name: Any, image: Any, weight: Any, replicas: Any
662) -> None:
663 """Start a canary deployment with a new image.
665 Routes a percentage of traffic to the canary while the primary
666 continues serving the rest. Use 'promote' to make the canary
667 the new primary, or 'rollback' to remove it.
669 Examples:
670 gco inference canary my-llm -i vllm/vllm-openai:v0.9.0 --weight 10
671 gco inference canary my-llm -i new-image:latest -w 25 -r 2
672 """
673 from ..inference import get_inference_manager
675 formatter = get_output_formatter(config)
677 try:
678 manager = get_inference_manager(config)
679 result = manager.canary_deploy(endpoint_name, image, weight=weight, replicas=replicas)
681 if not result:
682 formatter.print_error(f"Endpoint '{endpoint_name}' not found")
683 sys.exit(1)
685 formatter.print_success(
686 f"Canary started: {weight}% traffic → {image} ({replicas} replica(s))"
687 )
688 formatter.print_info(f"Monitor with: gco inference status {endpoint_name}")
689 formatter.print_info(f"Promote with: gco inference promote {endpoint_name}")
690 formatter.print_info(f"Rollback with: gco inference rollback {endpoint_name}")
692 except ValueError as e:
693 formatter.print_error(str(e))
694 sys.exit(1)
695 except Exception as e:
696 formatter.print_error(f"Failed to start canary: {e}")
697 sys.exit(1)
700@inference.command("promote")
701@click.argument("endpoint_name")
702@click.option("--yes", "-y", is_flag=True, help="Skip confirmation")
703@pass_config
704def inference_promote(config: Any, endpoint_name: Any, yes: Any) -> None:
705 """Promote the canary to primary.
707 Replaces the primary image with the canary image and removes
708 the canary deployment. All traffic goes to the new image.
710 Examples:
711 gco inference promote my-llm -y
712 """
713 from ..inference import get_inference_manager
715 formatter = get_output_formatter(config)
717 try:
718 manager = get_inference_manager(config)
719 endpoint = manager.get_endpoint(endpoint_name)
721 if not endpoint:
722 formatter.print_error(f"Endpoint '{endpoint_name}' not found")
723 sys.exit(1)
725 canary = endpoint.get("spec", {}).get("canary")
726 if not canary:
727 formatter.print_error(f"Endpoint '{endpoint_name}' has no active canary")
728 sys.exit(1)
730 if not yes: 730 ↛ 731line 730 didn't jump to line 731 because the condition on line 730 was never true
731 current_image = endpoint.get("spec", {}).get("image", "unknown")
732 click.echo(f" Current primary: {current_image}")
733 click.echo(f" Canary image: {canary.get('image', 'unknown')}")
734 click.echo(f" Canary weight: {canary.get('weight', 0)}%")
735 if not click.confirm(" Promote canary to primary?"):
736 formatter.print_info("Cancelled")
737 return
739 result = manager.promote_canary(endpoint_name)
740 if result: 740 ↛ 744line 740 didn't jump to line 744 because the condition on line 740 was always true
741 new_image = result.get("spec", {}).get("image", "unknown")
742 formatter.print_success(f"Promoted: all traffic now serving {new_image}")
743 else:
744 formatter.print_error("Promotion failed")
745 sys.exit(1)
747 except ValueError as e:
748 formatter.print_error(str(e))
749 sys.exit(1)
750 except Exception as e:
751 formatter.print_error(f"Failed to promote canary: {e}")
752 sys.exit(1)
755@inference.command("rollback")
756@click.argument("endpoint_name")
757@click.option("--yes", "-y", is_flag=True, help="Skip confirmation")
758@pass_config
759def inference_rollback(config: Any, endpoint_name: Any, yes: Any) -> None:
760 """Remove the canary deployment, keeping the primary unchanged.
762 All traffic returns to the primary deployment.
764 Examples:
765 gco inference rollback my-llm -y
766 """
767 from ..inference import get_inference_manager
769 formatter = get_output_formatter(config)
771 try:
772 manager = get_inference_manager(config)
773 endpoint = manager.get_endpoint(endpoint_name)
775 if not endpoint: 775 ↛ 776line 775 didn't jump to line 776 because the condition on line 775 was never true
776 formatter.print_error(f"Endpoint '{endpoint_name}' not found")
777 sys.exit(1)
779 canary = endpoint.get("spec", {}).get("canary")
780 if not canary:
781 formatter.print_error(f"Endpoint '{endpoint_name}' has no active canary")
782 sys.exit(1)
784 if not yes: 784 ↛ 785line 784 didn't jump to line 785 because the condition on line 784 was never true
785 click.echo(f" Canary image: {canary.get('image', 'unknown')}")
786 click.echo(f" Canary weight: {canary.get('weight', 0)}%")
787 if not click.confirm(" Remove canary and restore full traffic to primary?"):
788 formatter.print_info("Cancelled")
789 return
791 result = manager.rollback_canary(endpoint_name)
792 if result: 792 ↛ 796line 792 didn't jump to line 796 because the condition on line 792 was always true
793 primary_image = result.get("spec", {}).get("image", "unknown")
794 formatter.print_success(f"Rolled back: all traffic now serving {primary_image}")
795 else:
796 formatter.print_error("Rollback failed")
797 sys.exit(1)
799 except ValueError as e:
800 formatter.print_error(str(e))
801 sys.exit(1)
802 except Exception as e:
803 formatter.print_error(f"Failed to rollback canary: {e}")
804 sys.exit(1)
807@inference.command("health")
808@click.argument("endpoint_name")
809@click.option("--region", "-r", help="Target region to check")
810@pass_config
811def inference_health(config: Any, endpoint_name: Any, region: Any) -> None:
812 """Check if an inference endpoint is healthy and ready to serve.
814 Hits the endpoint's health check path and reports status and latency.
816 Examples:
817 gco inference health my-llm
819 gco inference health my-llm -r us-east-1
820 """
821 import json as _json
822 import time as _time
824 from ..aws_client import get_aws_client
825 from ..inference import get_inference_manager
827 formatter = get_output_formatter(config)
829 try:
830 manager = get_inference_manager(config)
831 endpoint = manager.get_endpoint(endpoint_name)
832 if not endpoint:
833 formatter.print_error(f"Endpoint '{endpoint_name}' not found")
834 sys.exit(1)
836 ingress_path = endpoint.get("ingress_path", f"/inference/{endpoint_name}")
837 spec = endpoint.get("spec", {})
838 health_path = spec.get("health_path", "/health") if isinstance(spec, dict) else "/health"
839 full_path = f"{ingress_path}{health_path}"
841 client = get_aws_client(config)
842 start = _time.monotonic()
843 response = client.make_authenticated_request(
844 method="GET",
845 path=full_path,
846 target_region=region,
847 )
848 latency_ms = (_time.monotonic() - start) * 1000
850 result = {
851 "endpoint": endpoint_name,
852 "status": "healthy" if response.ok else "unhealthy",
853 "http_status": response.status_code,
854 "latency_ms": round(latency_ms, 1),
855 "path": full_path,
856 }
858 try:
859 result["body"] = response.json()
860 except Exception:
861 result["body"] = response.text[:200] if response.text else None
863 if config.output_format == "json": 863 ↛ 864line 863 didn't jump to line 864 because the condition on line 863 was never true
864 print(_json.dumps(result, indent=2))
865 else:
866 status_icon = "✓" if response.ok else "✗"
867 formatter.print_info(
868 f"{status_icon} {endpoint_name}: {result['status']} "
869 f"(HTTP {response.status_code}, {result['latency_ms']}ms)"
870 )
872 except Exception as e:
873 formatter.print_error(f"Health check failed: {e}")
874 sys.exit(1)
877@inference.command("models")
878@click.argument("endpoint_name")
879@click.option("--region", "-r", help="Target region to query")
880@pass_config
881def inference_models(config: Any, endpoint_name: Any, region: Any) -> None:
882 """List models loaded on an inference endpoint.
884 Queries the /v1/models path (OpenAI-compatible) to discover loaded models.
886 Examples:
887 gco inference models my-llm
888 """
889 import json as _json
891 from ..aws_client import get_aws_client
892 from ..inference import get_inference_manager
894 formatter = get_output_formatter(config)
896 try:
897 manager = get_inference_manager(config)
898 endpoint = manager.get_endpoint(endpoint_name)
899 if not endpoint:
900 formatter.print_error(f"Endpoint '{endpoint_name}' not found")
901 sys.exit(1)
903 ingress_path = endpoint.get("ingress_path", f"/inference/{endpoint_name}")
904 full_path = f"{ingress_path}/v1/models"
906 client = get_aws_client(config)
907 response = client.make_authenticated_request(
908 method="GET",
909 path=full_path,
910 target_region=region,
911 )
913 if response.ok:
914 try:
915 resp_json = response.json()
916 print(_json.dumps(resp_json, indent=2))
917 except _json.JSONDecodeError:
918 print(response.text)
919 else:
920 formatter.print_error(f"HTTP {response.status_code}: {response.text[:500]}")
921 sys.exit(1)
923 except Exception as e:
924 formatter.print_error(f"Failed to list models: {e}")
925 sys.exit(1)