Coverage for cli / commands / inference_cmd.py: 88%

502 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-30 21:47 +0000

1"""Inference endpoint commands.""" 

2 

3import sys 

4from typing import Any 

5 

6import click 

7 

8from ..config import GCOConfig 

9from ..output import get_output_formatter 

10 

11pass_config = click.make_pass_decorator(GCOConfig, ensure=True) 

12 

13 

14@click.group() 

15@pass_config 

16def inference(config: Any) -> None: 

17 """Manage multi-region inference endpoints.""" 

18 pass 

19 

20 

21@inference.command("deploy") 

22@click.argument("endpoint_name") 

23@click.option("--image", "-i", required=True, help="Container image (e.g. vllm/vllm-openai:v0.8.0)") 

24@click.option( 

25 "--region", 

26 "-r", 

27 multiple=True, 

28 help="Target region(s). Repeatable. Default: all deployed regions", 

29) 

30@click.option("--replicas", default=1, help="Replicas per region (default: 1)") 

31@click.option("--gpu-count", default=1, help="GPUs per replica (default: 1)") 

32@click.option("--gpu-type", help="GPU instance type hint (e.g. g5.xlarge)") 

33@click.option("--port", default=8000, help="Container port (default: 8000)") 

34@click.option("--model-path", help="EFS path for model weights") 

35@click.option( 

36 "--model-source", 

37 help="S3 URI for model weights (e.g. s3://bucket/models/llama3). " 

38 "Auto-synced to each region via init container.", 

39) 

40@click.option("--health-path", default="/health", help="Health check path (default: /health)") 

41@click.option("--env", "-e", multiple=True, help="Environment variable (KEY=VALUE). Repeatable") 

42@click.option("--namespace", "-n", default="gco-inference", help="Kubernetes namespace") 

43@click.option("--label", "-l", multiple=True, help="Label (key=value). Repeatable") 

44@click.option("--min-replicas", type=int, default=None, help="Autoscaling: minimum replicas") 

45@click.option("--max-replicas", type=int, default=None, help="Autoscaling: maximum replicas") 

46@click.option( 

47 "--autoscale-metric", 

48 multiple=True, 

49 help="Autoscaling metric (cpu:70, memory:80, gpu:60). Repeatable. Enables autoscaling.", 

50) 

51@click.option( 

52 "--capacity-type", 

53 type=click.Choice(["on-demand", "spot"]), 

54 default=None, 

55 help="Node capacity type. 'spot' uses cheaper preemptible instances.", 

56) 

57@click.option( 

58 "--extra-args", 

59 multiple=True, 

60 help="Extra arguments passed to the container (e.g. '--kv-transfer-config {...}'). Repeatable.", 

61) 

62@click.option( 

63 "--accelerator", 

64 type=click.Choice(["nvidia", "neuron"]), 

65 default="nvidia", 

66 help="Accelerator type: 'nvidia' for GPU instances (default), 'neuron' for Trainium/Inferentia.", 

67) 

68@click.option( 

69 "--node-selector", 

70 multiple=True, 

71 help="Node selector (key=value). Repeatable. E.g. --node-selector eks.amazonaws.com/instance-family=inf2", 

72) 

73@pass_config 

74def inference_deploy( 

75 config: Any, 

76 endpoint_name: Any, 

77 image: Any, 

78 region: Any, 

79 replicas: Any, 

80 gpu_count: Any, 

81 gpu_type: Any, 

82 port: Any, 

83 model_path: Any, 

84 model_source: Any, 

85 health_path: Any, 

86 env: Any, 

87 namespace: Any, 

88 label: Any, 

89 min_replicas: Any, 

90 max_replicas: Any, 

91 autoscale_metric: Any, 

92 capacity_type: Any, 

93 extra_args: Any, 

94 accelerator: Any, 

95 node_selector: Any, 

96) -> None: 

97 """Deploy an inference endpoint to one or more regions. 

98 

99 The endpoint is registered in DynamoDB and the inference_monitor 

100 in each target region creates the Kubernetes resources automatically. 

101 

102 Examples: 

103 gco inference deploy my-llm -i vllm/vllm-openai:v0.8.0 

104 

105 gco inference deploy llama3-70b \\ 

106 -i vllm/vllm-openai:v0.8.0 \\ 

107 -r us-east-1 -r eu-west-1 \\ 

108 --replicas 2 --gpu-count 4 \\ 

109 --model-path /mnt/gco/models/llama3-70b \\ 

110 -e MODEL_NAME=meta-llama/Llama-3-70B 

111 """ 

112 from ..inference import get_inference_manager 

113 

114 formatter = get_output_formatter(config) 

115 

116 # Parse env vars and labels 

117 env_dict = {} 

118 for e_var in env: 

119 if "=" in e_var: 119 ↛ 118line 119 didn't jump to line 118 because the condition on line 119 was always true

120 k, v = e_var.split("=", 1) 

121 env_dict[k] = v 

122 

123 labels_dict = {} 

124 for lbl in label: 

125 if "=" in lbl: 125 ↛ 124line 125 didn't jump to line 124 because the condition on line 125 was always true

126 k, v = lbl.split("=", 1) 

127 labels_dict[k] = v 

128 

129 node_selector_dict = {} 

130 for ns in node_selector: 130 ↛ 131line 130 didn't jump to line 131 because the loop on line 130 never started

131 if "=" in ns: 

132 k, v = ns.split("=", 1) 

133 node_selector_dict[k] = v 

134 

135 # Build autoscaling config 

136 autoscaling_config = None 

137 if autoscale_metric: 

138 metrics = [] 

139 for m in autoscale_metric: 

140 if ":" in m: 

141 mtype, mtarget = m.split(":", 1) 

142 metrics.append({"type": mtype, "target": int(mtarget)}) 

143 else: 

144 metrics.append({"type": m, "target": 70}) 

145 autoscaling_config = { 

146 "enabled": True, 

147 "min_replicas": min_replicas or 1, 

148 "max_replicas": max_replicas or 10, 

149 "metrics": metrics, 

150 } 

151 

152 try: 

153 manager = get_inference_manager(config) 

154 result = manager.deploy( 

155 endpoint_name=endpoint_name, 

156 image=image, 

157 target_regions=list(region) if region else None, 

158 replicas=replicas, 

159 gpu_count=gpu_count, 

160 gpu_type=gpu_type, 

161 port=port, 

162 model_path=model_path, 

163 model_source=model_source, 

164 health_check_path=health_path, 

165 env=env_dict if env_dict else None, 

166 namespace=namespace, 

167 labels=labels_dict if labels_dict else None, 

168 autoscaling=autoscaling_config, 

169 capacity_type=capacity_type, 

170 extra_args=list(extra_args) if extra_args else None, 

171 accelerator=accelerator, 

172 node_selector=node_selector_dict if node_selector_dict else None, 

173 ) 

174 

175 formatter.print_success(f"Endpoint '{endpoint_name}' registered for deployment") 

176 regions_str = ", ".join(result.get("target_regions", [])) 

177 formatter.print_info(f"Target regions: {regions_str}") 

178 formatter.print_info(f"Ingress path: {result.get('ingress_path', '')}") 

179 formatter.print_info( 

180 "The inference_monitor in each region will create the resources. " 

181 "Use 'gco inference status' to track progress." 

182 ) 

183 

184 # Warn if deploying to a subset of regions 

185 if region: 

186 from ..aws_client import get_aws_client as _get_client 

187 

188 all_stacks = _get_client(config).discover_regional_stacks() 

189 all_regions = set(all_stacks.keys()) 

190 target_set = set(result.get("target_regions", [])) 

191 missing = all_regions - target_set 

192 if missing: 

193 formatter.print_warning( 

194 f"Endpoint is NOT deployed to: {', '.join(sorted(missing))}. " 

195 "Global Accelerator may route users to those regions where " 

196 "the endpoint won't exist. Consider deploying to all regions " 

197 "(omit -r) for consistent global routing." 

198 ) 

199 

200 if config.output_format != "table": 200 ↛ 201line 200 didn't jump to line 201 because the condition on line 200 was never true

201 formatter.print(result) 

202 

203 except ValueError as e: 

204 formatter.print_error(str(e)) 

205 sys.exit(1) 

206 except Exception as e: 

207 formatter.print_error(f"Failed to deploy endpoint: {e}") 

208 sys.exit(1) 

209 

210 

211@inference.command("list") 

212@click.option("--state", "-s", help="Filter by state (deploying, running, stopped, deleted)") 

213@click.option("--region", "-r", help="Filter by target region") 

214@pass_config 

215def inference_list(config: Any, state: Any, region: Any) -> None: 

216 """List inference endpoints. 

217 

218 Examples: 

219 gco inference list 

220 gco inference list --state running 

221 gco inference list -r us-east-1 

222 """ 

223 from ..inference import get_inference_manager 

224 

225 formatter = get_output_formatter(config) 

226 

227 try: 

228 manager = get_inference_manager(config) 

229 endpoints = manager.list_endpoints(desired_state=state, region=region) 

230 

231 if config.output_format != "table": 231 ↛ 232line 231 didn't jump to line 232 because the condition on line 231 was never true

232 formatter.print(endpoints) 

233 return 

234 

235 if not endpoints: 

236 formatter.print_info("No inference endpoints found") 

237 return 

238 

239 print(f"\n Inference Endpoints ({len(endpoints)} found)") 

240 print(" " + "-" * 85) 

241 print(f" {'NAME':<25} {'STATE':<12} {'REGIONS':<25} {'REPLICAS':>8} {'IMAGE'}") 

242 print(" " + "-" * 85) 

243 for ep in endpoints: 

244 name = ep.get("endpoint_name", "")[:24] 

245 ep_state = ep.get("desired_state", "unknown") 

246 regions = ", ".join(ep.get("target_regions", []))[:24] 

247 spec = ep.get("spec", {}) 

248 replicas = spec.get("replicas", 1) if isinstance(spec, dict) else 1 

249 image = spec.get("image", "")[:40] if isinstance(spec, dict) else "" 

250 print(f" {name:<25} {ep_state:<12} {regions:<25} {replicas:>8} {image}") 

251 

252 print() 

253 

254 except Exception as e: 

255 formatter.print_error(f"Failed to list endpoints: {e}") 

256 sys.exit(1) 

257 

258 

259@inference.command("status") 

260@click.argument("endpoint_name") 

261@pass_config 

262def inference_status(config: Any, endpoint_name: Any) -> None: 

263 """Show detailed status of an inference endpoint. 

264 

265 Examples: 

266 gco inference status my-llm 

267 """ 

268 from ..inference import get_inference_manager 

269 

270 formatter = get_output_formatter(config) 

271 

272 try: 

273 manager = get_inference_manager(config) 

274 endpoint = manager.get_endpoint(endpoint_name) 

275 

276 if not endpoint: 

277 formatter.print_error(f"Endpoint '{endpoint_name}' not found") 

278 sys.exit(1) 

279 

280 if config.output_format != "table": 280 ↛ 281line 280 didn't jump to line 281 because the condition on line 280 was never true

281 formatter.print(endpoint) 

282 return 

283 

284 spec = endpoint.get("spec", {}) 

285 print(f"\n Endpoint: {endpoint_name}") 

286 print(" " + "-" * 60) 

287 print(f" State: {endpoint.get('desired_state', 'unknown')}") 

288 print(f" Image: {spec.get('image', 'N/A')}") 

289 print(f" Replicas: {spec.get('replicas', 1)}") 

290 print(f" GPUs: {spec.get('gpu_count', 0)}") 

291 print(f" Port: {spec.get('port', 8000)}") 

292 print(f" Path: {endpoint.get('ingress_path', 'N/A')}") 

293 print(f" Namespace: {endpoint.get('namespace', 'N/A')}") 

294 print(f" Created: {endpoint.get('created_at', 'N/A')}") 

295 

296 # Region status 

297 region_status = endpoint.get("region_status", {}) 

298 if region_status: 

299 print("\n Region Status:") 

300 print(f" {'REGION':<18} {'STATE':<12} {'READY':>5} {'DESIRED':>7} {'LAST SYNC'}") 

301 print(" " + "-" * 65) 

302 for r, status in region_status.items(): 

303 if isinstance(status, dict): 303 ↛ 302line 303 didn't jump to line 302 because the condition on line 303 was always true

304 r_state = status.get("state", "unknown") 

305 ready = status.get("replicas_ready", 0) 

306 desired = status.get("replicas_desired", 0) 

307 last_sync = status.get("last_sync", "N/A") 

308 if last_sync and len(last_sync) > 19: 308 ↛ 310line 308 didn't jump to line 310 because the condition on line 308 was always true

309 last_sync = last_sync[:19] 

310 print(f" {r:<18} {r_state:<12} {ready:>5} {desired:>7} {last_sync}") 

311 else: 

312 target_regions = endpoint.get("target_regions", []) 

313 print(f"\n Target regions: {', '.join(target_regions)}") 

314 print(" (Waiting for inference_monitor to sync)") 

315 

316 print() 

317 

318 except Exception as e: 

319 formatter.print_error(f"Failed to get endpoint status: {e}") 

320 sys.exit(1) 

321 

322 

323@inference.command("scale") 

324@click.argument("endpoint_name") 

325@click.option("--replicas", "-r", required=True, type=int, help="New replica count") 

326@pass_config 

327def inference_scale(config: Any, endpoint_name: Any, replicas: Any) -> None: 

328 """Scale an inference endpoint. 

329 

330 Examples: 

331 gco inference scale my-llm --replicas 4 

332 """ 

333 from ..inference import get_inference_manager 

334 

335 formatter = get_output_formatter(config) 

336 

337 try: 

338 manager = get_inference_manager(config) 

339 result = manager.scale(endpoint_name, replicas) 

340 

341 if result: 

342 formatter.print_success(f"Endpoint '{endpoint_name}' scaled to {replicas} replicas") 

343 else: 

344 formatter.print_error(f"Endpoint '{endpoint_name}' not found") 

345 sys.exit(1) 

346 

347 except Exception as e: 

348 formatter.print_error(f"Failed to scale endpoint: {e}") 

349 sys.exit(1) 

350 

351 

352@inference.command("stop") 

353@click.argument("endpoint_name") 

354@click.option("--yes", "-y", is_flag=True, help="Skip confirmation") 

355@pass_config 

356def inference_stop(config: Any, endpoint_name: Any, yes: Any) -> None: 

357 """Stop an inference endpoint (scale to zero, keep config). 

358 

359 Examples: 

360 gco inference stop my-llm -y 

361 """ 

362 from ..inference import get_inference_manager 

363 

364 formatter = get_output_formatter(config) 

365 

366 if not yes: 366 ↛ 367line 366 didn't jump to line 367 because the condition on line 366 was never true

367 click.confirm(f"Stop endpoint '{endpoint_name}'?", abort=True) 

368 

369 try: 

370 manager = get_inference_manager(config) 

371 result = manager.stop(endpoint_name) 

372 

373 if result: 

374 formatter.print_success(f"Endpoint '{endpoint_name}' marked for stop") 

375 else: 

376 formatter.print_error(f"Endpoint '{endpoint_name}' not found") 

377 sys.exit(1) 

378 

379 except Exception as e: 

380 formatter.print_error(f"Failed to stop endpoint: {e}") 

381 sys.exit(1) 

382 

383 

384@inference.command("start") 

385@click.argument("endpoint_name") 

386@pass_config 

387def inference_start(config: Any, endpoint_name: Any) -> None: 

388 """Start a stopped inference endpoint. 

389 

390 Examples: 

391 gco inference start my-llm 

392 """ 

393 from ..inference import get_inference_manager 

394 

395 formatter = get_output_formatter(config) 

396 

397 try: 

398 manager = get_inference_manager(config) 

399 result = manager.start(endpoint_name) 

400 

401 if result: 

402 formatter.print_success(f"Endpoint '{endpoint_name}' marked for start") 

403 else: 

404 formatter.print_error(f"Endpoint '{endpoint_name}' not found") 

405 sys.exit(1) 

406 

407 except Exception as e: 

408 formatter.print_error(f"Failed to start endpoint: {e}") 

409 sys.exit(1) 

410 

411 

412@inference.command("delete") 

413@click.argument("endpoint_name") 

414@click.option("--yes", "-y", is_flag=True, help="Skip confirmation") 

415@pass_config 

416def inference_delete(config: Any, endpoint_name: Any, yes: Any) -> None: 

417 """Delete an inference endpoint from all regions. 

418 

419 The inference_monitor in each region will clean up the K8s resources. 

420 

421 Examples: 

422 gco inference delete my-llm -y 

423 """ 

424 from ..inference import get_inference_manager 

425 

426 formatter = get_output_formatter(config) 

427 

428 if not yes: 428 ↛ 429line 428 didn't jump to line 429 because the condition on line 428 was never true

429 click.confirm(f"Delete endpoint '{endpoint_name}' from all regions?", abort=True) 

430 

431 try: 

432 manager = get_inference_manager(config) 

433 result = manager.delete(endpoint_name) 

434 

435 if result: 

436 formatter.print_success( 

437 f"Endpoint '{endpoint_name}' marked for deletion. " 

438 "The inference_monitor will clean up resources in each region." 

439 ) 

440 else: 

441 formatter.print_error(f"Endpoint '{endpoint_name}' not found") 

442 sys.exit(1) 

443 

444 except Exception as e: 

445 formatter.print_error(f"Failed to delete endpoint: {e}") 

446 sys.exit(1) 

447 

448 

449@inference.command("update-image") 

450@click.argument("endpoint_name") 

451@click.option("--image", "-i", required=True, help="New container image") 

452@pass_config 

453def inference_update_image(config: Any, endpoint_name: Any, image: Any) -> None: 

454 """Update the container image for an inference endpoint. 

455 

456 Triggers a rolling update across all target regions. 

457 

458 Examples: 

459 gco inference update-image my-llm -i vllm/vllm-openai:v0.9.0 

460 """ 

461 from ..inference import get_inference_manager 

462 

463 formatter = get_output_formatter(config) 

464 

465 try: 

466 manager = get_inference_manager(config) 

467 result = manager.update_image(endpoint_name, image) 

468 

469 if result: 

470 formatter.print_success(f"Endpoint '{endpoint_name}' image updated to {image}") 

471 formatter.print_info("Rolling update will be applied by inference_monitor") 

472 else: 

473 formatter.print_error(f"Endpoint '{endpoint_name}' not found") 

474 sys.exit(1) 

475 

476 except Exception as e: 

477 formatter.print_error(f"Failed to update image: {e}") 

478 sys.exit(1) 

479 

480 

481@inference.command("invoke") 

482@click.argument("endpoint_name") 

483@click.option("--prompt", "-p", help="Text prompt to send") 

484@click.option("--data", "-d", help="Raw JSON body to send") 

485@click.option( 

486 "--path", "api_path", default=None, help="API sub-path (default: auto-detect from framework)" 

487) 

488@click.option("--region", "-r", help="Target region for the request") 

489@click.option( 

490 "--max-tokens", type=int, default=100, help="Maximum tokens to generate (default: 100)" 

491) 

492@click.option("--stream/--no-stream", default=False, help="Stream the response") 

493@pass_config 

494def inference_invoke( 

495 config: Any, 

496 endpoint_name: Any, 

497 prompt: Any, 

498 data: Any, 

499 api_path: Any, 

500 region: Any, 

501 max_tokens: Any, 

502 stream: Any, 

503) -> None: 

504 """Send a request to an inference endpoint and print the response. 

505 

506 Automatically discovers the endpoint's ingress path and routes the 

507 request through the API Gateway with SigV4 authentication. 

508 

509 Examples: 

510 gco inference invoke my-llm -p "What is GPU orchestration?" 

511 

512 gco inference invoke my-llm -d '{"prompt": "Hello", "max_tokens": 50}' 

513 

514 gco inference invoke my-llm -p "Explain K8s" --path /v1/completions 

515 """ 

516 import json as _json 

517 

518 from ..aws_client import get_aws_client 

519 from ..inference import get_inference_manager 

520 

521 formatter = get_output_formatter(config) 

522 

523 if not prompt and not data: 

524 formatter.print_error("Provide --prompt (-p) or --data (-d)") 

525 sys.exit(1) 

526 

527 try: 

528 # Look up the endpoint to get its ingress path and spec 

529 manager = get_inference_manager(config) 

530 endpoint = manager.get_endpoint(endpoint_name) 

531 if not endpoint: 

532 formatter.print_error(f"Endpoint '{endpoint_name}' not found") 

533 sys.exit(1) 

534 

535 ingress_path = endpoint.get("ingress_path", f"/inference/{endpoint_name}") 

536 spec = endpoint.get("spec", {}) 

537 image = spec.get("image", "") if isinstance(spec, dict) else "" 

538 

539 # Auto-detect the API sub-path based on the container image 

540 if api_path is None: 

541 if "vllm" in image: 

542 api_path = "/v1/completions" 

543 elif "text-generation-inference" in image or "tgi" in image: 

544 api_path = "/generate" 

545 elif "tritonserver" in image or "triton" in image: 545 ↛ 548line 545 didn't jump to line 548 because the condition on line 545 was always true

546 api_path = "/v2/models" 

547 else: 

548 api_path = "/v1/completions" 

549 

550 full_path = f"{ingress_path}{api_path}" 

551 

552 # Build the request body 

553 if data: 

554 body_str = data 

555 elif prompt: 555 ↛ 601line 555 didn't jump to line 601 because the condition on line 555 was always true

556 # Build a sensible default body based on framework 

557 if "generate" in api_path: 

558 # TGI format 

559 body_dict = {"inputs": prompt, "parameters": {"max_new_tokens": max_tokens}} 

560 elif "/v2/" in api_path: 

561 # Triton — just list models, prompt not used for this path 

562 body_dict = {} 

563 else: 

564 # OpenAI-compatible (vLLM, etc.) 

565 # Determine model name for OpenAI-compatible request 

566 model_name = endpoint_name 

567 if isinstance(spec, dict): 567 ↛ 593line 567 didn't jump to line 593 because the condition on line 567 was always true

568 # Check env vars first 

569 model_name = spec.get("env", {}).get("MODEL", model_name) 

570 # Check container args for --model (vLLM, etc.) 

571 args_list = spec.get("args") or [] 

572 for i, arg in enumerate(args_list): 572 ↛ 573line 572 didn't jump to line 573 because the loop on line 572 never started

573 if arg == "--model" and i + 1 < len(args_list): 

574 model_name = args_list[i + 1] 

575 break 

576 # Default for vLLM with no explicit model — auto-detect 

577 # by querying /v1/models on the running endpoint 

578 if model_name == endpoint_name and "vllm" in image: 578 ↛ 593line 578 didn't jump to line 593 because the condition on line 578 was always true

579 try: 

580 detect_client = get_aws_client(config) 

581 models_path = f"/inference/{endpoint_name}/v1/models" 

582 models_resp = detect_client.make_authenticated_request( 

583 method="GET", 

584 path=models_path, 

585 target_region=region, 

586 ) 

587 if models_resp.ok: 

588 models_data = models_resp.json().get("data", []) 

589 if models_data: 589 ↛ 590line 589 didn't jump to line 590 because the condition on line 589 was never true

590 model_name = models_data[0]["id"] 

591 except Exception: 

592 pass # Fall through to endpoint_name as model 

593 body_dict = { 

594 "model": model_name, 

595 "prompt": prompt, 

596 "max_tokens": max_tokens, 

597 "stream": stream, 

598 } 

599 body_str = _json.dumps(body_dict) 

600 

601 formatter.print_info(f"POST {full_path}") 

602 

603 # Make the authenticated request 

604 client = get_aws_client(config) 

605 response = client.make_authenticated_request( 

606 method="POST" if body_str else "GET", 

607 path=full_path, 

608 body=_json.loads(body_str) if body_str else None, 

609 target_region=region, 

610 ) 

611 

612 # Print the response 

613 if response.ok: 

614 try: 

615 resp_json = response.json() 

616 # Extract the generated text for common formats 

617 text = None 

618 if "choices" in resp_json: 

619 # OpenAI format 

620 choices = resp_json["choices"] 

621 if choices: 621 ↛ 631line 621 didn't jump to line 631 because the condition on line 621 was always true

622 text = choices[0].get("text") or choices[0].get("message", {}).get( 

623 "content" 

624 ) 

625 elif "generated_text" in resp_json: 625 ↛ 627line 625 didn't jump to line 627 because the condition on line 625 was never true

626 # TGI format 

627 text = resp_json["generated_text"] 

628 elif isinstance(resp_json, list) and resp_json and "generated_text" in resp_json[0]: 

629 text = resp_json[0]["generated_text"] 

630 

631 if text and config.output_format == "table": 

632 print(f"\n{text.strip()}\n") 

633 else: 

634 print(_json.dumps(resp_json, indent=2)) 

635 except _json.JSONDecodeError: 

636 print(response.text) 

637 else: 

638 formatter.print_error(f"HTTP {response.status_code}: {response.text[:500]}") 

639 sys.exit(1) 

640 

641 except Exception as e: 

642 formatter.print_error(f"Failed to invoke endpoint: {e}") 

643 sys.exit(1) 

644 

645 

646@inference.command("canary") 

647@click.argument("endpoint_name") 

648@click.option("--image", "-i", required=True, help="New container image for canary") 

649@click.option( 

650 "--weight", 

651 "-w", 

652 default=10, 

653 type=int, 

654 help="Percentage of traffic to canary (1-99, default: 10)", 

655) 

656@click.option( 

657 "--replicas", "-r", default=1, type=int, help="Number of canary replicas (default: 1)" 

658) 

659@pass_config 

660def inference_canary( 

661 config: Any, endpoint_name: Any, image: Any, weight: Any, replicas: Any 

662) -> None: 

663 """Start a canary deployment with a new image. 

664 

665 Routes a percentage of traffic to the canary while the primary 

666 continues serving the rest. Use 'promote' to make the canary 

667 the new primary, or 'rollback' to remove it. 

668 

669 Examples: 

670 gco inference canary my-llm -i vllm/vllm-openai:v0.9.0 --weight 10 

671 gco inference canary my-llm -i new-image:latest -w 25 -r 2 

672 """ 

673 from ..inference import get_inference_manager 

674 

675 formatter = get_output_formatter(config) 

676 

677 try: 

678 manager = get_inference_manager(config) 

679 result = manager.canary_deploy(endpoint_name, image, weight=weight, replicas=replicas) 

680 

681 if not result: 

682 formatter.print_error(f"Endpoint '{endpoint_name}' not found") 

683 sys.exit(1) 

684 

685 formatter.print_success( 

686 f"Canary started: {weight}% traffic → {image} ({replicas} replica(s))" 

687 ) 

688 formatter.print_info(f"Monitor with: gco inference status {endpoint_name}") 

689 formatter.print_info(f"Promote with: gco inference promote {endpoint_name}") 

690 formatter.print_info(f"Rollback with: gco inference rollback {endpoint_name}") 

691 

692 except ValueError as e: 

693 formatter.print_error(str(e)) 

694 sys.exit(1) 

695 except Exception as e: 

696 formatter.print_error(f"Failed to start canary: {e}") 

697 sys.exit(1) 

698 

699 

700@inference.command("promote") 

701@click.argument("endpoint_name") 

702@click.option("--yes", "-y", is_flag=True, help="Skip confirmation") 

703@pass_config 

704def inference_promote(config: Any, endpoint_name: Any, yes: Any) -> None: 

705 """Promote the canary to primary. 

706 

707 Replaces the primary image with the canary image and removes 

708 the canary deployment. All traffic goes to the new image. 

709 

710 Examples: 

711 gco inference promote my-llm -y 

712 """ 

713 from ..inference import get_inference_manager 

714 

715 formatter = get_output_formatter(config) 

716 

717 try: 

718 manager = get_inference_manager(config) 

719 endpoint = manager.get_endpoint(endpoint_name) 

720 

721 if not endpoint: 

722 formatter.print_error(f"Endpoint '{endpoint_name}' not found") 

723 sys.exit(1) 

724 

725 canary = endpoint.get("spec", {}).get("canary") 

726 if not canary: 

727 formatter.print_error(f"Endpoint '{endpoint_name}' has no active canary") 

728 sys.exit(1) 

729 

730 if not yes: 730 ↛ 731line 730 didn't jump to line 731 because the condition on line 730 was never true

731 current_image = endpoint.get("spec", {}).get("image", "unknown") 

732 click.echo(f" Current primary: {current_image}") 

733 click.echo(f" Canary image: {canary.get('image', 'unknown')}") 

734 click.echo(f" Canary weight: {canary.get('weight', 0)}%") 

735 if not click.confirm(" Promote canary to primary?"): 

736 formatter.print_info("Cancelled") 

737 return 

738 

739 result = manager.promote_canary(endpoint_name) 

740 if result: 740 ↛ 744line 740 didn't jump to line 744 because the condition on line 740 was always true

741 new_image = result.get("spec", {}).get("image", "unknown") 

742 formatter.print_success(f"Promoted: all traffic now serving {new_image}") 

743 else: 

744 formatter.print_error("Promotion failed") 

745 sys.exit(1) 

746 

747 except ValueError as e: 

748 formatter.print_error(str(e)) 

749 sys.exit(1) 

750 except Exception as e: 

751 formatter.print_error(f"Failed to promote canary: {e}") 

752 sys.exit(1) 

753 

754 

755@inference.command("rollback") 

756@click.argument("endpoint_name") 

757@click.option("--yes", "-y", is_flag=True, help="Skip confirmation") 

758@pass_config 

759def inference_rollback(config: Any, endpoint_name: Any, yes: Any) -> None: 

760 """Remove the canary deployment, keeping the primary unchanged. 

761 

762 All traffic returns to the primary deployment. 

763 

764 Examples: 

765 gco inference rollback my-llm -y 

766 """ 

767 from ..inference import get_inference_manager 

768 

769 formatter = get_output_formatter(config) 

770 

771 try: 

772 manager = get_inference_manager(config) 

773 endpoint = manager.get_endpoint(endpoint_name) 

774 

775 if not endpoint: 775 ↛ 776line 775 didn't jump to line 776 because the condition on line 775 was never true

776 formatter.print_error(f"Endpoint '{endpoint_name}' not found") 

777 sys.exit(1) 

778 

779 canary = endpoint.get("spec", {}).get("canary") 

780 if not canary: 

781 formatter.print_error(f"Endpoint '{endpoint_name}' has no active canary") 

782 sys.exit(1) 

783 

784 if not yes: 784 ↛ 785line 784 didn't jump to line 785 because the condition on line 784 was never true

785 click.echo(f" Canary image: {canary.get('image', 'unknown')}") 

786 click.echo(f" Canary weight: {canary.get('weight', 0)}%") 

787 if not click.confirm(" Remove canary and restore full traffic to primary?"): 

788 formatter.print_info("Cancelled") 

789 return 

790 

791 result = manager.rollback_canary(endpoint_name) 

792 if result: 792 ↛ 796line 792 didn't jump to line 796 because the condition on line 792 was always true

793 primary_image = result.get("spec", {}).get("image", "unknown") 

794 formatter.print_success(f"Rolled back: all traffic now serving {primary_image}") 

795 else: 

796 formatter.print_error("Rollback failed") 

797 sys.exit(1) 

798 

799 except ValueError as e: 

800 formatter.print_error(str(e)) 

801 sys.exit(1) 

802 except Exception as e: 

803 formatter.print_error(f"Failed to rollback canary: {e}") 

804 sys.exit(1) 

805 

806 

807@inference.command("health") 

808@click.argument("endpoint_name") 

809@click.option("--region", "-r", help="Target region to check") 

810@pass_config 

811def inference_health(config: Any, endpoint_name: Any, region: Any) -> None: 

812 """Check if an inference endpoint is healthy and ready to serve. 

813 

814 Hits the endpoint's health check path and reports status and latency. 

815 

816 Examples: 

817 gco inference health my-llm 

818 

819 gco inference health my-llm -r us-east-1 

820 """ 

821 import json as _json 

822 import time as _time 

823 

824 from ..aws_client import get_aws_client 

825 from ..inference import get_inference_manager 

826 

827 formatter = get_output_formatter(config) 

828 

829 try: 

830 manager = get_inference_manager(config) 

831 endpoint = manager.get_endpoint(endpoint_name) 

832 if not endpoint: 

833 formatter.print_error(f"Endpoint '{endpoint_name}' not found") 

834 sys.exit(1) 

835 

836 ingress_path = endpoint.get("ingress_path", f"/inference/{endpoint_name}") 

837 spec = endpoint.get("spec", {}) 

838 health_path = spec.get("health_path", "/health") if isinstance(spec, dict) else "/health" 

839 full_path = f"{ingress_path}{health_path}" 

840 

841 client = get_aws_client(config) 

842 start = _time.monotonic() 

843 response = client.make_authenticated_request( 

844 method="GET", 

845 path=full_path, 

846 target_region=region, 

847 ) 

848 latency_ms = (_time.monotonic() - start) * 1000 

849 

850 result = { 

851 "endpoint": endpoint_name, 

852 "status": "healthy" if response.ok else "unhealthy", 

853 "http_status": response.status_code, 

854 "latency_ms": round(latency_ms, 1), 

855 "path": full_path, 

856 } 

857 

858 try: 

859 result["body"] = response.json() 

860 except Exception: 

861 result["body"] = response.text[:200] if response.text else None 

862 

863 if config.output_format == "json": 863 ↛ 864line 863 didn't jump to line 864 because the condition on line 863 was never true

864 print(_json.dumps(result, indent=2)) 

865 else: 

866 status_icon = "✓" if response.ok else "✗" 

867 formatter.print_info( 

868 f"{status_icon} {endpoint_name}: {result['status']} " 

869 f"(HTTP {response.status_code}, {result['latency_ms']}ms)" 

870 ) 

871 

872 except Exception as e: 

873 formatter.print_error(f"Health check failed: {e}") 

874 sys.exit(1) 

875 

876 

877@inference.command("models") 

878@click.argument("endpoint_name") 

879@click.option("--region", "-r", help="Target region to query") 

880@pass_config 

881def inference_models(config: Any, endpoint_name: Any, region: Any) -> None: 

882 """List models loaded on an inference endpoint. 

883 

884 Queries the /v1/models path (OpenAI-compatible) to discover loaded models. 

885 

886 Examples: 

887 gco inference models my-llm 

888 """ 

889 import json as _json 

890 

891 from ..aws_client import get_aws_client 

892 from ..inference import get_inference_manager 

893 

894 formatter = get_output_formatter(config) 

895 

896 try: 

897 manager = get_inference_manager(config) 

898 endpoint = manager.get_endpoint(endpoint_name) 

899 if not endpoint: 

900 formatter.print_error(f"Endpoint '{endpoint_name}' not found") 

901 sys.exit(1) 

902 

903 ingress_path = endpoint.get("ingress_path", f"/inference/{endpoint_name}") 

904 full_path = f"{ingress_path}/v1/models" 

905 

906 client = get_aws_client(config) 

907 response = client.make_authenticated_request( 

908 method="GET", 

909 path=full_path, 

910 target_region=region, 

911 ) 

912 

913 if response.ok: 

914 try: 

915 resp_json = response.json() 

916 print(_json.dumps(resp_json, indent=2)) 

917 except _json.JSONDecodeError: 

918 print(response.text) 

919 else: 

920 formatter.print_error(f"HTTP {response.status_code}: {response.text[:500]}") 

921 sys.exit(1) 

922 

923 except Exception as e: 

924 formatter.print_error(f"Failed to list models: {e}") 

925 sys.exit(1)