Coverage for cli/commands/inference_cmd.py: 88%

504 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-15 15:07 +0000

1"""Inference endpoint commands.""" 

2 

3import sys 

4from typing import Any 

5 

6import click 

7 

8from ..config import GCOConfig 

9from ..output import get_output_formatter 

10 

11pass_config = click.make_pass_decorator(GCOConfig, ensure=True) 

12 

13 

14@click.group() 

15@pass_config 

16def inference(config: Any) -> None: 

17 """Manage multi-region inference endpoints.""" 

18 pass 

19 

20 

21@inference.command("deploy") 

22@click.argument("endpoint_name") 

23@click.option("--image", "-i", required=True, help="Container image (e.g. vllm/vllm-openai:v0.8.0)") 

24@click.option( 

25 "--region", 

26 "-r", 

27 multiple=True, 

28 help="Target region(s). Repeatable. Default: all deployed regions", 

29) 

30@click.option("--replicas", default=1, help="Replicas per region (default: 1)") 

31@click.option("--gpu-count", default=1, help="GPUs per replica (default: 1)") 

32@click.option("--gpu-type", help="GPU instance type hint (e.g. g5.xlarge)") 

33@click.option("--port", default=8000, help="Container port (default: 8000)") 

34@click.option("--model-path", help="EFS path for model weights") 

35@click.option( 

36 "--model-source", 

37 help="S3 URI for model weights (e.g. s3://bucket/models/llama3). " 

38 "Auto-synced to each region via init container.", 

39) 

40@click.option("--health-path", default="/health", help="Health check path (default: /health)") 

41@click.option("--env", "-e", multiple=True, help="Environment variable (KEY=VALUE). Repeatable") 

42@click.option("--namespace", "-n", default="gco-inference", help="Kubernetes namespace") 

43@click.option("--label", "-l", multiple=True, help="Label (key=value). Repeatable") 

44@click.option("--min-replicas", type=int, default=None, help="Autoscaling: minimum replicas") 

45@click.option("--max-replicas", type=int, default=None, help="Autoscaling: maximum replicas") 

46@click.option( 

47 "--autoscale-metric", 

48 multiple=True, 

49 help="Autoscaling metric (cpu:70, memory:80, gpu:60). Repeatable. Enables autoscaling.", 

50) 

51@click.option( 

52 "--capacity-type", 

53 type=click.Choice(["on-demand", "spot"]), 

54 default=None, 

55 help="Node capacity type. 'spot' uses cheaper preemptible instances.", 

56) 

57@click.option( 

58 "--extra-args", 

59 multiple=True, 

60 help="Extra arguments passed to the container (e.g. '--kv-transfer-config {...}'). Repeatable.", 

61) 

62@click.option( 

63 "--accelerator", 

64 type=click.Choice(["nvidia", "neuron"]), 

65 default="nvidia", 

66 help="Accelerator type: 'nvidia' for GPU instances (default), 'neuron' for Trainium/Inferentia.", 

67) 

68@click.option( 

69 "--node-selector", 

70 multiple=True, 

71 help="Node selector (key=value). Repeatable. E.g. --node-selector eks.amazonaws.com/instance-family=inf2", 

72) 

73@click.option( 

74 "--no-rewrite-image", 

75 is_flag=True, 

76 default=False, 

77 help="Skip the per-region ECR URI rewrite. The image URI is sent verbatim " 

78 "to every target region (operator owns cross-region pulls).", 

79) 

80@pass_config 

81def inference_deploy( 

82 config: Any, 

83 endpoint_name: Any, 

84 image: Any, 

85 region: Any, 

86 replicas: Any, 

87 gpu_count: Any, 

88 gpu_type: Any, 

89 port: Any, 

90 model_path: Any, 

91 model_source: Any, 

92 health_path: Any, 

93 env: Any, 

94 namespace: Any, 

95 label: Any, 

96 min_replicas: Any, 

97 max_replicas: Any, 

98 autoscale_metric: Any, 

99 capacity_type: Any, 

100 extra_args: Any, 

101 accelerator: Any, 

102 node_selector: Any, 

103 no_rewrite_image: Any, 

104) -> None: 

105 """Deploy an inference endpoint to one or more regions. 

106 

107 The endpoint is registered in DynamoDB and the inference_monitor 

108 in each target region creates the Kubernetes resources automatically. 

109 

110 Examples: 

111 gco inference deploy my-llm -i vllm/vllm-openai:v0.8.0 

112 

113 gco inference deploy llama3-70b \\ 

114 -i vllm/vllm-openai:v0.8.0 \\ 

115 -r us-east-1 -r eu-west-1 \\ 

116 --replicas 2 --gpu-count 4 \\ 

117 --model-path /mnt/gco/models/llama3-70b \\ 

118 -e MODEL_NAME=meta-llama/Llama-3-70B 

119 """ 

120 from ..inference import get_inference_manager 

121 

122 formatter = get_output_formatter(config) 

123 

124 # Parse env vars and labels 

125 env_dict = {} 

126 for e_var in env: 

127 if "=" in e_var: 127 ↛ 126line 127 didn't jump to line 126 because the condition on line 127 was always true

128 k, v = e_var.split("=", 1) 

129 env_dict[k] = v 

130 

131 labels_dict = {} 

132 for lbl in label: 

133 if "=" in lbl: 133 ↛ 132line 133 didn't jump to line 132 because the condition on line 133 was always true

134 k, v = lbl.split("=", 1) 

135 labels_dict[k] = v 

136 

137 node_selector_dict = {} 

138 for ns in node_selector: 138 ↛ 139line 138 didn't jump to line 139 because the loop on line 138 never started

139 if "=" in ns: 

140 k, v = ns.split("=", 1) 

141 node_selector_dict[k] = v 

142 

143 # Build autoscaling config 

144 autoscaling_config = None 

145 if autoscale_metric: 

146 metrics = [] 

147 for m in autoscale_metric: 

148 if ":" in m: 

149 mtype, mtarget = m.split(":", 1) 

150 metrics.append({"type": mtype, "target": int(mtarget)}) 

151 else: 

152 metrics.append({"type": m, "target": 70}) 

153 autoscaling_config = { 

154 "enabled": True, 

155 "min_replicas": min_replicas or 1, 

156 "max_replicas": max_replicas or 10, 

157 "metrics": metrics, 

158 } 

159 

160 try: 

161 manager = get_inference_manager(config) 

162 result = manager.deploy( 

163 endpoint_name=endpoint_name, 

164 image=image, 

165 target_regions=list(region) if region else None, 

166 replicas=replicas, 

167 gpu_count=gpu_count, 

168 gpu_type=gpu_type, 

169 port=port, 

170 model_path=model_path, 

171 model_source=model_source, 

172 health_check_path=health_path, 

173 env=env_dict if env_dict else None, 

174 namespace=namespace, 

175 labels=labels_dict if labels_dict else None, 

176 autoscaling=autoscaling_config, 

177 capacity_type=capacity_type, 

178 extra_args=list(extra_args) if extra_args else None, 

179 accelerator=accelerator, 

180 node_selector=node_selector_dict if node_selector_dict else None, 

181 rewrite_image=not no_rewrite_image, 

182 ) 

183 

184 formatter.print_success(f"Endpoint '{endpoint_name}' registered for deployment") 

185 regions_str = ", ".join(result.get("target_regions", [])) 

186 formatter.print_info(f"Target regions: {regions_str}") 

187 formatter.print_info(f"Ingress path: {result.get('ingress_path', '')}") 

188 formatter.print_info( 

189 "The inference_monitor in each region will create the resources. " 

190 "Use 'gco inference status' to track progress." 

191 ) 

192 

193 # Warn if deploying to a subset of regions 

194 if region: 

195 from ..aws_client import get_aws_client as _get_client 

196 

197 all_stacks = _get_client(config).discover_regional_stacks() 

198 all_regions = set(all_stacks.keys()) 

199 target_set = set(result.get("target_regions", [])) 

200 missing = all_regions - target_set 

201 if missing: 

202 formatter.print_warning( 

203 f"Endpoint is NOT deployed to: {', '.join(sorted(missing))}. " 

204 "Global Accelerator may route users to those regions where " 

205 "the endpoint won't exist. Consider deploying to all regions " 

206 "(omit -r) for consistent global routing." 

207 ) 

208 

209 if config.output_format != "table": 209 ↛ 210line 209 didn't jump to line 210 because the condition on line 209 was never true

210 formatter.print(result) 

211 

212 except ValueError as e: 

213 formatter.print_error(str(e)) 

214 sys.exit(1) 

215 except Exception as e: 

216 formatter.print_error(f"Failed to deploy endpoint: {e}") 

217 sys.exit(1) 

218 

219 

220@inference.command("list") 

221@click.option("--state", "-s", help="Filter by state (deploying, running, stopped, deleted)") 

222@click.option("--region", "-r", help="Filter by target region") 

223@pass_config 

224def inference_list(config: Any, state: Any, region: Any) -> None: 

225 """List inference endpoints. 

226 

227 Examples: 

228 gco inference list 

229 gco inference list --state running 

230 gco inference list -r us-east-1 

231 """ 

232 from ..inference import get_inference_manager 

233 

234 formatter = get_output_formatter(config) 

235 

236 try: 

237 manager = get_inference_manager(config) 

238 endpoints = manager.list_endpoints(desired_state=state, region=region) 

239 

240 if config.output_format != "table": 240 ↛ 241line 240 didn't jump to line 241 because the condition on line 240 was never true

241 formatter.print(endpoints) 

242 return 

243 

244 if not endpoints: 

245 formatter.print_info("No inference endpoints found") 

246 return 

247 

248 print(f"\n Inference Endpoints ({len(endpoints)} found)") 

249 print(" " + "-" * 85) 

250 print(f" {'NAME':<25} {'STATE':<12} {'REGIONS':<25} {'REPLICAS':>8} {'IMAGE'}") 

251 print(" " + "-" * 85) 

252 for ep in endpoints: 

253 name = ep.get("endpoint_name", "")[:24] 

254 ep_state = ep.get("desired_state", "unknown") 

255 regions = ", ".join(ep.get("target_regions", []))[:24] 

256 spec = ep.get("spec", {}) 

257 replicas = spec.get("replicas", 1) if isinstance(spec, dict) else 1 

258 image = spec.get("image", "")[:40] if isinstance(spec, dict) else "" 

259 print(f" {name:<25} {ep_state:<12} {regions:<25} {replicas:>8} {image}") 

260 

261 print() 

262 

263 except Exception as e: 

264 formatter.print_error(f"Failed to list endpoints: {e}") 

265 sys.exit(1) 

266 

267 

268@inference.command("status") 

269@click.argument("endpoint_name") 

270@pass_config 

271def inference_status(config: Any, endpoint_name: Any) -> None: 

272 """Show detailed status of an inference endpoint. 

273 

274 Examples: 

275 gco inference status my-llm 

276 """ 

277 from ..inference import get_inference_manager 

278 

279 formatter = get_output_formatter(config) 

280 

281 try: 

282 manager = get_inference_manager(config) 

283 endpoint = manager.get_endpoint(endpoint_name) 

284 

285 if not endpoint: 

286 formatter.print_error(f"Endpoint '{endpoint_name}' not found") 

287 sys.exit(1) 

288 

289 if config.output_format != "table": 289 ↛ 290line 289 didn't jump to line 290 because the condition on line 289 was never true

290 formatter.print(endpoint) 

291 return 

292 

293 spec = endpoint.get("spec", {}) 

294 print(f"\n Endpoint: {endpoint_name}") 

295 print(" " + "-" * 60) 

296 print(f" State: {endpoint.get('desired_state', 'unknown')}") 

297 print(f" Image: {spec.get('image', 'N/A')}") 

298 print(f" Replicas: {spec.get('replicas', 1)}") 

299 print(f" GPUs: {spec.get('gpu_count', 0)}") 

300 print(f" Port: {spec.get('port', 8000)}") 

301 print(f" Path: {endpoint.get('ingress_path', 'N/A')}") 

302 print(f" Namespace: {endpoint.get('namespace', 'N/A')}") 

303 print(f" Created: {endpoint.get('created_at', 'N/A')}") 

304 

305 # Region status 

306 region_status = endpoint.get("region_status", {}) 

307 if region_status: 

308 print("\n Region Status:") 

309 print(f" {'REGION':<18} {'STATE':<12} {'READY':>5} {'DESIRED':>7} {'LAST SYNC'}") 

310 print(" " + "-" * 65) 

311 for r, status in region_status.items(): 

312 if isinstance(status, dict): 312 ↛ 311line 312 didn't jump to line 311 because the condition on line 312 was always true

313 r_state = status.get("state", "unknown") 

314 ready = status.get("replicas_ready", 0) 

315 desired = status.get("replicas_desired", 0) 

316 last_sync = status.get("last_sync", "N/A") 

317 if last_sync and len(last_sync) > 19: 317 ↛ 319line 317 didn't jump to line 319 because the condition on line 317 was always true

318 last_sync = last_sync[:19] 

319 print(f" {r:<18} {r_state:<12} {ready:>5} {desired:>7} {last_sync}") 

320 else: 

321 target_regions = endpoint.get("target_regions", []) 

322 print(f"\n Target regions: {', '.join(target_regions)}") 

323 print(" (Waiting for inference_monitor to sync)") 

324 

325 print() 

326 

327 except Exception as e: 

328 formatter.print_error(f"Failed to get endpoint status: {e}") 

329 sys.exit(1) 

330 

331 

332@inference.command("scale") 

333@click.argument("endpoint_name") 

334@click.option("--replicas", "-r", required=True, type=int, help="New replica count") 

335@pass_config 

336def inference_scale(config: Any, endpoint_name: Any, replicas: Any) -> None: 

337 """Scale an inference endpoint. 

338 

339 Examples: 

340 gco inference scale my-llm --replicas 4 

341 """ 

342 from ..inference import get_inference_manager 

343 

344 formatter = get_output_formatter(config) 

345 

346 try: 

347 manager = get_inference_manager(config) 

348 result = manager.scale(endpoint_name, replicas) 

349 

350 if result: 

351 formatter.print_success(f"Endpoint '{endpoint_name}' scaled to {replicas} replicas") 

352 else: 

353 formatter.print_error(f"Endpoint '{endpoint_name}' not found") 

354 sys.exit(1) 

355 

356 except Exception as e: 

357 formatter.print_error(f"Failed to scale endpoint: {e}") 

358 sys.exit(1) 

359 

360 

361@inference.command("stop") 

362@click.argument("endpoint_name") 

363@click.option("--yes", "-y", is_flag=True, help="Skip confirmation") 

364@pass_config 

365def inference_stop(config: Any, endpoint_name: Any, yes: Any) -> None: 

366 """Stop an inference endpoint (scale to zero, keep config). 

367 

368 Examples: 

369 gco inference stop my-llm -y 

370 """ 

371 from ..inference import get_inference_manager 

372 

373 formatter = get_output_formatter(config) 

374 

375 if not yes: 375 ↛ 376line 375 didn't jump to line 376 because the condition on line 375 was never true

376 click.confirm(f"Stop endpoint '{endpoint_name}'?", abort=True) 

377 

378 try: 

379 manager = get_inference_manager(config) 

380 result = manager.stop(endpoint_name) 

381 

382 if result: 

383 formatter.print_success(f"Endpoint '{endpoint_name}' marked for stop") 

384 else: 

385 formatter.print_error(f"Endpoint '{endpoint_name}' not found") 

386 sys.exit(1) 

387 

388 except Exception as e: 

389 formatter.print_error(f"Failed to stop endpoint: {e}") 

390 sys.exit(1) 

391 

392 

393@inference.command("start") 

394@click.argument("endpoint_name") 

395@pass_config 

396def inference_start(config: Any, endpoint_name: Any) -> None: 

397 """Start a stopped inference endpoint. 

398 

399 Examples: 

400 gco inference start my-llm 

401 """ 

402 from ..inference import get_inference_manager 

403 

404 formatter = get_output_formatter(config) 

405 

406 try: 

407 manager = get_inference_manager(config) 

408 result = manager.start(endpoint_name) 

409 

410 if result: 

411 formatter.print_success(f"Endpoint '{endpoint_name}' marked for start") 

412 else: 

413 formatter.print_error(f"Endpoint '{endpoint_name}' not found") 

414 sys.exit(1) 

415 

416 except Exception as e: 

417 formatter.print_error(f"Failed to start endpoint: {e}") 

418 sys.exit(1) 

419 

420 

421@inference.command("delete") 

422@click.argument("endpoint_name") 

423@click.option("--yes", "-y", is_flag=True, help="Skip confirmation") 

424@pass_config 

425def inference_delete(config: Any, endpoint_name: Any, yes: Any) -> None: 

426 """Delete an inference endpoint from all regions. 

427 

428 The inference_monitor in each region will clean up the K8s resources. 

429 

430 Examples: 

431 gco inference delete my-llm -y 

432 """ 

433 from ..inference import get_inference_manager 

434 

435 formatter = get_output_formatter(config) 

436 

437 if not yes: 437 ↛ 438line 437 didn't jump to line 438 because the condition on line 437 was never true

438 click.confirm(f"Delete endpoint '{endpoint_name}' from all regions?", abort=True) 

439 

440 try: 

441 manager = get_inference_manager(config) 

442 result = manager.delete(endpoint_name) 

443 

444 if result: 

445 formatter.print_success( 

446 f"Endpoint '{endpoint_name}' marked for deletion. " 

447 "The inference_monitor will clean up resources in each region." 

448 ) 

449 else: 

450 formatter.print_error(f"Endpoint '{endpoint_name}' not found") 

451 sys.exit(1) 

452 

453 except Exception as e: 

454 formatter.print_error(f"Failed to delete endpoint: {e}") 

455 sys.exit(1) 

456 

457 

458@inference.command("update-image") 

459@click.argument("endpoint_name") 

460@click.option("--image", "-i", required=True, help="New container image") 

461@pass_config 

462def inference_update_image(config: Any, endpoint_name: Any, image: Any) -> None: 

463 """Update the container image for an inference endpoint. 

464 

465 Triggers a rolling update across all target regions. 

466 

467 Examples: 

468 gco inference update-image my-llm -i vllm/vllm-openai:v0.9.0 

469 """ 

470 from ..inference import get_inference_manager 

471 

472 formatter = get_output_formatter(config) 

473 

474 try: 

475 manager = get_inference_manager(config) 

476 result = manager.update_image(endpoint_name, image) 

477 

478 if result: 

479 formatter.print_success(f"Endpoint '{endpoint_name}' image updated to {image}") 

480 formatter.print_info("Rolling update will be applied by inference_monitor") 

481 else: 

482 formatter.print_error(f"Endpoint '{endpoint_name}' not found") 

483 sys.exit(1) 

484 

485 except Exception as e: 

486 formatter.print_error(f"Failed to update image: {e}") 

487 sys.exit(1) 

488 

489 

490@inference.command("invoke") 

491@click.argument("endpoint_name") 

492@click.option("--prompt", "-p", help="Text prompt to send") 

493@click.option("--data", "-d", help="Raw JSON body to send") 

494@click.option( 

495 "--path", "api_path", default=None, help="API sub-path (default: auto-detect from framework)" 

496) 

497@click.option("--region", "-r", help="Target region for the request") 

498@click.option( 

499 "--max-tokens", type=int, default=100, help="Maximum tokens to generate (default: 100)" 

500) 

501@click.option("--stream/--no-stream", default=False, help="Stream the response") 

502@pass_config 

503def inference_invoke( 

504 config: Any, 

505 endpoint_name: Any, 

506 prompt: Any, 

507 data: Any, 

508 api_path: Any, 

509 region: Any, 

510 max_tokens: Any, 

511 stream: Any, 

512) -> None: 

513 """Send a request to an inference endpoint and print the response. 

514 

515 Automatically discovers the endpoint's ingress path and routes the 

516 request through the API Gateway with SigV4 authentication. 

517 

518 Examples: 

519 gco inference invoke my-llm -p "What is GPU orchestration?" 

520 

521 gco inference invoke my-llm -d '{"prompt": "Hello", "max_tokens": 50}' 

522 

523 gco inference invoke my-llm -p "Explain K8s" --path /v1/completions 

524 """ 

525 import json as _json 

526 

527 from ..aws_client import get_aws_client 

528 from ..inference import get_inference_manager 

529 

530 formatter = get_output_formatter(config) 

531 

532 if not prompt and not data: 

533 formatter.print_error("Provide --prompt (-p) or --data (-d)") 

534 sys.exit(1) 

535 

536 try: 

537 # Look up the endpoint to get its ingress path and spec 

538 manager = get_inference_manager(config) 

539 endpoint = manager.get_endpoint(endpoint_name) 

540 if not endpoint: 

541 formatter.print_error(f"Endpoint '{endpoint_name}' not found") 

542 sys.exit(1) 

543 

544 ingress_path = endpoint.get("ingress_path", f"/inference/{endpoint_name}") 

545 spec = endpoint.get("spec", {}) 

546 image = spec.get("image", "") if isinstance(spec, dict) else "" 

547 

548 # Auto-detect the API sub-path based on the container image 

549 if api_path is None: 

550 if "vllm" in image: 

551 api_path = "/v1/completions" 

552 elif "text-generation-inference" in image or "tgi" in image: 

553 api_path = "/generate" 

554 elif "tritonserver" in image or "triton" in image: 554 ↛ 557line 554 didn't jump to line 557 because the condition on line 554 was always true

555 api_path = "/v2/models" 

556 else: 

557 api_path = "/v1/completions" 

558 

559 full_path = f"{ingress_path}{api_path}" 

560 

561 # Build the request body 

562 body_str: str | None = None 

563 if data: 

564 body_str = data 

565 elif prompt: 565 ↛ 611line 565 didn't jump to line 611 because the condition on line 565 was always true

566 # Build a sensible default body based on framework 

567 if "generate" in api_path: 

568 # TGI format 

569 body_dict = {"inputs": prompt, "parameters": {"max_new_tokens": max_tokens}} 

570 elif "/v2/" in api_path: 

571 # Triton — just list models, prompt not used for this path 

572 body_dict = {} 

573 else: 

574 # OpenAI-compatible (vLLM, etc.) 

575 # Determine model name for OpenAI-compatible request 

576 model_name = endpoint_name 

577 if isinstance(spec, dict): 577 ↛ 603line 577 didn't jump to line 603 because the condition on line 577 was always true

578 # Check env vars first 

579 model_name = spec.get("env", {}).get("MODEL", model_name) 

580 # Check container args for --model (vLLM, etc.) 

581 args_list = spec.get("args") or [] 

582 for i, arg in enumerate(args_list): 582 ↛ 583line 582 didn't jump to line 583 because the loop on line 582 never started

583 if arg == "--model" and i + 1 < len(args_list): 

584 model_name = args_list[i + 1] 

585 break 

586 # Default for vLLM with no explicit model — auto-detect 

587 # by querying /v1/models on the running endpoint 

588 if model_name == endpoint_name and "vllm" in image: 588 ↛ 603line 588 didn't jump to line 603 because the condition on line 588 was always true

589 try: 

590 detect_client = get_aws_client(config) 

591 models_path = f"/inference/{endpoint_name}/v1/models" 

592 models_resp = detect_client.make_authenticated_request( 

593 method="GET", 

594 path=models_path, 

595 target_region=region, 

596 ) 

597 if models_resp.ok: 

598 models_data = models_resp.json().get("data", []) 

599 if models_data: 599 ↛ 600line 599 didn't jump to line 600 because the condition on line 599 was never true

600 model_name = models_data[0]["id"] 

601 except Exception: 

602 pass # Fall through to endpoint_name as model 

603 body_dict = { 

604 "model": model_name, 

605 "prompt": prompt, 

606 "max_tokens": max_tokens, 

607 "stream": stream, 

608 } 

609 body_str = _json.dumps(body_dict) 

610 

611 formatter.print_info(f"POST {full_path}") 

612 

613 # Make the authenticated request 

614 client = get_aws_client(config) 

615 response = client.make_authenticated_request( 

616 method="POST" if body_str else "GET", 

617 path=full_path, 

618 body=_json.loads(body_str) if body_str else None, 

619 target_region=region, 

620 ) 

621 

622 # Print the response 

623 if response.ok: 

624 try: 

625 resp_json = response.json() 

626 # Extract the generated text for common formats 

627 text = None 

628 if "choices" in resp_json: 

629 # OpenAI format 

630 choices = resp_json["choices"] 

631 if choices: 631 ↛ 641line 631 didn't jump to line 641 because the condition on line 631 was always true

632 text = choices[0].get("text") or choices[0].get("message", {}).get( 

633 "content" 

634 ) 

635 elif "generated_text" in resp_json: 635 ↛ 637line 635 didn't jump to line 637 because the condition on line 635 was never true

636 # TGI format 

637 text = resp_json["generated_text"] 

638 elif isinstance(resp_json, list) and resp_json and "generated_text" in resp_json[0]: 

639 text = resp_json[0]["generated_text"] 

640 

641 if text and config.output_format == "table": 

642 print(f"\n{text.strip()}\n") 

643 else: 

644 print(_json.dumps(resp_json, indent=2)) 

645 except _json.JSONDecodeError: 

646 print(response.text) 

647 else: 

648 formatter.print_error(f"HTTP {response.status_code}: {response.text[:500]}") 

649 sys.exit(1) 

650 

651 except Exception as e: 

652 formatter.print_error(f"Failed to invoke endpoint: {e}") 

653 sys.exit(1) 

654 

655 

656@inference.command("canary") 

657@click.argument("endpoint_name") 

658@click.option("--image", "-i", required=True, help="New container image for canary") 

659@click.option( 

660 "--weight", 

661 "-w", 

662 default=10, 

663 type=int, 

664 help="Percentage of traffic to canary (1-99, default: 10)", 

665) 

666@click.option( 

667 "--replicas", "-r", default=1, type=int, help="Number of canary replicas (default: 1)" 

668) 

669@pass_config 

670def inference_canary( 

671 config: Any, endpoint_name: Any, image: Any, weight: Any, replicas: Any 

672) -> None: 

673 """Start a canary deployment with a new image. 

674 

675 Routes a percentage of traffic to the canary while the primary 

676 continues serving the rest. Use 'promote' to make the canary 

677 the new primary, or 'rollback' to remove it. 

678 

679 Examples: 

680 gco inference canary my-llm -i vllm/vllm-openai:v0.9.0 --weight 10 

681 gco inference canary my-llm -i new-image:latest -w 25 -r 2 

682 """ 

683 from ..inference import get_inference_manager 

684 

685 formatter = get_output_formatter(config) 

686 

687 try: 

688 manager = get_inference_manager(config) 

689 result = manager.canary_deploy(endpoint_name, image, weight=weight, replicas=replicas) 

690 

691 if not result: 

692 formatter.print_error(f"Endpoint '{endpoint_name}' not found") 

693 sys.exit(1) 

694 

695 formatter.print_success( 

696 f"Canary started: {weight}% traffic → {image} ({replicas} replica(s))" 

697 ) 

698 formatter.print_info(f"Monitor with: gco inference status {endpoint_name}") 

699 formatter.print_info(f"Promote with: gco inference promote {endpoint_name}") 

700 formatter.print_info(f"Rollback with: gco inference rollback {endpoint_name}") 

701 

702 except ValueError as e: 

703 formatter.print_error(str(e)) 

704 sys.exit(1) 

705 except Exception as e: 

706 formatter.print_error(f"Failed to start canary: {e}") 

707 sys.exit(1) 

708 

709 

710@inference.command("promote") 

711@click.argument("endpoint_name") 

712@click.option("--yes", "-y", is_flag=True, help="Skip confirmation") 

713@pass_config 

714def inference_promote(config: Any, endpoint_name: Any, yes: Any) -> None: 

715 """Promote the canary to primary. 

716 

717 Replaces the primary image with the canary image and removes 

718 the canary deployment. All traffic goes to the new image. 

719 

720 Examples: 

721 gco inference promote my-llm -y 

722 """ 

723 from ..inference import get_inference_manager 

724 

725 formatter = get_output_formatter(config) 

726 

727 try: 

728 manager = get_inference_manager(config) 

729 endpoint = manager.get_endpoint(endpoint_name) 

730 

731 if not endpoint: 

732 formatter.print_error(f"Endpoint '{endpoint_name}' not found") 

733 sys.exit(1) 

734 

735 canary = endpoint.get("spec", {}).get("canary") 

736 if not canary: 

737 formatter.print_error(f"Endpoint '{endpoint_name}' has no active canary") 

738 sys.exit(1) 

739 

740 if not yes: 740 ↛ 741line 740 didn't jump to line 741 because the condition on line 740 was never true

741 current_image = endpoint.get("spec", {}).get("image", "unknown") 

742 click.echo(f" Current primary: {current_image}") 

743 click.echo(f" Canary image: {canary.get('image', 'unknown')}") 

744 click.echo(f" Canary weight: {canary.get('weight', 0)}%") 

745 if not click.confirm(" Promote canary to primary?"): 

746 formatter.print_info("Cancelled") 

747 return 

748 

749 result = manager.promote_canary(endpoint_name) 

750 if result: 750 ↛ 754line 750 didn't jump to line 754 because the condition on line 750 was always true

751 new_image = result.get("spec", {}).get("image", "unknown") 

752 formatter.print_success(f"Promoted: all traffic now serving {new_image}") 

753 else: 

754 formatter.print_error("Promotion failed") 

755 sys.exit(1) 

756 

757 except ValueError as e: 

758 formatter.print_error(str(e)) 

759 sys.exit(1) 

760 except Exception as e: 

761 formatter.print_error(f"Failed to promote canary: {e}") 

762 sys.exit(1) 

763 

764 

765@inference.command("rollback") 

766@click.argument("endpoint_name") 

767@click.option("--yes", "-y", is_flag=True, help="Skip confirmation") 

768@pass_config 

769def inference_rollback(config: Any, endpoint_name: Any, yes: Any) -> None: 

770 """Remove the canary deployment, keeping the primary unchanged. 

771 

772 All traffic returns to the primary deployment. 

773 

774 Examples: 

775 gco inference rollback my-llm -y 

776 """ 

777 from ..inference import get_inference_manager 

778 

779 formatter = get_output_formatter(config) 

780 

781 try: 

782 manager = get_inference_manager(config) 

783 endpoint = manager.get_endpoint(endpoint_name) 

784 

785 if not endpoint: 785 ↛ 786line 785 didn't jump to line 786 because the condition on line 785 was never true

786 formatter.print_error(f"Endpoint '{endpoint_name}' not found") 

787 sys.exit(1) 

788 

789 canary = endpoint.get("spec", {}).get("canary") 

790 if not canary: 

791 formatter.print_error(f"Endpoint '{endpoint_name}' has no active canary") 

792 sys.exit(1) 

793 

794 if not yes: 794 ↛ 795line 794 didn't jump to line 795 because the condition on line 794 was never true

795 click.echo(f" Canary image: {canary.get('image', 'unknown')}") 

796 click.echo(f" Canary weight: {canary.get('weight', 0)}%") 

797 if not click.confirm(" Remove canary and restore full traffic to primary?"): 

798 formatter.print_info("Cancelled") 

799 return 

800 

801 result = manager.rollback_canary(endpoint_name) 

802 if result: 802 ↛ 806line 802 didn't jump to line 806 because the condition on line 802 was always true

803 primary_image = result.get("spec", {}).get("image", "unknown") 

804 formatter.print_success(f"Rolled back: all traffic now serving {primary_image}") 

805 else: 

806 formatter.print_error("Rollback failed") 

807 sys.exit(1) 

808 

809 except ValueError as e: 

810 formatter.print_error(str(e)) 

811 sys.exit(1) 

812 except Exception as e: 

813 formatter.print_error(f"Failed to rollback canary: {e}") 

814 sys.exit(1) 

815 

816 

817@inference.command("health") 

818@click.argument("endpoint_name") 

819@click.option("--region", "-r", help="Target region to check") 

820@pass_config 

821def inference_health(config: Any, endpoint_name: Any, region: Any) -> None: 

822 """Check if an inference endpoint is healthy and ready to serve. 

823 

824 Hits the endpoint's health check path and reports status and latency. 

825 

826 Examples: 

827 gco inference health my-llm 

828 

829 gco inference health my-llm -r us-east-1 

830 """ 

831 import json as _json 

832 import time as _time 

833 

834 from ..aws_client import get_aws_client 

835 from ..inference import get_inference_manager 

836 

837 formatter = get_output_formatter(config) 

838 

839 try: 

840 manager = get_inference_manager(config) 

841 endpoint = manager.get_endpoint(endpoint_name) 

842 if not endpoint: 

843 formatter.print_error(f"Endpoint '{endpoint_name}' not found") 

844 sys.exit(1) 

845 

846 ingress_path = endpoint.get("ingress_path", f"/inference/{endpoint_name}") 

847 spec = endpoint.get("spec", {}) 

848 health_path = spec.get("health_path", "/health") if isinstance(spec, dict) else "/health" 

849 full_path = f"{ingress_path}{health_path}" 

850 

851 client = get_aws_client(config) 

852 start = _time.monotonic() 

853 response = client.make_authenticated_request( 

854 method="GET", 

855 path=full_path, 

856 target_region=region, 

857 ) 

858 latency_ms = (_time.monotonic() - start) * 1000 

859 

860 result = { 

861 "endpoint": endpoint_name, 

862 "status": "healthy" if response.ok else "unhealthy", 

863 "http_status": response.status_code, 

864 "latency_ms": round(latency_ms, 1), 

865 "path": full_path, 

866 } 

867 

868 try: 

869 result["body"] = response.json() 

870 except Exception: 

871 result["body"] = response.text[:200] if response.text else None 

872 

873 if config.output_format == "json": 873 ↛ 874line 873 didn't jump to line 874 because the condition on line 873 was never true

874 print(_json.dumps(result, indent=2)) 

875 else: 

876 status_icon = "✓" if response.ok else "✗" 

877 formatter.print_info( 

878 f"{status_icon} {endpoint_name}: {result['status']} " 

879 f"(HTTP {response.status_code}, {result['latency_ms']}ms)" 

880 ) 

881 

882 except Exception as e: 

883 formatter.print_error(f"Health check failed: {e}") 

884 sys.exit(1) 

885 

886 

887@inference.command("models") 

888@click.argument("endpoint_name") 

889@click.option("--region", "-r", help="Target region to query") 

890@pass_config 

891def inference_models(config: Any, endpoint_name: Any, region: Any) -> None: 

892 """List models loaded on an inference endpoint. 

893 

894 Queries the /v1/models path (OpenAI-compatible) to discover loaded models. 

895 

896 Examples: 

897 gco inference models my-llm 

898 """ 

899 import json as _json 

900 

901 from ..aws_client import get_aws_client 

902 from ..inference import get_inference_manager 

903 

904 formatter = get_output_formatter(config) 

905 

906 try: 

907 manager = get_inference_manager(config) 

908 endpoint = manager.get_endpoint(endpoint_name) 

909 if not endpoint: 

910 formatter.print_error(f"Endpoint '{endpoint_name}' not found") 

911 sys.exit(1) 

912 

913 ingress_path = endpoint.get("ingress_path", f"/inference/{endpoint_name}") 

914 full_path = f"{ingress_path}/v1/models" 

915 

916 client = get_aws_client(config) 

917 response = client.make_authenticated_request( 

918 method="GET", 

919 path=full_path, 

920 target_region=region, 

921 ) 

922 

923 if response.ok: 

924 try: 

925 resp_json = response.json() 

926 print(_json.dumps(resp_json, indent=2)) 

927 except _json.JSONDecodeError: 

928 print(response.text) 

929 else: 

930 formatter.print_error(f"HTTP {response.status_code}: {response.text[:500]}") 

931 sys.exit(1) 

932 

933 except Exception as e: 

934 formatter.print_error(f"Failed to list models: {e}") 

935 sys.exit(1)