Coverage for cli/commands/inference

1"""Inference endpoint commands."""

3import sys

4from typing import Any

6import click

8from ..config import GCOConfig

9from ..output import get_output_formatter

11pass_config = click.make_pass_decorator(GCOConfig, ensure=True)

14@click.group()

15@pass_config

16def inference(config: Any) -> None:

17 """Manage multi-region inference endpoints."""

18 pass

21@inference.command("deploy")

22@click.argument("endpoint_name")

23@click.option("--image", "-i", required=True, help="Container image (e.g. vllm/vllm-openai:v0.8.0)")

24@click.option(

25 "--region",

26 "-r",

27 multiple=True,

28 help="Target region(s). Repeatable. Default: all deployed regions",

29)

30@click.option("--replicas", default=1, help="Replicas per region (default: 1)")

31@click.option("--gpu-count", default=1, help="GPUs per replica (default: 1)")

32@click.option("--gpu-type", help="GPU instance type hint (e.g. g5.xlarge)")

33@click.option("--port", default=8000, help="Container port (default: 8000)")

34@click.option("--model-path", help="EFS path for model weights")

35@click.option(

36 "--model-source",

37 help="S3 URI for model weights (e.g. s3://bucket/models/llama3). "

38 "Auto-synced to each region via init container.",

39)

40@click.option("--health-path", default="/health", help="Health check path (default: /health)")

41@click.option("--env", "-e", multiple=True, help="Environment variable (KEY=VALUE). Repeatable")

42@click.option("--namespace", "-n", default="gco-inference", help="Kubernetes namespace")

43@click.option("--label", "-l", multiple=True, help="Label (key=value). Repeatable")

44@click.option("--min-replicas", type=int, default=None, help="Autoscaling: minimum replicas")

45@click.option("--max-replicas", type=int, default=None, help="Autoscaling: maximum replicas")

46@click.option(

47 "--autoscale-metric",

48 multiple=True,

49 help="Autoscaling metric (cpu:70, memory:80, gpu:60). Repeatable. Enables autoscaling.",

50)

51@click.option(

52 "--capacity-type",

53 type=click.Choice(["on-demand", "spot"]),

54 default=None,

55 help="Node capacity type. 'spot' uses cheaper preemptible instances.",

56)

57@click.option(

58 "--extra-args",

59 multiple=True,

60 help="Extra arguments passed to the container (e.g. '--kv-transfer-config {...}'). Repeatable.",

61)

62@click.option(

63 "--accelerator",

64 type=click.Choice(["nvidia", "neuron"]),

65 default="nvidia",

66 help="Accelerator type: 'nvidia' for GPU instances (default), 'neuron' for Trainium/Inferentia.",

67)

68@click.option(

69 "--node-selector",

70 multiple=True,

71 help="Node selector (key=value). Repeatable. E.g. --node-selector eks.amazonaws.com/instance-family=inf2",

72)

73@pass_config

74def inference_deploy(

75 config: Any,

76 endpoint_name: Any,

77 image: Any,

78 region: Any,

79 replicas: Any,

80 gpu_count: Any,

81 gpu_type: Any,

82 port: Any,

83 model_path: Any,

84 model_source: Any,

85 health_path: Any,

86 env: Any,

87 namespace: Any,

88 label: Any,

89 min_replicas: Any,

90 max_replicas: Any,

91 autoscale_metric: Any,

92 capacity_type: Any,

93 extra_args: Any,

94 accelerator: Any,

95 node_selector: Any,

96) -> None:

97 """Deploy an inference endpoint to one or more regions.

99 The endpoint is registered in DynamoDB and the inference_monitor

100 in each target region creates the Kubernetes resources automatically.

101

102 Examples:

103 gco inference deploy my-llm -i vllm/vllm-openai:v0.8.0

104

105 gco inference deploy llama3-70b \\

106 -i vllm/vllm-openai:v0.8.0 \\

107 -r us-east-1 -r eu-west-1 \\

108 --replicas 2 --gpu-count 4 \\

109 --model-path /mnt/gco/models/llama3-70b \\

110 -e MODEL_NAME=meta-llama/Llama-3-70B

111 """

112 from ..inference import get_inference_manager

113

114 formatter = get_output_formatter(config)

115

116 # Parse env vars and labels

117 env_dict = {}

118 for e_var in env:

119 if "=" in e_var: 119 ↛ 118line 119 didn't jump to line 118 because the condition on line 119 was always true

120 k, v = e_var.split("=", 1)

121 env_dict[k] = v

122

123 labels_dict = {}

124 for lbl in label:

125 if "=" in lbl: 125 ↛ 124line 125 didn't jump to line 124 because the condition on line 125 was always true

126 k, v = lbl.split("=", 1)

127 labels_dict[k] = v

128

129 node_selector_dict = {}

130 for ns in node_selector: 130 ↛ 131line 130 didn't jump to line 131 because the loop on line 130 never started

131 if "=" in ns:

132 k, v = ns.split("=", 1)

133 node_selector_dict[k] = v

134

135 # Build autoscaling config

136 autoscaling_config = None

137 if autoscale_metric:

138 metrics = []

139 for m in autoscale_metric:

140 if ":" in m:

141 mtype, mtarget = m.split(":", 1)

142 metrics.append({"type": mtype, "target": int(mtarget)})

143 else:

144 metrics.append({"type": m, "target": 70})

145 autoscaling_config = {

146 "enabled": True,

147 "min_replicas": min_replicas or 1,

148 "max_replicas": max_replicas or 10,

149 "metrics": metrics,

150 }

151

152 try:

153 manager = get_inference_manager(config)

154 result = manager.deploy(

155 endpoint_name=endpoint_name,

156 image=image,

157 target_regions=list(region) if region else None,

158 replicas=replicas,

159 gpu_count=gpu_count,

160 gpu_type=gpu_type,

161 port=port,

162 model_path=model_path,

163 model_source=model_source,

164 health_check_path=health_path,

165 env=env_dict if env_dict else None,

166 namespace=namespace,

167 labels=labels_dict if labels_dict else None,

168 autoscaling=autoscaling_config,

169 capacity_type=capacity_type,

170 extra_args=list(extra_args) if extra_args else None,

171 accelerator=accelerator,

172 node_selector=node_selector_dict if node_selector_dict else None,

173 )

174

175 formatter.print_success(f"Endpoint '{endpoint_name}' registered for deployment")

176 regions_str = ", ".join(result.get("target_regions", []))

177 formatter.print_info(f"Target regions: {regions_str}")

178 formatter.print_info(f"Ingress path: {result.get('ingress_path', '')}")

179 formatter.print_info(

180 "The inference_monitor in each region will create the resources. "

181 "Use 'gco inference status' to track progress."

182 )

183

184 # Warn if deploying to a subset of regions

185 if region:

186 from ..aws_client import get_aws_client as _get_client

187

188 all_stacks = _get_client(config).discover_regional_stacks()

189 all_regions = set(all_stacks.keys())

190 target_set = set(result.get("target_regions", []))

191 missing = all_regions - target_set

192 if missing:

193 formatter.print_warning(

194 f"Endpoint is NOT deployed to: {', '.join(sorted(missing))}. "

195 "Global Accelerator may route users to those regions where "

196 "the endpoint won't exist. Consider deploying to all regions "

197 "(omit -r) for consistent global routing."

198 )

199

200 if config.output_format != "table": 200 ↛ 201line 200 didn't jump to line 201 because the condition on line 200 was never true

201 formatter.print(result)

202

203 except ValueError as e:

204 formatter.print_error(str(e))

205 sys.exit(1)

206 except Exception as e:

207 formatter.print_error(f"Failed to deploy endpoint: {e}")

208 sys.exit(1)

209

210

211@inference.command("list")

212@click.option("--state", "-s", help="Filter by state (deploying, running, stopped, deleted)")

213@click.option("--region", "-r", help="Filter by target region")

214@pass_config

215def inference_list(config: Any, state: Any, region: Any) -> None:

216 """List inference endpoints.

217

218 Examples:

219 gco inference list

220 gco inference list --state running

221 gco inference list -r us-east-1

222 """

223 from ..inference import get_inference_manager

224

225 formatter = get_output_formatter(config)

226

227 try:

228 manager = get_inference_manager(config)

229 endpoints = manager.list_endpoints(desired_state=state, region=region)

230

231 if config.output_format != "table": 231 ↛ 232line 231 didn't jump to line 232 because the condition on line 231 was never true

232 formatter.print(endpoints)

233 return

234

235 if not endpoints:

236 formatter.print_info("No inference endpoints found")

237 return

238

239 print(f"\n Inference Endpoints ({len(endpoints)} found)")

240 print(" " + "-" * 85)

241 print(f" {'NAME':<25} {'STATE':<12} {'REGIONS':<25} {'REPLICAS':>8} {'IMAGE'}")

242 print(" " + "-" * 85)

243 for ep in endpoints:

244 name = ep.get("endpoint_name", "")[:24]

245 ep_state = ep.get("desired_state", "unknown")

246 regions = ", ".join(ep.get("target_regions", []))[:24]

247 spec = ep.get("spec", {})

248 replicas = spec.get("replicas", 1) if isinstance(spec, dict) else 1

249 image = spec.get("image", "")[:40] if isinstance(spec, dict) else ""

250 print(f" {name:<25} {ep_state:<12} {regions:<25} {replicas:>8} {image}")

251

252 print()

253

254 except Exception as e:

255 formatter.print_error(f"Failed to list endpoints: {e}")

256 sys.exit(1)

257

258

259@inference.command("status")

260@click.argument("endpoint_name")

261@pass_config

262def inference_status(config: Any, endpoint_name: Any) -> None:

263 """Show detailed status of an inference endpoint.

264

265 Examples:

266 gco inference status my-llm

267 """

268 from ..inference import get_inference_manager

269

270 formatter = get_output_formatter(config)

271

272 try:

273 manager = get_inference_manager(config)

274 endpoint = manager.get_endpoint(endpoint_name)

275

276 if not endpoint:

277 formatter.print_error(f"Endpoint '{endpoint_name}' not found")

278 sys.exit(1)

279

280 if config.output_format != "table": 280 ↛ 281line 280 didn't jump to line 281 because the condition on line 280 was never true

281 formatter.print(endpoint)

282 return

283

284 spec = endpoint.get("spec", {})

285 print(f"\n Endpoint: {endpoint_name}")

286 print(" " + "-" * 60)

287 print(f" State: {endpoint.get('desired_state', 'unknown')}")

288 print(f" Image: {spec.get('image', 'N/A')}")

289 print(f" Replicas: {spec.get('replicas', 1)}")

290 print(f" GPUs: {spec.get('gpu_count', 0)}")

291 print(f" Port: {spec.get('port', 8000)}")

292 print(f" Path: {endpoint.get('ingress_path', 'N/A')}")

293 print(f" Namespace: {endpoint.get('namespace', 'N/A')}")

294 print(f" Created: {endpoint.get('created_at', 'N/A')}")

295

296 # Region status

297 region_status = endpoint.get("region_status", {})

298 if region_status:

299 print("\n Region Status:")

300 print(f" {'REGION':<18} {'STATE':<12} {'READY':>5} {'DESIRED':>7} {'LAST SYNC'}")

301 print(" " + "-" * 65)

302 for r, status in region_status.items():

303 if isinstance(status, dict): 303 ↛ 302line 303 didn't jump to line 302 because the condition on line 303 was always true

304 r_state = status.get("state", "unknown")

305 ready = status.get("replicas_ready", 0)

306 desired = status.get("replicas_desired", 0)

307 last_sync = status.get("last_sync", "N/A")

308 if last_sync and len(last_sync) > 19: 308 ↛ 310line 308 didn't jump to line 310 because the condition on line 308 was always true

309 last_sync = last_sync[:19]

310 print(f" {r:<18} {r_state:<12} {ready:>5} {desired:>7} {last_sync}")

311 else:

312 target_regions = endpoint.get("target_regions", [])

313 print(f"\n Target regions: {', '.join(target_regions)}")

314 print(" (Waiting for inference_monitor to sync)")

315

316 print()

317

318 except Exception as e:

319 formatter.print_error(f"Failed to get endpoint status: {e}")

320 sys.exit(1)

321

322

323@inference.command("scale")

324@click.argument("endpoint_name")

325@click.option("--replicas", "-r", required=True, type=int, help="New replica count")

326@pass_config

327def inference_scale(config: Any, endpoint_name: Any, replicas: Any) -> None:

328 """Scale an inference endpoint.

329

330 Examples:

331 gco inference scale my-llm --replicas 4

332 """

333 from ..inference import get_inference_manager

334

335 formatter = get_output_formatter(config)

336

337 try:

338 manager = get_inference_manager(config)

339 result = manager.scale(endpoint_name, replicas)

340

341 if result:

342 formatter.print_success(f"Endpoint '{endpoint_name}' scaled to {replicas} replicas")

343 else:

344 formatter.print_error(f"Endpoint '{endpoint_name}' not found")

345 sys.exit(1)

346

347 except Exception as e:

348 formatter.print_error(f"Failed to scale endpoint: {e}")

349 sys.exit(1)

350

351

352@inference.command("stop")

353@click.argument("endpoint_name")

354@click.option("--yes", "-y", is_flag=True, help="Skip confirmation")

355@pass_config

356def inference_stop(config: Any, endpoint_name: Any, yes: Any) -> None:

357 """Stop an inference endpoint (scale to zero, keep config).

358

359 Examples:

360 gco inference stop my-llm -y

361 """

362 from ..inference import get_inference_manager

363

364 formatter = get_output_formatter(config)

365

366 if not yes: 366 ↛ 367line 366 didn't jump to line 367 because the condition on line 366 was never true

367 click.confirm(f"Stop endpoint '{endpoint_name}'?", abort=True)

368

369 try:

370 manager = get_inference_manager(config)

371 result = manager.stop(endpoint_name)

372

373 if result:

374 formatter.print_success(f"Endpoint '{endpoint_name}' marked for stop")

375 else:

376 formatter.print_error(f"Endpoint '{endpoint_name}' not found")

377 sys.exit(1)

378

379 except Exception as e:

380 formatter.print_error(f"Failed to stop endpoint: {e}")

381 sys.exit(1)

382

383

384@inference.command("start")

385@click.argument("endpoint_name")

386@pass_config

387def inference_start(config: Any, endpoint_name: Any) -> None:

388 """Start a stopped inference endpoint.

389

390 Examples:

391 gco inference start my-llm

392 """

393 from ..inference import get_inference_manager

394

395 formatter = get_output_formatter(config)

396

397 try:

398 manager = get_inference_manager(config)

399 result = manager.start(endpoint_name)

400

401 if result:

402 formatter.print_success(f"Endpoint '{endpoint_name}' marked for start")

403 else:

404 formatter.print_error(f"Endpoint '{endpoint_name}' not found")

405 sys.exit(1)

406

407 except Exception as e:

408 formatter.print_error(f"Failed to start endpoint: {e}")

409 sys.exit(1)

410

411

412@inference.command("delete")

413@click.argument("endpoint_name")

414@click.option("--yes", "-y", is_flag=True, help="Skip confirmation")

415@pass_config

416def inference_delete(config: Any, endpoint_name: Any, yes: Any) -> None:

417 """Delete an inference endpoint from all regions.

418

419 The inference_monitor in each region will clean up the K8s resources.

420

421 Examples:

422 gco inference delete my-llm -y

423 """

424 from ..inference import get_inference_manager

425

426 formatter = get_output_formatter(config)

427

428 if not yes: 428 ↛ 429line 428 didn't jump to line 429 because the condition on line 428 was never true

429 click.confirm(f"Delete endpoint '{endpoint_name}' from all regions?", abort=True)

430

431 try:

432 manager = get_inference_manager(config)

433 result = manager.delete(endpoint_name)

434

435 if result:

436 formatter.print_success(

437 f"Endpoint '{endpoint_name}' marked for deletion. "

438 "The inference_monitor will clean up resources in each region."

439 )

440 else:

441 formatter.print_error(f"Endpoint '{endpoint_name}' not found")

442 sys.exit(1)

443

444 except Exception as e:

445 formatter.print_error(f"Failed to delete endpoint: {e}")

446 sys.exit(1)

447

448

449@inference.command("update-image")

450@click.argument("endpoint_name")

451@click.option("--image", "-i", required=True, help="New container image")

452@pass_config

453def inference_update_image(config: Any, endpoint_name: Any, image: Any) -> None:

454 """Update the container image for an inference endpoint.

455

456 Triggers a rolling update across all target regions.

457

458 Examples:

459 gco inference update-image my-llm -i vllm/vllm-openai:v0.9.0

460 """

461 from ..inference import get_inference_manager

462

463 formatter = get_output_formatter(config)

464

465 try:

466 manager = get_inference_manager(config)

467 result = manager.update_image(endpoint_name, image)

468

469 if result:

470 formatter.print_success(f"Endpoint '{endpoint_name}' image updated to {image}")

471 formatter.print_info("Rolling update will be applied by inference_monitor")

472 else:

473 formatter.print_error(f"Endpoint '{endpoint_name}' not found")

474 sys.exit(1)

475

476 except Exception as e:

477 formatter.print_error(f"Failed to update image: {e}")

478 sys.exit(1)

479

480

481@inference.command("invoke")

482@click.argument("endpoint_name")

483@click.option("--prompt", "-p", help="Text prompt to send")

484@click.option("--data", "-d", help="Raw JSON body to send")

485@click.option(

486 "--path", "api_path", default=None, help="API sub-path (default: auto-detect from framework)"

487)

488@click.option("--region", "-r", help="Target region for the request")

489@click.option(

490 "--max-tokens", type=int, default=100, help="Maximum tokens to generate (default: 100)"

491)

492@click.option("--stream/--no-stream", default=False, help="Stream the response")

493@pass_config

494def inference_invoke(

495 config: Any,

496 endpoint_name: Any,

497 prompt: Any,

498 data: Any,

499 api_path: Any,

500 region: Any,

501 max_tokens: Any,

502 stream: Any,

503) -> None:

504 """Send a request to an inference endpoint and print the response.

505

506 Automatically discovers the endpoint's ingress path and routes the

507 request through the API Gateway with SigV4 authentication.

508

509 Examples:

510 gco inference invoke my-llm -p "What is GPU orchestration?"

511

512 gco inference invoke my-llm -d '{"prompt": "Hello", "max_tokens": 50}'

513

514 gco inference invoke my-llm -p "Explain K8s" --path /v1/completions

515 """

516 import json as _json

517

518 from ..aws_client import get_aws_client

519 from ..inference import get_inference_manager

520

521 formatter = get_output_formatter(config)

522

523 if not prompt and not data:

524 formatter.print_error("Provide --prompt (-p) or --data (-d)")

525 sys.exit(1)

526

527 try:

528 # Look up the endpoint to get its ingress path and spec

529 manager = get_inference_manager(config)

530 endpoint = manager.get_endpoint(endpoint_name)

531 if not endpoint:

532 formatter.print_error(f"Endpoint '{endpoint_name}' not found")

533 sys.exit(1)

534

535 ingress_path = endpoint.get("ingress_path", f"/inference/{endpoint_name}")

536 spec = endpoint.get("spec", {})

537 image = spec.get("image", "") if isinstance(spec, dict) else ""

538

539 # Auto-detect the API sub-path based on the container image

540 if api_path is None:

541 if "vllm" in image:

542 api_path = "/v1/completions"

543 elif "text-generation-inference" in image or "tgi" in image:

544 api_path = "/generate"

545 elif "tritonserver" in image or "triton" in image: 545 ↛ 548line 545 didn't jump to line 548 because the condition on line 545 was always true

546 api_path = "/v2/models"

547 else:

548 api_path = "/v1/completions"

549

550 full_path = f"{ingress_path}{api_path}"

551

552 # Build the request body

553 if data:

554 body_str = data

555 elif prompt: 555 ↛ 601line 555 didn't jump to line 601 because the condition on line 555 was always true

556 # Build a sensible default body based on framework

557 if "generate" in api_path:

558 # TGI format

559 body_dict = {"inputs": prompt, "parameters": {"max_new_tokens": max_tokens}}

560 elif "/v2/" in api_path:

561 # Triton — just list models, prompt not used for this path

562 body_dict = {}

563 else:

564 # OpenAI-compatible (vLLM, etc.)

565 # Determine model name for OpenAI-compatible request

566 model_name = endpoint_name

567 if isinstance(spec, dict): 567 ↛ 593line 567 didn't jump to line 593 because the condition on line 567 was always true

568 # Check env vars first

569 model_name = spec.get("env", {}).get("MODEL", model_name)

570 # Check container args for --model (vLLM, etc.)

571 args_list = spec.get("args") or []

572 for i, arg in enumerate(args_list): 572 ↛ 573line 572 didn't jump to line 573 because the loop on line 572 never started

573 if arg == "--model" and i + 1 < len(args_list):

574 model_name = args_list[i + 1]

575 break

576 # Default for vLLM with no explicit model — auto-detect

577 # by querying /v1/models on the running endpoint

578 if model_name == endpoint_name and "vllm" in image: 578 ↛ 593line 578 didn't jump to line 593 because the condition on line 578 was always true

579 try:

580 detect_client = get_aws_client(config)

581 models_path = f"/inference/{endpoint_name}/v1/models"

582 models_resp = detect_client.make_authenticated_request(

583 method="GET",

584 path=models_path,

585 target_region=region,

586 )

587 if models_resp.ok:

588 models_data = models_resp.json().get("data", [])

589 if models_data: 589 ↛ 590line 589 didn't jump to line 590 because the condition on line 589 was never true

590 model_name = models_data[0]["id"]

591 except Exception:

592 pass # Fall through to endpoint_name as model

593 body_dict = {

594 "model": model_name,

595 "prompt": prompt,

596 "max_tokens": max_tokens,

597 "stream": stream,

598 }

599 body_str = _json.dumps(body_dict)

600

601 formatter.print_info(f"POST {full_path}")

602

603 # Make the authenticated request

604 client = get_aws_client(config)

605 response = client.make_authenticated_request(

606 method="POST" if body_str else "GET",

607 path=full_path,

608 body=_json.loads(body_str) if body_str else None,

609 target_region=region,

610 )

611

612 # Print the response

613 if response.ok:

614 try:

615 resp_json = response.json()

616 # Extract the generated text for common formats

617 text = None

618 if "choices" in resp_json:

619 # OpenAI format

620 choices = resp_json["choices"]

621 if choices: 621 ↛ 631line 621 didn't jump to line 631 because the condition on line 621 was always true

622 text = choices[0].get("text") or choices[0].get("message", {}).get(

623 "content"

624 )

625 elif "generated_text" in resp_json: 625 ↛ 627line 625 didn't jump to line 627 because the condition on line 625 was never true

626 # TGI format

627 text = resp_json["generated_text"]

628 elif isinstance(resp_json, list) and resp_json and "generated_text" in resp_json[0]:

629 text = resp_json[0]["generated_text"]

630

631 if text and config.output_format == "table":

632 print(f"\n{text.strip()}\n")

633 else:

634 print(_json.dumps(resp_json, indent=2))

635 except _json.JSONDecodeError:

636 print(response.text)

637 else:

638 formatter.print_error(f"HTTP {response.status_code}: {response.text[:500]}")

639 sys.exit(1)

640

641 except Exception as e:

642 formatter.print_error(f"Failed to invoke endpoint: {e}")

643 sys.exit(1)

644

645

646@inference.command("canary")

647@click.argument("endpoint_name")

648@click.option("--image", "-i", required=True, help="New container image for canary")

649@click.option(

650 "--weight",

651 "-w",

652 default=10,

653 type=int,

654 help="Percentage of traffic to canary (1-99, default: 10)",

655)

656@click.option(

657 "--replicas", "-r", default=1, type=int, help="Number of canary replicas (default: 1)"

658)

659@pass_config

660def inference_canary(

661 config: Any, endpoint_name: Any, image: Any, weight: Any, replicas: Any

662) -> None:

663 """Start a canary deployment with a new image.

664

665 Routes a percentage of traffic to the canary while the primary

666 continues serving the rest. Use 'promote' to make the canary

667 the new primary, or 'rollback' to remove it.

668

669 Examples:

670 gco inference canary my-llm -i vllm/vllm-openai:v0.9.0 --weight 10

671 gco inference canary my-llm -i new-image:latest -w 25 -r 2

672 """

673 from ..inference import get_inference_manager

674

675 formatter = get_output_formatter(config)

676

677 try:

678 manager = get_inference_manager(config)

679 result = manager.canary_deploy(endpoint_name, image, weight=weight, replicas=replicas)

680

681 if not result:

682 formatter.print_error(f"Endpoint '{endpoint_name}' not found")

683 sys.exit(1)

684

685 formatter.print_success(

686 f"Canary started: {weight}% traffic → {image} ({replicas} replica(s))"

687 )

688 formatter.print_info(f"Monitor with: gco inference status {endpoint_name}")

689 formatter.print_info(f"Promote with: gco inference promote {endpoint_name}")

690 formatter.print_info(f"Rollback with: gco inference rollback {endpoint_name}")

691

692 except ValueError as e:

693 formatter.print_error(str(e))

694 sys.exit(1)

695 except Exception as e:

696 formatter.print_error(f"Failed to start canary: {e}")

697 sys.exit(1)

698

699

700@inference.command("promote")

701@click.argument("endpoint_name")

702@click.option("--yes", "-y", is_flag=True, help="Skip confirmation")

703@pass_config

704def inference_promote(config: Any, endpoint_name: Any, yes: Any) -> None:

705 """Promote the canary to primary.

706

707 Replaces the primary image with the canary image and removes

708 the canary deployment. All traffic goes to the new image.

709

710 Examples:

711 gco inference promote my-llm -y

712 """

713 from ..inference import get_inference_manager

714

715 formatter = get_output_formatter(config)

716

717 try:

718 manager = get_inference_manager(config)

719 endpoint = manager.get_endpoint(endpoint_name)

720

721 if not endpoint:

722 formatter.print_error(f"Endpoint '{endpoint_name}' not found")

723 sys.exit(1)

724

725 canary = endpoint.get("spec", {}).get("canary")

726 if not canary:

727 formatter.print_error(f"Endpoint '{endpoint_name}' has no active canary")

728 sys.exit(1)

729

730 if not yes: 730 ↛ 731line 730 didn't jump to line 731 because the condition on line 730 was never true

731 current_image = endpoint.get("spec", {}).get("image", "unknown")

732 click.echo(f" Current primary: {current_image}")

733 click.echo(f" Canary image: {canary.get('image', 'unknown')}")

734 click.echo(f" Canary weight: {canary.get('weight', 0)}%")

735 if not click.confirm(" Promote canary to primary?"):

736 formatter.print_info("Cancelled")

737 return

738

739 result = manager.promote_canary(endpoint_name)

740 if result: 740 ↛ 744line 740 didn't jump to line 744 because the condition on line 740 was always true

741 new_image = result.get("spec", {}).get("image", "unknown")

742 formatter.print_success(f"Promoted: all traffic now serving {new_image}")

743 else:

744 formatter.print_error("Promotion failed")

745 sys.exit(1)

746

747 except ValueError as e:

748 formatter.print_error(str(e))

749 sys.exit(1)

750 except Exception as e:

751 formatter.print_error(f"Failed to promote canary: {e}")

752 sys.exit(1)

753

754

755@inference.command("rollback")

756@click.argument("endpoint_name")

757@click.option("--yes", "-y", is_flag=True, help="Skip confirmation")

758@pass_config

759def inference_rollback(config: Any, endpoint_name: Any, yes: Any) -> None:

760 """Remove the canary deployment, keeping the primary unchanged.

761

762 All traffic returns to the primary deployment.

763

764 Examples:

765 gco inference rollback my-llm -y

766 """

767 from ..inference import get_inference_manager

768

769 formatter = get_output_formatter(config)

770

771 try:

772 manager = get_inference_manager(config)

773 endpoint = manager.get_endpoint(endpoint_name)

774

775 if not endpoint: 775 ↛ 776line 775 didn't jump to line 776 because the condition on line 775 was never true

776 formatter.print_error(f"Endpoint '{endpoint_name}' not found")

777 sys.exit(1)

778

779 canary = endpoint.get("spec", {}).get("canary")

780 if not canary:

781 formatter.print_error(f"Endpoint '{endpoint_name}' has no active canary")

782 sys.exit(1)

783

784 if not yes: 784 ↛ 785line 784 didn't jump to line 785 because the condition on line 784 was never true

785 click.echo(f" Canary image: {canary.get('image', 'unknown')}")

786 click.echo(f" Canary weight: {canary.get('weight', 0)}%")

787 if not click.confirm(" Remove canary and restore full traffic to primary?"):

788 formatter.print_info("Cancelled")

789 return

790

791 result = manager.rollback_canary(endpoint_name)

792 if result: 792 ↛ 796line 792 didn't jump to line 796 because the condition on line 792 was always true

793 primary_image = result.get("spec", {}).get("image", "unknown")

794 formatter.print_success(f"Rolled back: all traffic now serving {primary_image}")

795 else:

796 formatter.print_error("Rollback failed")

797 sys.exit(1)

798

799 except ValueError as e:

800 formatter.print_error(str(e))

801 sys.exit(1)

802 except Exception as e:

803 formatter.print_error(f"Failed to rollback canary: {e}")

804 sys.exit(1)

805

806

807@inference.command("health")

808@click.argument("endpoint_name")

809@click.option("--region", "-r", help="Target region to check")

810@pass_config

811def inference_health(config: Any, endpoint_name: Any, region: Any) -> None:

812 """Check if an inference endpoint is healthy and ready to serve.

813

814 Hits the endpoint's health check path and reports status and latency.

815

816 Examples:

817 gco inference health my-llm

818

819 gco inference health my-llm -r us-east-1

820 """

821 import json as _json

822 import time as _time

823

824 from ..aws_client import get_aws_client

825 from ..inference import get_inference_manager

826

827 formatter = get_output_formatter(config)

828

829 try:

830 manager = get_inference_manager(config)

831 endpoint = manager.get_endpoint(endpoint_name)

832 if not endpoint:

833 formatter.print_error(f"Endpoint '{endpoint_name}' not found")

834 sys.exit(1)

835

836 ingress_path = endpoint.get("ingress_path", f"/inference/{endpoint_name}")

837 spec = endpoint.get("spec", {})

838 health_path = spec.get("health_path", "/health") if isinstance(spec, dict) else "/health"

839 full_path = f"{ingress_path}{health_path}"

840

841 client = get_aws_client(config)

842 start = _time.monotonic()

843 response = client.make_authenticated_request(

844 method="GET",

845 path=full_path,

846 target_region=region,

847 )

848 latency_ms = (_time.monotonic() - start) * 1000

849

850 result = {

851 "endpoint": endpoint_name,

852 "status": "healthy" if response.ok else "unhealthy",

853 "http_status": response.status_code,

854 "latency_ms": round(latency_ms, 1),

855 "path": full_path,

856 }

857

858 try:

859 result["body"] = response.json()

860 except Exception:

861 result["body"] = response.text[:200] if response.text else None

862

863 if config.output_format == "json": 863 ↛ 864line 863 didn't jump to line 864 because the condition on line 863 was never true

864 print(_json.dumps(result, indent=2))

865 else:

866 status_icon = "✓" if response.ok else "✗"

867 formatter.print_info(

868 f"{status_icon} {endpoint_name}: {result['status']} "

869 f"(HTTP {response.status_code}, {result['latency_ms']}ms)"

870 )

871

872 except Exception as e:

873 formatter.print_error(f"Health check failed: {e}")

874 sys.exit(1)

875

876

877@inference.command("models")

878@click.argument("endpoint_name")

879@click.option("--region", "-r", help="Target region to query")

880@pass_config

881def inference_models(config: Any, endpoint_name: Any, region: Any) -> None:

882 """List models loaded on an inference endpoint.

883

884 Queries the /v1/models path (OpenAI-compatible) to discover loaded models.

885

886 Examples:

887 gco inference models my-llm

888 """

889 import json as _json

890

891 from ..aws_client import get_aws_client

892 from ..inference import get_inference_manager

893

894 formatter = get_output_formatter(config)

895

896 try:

897 manager = get_inference_manager(config)

898 endpoint = manager.get_endpoint(endpoint_name)

899 if not endpoint:

900 formatter.print_error(f"Endpoint '{endpoint_name}' not found")

901 sys.exit(1)

902

903 ingress_path = endpoint.get("ingress_path", f"/inference/{endpoint_name}")

904 full_path = f"{ingress_path}/v1/models"

905

906 client = get_aws_client(config)

907 response = client.make_authenticated_request(

908 method="GET",

909 path=full_path,

910 target_region=region,

911 )

912

913 if response.ok:

914 try:

915 resp_json = response.json()

916 print(_json.dumps(resp_json, indent=2))

917 except _json.JSONDecodeError:

918 print(response.text)

919 else:

920 formatter.print_error(f"HTTP {response.status_code}: {response.text[:500]}")

921 sys.exit(1)

922

923 except Exception as e:

924 formatter.print_error(f"Failed to list models: {e}")

925 sys.exit(1)

Coverage for cli / commands / inference_cmd.py: 88%

502 statements