From a9453f5acdd4f1a8badc95bdd0d716b22a4082b4 Mon Sep 17 00:00:00 2001 From: lilyz-ai Date: Tue, 21 Apr 2026 01:11:28 +0000 Subject: [PATCH 01/11] chore: bump model-engine image tag to latest main (2e9d0078) Co-Authored-By: Claude Sonnet 4.6 --- charts/model-engine/values_sample.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/model-engine/values_sample.yaml b/charts/model-engine/values_sample.yaml index 301fca89..eee15c54 100644 --- a/charts/model-engine/values_sample.yaml +++ b/charts/model-engine/values_sample.yaml @@ -24,7 +24,7 @@ celery_broker_type_redis: null # - ALL # tag [required] is the LLM Engine docker image tag -tag: e360bfb1d21d9d4e7b7fcb6b29ca752095b4d0f4 +tag: 2e9d00786419ef44ec5c9d3305d8d6451d6aabfb # context is a user-specified deployment tag. Can be used to context: production image: From 0212a9c59bc5fa34ad937a95e4051a07a13abb23 Mon Sep 17 00:00:00 2001 From: lilyz-ai Date: Thu, 7 May 2026 02:09:43 +0000 Subject: [PATCH 02/11] feat(devx): add local control plane dev setup (MLI-6681) Adds a one-command local development workflow for the model engine control plane so developers can iterate on gateway/service-builder code without building prod images or touching live infra. - docker-compose.local.yml: spins up Postgres 15 + Redis 7 - service_configs/service_config_local.yaml: HMI config for local services - Makefile: dev-up / dev-migrate / dev-server / dev-down / test targets - LOCAL=true env var now activates fake queue/docker implementations (parallel to existing CIRCLECI=true path) and skips GIT_TAG requirement - README: new "Control Plane Local Setup" section with full walkthrough Co-Authored-By: Claude Sonnet 4.6 --- model-engine/Makefile | 35 ++++++++ model-engine/README.md | 82 +++++++++++++++++++ model-engine/docker-compose.local.yml | 23 ++++++ .../model_engine_server/api/dependencies.py | 10 +-- .../model_engine_server/common/env_vars.py | 2 +- .../service_configs/service_config_local.yaml | 32 ++++++++ 6 files changed, 178 insertions(+), 6 deletions(-) create mode 100644 model-engine/Makefile create mode 100644 model-engine/docker-compose.local.yml create mode 100644 model-engine/service_configs/service_config_local.yaml diff --git a/model-engine/Makefile b/model-engine/Makefile new file mode 100644 index 00000000..cbf11a85 --- /dev/null +++ b/model-engine/Makefile @@ -0,0 +1,35 @@ +.PHONY: install dev-up dev-down dev-migrate dev-server test + +MODEL_ENGINE_DIR := $(abspath .) +DB_URL := postgresql://postgres:password@localhost:5432/llm_engine + +# Local dev environment variables +LOCAL_ENV := \ + LOCAL=true \ + GIT_TAG=local \ + ML_INFRA_DATABASE_URL=$(DB_URL) \ + DEPLOY_SERVICE_CONFIG_PATH=$(MODEL_ENGINE_DIR)/service_configs/service_config_local.yaml + +install: + pip install -r requirements.txt -r requirements-test.txt -r requirements_override.txt + pip install -e . + +dev-up: + docker compose -f docker-compose.local.yml up -d + @echo "Waiting for services to be healthy..." + @until docker compose -f docker-compose.local.yml exec postgres pg_isready -U postgres -q; do sleep 1; done + @echo "Postgres ready." + @until docker compose -f docker-compose.local.yml exec redis redis-cli ping | grep -q PONG; do sleep 1; done + @echo "Redis ready." + +dev-down: + docker compose -f docker-compose.local.yml down + +dev-migrate: + $(LOCAL_ENV) bash model_engine_server/db/migrations/run_database_migration.sh + +dev-server: + $(LOCAL_ENV) start-fastapi-server --port 5000 --num-workers 1 --debug + +test: + pytest tests/unit/ diff --git a/model-engine/README.md b/model-engine/README.md index febdda78..1160de17 100644 --- a/model-engine/README.md +++ b/model-engine/README.md @@ -129,6 +129,88 @@ For OpenAI-compatible V2 APIs, we generate Pydantic models from OpenAI's spec: ## Local Development +### Control Plane Local Setup + +The control plane (Gateway API server, Service Builder, K8s Cache) can be run entirely +locally without GPU hardware or cloud credentials. Endpoint creation calls succeed +against a fake k8s/SQS/ECR backend, letting you iterate on control plane code quickly. + +**Prerequisites:** Python 3.10+, Docker + +#### One-time setup + +```bash +cd model-engine/ + +# Install Python dependencies +make install + +# Start Postgres + Redis +make dev-up + +# Apply database migrations +make dev-migrate +``` + +#### Run the API server + +```bash +make dev-server +``` + +The gateway starts at http://localhost:5000 with auto-reload on file changes. +Authentication is skipped automatically (`SKIP_AUTH=true`) so any token works. + +#### Make API calls + +```bash +# List model endpoints +curl http://localhost:5000/v1/model-endpoints \ + -H "Authorization: Bearer test-user" + +# Create an LLM endpoint (uses fake k8s — no real infra needed) +curl -X POST http://localhost:5000/v1/llm/model-endpoints \ + -H "Authorization: Bearer test-user" \ + -H "Content-Type: application/json" \ + -d '{"name":"local-test","model_name":"meta-llama/Meta-Llama-3.1-8B-Instruct","inference_framework":"vllm","min_workers":0,"max_workers":1,"gpus":1,"gpu_type":"nvidia-ampere-a10","endpoint_type":"sync"}' +``` + +#### Stop backing services + +```bash +make dev-down +``` + +#### What `LOCAL=true` does + +Running with `LOCAL=true` (set automatically by `make dev-server` and `make dev-migrate`): + +- Skips the `GIT_TAG` env var requirement +- Uses a **fake queue delegate** (no SQS/Azure Service Bus needed) +- Uses a **fake Docker repository** (no ECR/ACR/GAR needed) +- Auth is skipped when `identity_service_url` is absent from config (default) +- Postgres and Redis are real local services (via docker-compose) + +This means you can create/update/delete endpoints via the API and see them reflected +in Postgres, without any Kubernetes cluster or cloud account. + +#### Running individual components manually + +If you prefer to set env vars yourself rather than use `make`: + +```bash +export LOCAL=true +export GIT_TAG=local +export ML_INFRA_DATABASE_URL=postgresql://postgres:password@localhost:5432/llm_engine +export DEPLOY_SERVICE_CONFIG_PATH=$(pwd)/service_configs/service_config_local.yaml + +# Gateway +start-fastapi-server --port 5000 --num-workers 1 --debug + +# Database migration +bash model_engine_server/db/migrations/run_database_migration.sh +``` + ### Testing the HTTP Forwarder Start an endpoint on port 5005: diff --git a/model-engine/docker-compose.local.yml b/model-engine/docker-compose.local.yml new file mode 100644 index 00000000..9db7e313 --- /dev/null +++ b/model-engine/docker-compose.local.yml @@ -0,0 +1,23 @@ +services: + postgres: + image: postgres:15 + environment: + POSTGRES_PASSWORD: password + POSTGRES_DB: llm_engine + ports: + - "5432:5432" + healthcheck: + test: ["CMD-SHELL", "pg_isready -U postgres"] + interval: 5s + timeout: 5s + retries: 5 + + redis: + image: redis:7 + ports: + - "6379:6379" + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 5s + timeout: 5s + retries: 5 diff --git a/model-engine/model_engine_server/api/dependencies.py b/model-engine/model_engine_server/api/dependencies.py index f28425d8..24c60a06 100644 --- a/model-engine/model_engine_server/api/dependencies.py +++ b/model-engine/model_engine_server/api/dependencies.py @@ -10,7 +10,7 @@ from model_engine_server.common.aioredis_pool import build_aioredis_pool from model_engine_server.common.config import hmi_config from model_engine_server.common.dtos.model_endpoints import BrokerType -from model_engine_server.common.env_vars import CIRCLECI +from model_engine_server.common.env_vars import CIRCLECI, LOCAL from model_engine_server.core.auth.authentication_repository import AuthenticationRepository, User from model_engine_server.core.auth.fake_authentication_repository import ( FakeAuthenticationRepository, @@ -241,7 +241,7 @@ def _get_external_interfaces( ) queue_delegate: QueueEndpointResourceDelegate - if CIRCLECI: + if CIRCLECI or LOCAL: queue_delegate = FakeQueueEndpointResourceDelegate() elif infra_config().cloud_provider == "onprem": queue_delegate = OnPremQueueEndpointResourceDelegate() @@ -257,8 +257,8 @@ def _get_external_interfaces( inference_task_queue_gateway: TaskQueueGateway infra_task_queue_gateway: TaskQueueGateway - if CIRCLECI or infra_config().cloud_provider == "onprem": - # On-prem uses Redis-based task queues + if CIRCLECI or LOCAL or infra_config().cloud_provider == "onprem": + # On-prem and local dev use Redis-based task queues inference_task_queue_gateway = redis_24h_task_queue_gateway infra_task_queue_gateway = redis_task_queue_gateway elif infra_config().cloud_provider == "azure": @@ -391,7 +391,7 @@ def _get_external_interfaces( registry_type = infra_config().docker_registry_type or infer_registry_type( infra_config().docker_repo_prefix ) - if CIRCLECI: + if CIRCLECI or LOCAL: docker_repository = FakeDockerRepository() elif registry_type == "ecr": docker_repository = ECRDockerRepository() diff --git a/model-engine/model_engine_server/common/env_vars.py b/model-engine/model_engine_server/common/env_vars.py index 2a69cbff..9376f93f 100644 --- a/model-engine/model_engine_server/common/env_vars.py +++ b/model-engine/model_engine_server/common/env_vars.py @@ -76,5 +76,5 @@ def get_boolean_env_var(name: str) -> bool: logger.warning("LOCAL development & testing mode is ON") GIT_TAG: str = os.environ.get("GIT_TAG", "GIT_TAG_NOT_FOUND") -if GIT_TAG == "GIT_TAG_NOT_FOUND" and "pytest" not in sys.modules: +if GIT_TAG == "GIT_TAG_NOT_FOUND" and "pytest" not in sys.modules and not LOCAL: raise ValueError("GIT_TAG environment variable must be set") diff --git a/model-engine/service_configs/service_config_local.yaml b/model-engine/service_configs/service_config_local.yaml new file mode 100644 index 00000000..5f87a594 --- /dev/null +++ b/model-engine/service_configs/service_config_local.yaml @@ -0,0 +1,32 @@ +gateway_namespace: default +endpoint_namespace: model-engine +model_primitive_host: "none" + +# Local Redis (started via docker-compose.local.yml) +cache_redis_aws_url: redis://localhost:6379/15 + +sqs_profile: nonexistent_sqs_profile +sqs_queue_policy_template: > + { + "Version": "2012-10-17", + "Statement": [] + } +sqs_queue_tag_template: > + {} + +billing_queue_arn: none +cloud_file_llm_fine_tune_repository: "s3://local-bucket/fine_tune_repository/local" + +dd_trace_enabled: false +istio_enabled: false +sensitive_log_mode: false +tgi_repository: "text-generation-inference" +vllm_repository: "vllm" +lightllm_repository: "lightllm" +tensorrt_llm_repository: "tensorrt-llm" +batch_inference_vllm_repository: "llm-engine/batch-infer-vllm" +user_inference_base_repository: "launch/inference" +user_inference_pytorch_repository: "hosted-model-inference/async-pytorch" +user_inference_tensorflow_repository: "hosted-model-inference/async-tensorflow-cpu" +docker_image_layer_cache_repository: "kaniko-cache" +hf_user_fine_tuned_weights_prefix: "s3://local-bucket/model-weights" From 59e8a2fc8b97506fbf3ba15cebb3aba4cd84683f Mon Sep 17 00:00:00 2001 From: lilyz-ai Date: Thu, 7 May 2026 02:23:36 +0000 Subject: [PATCH 03/11] fix(devx): use cache_redis_onprem_url and pin ML_INFRA_SERVICES_CONFIG_PATH MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - service_config_local.yaml: switch from cache_redis_aws_url to cache_redis_onprem_url so the Redis URL is resolved before the cloud_provider assertion fires — fixes startup failure for non-AWS configs - Makefile: pin ML_INFRA_SERVICES_CONFIG_PATH to default.yaml so local dev is not affected by a developer's ambient infra config Co-Authored-By: Claude Sonnet 4.6 --- model-engine/Makefile | 3 ++- model-engine/service_configs/service_config_local.yaml | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/model-engine/Makefile b/model-engine/Makefile index cbf11a85..9599b3da 100644 --- a/model-engine/Makefile +++ b/model-engine/Makefile @@ -8,7 +8,8 @@ LOCAL_ENV := \ LOCAL=true \ GIT_TAG=local \ ML_INFRA_DATABASE_URL=$(DB_URL) \ - DEPLOY_SERVICE_CONFIG_PATH=$(MODEL_ENGINE_DIR)/service_configs/service_config_local.yaml + DEPLOY_SERVICE_CONFIG_PATH=$(MODEL_ENGINE_DIR)/service_configs/service_config_local.yaml \ + ML_INFRA_SERVICES_CONFIG_PATH=$(MODEL_ENGINE_DIR)/model_engine_server/core/configs/default.yaml install: pip install -r requirements.txt -r requirements-test.txt -r requirements_override.txt diff --git a/model-engine/service_configs/service_config_local.yaml b/model-engine/service_configs/service_config_local.yaml index 5f87a594..541da4b3 100644 --- a/model-engine/service_configs/service_config_local.yaml +++ b/model-engine/service_configs/service_config_local.yaml @@ -3,7 +3,8 @@ endpoint_namespace: model-engine model_primitive_host: "none" # Local Redis (started via docker-compose.local.yml) -cache_redis_aws_url: redis://localhost:6379/15 +# Use onprem_url so it's checked before cloud_provider assertions +cache_redis_onprem_url: redis://localhost:6379/15 sqs_profile: nonexistent_sqs_profile sqs_queue_policy_template: > From 26797262352f6856199427ff70856bef64718c92 Mon Sep 17 00:00:00 2001 From: lilyz-ai Date: Thu, 7 May 2026 02:32:36 +0000 Subject: [PATCH 04/11] fix(devx): pin infra config in manual example and add postgres volume - README: add ML_INFRA_SERVICES_CONFIG_PATH to the manual env-var snippet so developers with non-AWS ambient configs don't accidentally hit the cloud_provider assertion - docker-compose.local.yml: mount a named volume for Postgres so the database survives dev-down/dev-up cycles Co-Authored-By: Claude Sonnet 4.6 --- model-engine/README.md | 1 + model-engine/docker-compose.local.yml | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/model-engine/README.md b/model-engine/README.md index 1160de17..573d15e4 100644 --- a/model-engine/README.md +++ b/model-engine/README.md @@ -203,6 +203,7 @@ export LOCAL=true export GIT_TAG=local export ML_INFRA_DATABASE_URL=postgresql://postgres:password@localhost:5432/llm_engine export DEPLOY_SERVICE_CONFIG_PATH=$(pwd)/service_configs/service_config_local.yaml +export ML_INFRA_SERVICES_CONFIG_PATH=$(pwd)/model_engine_server/core/configs/default.yaml # Gateway start-fastapi-server --port 5000 --num-workers 1 --debug diff --git a/model-engine/docker-compose.local.yml b/model-engine/docker-compose.local.yml index 9db7e313..ff07c202 100644 --- a/model-engine/docker-compose.local.yml +++ b/model-engine/docker-compose.local.yml @@ -6,6 +6,8 @@ services: POSTGRES_DB: llm_engine ports: - "5432:5432" + volumes: + - postgres_data:/var/lib/postgresql/data healthcheck: test: ["CMD-SHELL", "pg_isready -U postgres"] interval: 5s @@ -21,3 +23,6 @@ services: interval: 5s timeout: 5s retries: 5 + +volumes: + postgres_data: From df1f0006a14c14c20875ceefd4c1f4cd0de0a4cc Mon Sep 17 00:00:00 2001 From: lilyz-ai Date: Thu, 7 May 2026 02:34:40 +0000 Subject: [PATCH 05/11] fix(devx): use docker compose --wait instead of unbounded polling loops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the manual until-loops in dev-up with `docker compose up --wait`, which blocks until healthchecks pass and exits non-zero if they fail — eliminating the infinite-spin on container crash. Co-Authored-By: Claude Sonnet 4.6 --- model-engine/Makefile | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/model-engine/Makefile b/model-engine/Makefile index 9599b3da..93dd2638 100644 --- a/model-engine/Makefile +++ b/model-engine/Makefile @@ -16,12 +16,7 @@ install: pip install -e . dev-up: - docker compose -f docker-compose.local.yml up -d - @echo "Waiting for services to be healthy..." - @until docker compose -f docker-compose.local.yml exec postgres pg_isready -U postgres -q; do sleep 1; done - @echo "Postgres ready." - @until docker compose -f docker-compose.local.yml exec redis redis-cli ping | grep -q PONG; do sleep 1; done - @echo "Redis ready." + docker compose -f docker-compose.local.yml up -d --wait dev-down: docker compose -f docker-compose.local.yml down From 957f5734e0d014f9341c47f4abc758d0dc36c7e2 Mon Sep 17 00:00:00 2001 From: lilyz-ai Date: Thu, 7 May 2026 03:07:20 +0000 Subject: [PATCH 06/11] feat(devx): add full end-to-end local flow with kind (MLI-6681) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends the local dev setup so the complete control plane → Service Builder → k8s inference pod flow can be tested locally without cloud credentials. Changes: - local-full.yaml: new onprem infra config pointing to localhost Redis/kind - dependencies.py: LOCAL=true + cloud_provider=onprem falls through to real Redis queue delegate instead of the fake (enabling full k8s flow) - service_builder/celery.py: fix onprem to use redis backend not s3 - env_vars.py: default GIT_TAG to "local" when LOCAL=true so k8s templates reference the correct model-engine:local image loaded into kind - Makefile: kind-up/kind-down/kind-image targets + dev-server-full, dev-service-builder, dev-k8s-cacher targets using FULL_LOCAL_ENV - README: full end-to-end setup section with step-by-step instructions, example endpoint creation, and flow table Co-Authored-By: Claude Sonnet 4.6 --- model-engine/Makefile | 46 +++++++- model-engine/README.md | 102 ++++++++++++++++++ .../model_engine_server/api/dependencies.py | 2 +- .../model_engine_server/common/env_vars.py | 2 +- .../core/configs/local-full.yaml | 16 +++ .../service_builder/celery.py | 6 +- 6 files changed, 168 insertions(+), 6 deletions(-) create mode 100644 model-engine/model_engine_server/core/configs/local-full.yaml diff --git a/model-engine/Makefile b/model-engine/Makefile index 93dd2638..b16349ec 100644 --- a/model-engine/Makefile +++ b/model-engine/Makefile @@ -1,9 +1,13 @@ -.PHONY: install dev-up dev-down dev-migrate dev-server test +.PHONY: install dev-up dev-down dev-migrate dev-server test \ + kind-up kind-down kind-image \ + dev-service-builder dev-k8s-cacher dev-server-full MODEL_ENGINE_DIR := $(abspath .) -DB_URL := postgresql://postgres:password@localhost:5432/llm_engine +DB_URL := postgresql://postgres:password@localhost:5432/llm_engine +KIND_CLUSTER := llm-engine +KUBE_CONTEXT := kind-$(KIND_CLUSTER) -# Local dev environment variables +# ── Control-plane-only (no k8s, fake queue/docker) ────────────────────────── LOCAL_ENV := \ LOCAL=true \ GIT_TAG=local \ @@ -11,10 +15,22 @@ LOCAL_ENV := \ DEPLOY_SERVICE_CONFIG_PATH=$(MODEL_ENGINE_DIR)/service_configs/service_config_local.yaml \ ML_INFRA_SERVICES_CONFIG_PATH=$(MODEL_ENGINE_DIR)/model_engine_server/core/configs/default.yaml +# ── Full end-to-end (real k8s via kind, real Redis queue, fake docker) ─────── +FULL_LOCAL_ENV := \ + LOCAL=true \ + GIT_TAG=local \ + ML_INFRA_DATABASE_URL=$(DB_URL) \ + DEPLOY_SERVICE_CONFIG_PATH=$(MODEL_ENGINE_DIR)/service_configs/service_config_local.yaml \ + ML_INFRA_SERVICES_CONFIG_PATH=$(MODEL_ENGINE_DIR)/model_engine_server/core/configs/local-full.yaml \ + REDIS_HOST=localhost \ + REDIS_PORT=6379 + +# ── One-time setup ─────────────────────────────────────────────────────────── install: pip install -r requirements.txt -r requirements-test.txt -r requirements_override.txt pip install -e . +# ── Backing services (Postgres + Redis) ───────────────────────────────────── dev-up: docker compose -f docker-compose.local.yml up -d --wait @@ -24,8 +40,32 @@ dev-down: dev-migrate: $(LOCAL_ENV) bash model_engine_server/db/migrations/run_database_migration.sh +# ── Control-plane-only server ──────────────────────────────────────────────── dev-server: $(LOCAL_ENV) start-fastapi-server --port 5000 --num-workers 1 --debug +# ── kind cluster (for full end-to-end flow) ────────────────────────────────── +kind-up: + kind create cluster --name $(KIND_CLUSTER) + kubectl --context $(KUBE_CONTEXT) create namespace model-engine --dry-run=client -o yaml | kubectl --context $(KUBE_CONTEXT) apply -f - + +kind-down: + kind delete cluster --name $(KIND_CLUSTER) + +kind-image: + docker build -t model-engine:local .. + kind load docker-image model-engine:local --name $(KIND_CLUSTER) + +# ── Full end-to-end processes (run each in a separate terminal) ─────────────── +dev-server-full: + $(FULL_LOCAL_ENV) start-fastapi-server --port 5000 --num-workers 1 --debug + +dev-service-builder: + $(FULL_LOCAL_ENV) celery -A model_engine_server.service_builder.celery worker --loglevel=info --concurrency=2 + +dev-k8s-cacher: + $(FULL_LOCAL_ENV) python model_engine_server/entrypoints/k8s_cache.py --sleep-interval-seconds 5 + +# ── Tests ───────────────────────────────────────────────────────────────────── test: pytest tests/unit/ diff --git a/model-engine/README.md b/model-engine/README.md index 573d15e4..be5fe204 100644 --- a/model-engine/README.md +++ b/model-engine/README.md @@ -212,6 +212,108 @@ start-fastapi-server --port 5000 --num-workers 1 --debug bash model_engine_server/db/migrations/run_database_migration.sh ``` +### Full End-to-End Local Flow (control plane + real inference pod) + +This setup uses [kind](https://kind.sigs.k8s.io/) (Kubernetes in Docker) to run a real +local k8s cluster. The Service Builder creates actual Deployments in kind; the K8s Cacher +polls kind and updates Redis. No GPU required — we use the built-in echo server as the +inference container. + +**Prerequisites:** Python 3.10+, Docker, [`kind`](https://kind.sigs.k8s.io/docs/user/quick-start/#installation) + +#### One-time cluster + image setup + +```bash +cd model-engine/ + +# Start Postgres + Redis (if not already running) +make dev-up + +# Apply DB migrations (if not already done) +make dev-migrate + +# Create kind cluster and the model-engine namespace +make kind-up + +# Build model-engine:local and load it into kind +make kind-image # takes ~2-3 min on first build +``` + +#### Run the full stack (4 terminals) + +```bash +# Terminal 1 — Gateway +make dev-server-full + +# Terminal 2 — Service Builder (picks up endpoint creation tasks from Redis) +make dev-service-builder + +# Terminal 3 — K8s Cacher (polls kind, writes endpoint status to Redis) +make dev-k8s-cacher +``` + +#### Create a test endpoint and watch it spin up + +```bash +# Terminal 4 — create a sync CPU endpoint using the echo server +curl -X POST http://localhost:5000/v1/model-endpoints \ + -H "Authorization: Bearer test-user" \ + -H "Content-Type: application/json" \ + -d '{ + "name": "local-echo", + "bundle_name": "echo-bundle", + "endpoint_type": "sync", + "cpus": 0.25, + "memory": "256Mi", + "min_workers": 1, + "max_workers": 1, + "per_worker": 1, + "model_bundle": { + "name": "echo-bundle", + "metadata": {}, + "flavor": { + "flavor": "runnable_image", + "repository": "model-engine", + "tag": "local", + "command": [ + "python", "-m", + "model_engine_server.inference.forwarding.echo_server", + "--port", "5005" + ], + "predict_route": "/predict", + "healthcheck_route": "/healthz", + "readiness_initial_delay_seconds": 15 + } + } + }' + +# Poll status — transitions PENDING → UPDATE_PENDING → READY (30-60 s) +curl http://localhost:5000/v1/model-endpoints/ \ + -H "Authorization: Bearer test-user" + +# Watch the pod come up in kind +kubectl --context kind-llm-engine get pods -n model-engine -w +``` + +#### Tear down + +```bash +make kind-down # delete kind cluster +make dev-down # stop Postgres + Redis +``` + +#### How the full flow works + +| Component | Mode | What it does locally | +|---|---|---| +| Gateway (`dev-server-full`) | `cloud_provider=onprem` + `LOCAL=true` | Real Redis queue, fake Docker registry | +| Service Builder | `cloud_provider=onprem` + Redis broker | Creates real k8s Deployments in kind | +| K8s Cacher | `cloud_provider=onprem` | Polls kind, writes status to Redis | +| Inference pod | `model-engine:local` in kind | Runs echo server on port 5005 | +| Forwarder sidecar | `model-engine:local` in kind | HTTP forwarder proxies requests | + +> **Note:** LLM endpoints (vLLM, TGI) require GPU hardware and pulling large images — use the generic sync endpoint with the echo server for local flow testing. + ### Testing the HTTP Forwarder Start an endpoint on port 5005: diff --git a/model-engine/model_engine_server/api/dependencies.py b/model-engine/model_engine_server/api/dependencies.py index 24c60a06..4cc0bb7b 100644 --- a/model-engine/model_engine_server/api/dependencies.py +++ b/model-engine/model_engine_server/api/dependencies.py @@ -241,7 +241,7 @@ def _get_external_interfaces( ) queue_delegate: QueueEndpointResourceDelegate - if CIRCLECI or LOCAL: + if CIRCLECI or (LOCAL and infra_config().cloud_provider != "onprem"): queue_delegate = FakeQueueEndpointResourceDelegate() elif infra_config().cloud_provider == "onprem": queue_delegate = OnPremQueueEndpointResourceDelegate() diff --git a/model-engine/model_engine_server/common/env_vars.py b/model-engine/model_engine_server/common/env_vars.py index 9376f93f..b39e16ce 100644 --- a/model-engine/model_engine_server/common/env_vars.py +++ b/model-engine/model_engine_server/common/env_vars.py @@ -75,6 +75,6 @@ def get_boolean_env_var(name: str) -> bool: if LOCAL: logger.warning("LOCAL development & testing mode is ON") -GIT_TAG: str = os.environ.get("GIT_TAG", "GIT_TAG_NOT_FOUND") +GIT_TAG: str = os.environ.get("GIT_TAG", "local" if LOCAL else "GIT_TAG_NOT_FOUND") if GIT_TAG == "GIT_TAG_NOT_FOUND" and "pytest" not in sys.modules and not LOCAL: raise ValueError("GIT_TAG environment variable must be set") diff --git a/model-engine/model_engine_server/core/configs/local-full.yaml b/model-engine/model_engine_server/core/configs/local-full.yaml new file mode 100644 index 00000000..cc0fb02f --- /dev/null +++ b/model-engine/model_engine_server/core/configs/local-full.yaml @@ -0,0 +1,16 @@ +cloud_provider: onprem +env: local +k8s_cluster_name: kind-llm-engine +dns_host_domain: localhost +default_region: us-east-1 +ml_account_id: local +docker_repo_prefix: "localhost" +s3_bucket: local-bucket +redis_host: localhost +redis_port: 6379 +celery_broker_type_redis: true +db_engine_pool_size: 5 +db_engine_max_overflow: 5 +db_engine_echo: false +db_engine_echo_pool: false +db_engine_disconnect_strategy: pessimistic diff --git a/model-engine/model_engine_server/service_builder/celery.py b/model-engine/model_engine_server/service_builder/celery.py index e9e09df4..de57484c 100644 --- a/model-engine/model_engine_server/service_builder/celery.py +++ b/model-engine/model_engine_server/service_builder/celery.py @@ -31,7 +31,11 @@ def get_broker_type(cloud_provider: str, is_ci: bool, force_redis: bool) -> str: backend_protocol=( "abs" if infra_config().cloud_provider == "azure" - else ("redis" if infra_config().cloud_provider == "gcp" else "s3") + else ( + "redis" + if infra_config().cloud_provider in ("gcp", "onprem") + else "s3" + ) ), # Add detailed task tracking for debugging task_track_started=True, From 0c42fc8861c85ed38c54bb2cea8f3be0d7b272e9 Mon Sep 17 00:00:00 2001 From: lilyz-ai Date: Thu, 7 May 2026 03:16:24 +0000 Subject: [PATCH 07/11] fix(devx): align celery_task_queue_gateway backend_protocol for onprem The gateway's module-level backend_protocol had the same aws/gcp/azure mapping as service_builder/celery.py. Without this fix, the Service Builder writes task results to Redis but the Gateway looks in S3, leaving endpoints stuck in PENDING under the kind-based full local flow. Co-Authored-By: Claude Sonnet 4.6 --- .../infra/gateways/celery_task_queue_gateway.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/model-engine/model_engine_server/infra/gateways/celery_task_queue_gateway.py b/model-engine/model_engine_server/infra/gateways/celery_task_queue_gateway.py index 62eab418..ac1d3238 100644 --- a/model-engine/model_engine_server/infra/gateways/celery_task_queue_gateway.py +++ b/model-engine/model_engine_server/infra/gateways/celery_task_queue_gateway.py @@ -28,7 +28,9 @@ logger = make_logger(logger_name()) _cloud_provider = infra_config().cloud_provider backend_protocol = ( - "abs" if _cloud_provider == "azure" else ("redis" if _cloud_provider == "gcp" else "s3") + "abs" + if _cloud_provider == "azure" + else ("redis" if _cloud_provider in ("gcp", "onprem") else "s3") ) celery_redis = celery_app( From f649274612b0febedc2580acd5a464ef406a0376 Mon Sep 17 00:00:00 2001 From: lilyz-ai Date: Thu, 7 May 2026 03:30:38 +0000 Subject: [PATCH 08/11] fix(test): include otlp.proto.grpc in OTEL_AVAILABLE guard The exporter package was imported unconditionally under the OTEL_AVAILABLE flag which only checked the base SDK, not the exporter. Include it in the try block so OTEL_AVAILABLE stays False when the exporter is absent, fixing the ImportError that caused run_unit_tests_server to fail. Co-Authored-By: Claude Sonnet 4.6 --- .../model_engine_server/common/startup_tracing/correlation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/model-engine/model_engine_server/common/startup_tracing/correlation.py b/model-engine/model_engine_server/common/startup_tracing/correlation.py index c09156a6..32cc134b 100644 --- a/model-engine/model_engine_server/common/startup_tracing/correlation.py +++ b/model-engine/model_engine_server/common/startup_tracing/correlation.py @@ -12,6 +12,7 @@ try: from opentelemetry import trace from opentelemetry.context import Context + from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter # noqa: F401 from opentelemetry.sdk.trace import TracerProvider # noqa: F401 - SDK availability check from opentelemetry.trace import NonRecordingSpan, SpanContext, TraceFlags From 89336be911a0b44671cca92ad10990011d8aa6c3 Mon Sep 17 00:00:00 2001 From: lilyz-ai Date: Thu, 7 May 2026 20:26:56 +0000 Subject: [PATCH 09/11] fix(ci): fix black/isort formatting and FastAPI 0.135 API compat in schema gateway - Reformat correlation.py and celery.py to satisfy black - Move noqa comment to the from...import( line so ruff F401 is suppressed correctly - Pass schema_generator=GenerateJsonSchema() (new required kwarg) to get_definitions() and get_openapi_path() in live_model_endpoints_schema_gateway, creating a fresh instance per route since pydantic rejects reuse Co-Authored-By: Claude Sonnet 4.6 --- .../common/startup_tracing/correlation.py | 4 +++- .../infra/gateways/live_model_endpoints_schema_gateway.py | 3 +++ model-engine/model_engine_server/service_builder/celery.py | 6 +----- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/model-engine/model_engine_server/common/startup_tracing/correlation.py b/model-engine/model_engine_server/common/startup_tracing/correlation.py index 32cc134b..220507a5 100644 --- a/model-engine/model_engine_server/common/startup_tracing/correlation.py +++ b/model-engine/model_engine_server/common/startup_tracing/correlation.py @@ -12,7 +12,9 @@ try: from opentelemetry import trace from opentelemetry.context import Context - from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter # noqa: F401 + from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import ( # noqa: F401 + OTLPMetricExporter, + ) from opentelemetry.sdk.trace import TracerProvider # noqa: F401 - SDK availability check from opentelemetry.trace import NonRecordingSpan, SpanContext, TraceFlags diff --git a/model-engine/model_engine_server/infra/gateways/live_model_endpoints_schema_gateway.py b/model-engine/model_engine_server/infra/gateways/live_model_endpoints_schema_gateway.py index f09d74a3..1cb55dc4 100644 --- a/model-engine/model_engine_server/infra/gateways/live_model_endpoints_schema_gateway.py +++ b/model-engine/model_engine_server/infra/gateways/live_model_endpoints_schema_gateway.py @@ -123,14 +123,17 @@ def get_openapi( prefix = model_endpoint_name model_name_map = LiveModelEndpointsSchemaGateway.get_model_name_map(prefix) all_fields = get_fields_from_routes([route]) + schema_generator = GenerateJsonSchema() field_mapping, _ = get_definitions( fields=all_fields, + schema_generator=schema_generator, model_name_map=model_name_map, ) result = get_openapi_path( route=route, operation_ids=operation_ids, + schema_generator=schema_generator, model_name_map=model_name_map, field_mapping=field_mapping, ) diff --git a/model-engine/model_engine_server/service_builder/celery.py b/model-engine/model_engine_server/service_builder/celery.py index de57484c..c6845b8a 100644 --- a/model-engine/model_engine_server/service_builder/celery.py +++ b/model-engine/model_engine_server/service_builder/celery.py @@ -31,11 +31,7 @@ def get_broker_type(cloud_provider: str, is_ci: bool, force_redis: bool) -> str: backend_protocol=( "abs" if infra_config().cloud_provider == "azure" - else ( - "redis" - if infra_config().cloud_provider in ("gcp", "onprem") - else "s3" - ) + else ("redis" if infra_config().cloud_provider in ("gcp", "onprem") else "s3") ), # Add detailed task tracking for debugging task_track_started=True, From f15a827aea5f793e13861b5a745d04acdc7471f2 Mon Sep 17 00:00:00 2001 From: lilyz-ai Date: Thu, 7 May 2026 21:01:09 +0000 Subject: [PATCH 10/11] =?UTF-8?q?fix(ci):=20revert=20schema=5Fgenerator=20?= =?UTF-8?q?from=20gateway=20=E2=80=94=20FastAPI=200.135.1=20API=20does=20n?= =?UTF-8?q?ot=20have=20this=20param?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The param was added to fix a local test failure (FastAPI 0.110.0 requires it) but FastAPI 0.135.1 (pinned in requirements.txt, used by CI) does not accept it, causing mypy call-arg errors. Revert to the original signature. Co-Authored-By: Claude Sonnet 4.6 --- .../infra/gateways/live_model_endpoints_schema_gateway.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/model-engine/model_engine_server/infra/gateways/live_model_endpoints_schema_gateway.py b/model-engine/model_engine_server/infra/gateways/live_model_endpoints_schema_gateway.py index 1cb55dc4..f09d74a3 100644 --- a/model-engine/model_engine_server/infra/gateways/live_model_endpoints_schema_gateway.py +++ b/model-engine/model_engine_server/infra/gateways/live_model_endpoints_schema_gateway.py @@ -123,17 +123,14 @@ def get_openapi( prefix = model_endpoint_name model_name_map = LiveModelEndpointsSchemaGateway.get_model_name_map(prefix) all_fields = get_fields_from_routes([route]) - schema_generator = GenerateJsonSchema() field_mapping, _ = get_definitions( fields=all_fields, - schema_generator=schema_generator, model_name_map=model_name_map, ) result = get_openapi_path( route=route, operation_ids=operation_ids, - schema_generator=schema_generator, model_name_map=model_name_map, field_mapping=field_mapping, ) From dd8e8c00eed41d41767ff734ede443f0a186b20f Mon Sep 17 00:00:00 2001 From: lilyz-ai Date: Thu, 7 May 2026 21:26:21 +0000 Subject: [PATCH 11/11] docs(devx): replace curl endpoint example with launch-python-client example Co-Authored-By: Claude Sonnet 4.6 --- model-engine/README.md | 86 ++++++++++++++++++++++++------------------ 1 file changed, 49 insertions(+), 37 deletions(-) diff --git a/model-engine/README.md b/model-engine/README.md index be5fe204..1df093b6 100644 --- a/model-engine/README.md +++ b/model-engine/README.md @@ -254,44 +254,56 @@ make dev-k8s-cacher #### Create a test endpoint and watch it spin up -```bash -# Terminal 4 — create a sync CPU endpoint using the echo server -curl -X POST http://localhost:5000/v1/model-endpoints \ - -H "Authorization: Bearer test-user" \ - -H "Content-Type: application/json" \ - -d '{ - "name": "local-echo", - "bundle_name": "echo-bundle", - "endpoint_type": "sync", - "cpus": 0.25, - "memory": "256Mi", - "min_workers": 1, - "max_workers": 1, - "per_worker": 1, - "model_bundle": { - "name": "echo-bundle", - "metadata": {}, - "flavor": { - "flavor": "runnable_image", - "repository": "model-engine", - "tag": "local", - "command": [ - "python", "-m", - "model_engine_server.inference.forwarding.echo_server", - "--port", "5005" - ], - "predict_route": "/predict", - "healthcheck_route": "/healthz", - "readiness_initial_delay_seconds": 15 - } - } - }' - -# Poll status — transitions PENDING → UPDATE_PENDING → READY (30-60 s) -curl http://localhost:5000/v1/model-endpoints/ \ - -H "Authorization: Bearer test-user" +```python +# Terminal 4 — create a sync CPU endpoint using the echo server (launch-python-client) +import time +from launch import LaunchClient, EndpointRequest + +# Any token works — LOCAL=true skips auth; the token becomes the user/owner ID +client = LaunchClient(api_key="test-user", endpoint="http://localhost:5000") + +# Create the model bundle (echo server image loaded into kind via `make kind-image`) +bundle = client.create_model_bundle_from_runnable_image_v2( + model_bundle_name="echo-bundle", + repository="model-engine", + tag="local", + command=[ + "python", "-m", + "model_engine_server.inference.forwarding.echo_server", + "--port", "5005", + ], + predict_route="/predict", + healthcheck_route="/healthz", + readiness_initial_delay_seconds=15, +) + +# Create a sync CPU endpoint +client.create_model_endpoint( + endpoint_name="local-echo", + model_bundle=bundle, + endpoint_type="sync", + cpus=0.25, + memory="256Mi", + min_workers=1, + max_workers=1, + per_worker=1, +) + +# Poll until READY — transitions PENDING → UPDATE_PENDING → READY (~30-60 s) +while True: + ep = client.get_model_endpoint("local-echo") + print(f"status: {ep.status}") + if ep.status == "READY": + break + time.sleep(5) + +# Make a prediction against the echo server +response = ep.predict(request=EndpointRequest(args={"text": "hello"})) +print(response) +``` -# Watch the pod come up in kind +```bash +# Watch the pod come up in kind (separate terminal) kubectl --context kind-llm-engine get pods -n model-engine -w ```