scaleapi · lilyz-ai · Apr 21, 2026 · May 7, 2026 · May 7, 2026 · May 7, 2026
diff --git a/charts/model-engine/values_sample.yaml b/charts/model-engine/values_sample.yaml
@@ -24,7 +24,7 @@ celery_broker_type_redis: null
 #       - ALL  
 
 # tag [required] is the LLM Engine docker image tag
-tag: e360bfb1d21d9d4e7b7fcb6b29ca752095b4d0f4
+tag: 2e9d00786419ef44ec5c9d3305d8d6451d6aabfb
 # context is a user-specified deployment tag. Can be used to 
 context: production
 image:

diff --git a/model-engine/Makefile b/model-engine/Makefile
@@ -0,0 +1,71 @@
+.PHONY: install dev-up dev-down dev-migrate dev-server test \
+        kind-up kind-down kind-image \
+        dev-service-builder dev-k8s-cacher dev-server-full
+
+MODEL_ENGINE_DIR := $(abspath .)
+DB_URL           := postgresql://postgres:password@localhost:5432/llm_engine
+KIND_CLUSTER     := llm-engine
+KUBE_CONTEXT     := kind-$(KIND_CLUSTER)
+
+# ── Control-plane-only (no k8s, fake queue/docker) ──────────────────────────
+LOCAL_ENV := \
+	LOCAL=true \
+	GIT_TAG=local \
+	ML_INFRA_DATABASE_URL=$(DB_URL) \
+	DEPLOY_SERVICE_CONFIG_PATH=$(MODEL_ENGINE_DIR)/service_configs/service_config_local.yaml \
+	ML_INFRA_SERVICES_CONFIG_PATH=$(MODEL_ENGINE_DIR)/model_engine_server/core/configs/default.yaml
+
+# ── Full end-to-end (real k8s via kind, real Redis queue, fake docker) ───────
+FULL_LOCAL_ENV := \
+	LOCAL=true \
+	GIT_TAG=local \
+	ML_INFRA_DATABASE_URL=$(DB_URL) \
+	DEPLOY_SERVICE_CONFIG_PATH=$(MODEL_ENGINE_DIR)/service_configs/service_config_local.yaml \
+	ML_INFRA_SERVICES_CONFIG_PATH=$(MODEL_ENGINE_DIR)/model_engine_server/core/configs/local-full.yaml \
+	REDIS_HOST=localhost \
+	REDIS_PORT=6379
+
+# ── One-time setup ───────────────────────────────────────────────────────────
+install:
+	pip install -r requirements.txt -r requirements-test.txt -r requirements_override.txt
+	pip install -e .
+
+# ── Backing services (Postgres + Redis) ─────────────────────────────────────
+dev-up:
+	docker compose -f docker-compose.local.yml up -d --wait
+
+dev-down:
+	docker compose -f docker-compose.local.yml down
+
+dev-migrate:
+	$(LOCAL_ENV) bash model_engine_server/db/migrations/run_database_migration.sh
+
+# ── Control-plane-only server ────────────────────────────────────────────────
+dev-server:
+	$(LOCAL_ENV) start-fastapi-server --port 5000 --num-workers 1 --debug
+
+# ── kind cluster (for full end-to-end flow) ──────────────────────────────────
+kind-up:
+	kind create cluster --name $(KIND_CLUSTER)
+	kubectl --context $(KUBE_CONTEXT) create namespace model-engine --dry-run=client -o yaml | kubectl --context $(KUBE_CONTEXT) apply -f -
+
+kind-down:
+	kind delete cluster --name $(KIND_CLUSTER)
+
+kind-image:
+	docker build -t model-engine:local ..
+	kind load docker-image model-engine:local --name $(KIND_CLUSTER)
+
+# ── Full end-to-end processes (run each in a separate terminal) ───────────────
+dev-server-full:
+	$(FULL_LOCAL_ENV) start-fastapi-server --port 5000 --num-workers 1 --debug
+
+dev-service-builder:
+	$(FULL_LOCAL_ENV) celery -A model_engine_server.service_builder.celery worker --loglevel=info --concurrency=2
+
+dev-k8s-cacher:
+	$(FULL_LOCAL_ENV) python model_engine_server/entrypoints/k8s_cache.py --sleep-interval-seconds 5
+
+# ── Tests ─────────────────────────────────────────────────────────────────────
+test:
+	pytest tests/unit/
diff --git a/model-engine/README.md b/model-engine/README.md
@@ -129,6 +129,203 @@ For OpenAI-compatible V2 APIs, we generate Pydantic models from OpenAI's spec:
 
 ## Local Development
 
+### Control Plane Local Setup
+
+The control plane (Gateway API server, Service Builder, K8s Cache) can be run entirely
+locally without GPU hardware or cloud credentials. Endpoint creation calls succeed
+against a fake k8s/SQS/ECR backend, letting you iterate on control plane code quickly.
+
+**Prerequisites:** Python 3.10+, Docker
+
+#### One-time setup
+
+```bash
+cd model-engine/
+
+# Install Python dependencies
+make install
+
+# Start Postgres + Redis
+make dev-up
+
+# Apply database migrations
+make dev-migrate
+```
+
+#### Run the API server
+
+```bash
+make dev-server
+```
+
+The gateway starts at http://localhost:5000 with auto-reload on file changes.
+Authentication is skipped automatically (`SKIP_AUTH=true`) so any token works.
+
+#### Make API calls
+
+```bash
+# List model endpoints
+curl http://localhost:5000/v1/model-endpoints \
+  -H "Authorization: Bearer test-user"
+
+# Create an LLM endpoint (uses fake k8s — no real infra needed)
+curl -X POST http://localhost:5000/v1/llm/model-endpoints \
+  -H "Authorization: Bearer test-user" \
+  -H "Content-Type: application/json" \
+  -d '{"name":"local-test","model_name":"meta-llama/Meta-Llama-3.1-8B-Instruct","inference_framework":"vllm","min_workers":0,"max_workers":1,"gpus":1,"gpu_type":"nvidia-ampere-a10","endpoint_type":"sync"}'
+```
+
+#### Stop backing services
+
+```bash
+make dev-down
+```
+
+#### What `LOCAL=true` does
+
+Running with `LOCAL=true` (set automatically by `make dev-server` and `make dev-migrate`):
+
+- Skips the `GIT_TAG` env var requirement
+- Uses a **fake queue delegate** (no SQS/Azure Service Bus needed)
+- Uses a **fake Docker repository** (no ECR/ACR/GAR needed)
+- Auth is skipped when `identity_service_url` is absent from config (default)
+- Postgres and Redis are real local services (via docker-compose)
+
+This means you can create/update/delete endpoints via the API and see them reflected
+in Postgres, without any Kubernetes cluster or cloud account.
+
+#### Running individual components manually
+
+If you prefer to set env vars yourself rather than use `make`:
+
+```bash
+export LOCAL=true
+export GIT_TAG=local
+export ML_INFRA_DATABASE_URL=postgresql://postgres:password@localhost:5432/llm_engine
+export DEPLOY_SERVICE_CONFIG_PATH=$(pwd)/service_configs/service_config_local.yaml
+export ML_INFRA_SERVICES_CONFIG_PATH=$(pwd)/model_engine_server/core/configs/default.yaml
+
+# Gateway
+start-fastapi-server --port 5000 --num-workers 1 --debug
+
+# Database migration
+bash model_engine_server/db/migrations/run_database_migration.sh
+```
+
+### Full End-to-End Local Flow (control plane + real inference pod)
+
+This setup uses [kind](https://kind.sigs.k8s.io/) (Kubernetes in Docker) to run a real
+local k8s cluster. The Service Builder creates actual Deployments in kind; the K8s Cacher
+polls kind and updates Redis. No GPU required — we use the built-in echo server as the
+inference container.
+
+**Prerequisites:** Python 3.10+, Docker, [`kind`](https://kind.sigs.k8s.io/docs/user/quick-start/#installation)
+
+#### One-time cluster + image setup
+
+```bash
+cd model-engine/
+
+# Start Postgres + Redis (if not already running)
+make dev-up
+
+# Apply DB migrations (if not already done)
+make dev-migrate
+
+# Create kind cluster and the model-engine namespace
+make kind-up
+
+# Build model-engine:local and load it into kind
+make kind-image        # takes ~2-3 min on first build
+```
+
+#### Run the full stack (4 terminals)
+
+```bash
+# Terminal 1 — Gateway
+make dev-server-full
+
+# Terminal 2 — Service Builder (picks up endpoint creation tasks from Redis)
+make dev-service-builder
+
+# Terminal 3 — K8s Cacher (polls kind, writes endpoint status to Redis)
+make dev-k8s-cacher
+```
+
+#### Create a test endpoint and watch it spin up
+
+```python
+# Terminal 4 — create a sync CPU endpoint using the echo server (launch-python-client)
+import time
+from launch import LaunchClient, EndpointRequest
+
+# Any token works — LOCAL=true skips auth; the token becomes the user/owner ID
+client = LaunchClient(api_key="test-user", endpoint="http://localhost:5000")
+
+# Create the model bundle (echo server image loaded into kind via `make kind-image`)
+bundle = client.create_model_bundle_from_runnable_image_v2(
+    model_bundle_name="echo-bundle",
+    repository="model-engine",
+    tag="local",
+    command=[
+        "python", "-m",
+        "model_engine_server.inference.forwarding.echo_server",
+        "--port", "5005",
+    ],
+    predict_route="/predict",
+    healthcheck_route="/healthz",
+    readiness_initial_delay_seconds=15,
+)
+
+# Create a sync CPU endpoint
+client.create_model_endpoint(
+    endpoint_name="local-echo",
+    model_bundle=bundle,
+    endpoint_type="sync",
+    cpus=0.25,
+    memory="256Mi",
+    min_workers=1,
+    max_workers=1,
+    per_worker=1,
+)
+
+# Poll until READY — transitions PENDING → UPDATE_PENDING → READY (~30-60 s)
+while True:
+    ep = client.get_model_endpoint("local-echo")
+    print(f"status: {ep.status}")
+    if ep.status == "READY":
+        break
+    time.sleep(5)
+
+# Make a prediction against the echo server
+response = ep.predict(request=EndpointRequest(args={"text": "hello"}))
+print(response)
+```
+
+```bash
+# Watch the pod come up in kind (separate terminal)
+kubectl --context kind-llm-engine get pods -n model-engine -w
+```
+
+#### Tear down
+
+```bash
+make kind-down          # delete kind cluster
+make dev-down           # stop Postgres + Redis
+```
+
+#### How the full flow works
+
+| Component | Mode | What it does locally |
+|---|---|---|
+| Gateway (`dev-server-full`) | `cloud_provider=onprem` + `LOCAL=true` | Real Redis queue, fake Docker registry |
+| Service Builder | `cloud_provider=onprem` + Redis broker | Creates real k8s Deployments in kind |
+| K8s Cacher | `cloud_provider=onprem` | Polls kind, writes status to Redis |
+| Inference pod | `model-engine:local` in kind | Runs echo server on port 5005 |
+| Forwarder sidecar | `model-engine:local` in kind | HTTP forwarder proxies requests |
+
+> **Note:** LLM endpoints (vLLM, TGI) require GPU hardware and pulling large images — use the generic sync endpoint with the echo server for local flow testing.
+
 ### Testing the HTTP Forwarder
 
 Start an endpoint on port 5005:

diff --git a/model-engine/docker-compose.local.yml b/model-engine/docker-compose.local.yml
@@ -0,0 +1,28 @@
+services:
+  postgres:
+    image: postgres:15
+    environment:
+      POSTGRES_PASSWORD: password
+      POSTGRES_DB: llm_engine
+    ports:
+      - "5432:5432"
+    volumes:
+      - postgres_data:/var/lib/postgresql/data
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -U postgres"]
+      interval: 5s
+      timeout: 5s
+      retries: 5
+
+  redis:
+    image: redis:7
+    ports:
+      - "6379:6379"
+    healthcheck:
+      test: ["CMD", "redis-cli", "ping"]
+      interval: 5s
+      timeout: 5s
+      retries: 5
+
+volumes:
+  postgres_data:
diff --git a/model-engine/model_engine_server/api/dependencies.py b/model-engine/model_engine_server/api/dependencies.py
@@ -10,7 +10,7 @@
 from model_engine_server.common.aioredis_pool import build_aioredis_pool
 from model_engine_server.common.config import hmi_config
 from model_engine_server.common.dtos.model_endpoints import BrokerType
-from model_engine_server.common.env_vars import CIRCLECI
+from model_engine_server.common.env_vars import CIRCLECI, LOCAL
 from model_engine_server.core.auth.authentication_repository import AuthenticationRepository, User
 from model_engine_server.core.auth.fake_authentication_repository import (
     FakeAuthenticationRepository,
@@ -241,7 +241,7 @@ def _get_external_interfaces(
     )
 
     queue_delegate: QueueEndpointResourceDelegate
-    if CIRCLECI:
+    if CIRCLECI or (LOCAL and infra_config().cloud_provider != "onprem"):
         queue_delegate = FakeQueueEndpointResourceDelegate()
     elif infra_config().cloud_provider == "onprem":
         queue_delegate = OnPremQueueEndpointResourceDelegate()
@@ -257,8 +257,8 @@ def _get_external_interfaces(
 
     inference_task_queue_gateway: TaskQueueGateway
     infra_task_queue_gateway: TaskQueueGateway
-    if CIRCLECI or infra_config().cloud_provider == "onprem":
-        # On-prem uses Redis-based task queues
+    if CIRCLECI or LOCAL or infra_config().cloud_provider == "onprem":
+        # On-prem and local dev use Redis-based task queues
         inference_task_queue_gateway = redis_24h_task_queue_gateway
         infra_task_queue_gateway = redis_task_queue_gateway
     elif infra_config().cloud_provider == "azure":
@@ -391,7 +391,7 @@ def _get_external_interfaces(
     registry_type = infra_config().docker_registry_type or infer_registry_type(
         infra_config().docker_repo_prefix
     )
-    if CIRCLECI:
+    if CIRCLECI or LOCAL:
         docker_repository = FakeDockerRepository()
     elif registry_type == "ecr":
         docker_repository = ECRDockerRepository()

diff --git a/model-engine/model_engine_server/common/env_vars.py b/model-engine/model_engine_server/common/env_vars.py
@@ -93,6 +93,6 @@ def get_boolean_env_var(name: str) -> bool:
 if LOCAL:
     logger.warning("LOCAL development & testing mode is ON")
 
-GIT_TAG: str = os.environ.get("GIT_TAG", "GIT_TAG_NOT_FOUND")
-if GIT_TAG == "GIT_TAG_NOT_FOUND" and "pytest" not in sys.modules:
+GIT_TAG: str = os.environ.get("GIT_TAG", "local" if LOCAL else "GIT_TAG_NOT_FOUND")
+if GIT_TAG == "GIT_TAG_NOT_FOUND" and "pytest" not in sys.modules and not LOCAL:
     raise ValueError("GIT_TAG environment variable must be set")
diff --git a/model-engine/model_engine_server/common/startup_tracing/correlation.py b/model-engine/model_engine_server/common/startup_tracing/correlation.py
@@ -12,6 +12,9 @@
 try:
     from opentelemetry import trace
     from opentelemetry.context import Context
+    from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import (  # noqa: F401
+        OTLPMetricExporter,
+    )
     from opentelemetry.sdk.trace import TracerProvider  # noqa: F401 - SDK availability check
     from opentelemetry.trace import NonRecordingSpan, SpanContext, TraceFlags
 

diff --git a/model-engine/model_engine_server/core/configs/local-full.yaml b/model-engine/model_engine_server/core/configs/local-full.yaml
@@ -0,0 +1,16 @@
+cloud_provider: onprem
+env: local
+k8s_cluster_name: kind-llm-engine
+dns_host_domain: localhost
+default_region: us-east-1
+ml_account_id: local
+docker_repo_prefix: "localhost"
+s3_bucket: local-bucket
+redis_host: localhost
+redis_port: 6379
+celery_broker_type_redis: true
+db_engine_pool_size: 5
+db_engine_max_overflow: 5
+db_engine_echo: false
+db_engine_echo_pool: false
+db_engine_disconnect_strategy: pessimistic
diff --git a/model-engine/model_engine_server/infra/gateways/celery_task_queue_gateway.py b/model-engine/model_engine_server/infra/gateways/celery_task_queue_gateway.py
@@ -28,7 +28,9 @@
 logger = make_logger(logger_name())
 _cloud_provider = infra_config().cloud_provider
 backend_protocol = (
-    "abs" if _cloud_provider == "azure" else ("redis" if _cloud_provider == "gcp" else "s3")
+    "abs"
+    if _cloud_provider == "azure"
+    else ("redis" if _cloud_provider in ("gcp", "onprem") else "s3")
 )
 
 celery_redis = celery_app(