diff --git a/charts/model-engine/values_sample.yaml b/charts/model-engine/values_sample.yaml index b8dc6fb7e..3ba83d500 100644 --- a/charts/model-engine/values_sample.yaml +++ b/charts/model-engine/values_sample.yaml @@ -24,7 +24,7 @@ celery_broker_type_redis: null # - ALL # tag [required] is the LLM Engine docker image tag -tag: e360bfb1d21d9d4e7b7fcb6b29ca752095b4d0f4 +tag: 2e9d00786419ef44ec5c9d3305d8d6451d6aabfb # context is a user-specified deployment tag. Can be used to context: production image: diff --git a/model-engine/Makefile b/model-engine/Makefile new file mode 100644 index 000000000..b16349ecb --- /dev/null +++ b/model-engine/Makefile @@ -0,0 +1,71 @@ +.PHONY: install dev-up dev-down dev-migrate dev-server test \ + kind-up kind-down kind-image \ + dev-service-builder dev-k8s-cacher dev-server-full + +MODEL_ENGINE_DIR := $(abspath .) +DB_URL := postgresql://postgres:password@localhost:5432/llm_engine +KIND_CLUSTER := llm-engine +KUBE_CONTEXT := kind-$(KIND_CLUSTER) + +# ── Control-plane-only (no k8s, fake queue/docker) ────────────────────────── +LOCAL_ENV := \ + LOCAL=true \ + GIT_TAG=local \ + ML_INFRA_DATABASE_URL=$(DB_URL) \ + DEPLOY_SERVICE_CONFIG_PATH=$(MODEL_ENGINE_DIR)/service_configs/service_config_local.yaml \ + ML_INFRA_SERVICES_CONFIG_PATH=$(MODEL_ENGINE_DIR)/model_engine_server/core/configs/default.yaml + +# ── Full end-to-end (real k8s via kind, real Redis queue, fake docker) ─────── +FULL_LOCAL_ENV := \ + LOCAL=true \ + GIT_TAG=local \ + ML_INFRA_DATABASE_URL=$(DB_URL) \ + DEPLOY_SERVICE_CONFIG_PATH=$(MODEL_ENGINE_DIR)/service_configs/service_config_local.yaml \ + ML_INFRA_SERVICES_CONFIG_PATH=$(MODEL_ENGINE_DIR)/model_engine_server/core/configs/local-full.yaml \ + REDIS_HOST=localhost \ + REDIS_PORT=6379 + +# ── One-time setup ─────────────────────────────────────────────────────────── +install: + pip install -r requirements.txt -r requirements-test.txt -r requirements_override.txt + pip install -e . + +# ── Backing services (Postgres + Redis) ───────────────────────────────────── +dev-up: + docker compose -f docker-compose.local.yml up -d --wait + +dev-down: + docker compose -f docker-compose.local.yml down + +dev-migrate: + $(LOCAL_ENV) bash model_engine_server/db/migrations/run_database_migration.sh + +# ── Control-plane-only server ──────────────────────────────────────────────── +dev-server: + $(LOCAL_ENV) start-fastapi-server --port 5000 --num-workers 1 --debug + +# ── kind cluster (for full end-to-end flow) ────────────────────────────────── +kind-up: + kind create cluster --name $(KIND_CLUSTER) + kubectl --context $(KUBE_CONTEXT) create namespace model-engine --dry-run=client -o yaml | kubectl --context $(KUBE_CONTEXT) apply -f - + +kind-down: + kind delete cluster --name $(KIND_CLUSTER) + +kind-image: + docker build -t model-engine:local .. + kind load docker-image model-engine:local --name $(KIND_CLUSTER) + +# ── Full end-to-end processes (run each in a separate terminal) ─────────────── +dev-server-full: + $(FULL_LOCAL_ENV) start-fastapi-server --port 5000 --num-workers 1 --debug + +dev-service-builder: + $(FULL_LOCAL_ENV) celery -A model_engine_server.service_builder.celery worker --loglevel=info --concurrency=2 + +dev-k8s-cacher: + $(FULL_LOCAL_ENV) python model_engine_server/entrypoints/k8s_cache.py --sleep-interval-seconds 5 + +# ── Tests ───────────────────────────────────────────────────────────────────── +test: + pytest tests/unit/ diff --git a/model-engine/README.md b/model-engine/README.md index febdda782..1df093b65 100644 --- a/model-engine/README.md +++ b/model-engine/README.md @@ -129,6 +129,203 @@ For OpenAI-compatible V2 APIs, we generate Pydantic models from OpenAI's spec: ## Local Development +### Control Plane Local Setup + +The control plane (Gateway API server, Service Builder, K8s Cache) can be run entirely +locally without GPU hardware or cloud credentials. Endpoint creation calls succeed +against a fake k8s/SQS/ECR backend, letting you iterate on control plane code quickly. + +**Prerequisites:** Python 3.10+, Docker + +#### One-time setup + +```bash +cd model-engine/ + +# Install Python dependencies +make install + +# Start Postgres + Redis +make dev-up + +# Apply database migrations +make dev-migrate +``` + +#### Run the API server + +```bash +make dev-server +``` + +The gateway starts at http://localhost:5000 with auto-reload on file changes. +Authentication is skipped automatically (`SKIP_AUTH=true`) so any token works. + +#### Make API calls + +```bash +# List model endpoints +curl http://localhost:5000/v1/model-endpoints \ + -H "Authorization: Bearer test-user" + +# Create an LLM endpoint (uses fake k8s — no real infra needed) +curl -X POST http://localhost:5000/v1/llm/model-endpoints \ + -H "Authorization: Bearer test-user" \ + -H "Content-Type: application/json" \ + -d '{"name":"local-test","model_name":"meta-llama/Meta-Llama-3.1-8B-Instruct","inference_framework":"vllm","min_workers":0,"max_workers":1,"gpus":1,"gpu_type":"nvidia-ampere-a10","endpoint_type":"sync"}' +``` + +#### Stop backing services + +```bash +make dev-down +``` + +#### What `LOCAL=true` does + +Running with `LOCAL=true` (set automatically by `make dev-server` and `make dev-migrate`): + +- Skips the `GIT_TAG` env var requirement +- Uses a **fake queue delegate** (no SQS/Azure Service Bus needed) +- Uses a **fake Docker repository** (no ECR/ACR/GAR needed) +- Auth is skipped when `identity_service_url` is absent from config (default) +- Postgres and Redis are real local services (via docker-compose) + +This means you can create/update/delete endpoints via the API and see them reflected +in Postgres, without any Kubernetes cluster or cloud account. + +#### Running individual components manually + +If you prefer to set env vars yourself rather than use `make`: + +```bash +export LOCAL=true +export GIT_TAG=local +export ML_INFRA_DATABASE_URL=postgresql://postgres:password@localhost:5432/llm_engine +export DEPLOY_SERVICE_CONFIG_PATH=$(pwd)/service_configs/service_config_local.yaml +export ML_INFRA_SERVICES_CONFIG_PATH=$(pwd)/model_engine_server/core/configs/default.yaml + +# Gateway +start-fastapi-server --port 5000 --num-workers 1 --debug + +# Database migration +bash model_engine_server/db/migrations/run_database_migration.sh +``` + +### Full End-to-End Local Flow (control plane + real inference pod) + +This setup uses [kind](https://kind.sigs.k8s.io/) (Kubernetes in Docker) to run a real +local k8s cluster. The Service Builder creates actual Deployments in kind; the K8s Cacher +polls kind and updates Redis. No GPU required — we use the built-in echo server as the +inference container. + +**Prerequisites:** Python 3.10+, Docker, [`kind`](https://kind.sigs.k8s.io/docs/user/quick-start/#installation) + +#### One-time cluster + image setup + +```bash +cd model-engine/ + +# Start Postgres + Redis (if not already running) +make dev-up + +# Apply DB migrations (if not already done) +make dev-migrate + +# Create kind cluster and the model-engine namespace +make kind-up + +# Build model-engine:local and load it into kind +make kind-image # takes ~2-3 min on first build +``` + +#### Run the full stack (4 terminals) + +```bash +# Terminal 1 — Gateway +make dev-server-full + +# Terminal 2 — Service Builder (picks up endpoint creation tasks from Redis) +make dev-service-builder + +# Terminal 3 — K8s Cacher (polls kind, writes endpoint status to Redis) +make dev-k8s-cacher +``` + +#### Create a test endpoint and watch it spin up + +```python +# Terminal 4 — create a sync CPU endpoint using the echo server (launch-python-client) +import time +from launch import LaunchClient, EndpointRequest + +# Any token works — LOCAL=true skips auth; the token becomes the user/owner ID +client = LaunchClient(api_key="test-user", endpoint="http://localhost:5000") + +# Create the model bundle (echo server image loaded into kind via `make kind-image`) +bundle = client.create_model_bundle_from_runnable_image_v2( + model_bundle_name="echo-bundle", + repository="model-engine", + tag="local", + command=[ + "python", "-m", + "model_engine_server.inference.forwarding.echo_server", + "--port", "5005", + ], + predict_route="/predict", + healthcheck_route="/healthz", + readiness_initial_delay_seconds=15, +) + +# Create a sync CPU endpoint +client.create_model_endpoint( + endpoint_name="local-echo", + model_bundle=bundle, + endpoint_type="sync", + cpus=0.25, + memory="256Mi", + min_workers=1, + max_workers=1, + per_worker=1, +) + +# Poll until READY — transitions PENDING → UPDATE_PENDING → READY (~30-60 s) +while True: + ep = client.get_model_endpoint("local-echo") + print(f"status: {ep.status}") + if ep.status == "READY": + break + time.sleep(5) + +# Make a prediction against the echo server +response = ep.predict(request=EndpointRequest(args={"text": "hello"})) +print(response) +``` + +```bash +# Watch the pod come up in kind (separate terminal) +kubectl --context kind-llm-engine get pods -n model-engine -w +``` + +#### Tear down + +```bash +make kind-down # delete kind cluster +make dev-down # stop Postgres + Redis +``` + +#### How the full flow works + +| Component | Mode | What it does locally | +|---|---|---| +| Gateway (`dev-server-full`) | `cloud_provider=onprem` + `LOCAL=true` | Real Redis queue, fake Docker registry | +| Service Builder | `cloud_provider=onprem` + Redis broker | Creates real k8s Deployments in kind | +| K8s Cacher | `cloud_provider=onprem` | Polls kind, writes status to Redis | +| Inference pod | `model-engine:local` in kind | Runs echo server on port 5005 | +| Forwarder sidecar | `model-engine:local` in kind | HTTP forwarder proxies requests | + +> **Note:** LLM endpoints (vLLM, TGI) require GPU hardware and pulling large images — use the generic sync endpoint with the echo server for local flow testing. + ### Testing the HTTP Forwarder Start an endpoint on port 5005: diff --git a/model-engine/docker-compose.local.yml b/model-engine/docker-compose.local.yml new file mode 100644 index 000000000..ff07c202d --- /dev/null +++ b/model-engine/docker-compose.local.yml @@ -0,0 +1,28 @@ +services: + postgres: + image: postgres:15 + environment: + POSTGRES_PASSWORD: password + POSTGRES_DB: llm_engine + ports: + - "5432:5432" + volumes: + - postgres_data:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U postgres"] + interval: 5s + timeout: 5s + retries: 5 + + redis: + image: redis:7 + ports: + - "6379:6379" + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 5s + timeout: 5s + retries: 5 + +volumes: + postgres_data: diff --git a/model-engine/model_engine_server/api/dependencies.py b/model-engine/model_engine_server/api/dependencies.py index f28425d88..4cc0bb7be 100644 --- a/model-engine/model_engine_server/api/dependencies.py +++ b/model-engine/model_engine_server/api/dependencies.py @@ -10,7 +10,7 @@ from model_engine_server.common.aioredis_pool import build_aioredis_pool from model_engine_server.common.config import hmi_config from model_engine_server.common.dtos.model_endpoints import BrokerType -from model_engine_server.common.env_vars import CIRCLECI +from model_engine_server.common.env_vars import CIRCLECI, LOCAL from model_engine_server.core.auth.authentication_repository import AuthenticationRepository, User from model_engine_server.core.auth.fake_authentication_repository import ( FakeAuthenticationRepository, @@ -241,7 +241,7 @@ def _get_external_interfaces( ) queue_delegate: QueueEndpointResourceDelegate - if CIRCLECI: + if CIRCLECI or (LOCAL and infra_config().cloud_provider != "onprem"): queue_delegate = FakeQueueEndpointResourceDelegate() elif infra_config().cloud_provider == "onprem": queue_delegate = OnPremQueueEndpointResourceDelegate() @@ -257,8 +257,8 @@ def _get_external_interfaces( inference_task_queue_gateway: TaskQueueGateway infra_task_queue_gateway: TaskQueueGateway - if CIRCLECI or infra_config().cloud_provider == "onprem": - # On-prem uses Redis-based task queues + if CIRCLECI or LOCAL or infra_config().cloud_provider == "onprem": + # On-prem and local dev use Redis-based task queues inference_task_queue_gateway = redis_24h_task_queue_gateway infra_task_queue_gateway = redis_task_queue_gateway elif infra_config().cloud_provider == "azure": @@ -391,7 +391,7 @@ def _get_external_interfaces( registry_type = infra_config().docker_registry_type or infer_registry_type( infra_config().docker_repo_prefix ) - if CIRCLECI: + if CIRCLECI or LOCAL: docker_repository = FakeDockerRepository() elif registry_type == "ecr": docker_repository = ECRDockerRepository() diff --git a/model-engine/model_engine_server/common/env_vars.py b/model-engine/model_engine_server/common/env_vars.py index 7c4626d65..0a7907bd1 100644 --- a/model-engine/model_engine_server/common/env_vars.py +++ b/model-engine/model_engine_server/common/env_vars.py @@ -93,6 +93,6 @@ def get_boolean_env_var(name: str) -> bool: if LOCAL: logger.warning("LOCAL development & testing mode is ON") -GIT_TAG: str = os.environ.get("GIT_TAG", "GIT_TAG_NOT_FOUND") -if GIT_TAG == "GIT_TAG_NOT_FOUND" and "pytest" not in sys.modules: +GIT_TAG: str = os.environ.get("GIT_TAG", "local" if LOCAL else "GIT_TAG_NOT_FOUND") +if GIT_TAG == "GIT_TAG_NOT_FOUND" and "pytest" not in sys.modules and not LOCAL: raise ValueError("GIT_TAG environment variable must be set") diff --git a/model-engine/model_engine_server/common/startup_tracing/correlation.py b/model-engine/model_engine_server/common/startup_tracing/correlation.py index c09156a6b..220507a5d 100644 --- a/model-engine/model_engine_server/common/startup_tracing/correlation.py +++ b/model-engine/model_engine_server/common/startup_tracing/correlation.py @@ -12,6 +12,9 @@ try: from opentelemetry import trace from opentelemetry.context import Context + from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import ( # noqa: F401 + OTLPMetricExporter, + ) from opentelemetry.sdk.trace import TracerProvider # noqa: F401 - SDK availability check from opentelemetry.trace import NonRecordingSpan, SpanContext, TraceFlags diff --git a/model-engine/model_engine_server/core/configs/local-full.yaml b/model-engine/model_engine_server/core/configs/local-full.yaml new file mode 100644 index 000000000..cc0fb02f3 --- /dev/null +++ b/model-engine/model_engine_server/core/configs/local-full.yaml @@ -0,0 +1,16 @@ +cloud_provider: onprem +env: local +k8s_cluster_name: kind-llm-engine +dns_host_domain: localhost +default_region: us-east-1 +ml_account_id: local +docker_repo_prefix: "localhost" +s3_bucket: local-bucket +redis_host: localhost +redis_port: 6379 +celery_broker_type_redis: true +db_engine_pool_size: 5 +db_engine_max_overflow: 5 +db_engine_echo: false +db_engine_echo_pool: false +db_engine_disconnect_strategy: pessimistic diff --git a/model-engine/model_engine_server/infra/gateways/celery_task_queue_gateway.py b/model-engine/model_engine_server/infra/gateways/celery_task_queue_gateway.py index 62eab4183..ac1d32380 100644 --- a/model-engine/model_engine_server/infra/gateways/celery_task_queue_gateway.py +++ b/model-engine/model_engine_server/infra/gateways/celery_task_queue_gateway.py @@ -28,7 +28,9 @@ logger = make_logger(logger_name()) _cloud_provider = infra_config().cloud_provider backend_protocol = ( - "abs" if _cloud_provider == "azure" else ("redis" if _cloud_provider == "gcp" else "s3") + "abs" + if _cloud_provider == "azure" + else ("redis" if _cloud_provider in ("gcp", "onprem") else "s3") ) celery_redis = celery_app( diff --git a/model-engine/model_engine_server/service_builder/celery.py b/model-engine/model_engine_server/service_builder/celery.py index e9e09df4e..c6845b8af 100644 --- a/model-engine/model_engine_server/service_builder/celery.py +++ b/model-engine/model_engine_server/service_builder/celery.py @@ -31,7 +31,7 @@ def get_broker_type(cloud_provider: str, is_ci: bool, force_redis: bool) -> str: backend_protocol=( "abs" if infra_config().cloud_provider == "azure" - else ("redis" if infra_config().cloud_provider == "gcp" else "s3") + else ("redis" if infra_config().cloud_provider in ("gcp", "onprem") else "s3") ), # Add detailed task tracking for debugging task_track_started=True, diff --git a/model-engine/service_configs/service_config_local.yaml b/model-engine/service_configs/service_config_local.yaml new file mode 100644 index 000000000..541da4b3c --- /dev/null +++ b/model-engine/service_configs/service_config_local.yaml @@ -0,0 +1,33 @@ +gateway_namespace: default +endpoint_namespace: model-engine +model_primitive_host: "none" + +# Local Redis (started via docker-compose.local.yml) +# Use onprem_url so it's checked before cloud_provider assertions +cache_redis_onprem_url: redis://localhost:6379/15 + +sqs_profile: nonexistent_sqs_profile +sqs_queue_policy_template: > + { + "Version": "2012-10-17", + "Statement": [] + } +sqs_queue_tag_template: > + {} + +billing_queue_arn: none +cloud_file_llm_fine_tune_repository: "s3://local-bucket/fine_tune_repository/local" + +dd_trace_enabled: false +istio_enabled: false +sensitive_log_mode: false +tgi_repository: "text-generation-inference" +vllm_repository: "vllm" +lightllm_repository: "lightllm" +tensorrt_llm_repository: "tensorrt-llm" +batch_inference_vllm_repository: "llm-engine/batch-infer-vllm" +user_inference_base_repository: "launch/inference" +user_inference_pytorch_repository: "hosted-model-inference/async-pytorch" +user_inference_tensorflow_repository: "hosted-model-inference/async-tensorflow-cpu" +docker_image_layer_cache_repository: "kaniko-cache" +hf_user_fine_tuned_weights_prefix: "s3://local-bucket/model-weights"