diff --git a/.env.example b/.env.example index 5b151e342..73b8ab494 100644 --- a/.env.example +++ b/.env.example @@ -65,13 +65,14 @@ export P2P_BOOTSTRAP_NODES= export P2P_FILTER_ANNOUNCED_ADDRESSES= ## compute -# Each environment defines its own resources (CPU, RAM, disk, GPUs) with full configuration. -# CPU, RAM, and disk are per-env exclusive: inUse tracked only within the environment where the job runs. -# A global check ensures the aggregate usage across all environments does not exceed physical capacity. -# GPUs are shared-exclusive: if a job on envA uses gpu0, it shows as in-use on envB too. -# CPU cores are automatically partitioned across environments based on each env's cpu.total. -# CPU and RAM defaults are auto-detected from the system when not configured. -# export DOCKER_COMPUTE_ENVIRONMENTS='[{"socketPath":"/var/run/docker.sock","environments":[{"id":"envA","storageExpiry":604800,"maxJobDuration":3600,"minJobDuration":60,"resources":[{"id":"cpu","total":4,"max":4,"min":1,"type":"cpu"},{"id":"ram","total":16,"max":16,"min":1,"type":"ram"},{"id":"disk","total":500,"max":500,"min":10,"type":"disk"},{"id":"gpu0","total":1,"max":1,"min":0,"type":"gpu","init":{"deviceRequests":{"Driver":"nvidia","DeviceIDs":["0"],"Capabilities":[["gpu"]]}}}],"fees":{"1":[{"feeToken":"0x123","prices":[{"id":"cpu","price":1},{"id":"ram","price":0.1},{"id":"disk","price":0.01},{"id":"gpu0","price":5}]}]}}]}]' +# Resources are defined at the Docker-connection level (socketPath) and shared across all environments. +# cpu, ram, and disk are auto-detected from the host — omit them to use all available capacity, +# or include them to cap/reserve (e.g. limit an 8-core host to 6 cores for compute). +# GPUs and other hardware go in the connection-level "resources" array with kind:"discrete". +# Each environment references pool resources by id using lightweight refs {id, total?, min?, max?}. +# Dual-gate tracking for fungible resources: per-env ceiling (Gate 1) + engine-wide pool (Gate 2). +# Discrete resources (GPUs) are tracked globally — a GPU in use on envA shows as in-use on envB too. +# export DOCKER_COMPUTE_ENVIRONMENTS='[{"socketPath":"/var/run/docker.sock","resources":[{"id":"disk","total":500},{"id":"gpu0","kind":"discrete","type":"gpu","total":1,"description":"NVIDIA A100","platform":"nvidia","driverVersion":"570.195.03","init":{"deviceRequests":{"Driver":"nvidia","DeviceIDs":["GPU-uuid-a"],"Capabilities":[["gpu"]]}}}],"environments":[{"id":"envA","storageExpiry":604800,"maxJobDuration":3600,"minJobDuration":60,"resources":[{"id":"cpu"},{"id":"ram"},{"id":"disk","max":500},{"id":"gpu0"}],"fees":{"1":[{"feeToken":"0x123","prices":[{"id":"cpu","price":1},{"id":"ram","price":0.1},{"id":"disk","price":0.01},{"id":"gpu0","price":5}]}]}}]}]' export DOCKER_COMPUTE_ENVIRONMENTS= diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 000000000..5413fff37 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,27 @@ +# Claude Code Instructions for ocean-node + +## Node.js version + +This repo requires **Node.js 22** (see `.nvmrc`). + +**Always run `nvm use` before any test, build, or `npm` command**, or the wrong Node version will be active and commands will fail with errors like `Unexpected token 'with'` or missing `GLIBC_2.38`. + +```bash +source ~/.nvm/nvm.sh && nvm use +``` + +If `sqlite3` native bindings fail after switching to Node 22, rebuild from source: + +```bash +npm_config_build_from_source=true npm rebuild sqlite3 +``` + +## Running tests + +```bash +# Unit tests (compute only — fast) +source ~/.nvm/nvm.sh && nvm use && npm run test:computeunit + +# All unit tests +source ~/.nvm/nvm.sh && nvm use && npm run test:unit +``` diff --git a/README.md b/README.md index c3d1d33ba..67492baac 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,7 @@ This command will run you through the process of setting up the environmental va > [!NOTE] > The quickstart script attempts to automatically detect GPUs (NVIDIA via `nvidia-smi`, others via `lspci`) and appends them to your `DOCKER_COMPUTE_ENVIRONMENTS`. +> Detected GPUs are added to `DOCKER_COMPUTE_ENVIRONMENTS[0].resources` (the connection-level resource pool), and a lightweight ref is added to `DOCKER_COMPUTE_ENVIRONMENTS[0].environments[0].resources`. > If you choose to manually configure `DOCKER_COMPUTE_ENVIRONMENTS` before running the script (e.g. via environment variable), be aware that auto-detected GPUs will be **merged** into your configuration, which could lead to duplication if you already manually defined them. > For most users, it is recommended to let the script handle GPU detection automatically. diff --git a/config.json b/config.json index 42d364183..703783c67 100644 --- a/config.json +++ b/config.json @@ -97,15 +97,27 @@ "dockerComputeEnvironments": [ { "socketPath": "/var/run/docker.sock", + "resources": [ + { + "id": "disk", + "total": 1 + } + ], "environments": [ { "storageExpiry": 604800, "maxJobDuration": 3600, "minJobDuration": 60, "resources": [ + { + "id": "cpu" + }, + { + "id": "ram" + }, { "id": "disk", - "total": 1 + "max": 1 } ], "access": { diff --git a/docs/GPU.md b/docs/GPU.md index b89a7664f..e67af2a85 100644 --- a/docs/GPU.md +++ b/docs/GPU.md @@ -1,14 +1,26 @@ Supporting GPUs for c2d jobs comes down to: -- define gpu list for each c2d env -- pass docker args for each gpu -- set a price for each gpu (see [Compute pricing](compute-pricing.md) for pricing units and examples) +- define each GPU as a named resource at the **connection level** (same level as `socketPath`) +- pass docker device args inside each GPU's `init` block +- reference the GPU by id in the environment's `resources` list +- set a price for each GPU in the environment's `fees` (see [Compute pricing](compute-pricing.md)) -## Nvidia GPU Example +## Key rules -Start by installing nvidia cuda drivers (ie:https://docs.nvidia.com/cuda/cuda-installation-guide-linux/), then install nvidia container toolkit (https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) +- Each physical GPU is its own resource with a unique id and exactly **one** `DeviceID`. +- `kind: "discrete"` (non-fungible): only one job at a time can use the device. This is the default when `init` is present. +- `cpu`, `ram`, and `disk` are **auto-detected** from the host — you do not need to declare them unless you want to cap their totals. +- Environment `resources` are **lightweight refs** (`id` + optional `total`/`min`/`max`/`constraints`). Hardware details (`init`, `driverVersion`, `platform`, etc.) live only at connection level. -Once that is done, check if you can get gpu details by running 'nvidia-smi': +> **Security note**: `init.advanced` entries (`Binds`, `CapAdd`, `Devices`, `SecurityOpt`) apply to every job in every environment that references the resource. Review them carefully before adding to production configs. + +--- + +## NVIDIA GPU Example + +Install nvidia cuda drivers (https://docs.nvidia.com/cuda/cuda-installation-guide-linux/) and nvidia container toolkit (https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html). + +Check your GPU details: ``` root@gpu-1:/repos/ocean/ocean-node# nvidia-smi @@ -18,7 +30,7 @@ Fri Apr 25 06:00:34 2025 |-----------------------------------------+------------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | -| | | MIG M. | +| | | N/A | |=========================================+========================+======================| | 0 NVIDIA GeForce GTX 1060 3GB Off | 00000000:01:00.0 Off | N/A | | 0% 39C P8 6W / 120W | 2MiB / 3072MiB | 0% Default | @@ -34,7 +46,7 @@ Fri Apr 25 06:00:34 2025 +-----------------------------------------------------------------------------------------+ ``` -Now, time to get the id of the gpu: +Get the GPU UUID: ```bash root@gpu-1:/repos/ocean/ocean-node# nvidia-smi --query-gpu=name,uuid,driver_version,memory.total --format=csv @@ -42,40 +54,23 @@ name, uuid, driver version, memory total NVIDIA GeForce GTX 1060 3GB, GPU-294c6802-bb2f-fedb-f9e0-a26b9142dd81, 570.195.03, 3072 MiB ``` -Now, we can define the gpu for node: - -```json -{ - "id": "myGPU", - "description": "NVIDIA GeForce GTX 1060 3GB", - "type": "gpu", - "total": 1, - "init": { - "deviceRequests": { - "Driver": "nvidia", - "DeviceIDs": ["GPU-294c6802-bb2f-fedb-f9e0-a26b9142dd81"], - "Capabilities": [["gpu"]] - } - }, - "driverVersion": "570.195.03", - "memoryTotal": "3072 MiB" -} -``` - -Don't forget to add it to fees definition and free definition (if desired). - -Here is the full definition of DOCKER_COMPUTE_ENVIRONMENTS: +Full `DOCKER_COMPUTE_ENVIRONMENTS` configuration: ```json [ { "socketPath": "/var/run/docker.sock", + "resources": [ { - "id": "myGPU", - "description": "NVIDIA GeForce GTX 1060 3GB", + "id": "gpu0", + "kind": "discrete", "type": "gpu", "total": 1, + "description": "NVIDIA GeForce GTX 1060 3GB", + "platform": "nvidia", + "driverVersion": "570.195.03", + "memoryTotal": "3072 MiB", "init": { "deviceRequests": { "Driver": "nvidia", @@ -83,113 +78,66 @@ Here is the full definition of DOCKER_COMPUTE_ENVIRONMENTS: "Capabilities": [["gpu"]] } }, - "driverVersion": "570.195.03", - "memoryTotal": "3072 MiB" - }, - { "id": "disk", "total": 1 } + "constraints": [ + { "id": "ram", "min": 2 }, + { "id": "cpu", "min": 1 } + ] + } ], - "storageExpiry": 604800, - "maxJobDuration": 3600, - "minJobDuration": 60, - "fees": { - "1": [ - { - "feeToken": "0x123", - "prices": [ - { "id": "cpu", "price": 1 }, - { "id": "myGPU", "price": 3 } + + "environments": [ + { + "id": "gpu-env", + "description": "NVIDIA GPU environment", + "storageExpiry": 604800, + "maxJobDuration": 3600, + "minJobDuration": 60, + "enableNetwork": false, + "resources": [ + { "id": "cpu", "min": 1, "max": 4 }, + { "id": "ram", "min": 1, "max": 8 }, + { "id": "disk", "min": 1, "max": 50 }, + { "id": "gpu0" } + ], + "access": { "addresses": [], "accessLists": null }, + "fees": { + "1": [ + { + "feeToken": "0x967da4048cD07aB37855c090aAF366e4ce1b9F48", + "prices": [ + { "id": "cpu", "price": 1 }, + { "id": "gpu0", "price": 3 } + ] + } + ] + }, + "free": { + "maxJobDuration": 60, + "minJobDuration": 10, + "maxJobs": 3, + "access": { "addresses": [], "accessLists": null }, + "resources": [ + { "id": "cpu", "max": 1 }, + { "id": "ram", "max": 2 }, + { "id": "disk", "max": 5 }, + { "id": "gpu0" } ] } - ] - }, - "free": { - "maxJobDuration": 60, - "minJobDuration": 10, - "maxJobs": 3, - "resources": [ - { "id": "cpu", "max": 1 }, - { "id": "ram", "max": 1 }, - { "id": "disk", "max": 1 }, - { "id": "myGPU", "max": 1 } - ] - } + } + ] } ] ``` -And you should have it in your compute envs: +Verify: ```bash -root@gpu-1:/repos/ocean/ocean-node# curl http://localhost:8000/api/services/computeEnvironments +curl http://localhost:8000/api/services/computeEnvironments ``` -```json -[ - { - "id": "0xd6b10b27aab01a72070a5164c07d0517755838b9cb9857e2d5649287ec3aaaa2-0x66073c81f833deaa2f8e2a508f69cf78f8a99b17ba1a64f369af921750f93914", - "runningJobs": 0, - "consumerAddress": "0x00", - "platform": { "architecture": "x86_64", "os": "Ubuntu 22.04.3 LTS" }, - "fees": { - "1": [ - { - "feeToken": "0x123", - "prices": [ - { "id": "cpu", "price": 1 }, - { "id": "myGPU", "price": 3 } - ] - } - ] - }, - "storageExpiry": 604800, - "maxJobDuration": 3600, - "minJobDuration": 60, - "resources": [ - { "id": "cpu", "total": 8, "max": 8, "min": 1, "inUse": 0 }, - { - "id": "ram", - "total": 23, - "max": 23, - "min": 1, - "inUse": 0 - }, - { - "id": "myGPU", - "description": "NVIDIA GeForce GTX 1060 3GB", - "type": "gpu", - "total": 1, - "init": { - "deviceRequests": { - "Driver": "nvidia", - "DeviceIDs": ["GPU-294c6802-bb2f-fedb-f9e0-a26b9142dd81"], - "Capabilities": [["gpu"]] - } - }, - "driverVersion": "570.195.03", - "memoryTotal": "3072 MiB", - "max": 1, - "min": 0, - "inUse": 0 - }, - { "id": "disk", "total": 1, "max": 1, "min": 0, "inUse": 0 } - ], - "free": { - "maxJobDuration": 60, - "minJobDuration": 10, - "maxJobs": 3, - "resources": [ - { "id": "cpu", "max": 1, "inUse": 0 }, - { "id": "ram", "max": 1, "inUse": 0 }, - { "id": "disk", "max": 1, "inUse": 0 }, - { "id": "myGPU", "max": 1, "inUse": 0 } - ] - }, - "runningfreeJobs": 0 - } -] -``` +The response includes `resources` with the GPU fully resolved (including `init`, `driverVersion`, etc.) and `inUse` counters. -Start a free job using: +Start a free GPU job: ```json { @@ -201,56 +149,40 @@ Start a free job using: "tag": "2.17.0-gpu", "entrypoint": "python $ALGO" }, - "rawcode": "import tensorflow as tf\nsess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))\nprint(\"Num GPUs Available: \", len(tf.config.list_physical_devices('GPU')))\ngpus = tf.config.list_physical_devices('GPU')\nfor gpu in gpus:\n\tprint('Name:', gpu.name, ' Type:', gpu.device_type)" + "rawcode": "import tensorflow as tf\nprint('Num GPUs Available:', len(tf.config.list_physical_devices('GPU')))" } }, "consumerAddress": "0x00", "signature": "123", "nonce": 1, - "environment": "0xd6b10b27aab01a72070a5164c07d0517755838b9cb9857e2d5649287ec3aaaa2-0x66073c81f833deaa2f8e2a508f69cf78f8a99b17ba1a64f369af921750f93914", + "environment": "", "resources": [ - { - "id": "cpu", - "amount": 1 - }, - { - "id": "myGPU", - "amount": 1 - } + { "id": "cpu", "amount": 1 }, + { "id": "gpu0", "amount": 1 } ] } ``` -And the output of `getComputeResult` should look like: - -```bash -2025-04-25 06:18:20.890217: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered -2025-04-25 06:18:21.192330: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered -2025-04-25 06:18:21.292230: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered -WARNING: All log messages before absl::InitializeLog() is called are written to STDERR -I0000 00:00:1745561915.985558 1 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355 -I0000 00:00:1745561915.993514 1 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355 -I0000 00:00:1745561915.993799 1 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355 -Num GPUs Available: 1 -Name: /physical_device:GPU:0 Type: GPU -``` - -## AMD Radeon 9070 XT ON WSL2 +--- -First, install ROCm (https://rocm.docs.amd.com/projects/radeon/en/latest/docs/install/wsl/install-radeon.html) +## AMD Radeon GPU Example -Then define DOCKER_COMPUTE_ENVIRONMENTS with +Install ROCm (https://rocm.docs.amd.com/projects/radeon/en/latest/docs/install/wsl/install-radeon.html). ```json [ { "socketPath": "/var/run/docker.sock", + "resources": [ { - "id": "myGPU", - "description": "AMD Radeon RX 9070 XT", + "id": "gpu0", + "kind": "discrete", "type": "gpu", "total": 1, + "description": "AMD Radeon RX 9070 XT", + "driverVersion": "26.2.2", + "memoryTotal": "16384 MiB", "init": { "advanced": { "IpcMode": "host", @@ -261,202 +193,65 @@ Then define DOCKER_COMPUTE_ENVIRONMENTS with "/usr/lib/wsl/lib/libdxcore.so:/usr/lib/libdxcore.so", "/opt/rocm/lib/libhsa-runtime64.so.1:/opt/rocm/lib/libhsa-runtime64.so.1" ], - "SecurityOpt": { - "seccomp": "unconfined" - } + "SecurityOpt": { "seccomp": "unconfined" } } }, - "driverVersion": "26.2.2", - "memoryTotal": "16384 MiB" - }, - { - "id": "disk", - "total": 1 + "constraints": [ + { "id": "ram", "min": 4 }, + { "id": "cpu", "min": 2 } + ] } ], - "storageExpiry": 604800, - "maxJobDuration": 3600, - "minJobDuration": 60, - "fees": { - "1": [ - { - "feeToken": "0x123", - "prices": [ - { - "id": "cpu", - "price": 1 - }, + + "environments": [ + { + "id": "amd-gpu-env", + "description": "AMD Radeon GPU environment", + "storageExpiry": 604800, + "maxJobDuration": 3600, + "minJobDuration": 60, + "enableNetwork": false, + "resources": [ + { "id": "cpu", "min": 1, "max": 4 }, + { "id": "ram", "min": 1, "max": 16 }, + { "id": "disk", "min": 1, "max": 50 }, + { "id": "gpu0" } + ], + "access": { "addresses": [], "accessLists": null }, + "fees": { + "1": [ { - "id": "nyGPU", - "price": 3 + "feeToken": "0x967da4048cD07aB37855c090aAF366e4ce1b9F48", + "prices": [ + { "id": "cpu", "price": 1 }, + { "id": "gpu0", "price": 3 } + ] } ] - } - ] - }, - "free": { - "maxJobDuration": 60, - "minJobDuration": 10, - "maxJobs": 3, - "resources": [ - { - "id": "cpu", - "max": 1 - }, - { - "id": "ram", - "max": 1 }, - { - "id": "disk", - "max": 1 - }, - { - "id": "myGPU", - "max": 1 - } - ] - } - } -] -``` - -aka - -```bash -export DOCKER_COMPUTE_ENVIRONMENTS='[{"socketPath":"/var/run/docker.sock","resources":[{"id":"myGPU","description":"AMD Radeon RX 9070 XT","type":"gpu","total":1,"init":{"advanced":{"IpcMode":"host","ShmSize":8589934592,"CapAdd":["SYS_PTRACE"],"Devices":["/dev/dxg","/dev/dri/card0"],"Binds":["/usr/lib/wsl/lib/libdxcore.so:/usr/lib/libdxcore.so","/opt/rocm/lib/libhsa-runtime64.so.1:/opt/rocm/lib/libhsa-runtime64.so.1"],"SecurityOpt":{"seccomp":"unconfined"}}},"driverVersion":"26.2.2","memoryTotal":"16384 MiB"},{"id":"disk","total":1}],"storageExpiry":604800,"maxJobDuration":3600,"minJobDuration":60,"fees":{"1":[{"feeToken":"0x123","prices":[{"id":"cpu","price":1},{"id":"nyGPU","price":3}]}]},"free":{"maxJobDuration":60,"minJobDuration":10,"maxJobs":3,"resources":[{"id":"cpu","max":1},{"id":"ram","max":1},{"id":"disk","max":1},{"id":"myGPU","max":1}]}}]' -``` - -you should have it in your compute envs: - -```bash -root@gpu-1:/repos/ocean/ocean-node# curl http://localhost:8000/api/services/computeEnvironments -``` - -```json -[ - { - "id": "0xbb5773e734e1b188165dac88d9a3dc8ac28bc9f5624b45fa8bbd8fca043de7c1-0x2c2761f938cf186eeb81f71dee06ad7edb299493e39c316c390d0c0691e6585c", - "runningJobs": 0, - "consumerAddress": "0x00", - "platform": { - "architecture": "x86_64", - "os": "Ubuntu 24.04.2 LTS" - }, - "fees": { - "1": [ - { - "feeToken": "0x123", - "prices": [ - { - "id": "cpu", - "price": 1 - }, - { - "id": "nyGPU", - "price": 3 - } + "free": { + "maxJobDuration": 60, + "minJobDuration": 10, + "maxJobs": 3, + "access": { "addresses": [], "accessLists": null }, + "resources": [ + { "id": "cpu", "max": 1 }, + { "id": "ram", "max": 4 }, + { "id": "disk", "max": 5 }, + { "id": "gpu0" } ] } - ] - }, - "storageExpiry": 604800, - "maxJobDuration": 3600, - "minJobDuration": 60, - "resources": [ - { - "id": "cpu", - "total": 16, - "max": 16, - "min": 1, - "inUse": 0 - }, - { - "id": "ram", - "total": 31, - "max": 31, - "min": 1, - "inUse": 0 - }, - { - "id": "myGPU", - "description": "AMD Radeon RX 9070 XT", - "type": "gpu", - "total": 1, - "init": { - "advanced": { - "IpcMode": "host", - "CapAdd": ["CAP_SYS_PTRACE"], - "Devices": ["/dev/dxg", "/dev/dri/card0"], - "Binds": [ - "/usr/lib/wsl/lib/libdxcore.so:/usr/lib/libdxcore.so", - "/opt/rocm/lib/libhsa-runtime64.so.1:/opt/rocm/lib/libhsa-runtime64.so.1" - ], - "SecurityOpt": { - "seccomp": "unconfined" - } - } - }, - "driverVersion": "26.2.2", - "memoryTotal": "16384 MiB", - "max": 1, - "min": 0, - "inUse": 0 - }, - { - "id": "disk", - "total": 10, - "max": 10, - "min": 0, - "inUse": 0 } - ], - "free": { - "maxJobDuration": 60, - "minJobDuration": 10, - "maxJobs": 3, - "resources": [ - { - "id": "cpu", - "max": 1, - "inUse": 0 - }, - { - "id": "ram", - "max": 1, - "inUse": 0 - }, - { - "id": "disk", - "max": 1, - "inUse": 0 - }, - { - "id": "myGPU", - "max": 1, - "inUse": 0 - } - ] - }, - "runningfreeJobs": 0 + ] } ] ``` -Start a free job with +Start a free job: ```json { "command": "freeStartCompute", - "datasets": [ - { - "fileObject": { - "type": "url", - "url": "https://raw.githubusercontent.com/oceanprotocol/test-algorithm/master/javascript/algo.js", - "method": "get" - } - } - ], "algorithm": { "meta": { "container": { @@ -464,47 +259,25 @@ Start a free job with "tag": "rocm6.4-py3.12-tf2.18-dev", "entrypoint": "python $ALGO" }, - "rawcode": "import tensorflow as tf\nsess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))\nprint(\"Num GPUs Available: \", len(tf.config.list_physical_devices('GPU')))\ngpus = tf.config.list_physical_devices('GPU')\nfor gpu in gpus:\n\tprint('Name:', gpu.name, ' Type:', gpu.device_type)" + "rawcode": "import tensorflow as tf\nprint('Num GPUs Available:', len(tf.config.list_physical_devices('GPU')))" } }, "consumerAddress": "0x00", "signature": "123", "nonce": 1, - "environment": "0xbb5773e734e1b188165dac88d9a3dc8ac28bc9f5624b45fa8bbd8fca043de7c1-0x2c2761f938cf186eeb81f71dee06ad7edb299493e39c316c390d0c0691e6585c", + "environment": "", "resources": [ - { - "id": "cpu", - "amount": 1 - }, - { - "id": "myGPU", - "amount": 1 - } + { "id": "cpu", "amount": 1 }, + { "id": "gpu0", "amount": 1 } ] } ``` -and get the results - -```bash -2025-04-25 15:16:15.218050: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. -To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. -WARNING: All log messages before absl::InitializeLog() is called are written to STDERR -I0000 00:00:1745594260.720023 1 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 2874 MB memory: -> device: 0, name: AMD Radeon RX 9070 XT, pci bus id: 0000:0d:00.0 -2025-04-25 15:17:44.018225: I tensorflow/core/common_runtime/direct_session.cc:378] Device mapping: -/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: AMD Radeon RX 9070 XT, pci bus id: 0000:0d:00.0 - -Num GPUs Available: 1 -Name: /physical_device:GPU:0 Type: GPU -Warning: Resource leak detected by SharedSignalPool, 385 Signals leaked. -pid:1 tid:0x7f4476ac1740 [~VaMgr] frag_map_ size is not 1. -``` +--- ## Intel Arc GPU Example -First, install Intel GPU drivers (https://dgpu-docs.intel.com/driver/installation.html), then install Intel container toolkit (https://github.com/intel/intel-device-plugins-for-kubernetes/tree/main/cmd/gpu_plugin) - -Once that is done, check if you can get gpu details by running `clinfo`: +Install Intel GPU drivers (https://dgpu-docs.intel.com/driver/installation.html). ```bash root@gpu-1:/repos/ocean/ocean-node# clinfo @@ -518,56 +291,22 @@ Number of platforms: 1 Device Topology (NV12): PCI[ B#3 D#0 F#0 ] Max compute units: 32 Max clock frequency: 2400 MHz - Device extensions: cl_khr_fp64 cl_khr_fp16 cl_intel_subgroups ... ``` -Now, get the device UUID: - -```bash -root@gpu-1:/repos/ocean/ocean-node# lspci -D | grep VGA -0000:03:00.0 VGA compatible controller: Intel Corporation Arc Graphics -``` - -For container runtime, Intel Arc GPUs use `/dev/dri/renderD128` or similar: - -```bash -root@gpu-1:/repos/ocean/ocean-node# ls -la /dev/dri/ -crw-rw---- 1 root render 226, 0 Apr 25 10:00 card0 -crw-rw---- 1 root render 226, 128 Apr 25 10:00 renderD128 -``` - -Now, we can define the GPU for the node: - -```json -{ - "id": "intelGPU", - "description": "Intel Arc A770M Graphics", - "type": "gpu", - "total": 1, - "init": { - "advanced": { - "Devices": ["/dev/dri/renderD128", "/dev/dri/card0"], - "GroupAdd": ["video", "render"], - "CapAdd": ["SYS_ADMIN"] - } - }, - "driverVersion": "32.0.101.8531", - "memoryTotal": "16384 MiB" -} -``` - -Here is the full definition of DOCKER_COMPUTE_ENVIRONMENTS with Intel GPU: - ```json [ { "socketPath": "/var/run/docker.sock", + "resources": [ { - "id": "intelGPU", - "description": "Intel Arc A770M Graphics", + "id": "gpu0", + "kind": "discrete", "type": "gpu", "total": 1, + "description": "Intel Arc A770M Graphics", + "driverVersion": "32.0.101.8531", + "memoryTotal": "16384 MiB", "init": { "advanced": { "Devices": ["/dev/dri/renderD128", "/dev/dri/card0"], @@ -575,156 +314,249 @@ Here is the full definition of DOCKER_COMPUTE_ENVIRONMENTS with Intel GPU: "CapAdd": ["SYS_ADMIN"] } }, - "driverVersion": "32.0.101.8531", - "memoryTotal": "16384 MiB" - }, - { "id": "disk", "total": 1 } + "constraints": [ + { "id": "ram", "min": 2 }, + { "id": "cpu", "min": 1 } + ] + } ], - "storageExpiry": 604800, - "maxJobDuration": 3600, - "minJobDuration": 60, - "fees": { - "1": [ - { - "feeToken": "0x123", - "prices": [ - { "id": "cpu", "price": 1 }, - { "id": "intelGPU", "price": 2 } + + "environments": [ + { + "id": "intel-gpu-env", + "description": "Intel Arc GPU environment", + "storageExpiry": 604800, + "maxJobDuration": 3600, + "minJobDuration": 60, + "enableNetwork": false, + "resources": [ + { "id": "cpu", "min": 1, "max": 4 }, + { "id": "ram", "min": 1, "max": 8 }, + { "id": "disk", "min": 1, "max": 50 }, + { "id": "gpu0" } + ], + "access": { "addresses": [], "accessLists": null }, + "fees": { + "1": [ + { + "feeToken": "0x967da4048cD07aB37855c090aAF366e4ce1b9F48", + "prices": [ + { "id": "cpu", "price": 1 }, + { "id": "gpu0", "price": 2 } + ] + } + ] + }, + "free": { + "maxJobDuration": 60, + "minJobDuration": 10, + "maxJobs": 3, + "access": { "addresses": [], "accessLists": null }, + "resources": [ + { "id": "cpu", "max": 1 }, + { "id": "ram", "max": 2 }, + { "id": "disk", "max": 5 }, + { "id": "gpu0" } ] } - ] - }, - "free": { - "maxJobDuration": 60, - "minJobDuration": 10, - "maxJobs": 3, - "resources": [ - { "id": "cpu", "max": 1 }, - { "id": "ram", "max": 1 }, - { "id": "disk", "max": 1 }, - { "id": "intelGPU", "max": 1 } - ] - } + } + ] } ] ``` -Verify you have it in your compute environments: +--- -```bash -root@gpu-1:/repos/ocean/ocean-node# curl http://localhost:8000/api/services/computeEnvironments -``` +## Multiple GPUs — Shared Between Environments + +Each physical GPU is its own resource. Both environments can reference both GPUs; the engine tracks usage globally so no GPU is ever double-allocated. ```json [ { - "id": "0xaa1234567890abcdef1234567890abcdef1234567890abcdef1234567890ab-0xbb0987654321fedcba0987654321fedcba0987654321fedcba0987654321fed", - "runningJobs": 0, - "consumerAddress": "0x00", - "platform": { "architecture": "x86_64", "os": "Ubuntu 22.04.3 LTS" }, - "fees": { - "1": [ - { - "feeToken": "0x123", - "prices": [ - { "id": "cpu", "price": 1 }, - { "id": "intelGPU", "price": 2 } - ] - } - ] - }, - "storageExpiry": 604800, - "maxJobDuration": 3600, - "minJobDuration": 60, + "socketPath": "/var/run/docker.sock", + "resources": [ - { "id": "cpu", "total": 16, "max": 16, "min": 1, "inUse": 0 }, { - "id": "ram", - "total": 32, - "max": 32, - "min": 1, - "inUse": 0 + "id": "gpu0", + "kind": "discrete", + "type": "gpu", + "total": 1, + "description": "NVIDIA A100 40GB (slot 0)", + "platform": "nvidia", + "driverVersion": "570.195.03", + "memoryTotal": "40960 MiB", + "init": { + "deviceRequests": { + "Driver": "nvidia", + "DeviceIDs": ["GPU-uuid-a"], + "Capabilities": [["gpu"]] + } + }, + "constraints": [ + { "id": "ram", "min": 8 }, + { "id": "cpu", "min": 2 } + ] }, { - "id": "intelGPU", - "description": "Intel Arc A770M Graphics", + "id": "gpu1", + "kind": "discrete", "type": "gpu", "total": 1, + "description": "NVIDIA A100 40GB (slot 1)", + "platform": "nvidia", + "driverVersion": "570.195.03", + "memoryTotal": "40960 MiB", "init": { - "advanced": { - "Devices": ["/dev/dri/renderD128", "/dev/dri/card0"], - "GroupAdd": ["video", "render"], - "CapAdd": ["SYS_ADMIN"] + "deviceRequests": { + "Driver": "nvidia", + "DeviceIDs": ["GPU-uuid-b"], + "Capabilities": [["gpu"]] } }, - "driverVersion": "32.0.101.8531", - "memoryTotal": "16384 MiB" - "max": 1, - "min": 0, - "inUse": 0 - }, - { "id": "disk", "total": 1, "max": 1, "min": 0, "inUse": 0 } + "constraints": [ + { "id": "ram", "min": 8 }, + { "id": "cpu", "min": 2 } + ] + } ], - "free": { - "maxJobDuration": 60, - "minJobDuration": 10, - "maxJobs": 3, - "resources": [ - { "id": "cpu", "max": 1, "inUse": 0 }, - { "id": "ram", "max": 1, "inUse": 0 }, - { "id": "disk", "max": 1, "inUse": 0 }, - { "id": "intelGPU", "max": 1, "inUse": 0 } - ] - }, - "runningfreeJobs": 0 + + "environments": [ + { + "id": "premium", + "description": "Full GPU access", + "storageExpiry": 604800, + "maxJobDuration": 3600, + "minJobDuration": 60, + "enableNetwork": true, + "resources": [ + { "id": "cpu", "total": 16, "min": 1, "max": 8 }, + { "id": "ram", "total": 60, "min": 1, "max": 32 }, + { "id": "disk", "total": 200, "min": 1, "max": 100 }, + { "id": "gpu0" }, + { "id": "gpu1" } + ], + "access": { "addresses": [], "accessLists": null }, + "fees": { + "1": [ + { + "feeToken": "0x967da4048cD07aB37855c090aAF366e4ce1b9F48", + "prices": [ + { "id": "cpu", "price": 1 }, + { "id": "ram", "price": 0.5 }, + { "id": "gpu0", "price": 10 }, + { "id": "gpu1", "price": 10 } + ] + } + ] + } + }, + { + "id": "standard", + "description": "CPU only", + "storageExpiry": 604800, + "maxJobDuration": 1800, + "minJobDuration": 60, + "enableNetwork": false, + "resources": [ + { "id": "cpu", "total": 8, "min": 1, "max": 4 }, + { "id": "ram", "total": 16, "min": 1, "max": 8 }, + { "id": "disk", "total": 50, "min": 1, "max": 50 } + ], + "access": { "addresses": [], "accessLists": null }, + "fees": { + "1": [ + { + "feeToken": "0x967da4048cD07aB37855c090aAF366e4ce1b9F48", + "prices": [ + { "id": "cpu", "price": 0.5 }, + { "id": "ram", "price": 0.2 } + ] + } + ] + }, + "free": { + "maxJobDuration": 300, + "minJobDuration": 10, + "maxJobs": 3, + "access": { "addresses": [], "accessLists": null }, + "resources": [ + { "id": "cpu", "max": 1 }, + { "id": "ram", "max": 2 }, + { "id": "disk", "max": 5 } + ] + } + } + ] } ] ``` -Start a free job using Intel GPU with: +--- + +## Shareable Devices (NIC, TPM, HSM) + +Use `kind: "discrete"` and `shareable: true` for devices that multiple jobs may use simultaneously. The engine tracks `inUse` for visibility but never blocks allocation. ```json { - "command": "freeStartCompute", - "algorithm": { - "meta": { - "container": { - "image": "intel/oneapi-runtime", - "tag": "2024.0-devel-ubuntu22.04", - "entrypoint": "python $ALGO" - }, - "rawcode": "import os\nprint('GPU device available:')\nos.system('clinfo')" - } - }, - "consumerAddress": "0x00", - "signature": "123", - "nonce": 1, - "environment": "0xaa1234567890abcdef1234567890abcdef1234567890abcdef1234567890ab-0xbb0987654321fedcba0987654321fedcba0987654321fedcba0987654321fed", - "resources": [ - { - "id": "cpu", - "amount": 1 - }, - { - "id": "intelGPU", - "amount": 1 + "id": "nic0", + "kind": "discrete", + "shareable": true, + "type": "network", + "total": 1, + "description": "SR-IOV NIC", + "init": { + "advanced": { + "Devices": [{ "PathOnHost": "/dev/net/tun", "PathInContainer": "/dev/net/tun" }] } + } +} +``` + +> `shareable: true` is **not** allowed on `type: "gpu"` or `type: "fpga"` — the node will refuse to start. GPUs and FPGAs require exclusive per-job access. + +--- + +## Resource Constraints + +Constraints on a GPU resource define minimum companion resources required per job. When a user requests the GPU, the engine automatically allocates at least the constrained amounts. This also prevents the GPU from being scheduled when fungible resources (RAM, CPU) are exhausted. + +```json +{ + "id": "gpu0", + "kind": "discrete", + "constraints": [ + { "id": "ram", "min": 8 }, + { "id": "cpu", "min": 2 }, + { "id": "disk", "min": 10 } ] } ``` -And the output of `getComputeResult` should look like: +Environments can override pool-level constraints via the `EnvironmentResourceRef`: -```bash -Number of platforms: 1 - Platform #0: Intel(R) OpenCL Graphics - Number of devices: 1 - Device #0: Intel(R) Arc(TM) A770M Graphics - Board name: Intel Arc Graphics - Vendor ID: 0x8086 - Device ID: 0x56a0 - Device Topology (NV12): PCI[ B#3 D#0 F#0 ] - Max compute units: 32 - Max clock frequency: 2400 MHz - Device extensions: cl_khr_fp64 cl_khr_fp16 cl_intel_subgroups ... +```json +{ "id": "gpu0", "constraints": [{ "id": "ram", "min": 16 }, { "id": "cpu", "min": 4 }] } +``` + +Set `"constraints": []` to remove all constraints for a specific environment. + +--- + +## Migration from old format + +The old format placed hardware resources (`init`, `driverVersion`, etc.) inside environments. This is now a startup error. + +**Old (rejected):** +```json +"environments": [{ "resources": [{ "id": "myGPU", "total": 1, "init": {...} }] }] ``` + +**New:** +```json +"resources": [{ "id": "myGPU", "kind": "discrete", "total": 1, "init": {...} }], +"environments": [{ "resources": [{ "id": "myGPU" }] }] +``` + +Move all `init`, `description`, `driverVersion`, `platform`, `memoryTotal`, `type`, `kind`, and `constraints` fields to the connection-level `resources` array. Each environment's `resources` keeps only `id` and optionally `total`/`min`/`max`/`constraints`. diff --git a/docs/compute-pricing.md b/docs/compute-pricing.md index c6598e760..fc070654e 100644 --- a/docs/compute-pricing.md +++ b/docs/compute-pricing.md @@ -1,15 +1,17 @@ # Compute Environment Configuration and Pricing -This guide explains how to configure your node’s Docker compute environments and how to set prices for each resource. It covers the `DOCKER_COMPUTE_ENVIRONMENTS` variable (or equivalent config), the fee structure, pricing units, and examples for CPU, RAM, disk, and GPU. +This guide explains how to configure your node's Docker compute environments and how to set prices for each resource. It covers the `DOCKER_COMPUTE_ENVIRONMENTS` variable (or equivalent config), the fee structure, pricing units, and examples for CPU, RAM, disk, and GPU. ## Overview - **Configuration**: Define compute environments via the `DOCKER_COMPUTE_ENVIRONMENTS` environment variable (JSON) or via `config.json` under `dockerComputeEnvironments`. -- **Environment**: Is a group of resources, payment and accesslists. -- **Resources**: Each environment declares resources (e.g. `cpu`, `ram`, `disk`, and optionally GPUs). You must declare a `disk` resource. +- **Two-level layout**: Resources are defined at the **Docker-connection level** (`socketPath`) and referenced by each environment. This lets multiple environments share the same hardware (e.g. both a paid and a free environment can use the same GPU). +- **Auto-detection**: `cpu` and `ram` are automatically detected from the host at startup. `disk` is measured via `statfs`. You only need to declare them in `resources` if you want to cap/override the detected value. +- **Resources**: The connection-level `resources` array holds full hardware definitions. Each environment's `resources` array holds lightweight refs (`{ id, total?, min?, max? }`) pointing to those pool entries. +- **Dual-gate tracking** (fungible resources like CPU/RAM/disk): Gate 1 enforces the per-environment ceiling; Gate 2 enforces the engine-wide physical pool ceiling. Both must pass for a job to be admitted. - **Pricing**: For each chain and fee token, you set a `price` per resource. Cost is computed as **price × amount × duration (in minutes, rounded up)**. -- **Free**: Environments which does not require a payment for the resources, but most likley are very limited in terms of resources available and job duration. -- **Image building**: **Free jobs cannot build images** (Dockerfiles are not allowed). For **paid jobs**, **image build time counts toward billable duration** and also consumes the job’s `maxJobDuration`. +- **Free tier**: Environments can have a `free` block that permits jobs with no payment, but with tighter resource limits. +- **Image building**: Free jobs cannot build images (Dockerfiles are not allowed). For paid jobs, image build time counts toward billable duration. ## Pricing Units @@ -20,11 +22,6 @@ This guide explains how to configure your node’s Docker compute environments a | **Disk** | Gigabytes (GB) | Per GB per minute | `price × diskGB × ceil(duration/60)` | | **GPU** | Number of GPUs | Per GPU per minute | `price × gpus × ceil(duration/60)` | -So: - -- **CPU and GPU**: price is **per resource per minute** (e.g. 2 CPUs at price 1 for 90 minutes → 2 × 1 × 90 = 180). -- **Memory (RAM) and storage (disk)**: price is **per minute per gigabyte** (e.g. 4 GB RAM at price 0.5 for 60 minutes → 0.5 × 4 × 60 = 120). - Duration is always in seconds; it is converted to minutes with **ceil(duration / 60)** (e.g. 61 seconds → 2 minutes). --- @@ -32,33 +29,70 @@ Duration is always in seconds; it is converted to minutes with **ceil(duration / ## Where to Configure 1. **Environment variable** - Set `DOCKER_COMPUTE_ENVIRONMENTS` to a JSON string (array of compute environment objects). - Example: + Set `DOCKER_COMPUTE_ENVIRONMENTS` to a JSON string (array of Docker-connection objects). `export DOCKER_COMPUTE_ENVIRONMENTS='[{"socketPath":"/var/run/docker.sock",...}]'` 2. **Config file** Put the same array in your JSON config under the key `dockerComputeEnvironments`, and point the node to that file (e.g. via `CONFIG_PATH`). -If both are set, the environment variable typically overrides the config. See [Environmental Variables](env.md) and `ENVIRONMENT_VARIABLES` in `src/utils/constants.ts`. +If both are set, the environment variable overrides the config. See [env.md](env.md) for all available fields. --- -## Environment Structure (Summary) +## Configuration Layout -Each element of `DOCKER_COMPUTE_ENVIRONMENTS` is an object with at least: +``` +DOCKER_COMPUTE_ENVIRONMENTS +└── [ Docker connection ] ← socketPath, resources[], environments[] + ├── resources[] ← full hardware definitions (CPU, RAM, disk, GPU, …) + │ ├── { id: "cpu", total: 6 } (optional: caps auto-detected value) + │ ├── { id: "ram", total: 28 } (optional: caps auto-detected value) + │ ├── { id: "disk", total: 80 } (optional: caps auto-detected value) + │ └── { id: "gpu0", kind: "discrete", … } (required for custom hardware) + └── environments[] ← one or more compute environments + └── { id, fees, resources[], free? } + └── resources[] ← lightweight refs to the pool above + ├── { id: "cpu", total: 4, min: 1, max: 4 } + ├── { id: "ram", total: 16, min: 1, max: 8 } + ├── { id: "disk", max: 20 } + └── { id: "gpu0" } +``` -- **socketPath**: Docker socket (e.g. `"/var/run/docker.sock"`). -- **resources**: List of resources (see below). Must include `disk`. -- **storageExpiry**, **maxJobDuration**, **minJobDuration**: Required (seconds). -- **fees**: Per-chain, per-token pricing (see next section). -- **access** (optional): Who can run paid jobs (`addresses`, `accessLists`). -- **free** (optional): Limits and access for free jobs. +`cpu`, `ram`, and `disk` are **auto-detected** at startup — you do not need to declare them in the connection-level `resources` array. Include them only to cap the detected value (e.g. limit an 8-core host to 6 cores for compute). Custom hardware (GPUs, NICs) must always be declared. -### Resources +--- -- **cpu**, **ram**, **disk**: Standard resources. `disk` is mandatory. - **Disk** and **RAM** are in **GB** (e.g. `"total": 10` = 10 GB). -- **GPU**: Add a resource with `"type": "gpu"` and either `deviceRequests` (NVIDIA) or `advanced` (AMD/Intel). See [GPU.md](GPU.md) for full examples. +## Connection-level Resource Fields + +These fields go in the `resources` array at the Docker-connection level: + +| Field | Description | +|---|---| +| `id` | Unique identifier used in env refs and `fees.prices[].id` (e.g. `"cpu"`, `"gpu0"`) | +| `total` | Maximum units available in the pool. For `cpu`/`ram`/`disk`, caps the auto-detected value. | +| `kind` | `"fungible"` (CPU, RAM, disk — interchangeable units) or `"discrete"` (GPU, FPGA — named device). Auto-inferred if omitted: `"discrete"` when `init` is present, `"fungible"` otherwise. | +| `shareable` | `discrete` only. `true` → multiple jobs may use the device simultaneously (e.g. NIC, TPM). `false` (default) → exclusive per job (GPU, FPGA). | +| `min` | Minimum units per job request | +| `max` | Maximum units per job request (defaults to `total`) | +| `type` | Hint string: `"cpu"`, `"ram"`, `"disk"`, `"gpu"` | +| `description` | Human-readable label shown in `getComputeEnvironments` | +| `driverVersion` | GPU driver version string | +| `memoryTotal` | GPU VRAM string (e.g. `"40960 MiB"`) | +| `platform` | GPU vendor: `"nvidia"`, `"amd"`, `"intel"` | +| `init` | Docker container configuration (`deviceRequests` for NVIDIA, `advanced` for AMD/Intel). Makes `kind` default to `"discrete"`. | +| `constraints` | `[{ id, min?, max? }]` — companion resource requirements. When a job rents this resource, linked resources are auto-bumped to their minimums. | + +## Environment-level Resource Ref Fields + +These fields go in each environment's `resources` array (lightweight refs to the pool): + +| Field | Description | +|---|---| +| `id` | Must match a connection-level resource `id`. `cpu`, `ram`, `disk` are always valid (auto-detected). | +| `total` | Environment aggregate ceiling: maximum units all jobs in this environment can use simultaneously. Omit to default to the pool total. | +| `min` | Per-job minimum override for this environment | +| `max` | Per-job maximum override (capped to `total` if both are set) | +| `constraints` | Per-env override: replaces the pool resource's constraints entirely for this environment. Omit to inherit pool constraints. Set `[]` to remove all constraints for this env. | --- @@ -75,7 +109,7 @@ Each element of `DOCKER_COMPUTE_ENVIRONMENTS` is an object with at least: { "id": "cpu", "price": 1 }, { "id": "ram", "price": 0.5 }, { "id": "disk", "price": 0.2 }, - { "id": "myGPU", "price": 3 } + { "id": "gpu0", "price": 3 } ] } ] @@ -84,9 +118,7 @@ Each element of `DOCKER_COMPUTE_ENVIRONMENTS` is an object with at least: - **feeToken**: Token contract address used for payment on that chain. - **prices**: List of `{ "id": "", "price": }`. - Only resources listed here are billable; omit a resource to offer it without charge (e.g. for free tier only). - -**Important**: The `id` in `prices` must match the resource `id` in `resources` and in `free.resources` (e.g. if the GPU resource is `"id": "myGPU"`, use `"id": "myGPU"` in `prices`, not `"nyGPU"`). + The `id` in `prices` must match a connection-level resource `id`. Only resources listed here are billable; omit a resource to offer it at no charge. --- @@ -111,44 +143,53 @@ Job: 2 CPUs, 4 GB RAM, 10 GB disk, 1 GPU, duration **125 seconds** (ceil = 3 min ## Example 1: CPU, RAM, and Disk with Prices +`cpu` and `ram` are auto-detected, so they don't need to appear in `resources`. Only `disk` is declared here to cap it at 10 GB. + ```json [ { "socketPath": "/var/run/docker.sock", - "resources": [{ "id": "disk", "total": 10 }], - "storageExpiry": 604800, - "maxJobDuration": 3600, - "minJobDuration": 60, - "fees": { - "1": [ - { - "feeToken": "0x967da4048cD07aB37855c090aAF366e4ce1b9F48", - "prices": [ - { "id": "cpu", "price": 1 }, - { "id": "ram", "price": 0.5 }, - { "id": "disk", "price": 0.2 } + "resources": [ + { "id": "disk", "total": 10 } + ], + "environments": [ + { + "storageExpiry": 604800, + "maxJobDuration": 3600, + "minJobDuration": 60, + "resources": [ + { "id": "cpu" }, + { "id": "ram" }, + { "id": "disk", "max": 10 } + ], + "fees": { + "1": [ + { + "feeToken": "0x967da4048cD07aB37855c090aAF366e4ce1b9F48", + "prices": [ + { "id": "cpu", "price": 1 }, + { "id": "ram", "price": 0.5 }, + { "id": "disk", "price": 0.2 } + ] + } + ] + }, + "free": { + "maxJobDuration": 60, + "minJobDuration": 10, + "maxJobs": 3, + "resources": [ + { "id": "cpu", "max": 1 }, + { "id": "ram", "max": 1 }, + { "id": "disk", "max": 1 } ] } - ] - }, - "free": { - "maxJobDuration": 60, - "minJobDuration": 10, - "maxJobs": 3, - "resources": [ - { "id": "cpu", "max": 1 }, - { "id": "ram", "max": 1 }, - { "id": "disk", "max": 1 } - ] - } + } + ] } ] ``` -- **CPU**: 1 unit per CPU per minute. -- **RAM**: 0.5 units per GB per minute. -- **Disk**: 0.2 units per GB per minute. - --- ## Example 2: CPU + NVIDIA GPU @@ -159,18 +200,21 @@ Get the GPU UUID: nvidia-smi --query-gpu=name,uuid --format=csv ``` -Then define one GPU and set a price per GPU per minute (e.g. 3): +Define the GPU at the **connection level** with `kind: "discrete"` and a single `DeviceID`. Each physical GPU is its own resource entry. The environment references it by `id`. ```json [ { "socketPath": "/var/run/docker.sock", "resources": [ + { "id": "disk", "total": 10 }, { - "id": "myGPU", - "description": "NVIDIA GeForce GTX 1060 3GB", + "id": "gpu0", + "kind": "discrete", "type": "gpu", "total": 1, + "description": "NVIDIA GeForce GTX 1060 3GB", + "platform": "nvidia", "init": { "deviceRequests": { "Driver": "nvidia", @@ -178,39 +222,47 @@ Then define one GPU and set a price per GPU per minute (e.g. 3): "Capabilities": [["gpu"]] } } - }, - { "id": "disk", "total": 1 } + } ], - "storageExpiry": 604800, - "maxJobDuration": 3600, - "minJobDuration": 60, - "fees": { - "1": [ - { - "feeToken": "0x123", - "prices": [ - { "id": "cpu", "price": 1 }, - { "id": "myGPU", "price": 3 } + "environments": [ + { + "storageExpiry": 604800, + "maxJobDuration": 3600, + "minJobDuration": 60, + "resources": [ + { "id": "cpu" }, + { "id": "ram" }, + { "id": "disk", "max": 10 }, + { "id": "gpu0" } + ], + "fees": { + "1": [ + { + "feeToken": "0x123", + "prices": [ + { "id": "cpu", "price": 1 }, + { "id": "gpu0", "price": 3 } + ] + } + ] + }, + "free": { + "maxJobDuration": 60, + "minJobDuration": 10, + "maxJobs": 3, + "resources": [ + { "id": "cpu", "max": 1 }, + { "id": "ram", "max": 1 }, + { "id": "disk", "max": 1 } ] } - ] - }, - "free": { - "maxJobDuration": 60, - "minJobDuration": 10, - "maxJobs": 3, - "resources": [ - { "id": "cpu", "max": 1 }, - { "id": "ram", "max": 1 }, - { "id": "disk", "max": 1 }, - { "id": "myGPU", "max": 1 } - ] - } + } + ] } ] ``` -Ensure the fee `id` matches the resource `id` (`myGPU`). Price 3 = 3 units per GPU per minute. +The `id` in `fees.prices` (`"gpu0"`) must match the connection-level resource `id`. Price 3 = 3 units per GPU per minute. --- @@ -224,16 +276,16 @@ You can support several chains and multiple fee tokens per chain: { "feeToken": "0xTokenA", "prices": [ - { "id": "cpu", "price": 1 }, - { "id": "ram", "price": 0.5 }, + { "id": "cpu", "price": 1 }, + { "id": "ram", "price": 0.5 }, { "id": "disk", "price": 0.2 } ] }, { "feeToken": "0xTokenB", "prices": [ - { "id": "cpu", "price": 2 }, - { "id": "ram", "price": 1 }, + { "id": "cpu", "price": 2 }, + { "id": "ram", "price": 1 }, { "id": "disk", "price": 0.5 } ] } @@ -242,8 +294,8 @@ You can support several chains and multiple fee tokens per chain: { "feeToken": "0xPolygonToken", "prices": [ - { "id": "cpu", "price": 1 }, - { "id": "ram", "price": 0.5 }, + { "id": "cpu", "price": 1 }, + { "id": "ram", "price": 0.5 }, { "id": "disk", "price": 0.2 } ] } @@ -257,23 +309,36 @@ Consumers choose chain and token when starting a job; the node uses the matching ## Example 4: AMD or Intel GPU -For **AMD (e.g. ROCm on WSL2)** or **Intel Arc**, use a GPU resource with `init.advanced` instead of `deviceRequests`. Still set a **per-GPU per-minute** price in `fees.prices` (same formula: price × gpus × ceil(duration/60)). +For **AMD (e.g. ROCm)** or **Intel Arc**, define the GPU at the connection level with `init.advanced` instead of `deviceRequests`. The environment references it by `id` just like any other resource. -- **AMD Radeon (WSL2/ROCm)**: See [GPU.md – AMD Radeon 9070 XT](GPU.md#amd-radeon-9070-xt-on-wsl2) for `advanced` (Devices, Binds, CapAdd, etc.). Use a consistent `id` (e.g. `myGPU`) in `resources` and in `fees.prices`. -- **Intel Arc**: See [GPU.md – Intel Arc GPU](GPU.md#intel-arc-gpu-example) for `advanced` (Devices, GroupAdd, CapAdd). Again, use the same `id` in `fees.prices` (e.g. `intelGPU`). +- **AMD Radeon**: See [GPU.md – AMD Radeon](GPU.md#amd-radeon-example) for the `advanced` block (Devices, Binds, CapAdd, etc.). +- **Intel Arc**: See [GPU.md – Intel Arc GPU](GPU.md#intel-arc-gpu-example) for the `advanced` block (Devices, GroupAdd, CapAdd). In all cases, the pricing rule is: **price × amount × ceil(duration/60)** with amount = number of GPUs. --- +## Dual-gate Availability (Fungible Resources) + +For `cpu`, `ram`, and `disk`, two independent checks must pass before a job is admitted: + +- **Gate 1 (per-environment ceiling)**: `env.total - env.inUse >= requested`. Controlled by `EnvironmentResourceRef.total` in the environment's `resources` array. Prevents one environment from starving others. +- **Gate 2 (engine-wide pool ceiling)**: The sum of in-use across all environments must not exceed the pool's `total`. Enforces the physical hardware limit. + +For discrete resources (GPU), only Gate 2 applies — and only for exclusive (`shareable: false`, the default) devices. + +--- + ## Checklist - [ ] `DOCKER_COMPUTE_ENVIRONMENTS` (or `dockerComputeEnvironments` in config) is a JSON array. -- [ ] Every environment has a **disk** resource (and optionally cpu, ram, GPU). +- [ ] GPUs and other custom hardware are defined in the **connection-level** `resources` array with `kind: "discrete"`. +- [ ] Each environment's `resources` array contains lightweight refs (`{ id, total?, min?, max? }`). +- [ ] `cpu`, `ram`, `disk` are auto-detected — only declare them in `resources` to cap/override the detected value. - [ ] **Disk** and **RAM** amounts are in **GB**. -- [ ] **fees** has an entry per chain ID; each entry has `feeToken` and `prices` with `id` matching resource ids. +- [ ] `fees.prices[].id` matches a connection-level resource `id`. - [ ] **CPU / GPU**: price = per resource per minute. - [ ] **RAM / Disk**: price = per GB per minute. - [ ] For free tier, list the same resource ids in `free.resources`; omit from `prices` if they should be free only. -For GPU setup details (NVIDIA, AMD, Intel), see [GPU.md](GPU.md). For other env vars and config options, see [env.md](env.md). +For GPU setup details (NVIDIA, AMD, Intel), see [GPU.md](GPU.md). For all env vars and config options, see [env.md](env.md). diff --git a/docs/env.md b/docs/env.md index dd8761849..3c1405295 100644 --- a/docs/env.md +++ b/docs/env.md @@ -129,114 +129,134 @@ Environmental variables are also tracked in `ENVIRONMENT_VARIABLES` within `src/ - `C2D_DOWNLOAD_TIMEOUT`: Timeout (in seconds) for pulling the algorithm docker image during a C2D job. If the pull exceeds this timeout, the job fails with `PullImageFailed` instead of getting stuck. Defaults to `900` (15 minutes). Example: `900` -The `DOCKER_COMPUTE_ENVIRONMENTS` environment variable is used to configure Docker-based compute environments in Ocean Node. This guide will walk you through the options available for defining `DOCKER_COMPUTE_ENVIRONMENTS` and how to set it up correctly. For configuring compute environments and setting prices for each resource (including pricing units and examples), see [Compute pricing](compute-pricing.md). +The `DOCKER_COMPUTE_ENVIRONMENTS` environment variable is used to configure Docker-based compute environments in Ocean Node. For GPU setup and examples see [GPU Guide](GPU.md). For pricing configuration see [Compute pricing](compute-pricing.md). -Example Configuration -The `DOCKER_COMPUTE_ENVIRONMENTS` environment variable should be a JSON array of objects, where each object represents a Docker compute environment configuration. Below is an example configuration: +`cpu`, `ram`, and `disk` resources are **auto-detected** from the host at startup. All resource values are expressed in natural units: CPU in cores, RAM and disk in GB. -`Disk` and `Ram` resources are always expressed in GB. +The config has a two-level structure: +- **Connection level** (`C2DDockerConfig`): Docker connection details + optional hardware resource pool (GPUs, NICs, or overrides for auto-detected cpu/ram/disk) +- **Environment level** (`C2DEnvironmentConfig`): per-environment business rules (fees, access, durations) + lightweight resource refs ```json [ { "socketPath": "/var/run/docker.sock", - "scanImages": true, - "enableNetwork": false, + "scanImages": false, "imageRetentionDays": 7, "imageCleanupInterval": 86400, + "paymentClaimInterval": 3600, + "resources": [ - { - "id": "disk", - "total": 10 - } + { "id": "cpu", "total": 6 }, + { "id": "disk", "total": 50 } ], - "storageExpiry": 604800, - "maxJobDuration": 3600, - "minJobDuration": 60, - "access": { - "addresses": ["0x123", "0x456"], - "accessLists": [] - }, - "fees": { - "1": [ - { - "feeToken": "0x123", - "prices": [ + + "environments": [ + { + "id": "default", + "description": "CPU compute environment", + "storageExpiry": 604800, + "maxJobDuration": 3600, + "minJobDuration": 60, + "enableNetwork": false, + "access": { + "addresses": ["0x123", "0x456"], + "accessLists": [] + }, + "fees": { + "1": [ { - "id": "cpu", - "price": 1 + "feeToken": "0x967da4048cD07aB37855c090aAF366e4ce1b9F48", + "prices": [{ "id": "cpu", "price": 1 }] } ] - } - ] - }, - "free": { - "maxJobDuration": 60, - "minJobDuration": 10, - "maxJobs": 3, - "access": { - "addresses": [], - "accessLists": ["0x789"] - }, - "resources": [ - { - "id": "cpu", - "max": 1 }, - { - "id": "ram", - "max": 1 - }, - { - "id": "disk", - "max": 1 + "resources": [ + { "id": "cpu", "min": 1, "max": 4 }, + { "id": "ram", "min": 1, "max": 8 }, + { "id": "disk", "min": 1, "max": 50 } + ], + "free": { + "maxJobDuration": 60, + "minJobDuration": 10, + "maxJobs": 3, + "access": { "addresses": [], "accessLists": [] }, + "resources": [ + { "id": "cpu", "max": 1 }, + { "id": "ram", "max": 1 }, + { "id": "disk", "max": 1 } + ] } - ] - } + } + ] } ] ``` -#### Configuration Options - -- **socketPath**: Path to the Docker socket (e.g., docker.sock). -- **scanImages**: Whether Docker images should be scanned for vulnerabilities using Trivy. If enabled and critical vulnerabilities are found, the C2D job is rejected. -- **scanImageDBUpdateInterval**: How often to update the vulnerability database, in seconds. Default: 43200 (12 hours) -- **enableNetwork**: Whether networking is enabled for algorithm containers. Default: false -- **imageRetentionDays** - how long docker images are kept, in days. Default: 7 -- **imageCleanupInterval** - how often to run cleanup for docker images, in seconds. Min: 3600 (1hour), Default: 86400 (24 hours) -- **paymentClaimInterval** - how often to run payment claiming, in seconds. Default: 3600 (1 hour) -- **enableBenchmark** - when set to `true`, the node will auto-create a benchmark compute environment at startup using the system's available resources (CPU, RAM, disk, GPUs). Default: `false` -- **storageExpiry**: Amount of seconds for storage expiry.(Mandatory) -- **maxJobDuration**: Maximum duration in seconds for a job.(Mandatory) -- **minJobDuration**: Minimum duration in seconds for a job.(Mandatory) -- **access**: Access control configuration for paid compute jobs. If both `addresses` and `accessLists` are empty, all addresses are allowed. - - **addresses**: Array of Ethereum addresses allowed to run compute jobs. If empty and no access lists are configured, all addresses are allowed. - - **accessLists**: Array of AccessList contract addresses. Users holding NFTs from these contracts can run compute jobs. Checked across all supported networks. -- **fees**: Fee structure for the compute environment. - - **feeToken**: Token address for the fee. - - **prices**: Array of resource pricing information. - - **id**: Resource type (e.g., `cpu`, `ram`, `disk`). - - **price**: Price per unit of the resource. -- **resources**: Array of resources available in the compute environment. - - **id**: Resource type (e.g., `cpu`, `ram`, `disk`). - - **total**: Total number of the resource available. - - **min**: Minimum number of the resource needed for a job. - - **max**: Maximum number of the resource for a job. -- **free**: Optional configuration for free jobs. - - **storageExpiry**: Amount of seconds for storage expiry for free jobs. - - **maxJobDuration**: Maximum duration in seconds for a free job. - - **minJobDuration**: Minimum duration in seconds for a free job. - - **maxJobs**: Maximum number of simultaneous free jobs. - - **allowImageBuild**: If building images is allowed on free envs. Default: false - - **access**: Access control configuration for free compute jobs. Works the same as the main `access` field. - - **addresses**: Array of Ethereum addresses allowed to run free compute jobs. - - **accessLists**: Array of AccessList contract addresses for free compute access control. - - **resources**: Array of resources available for free jobs. - - **id**: Resource type (e.g., `cpu`, `ram`, `disk`). - - **total**: Total number of the resource available. - - **min**: Minimum number of the resource needed for a job. - - **max**: Maximum number of the resource for a job. +#### Connection-level fields + +- **socketPath** / **host** / **port** / **protocol** / **caPath** / **certPath** / **keyPath**: Docker connection settings. +- **scanImages**: Scan algorithm images for vulnerabilities with Trivy. Default: `false` +- **scanImageDBUpdateInterval**: Vulnerability DB update interval in seconds. Default: `43200` (12 hours) +- **imageRetentionDays**: How long to keep Docker images, in days. Default: `7` +- **imageCleanupInterval**: Image cleanup interval in seconds. Min: `3600`, Default: `86400` +- **paymentClaimInterval**: Payment claim interval in seconds. Min: `60`, Default: `3600` +- **resources** *(optional)*: Hardware resource pool for this connection. `cpu`, `ram`, and `disk` are auto-detected from the host — include them only to cap their totals or to add custom resources (GPUs, NICs, etc.). + - **id**: Resource identifier. `cpu`, `ram`, `disk` are built-in; any other string defines a custom resource. + - **kind**: `"discrete"` (non-fungible device, e.g. GPU) or `"fungible"` (interchangeable units, e.g. CPU). Auto-inferred: `"discrete"` if `init` is present, `"fungible"` otherwise. + - **shareable** *(discrete only)*: `true` allows multiple jobs to use the device simultaneously (NIC, TPM). Default: `false`. **Not allowed** on `type: "gpu"` or `type: "fpga"`. + - **total**: Total units available. Capped at the physical host limit. + - **min** / **max**: Per-job minimum/maximum. + - **description**, **platform**, **driverVersion**, **memoryTotal**: Informational metadata. + - **init**: Docker device configuration (`deviceRequests` for NVIDIA, `advanced` for AMD/Intel). See [GPU Guide](GPU.md). + - **constraints**: Cross-resource requirements. `{ "id": "ram", "min": 4 }` means renting this resource also requires 4 GB RAM. + +#### Environment-level fields + +- **id** *(optional)*: Stable identifier for the environment. Used to compute the environment hash. +- **description**: Human-readable description. +- **storageExpiry**: Seconds before compute results expire. +- **maxJobDuration** / **minJobDuration**: Maximum/minimum job duration in seconds. +- **maxJobs**: Maximum simultaneous paid jobs. +- **enableNetwork**: Whether algorithm containers can make outbound network connections. Default: `false` +- **access**: Access control for paid jobs. + - **addresses**: Ethereum addresses allowed to submit jobs. Empty + no accessLists = open access. + - **accessLists**: AccessList NFT contract addresses. NFT holders can submit jobs. +- **fees**: Fee structure per chain. + - **feeToken**: ERC-20 token address for payment. + - **prices**: `[{ "id": "", "price": }]` +- **resources**: Lightweight refs to the connection pool. `cpu`, `ram`, `disk` are always available. + - **id**: Must match a connection-level resource id, or `cpu` / `ram` / `disk`. + - **total**: Env aggregate ceiling — max units all running jobs in this env can use simultaneously. Omit for no per-env cap. + - **min** / **max**: Per-job limits for this environment (further restricted from pool values). + - **constraints**: Per-env override for pool-level constraints. Replaces (not merges) the pool constraints. Set `[]` to remove constraints for this env. +- **free** *(optional)*: Free tier configuration. + - **maxJobDuration** / **minJobDuration** / **maxJobs**: Free job limits. + - **allowImageBuild**: Allow image builds on free jobs. Default: `false` + - **access**: Same structure as the paid access field. + - **resources**: Same structure as environment resources — lightweight refs limiting what free jobs can request. + +> **Strict isolation**: If you need strict physical CPU isolation between environments (e.g., for regulated data), run each environment on a separate Docker connection. All environments on the same connection share the same CPU core pool dynamically. + +#### Migration from old format + +The old format placed hardware details (`init`, `driverVersion`, etc.) inside environments. This is now **a startup error**. + +**Old (rejected):** +```json +[{ "socketPath": "...", "environments": [{ "resources": [{ "id": "myGPU", "init": {...} }] }] }] +``` + +**New:** +```json +[{ + "socketPath": "...", + "resources": [{ "id": "myGPU", "kind": "discrete", "total": 1, "init": {...} }], + "environments": [{ "resources": [{ "id": "myGPU" }] }] +}] +``` + +Move all `init`, `driverVersion`, `platform`, `memoryTotal`, `type`, `kind`, and `constraints` fields to the connection-level `resources` array. Environment resources keep only `id` and optionally `total`/`min`/`max`/`constraints`. ### Docker Registry Authentication diff --git a/scripts/ocean-node-quickstart.sh b/scripts/ocean-node-quickstart.sh index 5280c45f6..a67d08c4f 100755 --- a/scripts/ocean-node-quickstart.sh +++ b/scripts/ocean-node-quickstart.sh @@ -239,15 +239,27 @@ if [ -z "$DOCKER_COMPUTE_ENVIRONMENTS" ]; then export DOCKER_COMPUTE_ENVIRONMENTS='[ { "socketPath": "/var/run/docker.sock", + "resources": [ + { + "id": "disk", + "total": 10 + } + ], "environments": [ { "storageExpiry": 604800, "maxJobDuration": 36000, "minJobDuration": 60, "resources": [ + { + "id": "cpu" + }, + { + "id": "ram" + }, { "id": "disk", - "total": 10 + "max": 10 } ], "fees": { @@ -580,43 +592,48 @@ process_pci_line() { } | del(.. | select(. == null)) | del(.. | select(. == []))' } -# Function to get all GPUs in JSON array format +# Function to get all GPUs in JSON array format. +# Each physical GPU becomes its own resource entry (kind: "discrete", total: 1) +# with a single DeviceID — never multiple DeviceIDs in one resource. get_all_gpus_json() { ( get_nvidia_gpus get_generic_gpus ) | jq -s ' - group_by(.description) | map( + to_entries | map( { - id: (.[0].description | ascii_downcase | gsub("[^a-z0-9]"; "-") | gsub("-+"; "-") | sub("^-"; "") | sub("-$"; "")), - description: .[0].description, + id: ("gpu" + (.key | tostring)), + kind: "discrete", type: "gpu", - total: length, - driverVersion: (.[0].driverVersion // null), - memoryTotal: (.[0].memoryTotal // null), - platform: (if .[0].init.deviceRequests.Driver == "amdgpu" then "amd" else .[0].init.deviceRequests.Driver end), + total: 1, + description: .value.description, + driverVersion: .value.driverVersion, + memoryTotal: .value.memoryTotal, + platform: (if .value.init.deviceRequests.Driver == "amdgpu" then "amd" else .value.init.deviceRequests.Driver end), init: ( - if .[0].init.deviceRequests.Driver == "nvidia" then + if .value.init.deviceRequests.Driver == "nvidia" then { deviceRequests: { - Driver: .[0].init.deviceRequests.Driver, - DeviceIDs: (map(.init.deviceRequests.Devices[]?) | unique), + Driver: "nvidia", + DeviceIDs: .value.init.deviceRequests.Devices, Capabilities: [["gpu"]] } } else { - advanced: { - Driver: .[0].init.deviceRequests.Driver, - Devices: (map(.init.deviceRequests.Devices[]?) | unique), - Capabilities: [["gpu"]], - Binds: (map(.init.Binds[]?) | unique), - CapAdd: (map(.init.CapAdd[]?) | unique), - GroupAdd: (map(.init.GroupAdd[]?) | unique), - SecurityOpt: .[0].init.SecurityOpt, - ShmSize: .[0].init.ShmSize, - IpcMode: .[0].init.IpcMode - } | del(.. | select(. == null)) | del(.. | select(. == [])) + advanced: ( + { + Driver: .value.init.deviceRequests.Driver, + Devices: .value.init.deviceRequests.Devices, + Capabilities: [["gpu"]], + Binds: .value.init.Binds, + CapAdd: .value.init.CapAdd, + GroupAdd: .value.init.GroupAdd, + SecurityOpt: .value.init.SecurityOpt, + ShmSize: .value.init.ShmSize, + IpcMode: .value.init.IpcMode + } | del(.. | select(. == null)) | del(.. | select(. == [])) + ) } end ) @@ -632,8 +649,11 @@ if command -v jq &> /dev/null; then if [ "$GPU_COUNT" -gt 0 ]; then echo "Detected $GPU_COUNT GPU type(s). Updating configuration..." - DOCKER_COMPUTE_ENVIRONMENTS=$(echo "$DOCKER_COMPUTE_ENVIRONMENTS" | jq --argjson gpus "$DETECTED_GPUS" '.[0].environments[0].resources += $gpus') - echo "GPUs added to Compute Environment resources." + DOCKER_COMPUTE_ENVIRONMENTS=$(echo "$DOCKER_COMPUTE_ENVIRONMENTS" | jq --argjson gpus "$DETECTED_GPUS" ' + .[0].resources += $gpus | + .[0].environments[0].resources += ($gpus | map({ id: .id })) + ') + echo "GPUs added to connection-level resources; environment refs updated." else echo "No GPUs detected." fi diff --git a/src/@types/C2D/C2D.ts b/src/@types/C2D/C2D.ts index 9fe929356..eb2744089 100644 --- a/src/@types/C2D/C2D.ts +++ b/src/@types/C2D/C2D.ts @@ -22,6 +22,7 @@ export interface C2DClusterInfo { } export type ComputeResourceType = 'cpu' | 'ram' | 'disk' | any +export type ComputeResourceKind = 'discrete' | 'fungible' export interface ResourceConstraint { id: ComputeResourceType // the resource being constrained @@ -58,7 +59,10 @@ export interface ComputeResource { id: ComputeResourceType description?: string type?: string - kind?: string // discreet, named, etc + kind?: ComputeResourceKind // 'discrete' | 'fungible'. Auto-inferred if omitted. + shareable?: boolean // Only meaningful for kind:'discrete'. Default false. + // true → multiple jobs may share the device simultaneously (NIC, TPM, HSM) + // false → exclusive: only one job at a time (GPU, FPGA) total: number // total number of specific resource min: number // min number of resource needed for a job max: number // max number of resource for a job @@ -72,6 +76,15 @@ export interface ComputeResource { init?: dockerHwInit constraints?: ResourceConstraint[] // optional cross-resource constraints } +export interface EnvironmentResourceRef { + id: ComputeResourceType // must match a resource id in C2DDockerConfig.resources or auto-detected (cpu/ram/disk) + total?: number // env aggregate ceiling; if omitted → defaults to pool total (no per-env restriction) + min?: number // per-job minimum + max?: number // per-job maximum (capped to total if both present) + constraints?: ResourceConstraint[] // per-env override: replaces pool constraints entirely + // Omit to inherit pool constraints. Set [] to remove all constraints for this env. +} + export interface ComputeResourceRequest { id: string amount: number @@ -109,6 +122,19 @@ export interface ComputeEnvironmentFreeOptions { access: ComputeAccessList allowImageBuild?: boolean } + +// Config-time only — used in C2DEnvironmentConfig.free. +// resources are EnvironmentResourceRef[] (refs to pool) and resolved to ComputeResource[] at startup. +// Runtime free options live in ComputeEnvironmentFreeOptions (unchanged). +export interface C2DEnvironmentFreeConfig { + storageExpiry?: number + maxJobDuration?: number + minJobDuration?: number + maxJobs?: number + resources?: EnvironmentResourceRef[] + access?: ComputeAccessList + allowImageBuild?: boolean +} export interface ComputeEnvironmentBaseConfig { description?: string // v1 storageExpiry?: number // amount of seconds for storage @@ -151,8 +177,8 @@ export interface C2DEnvironmentConfig { maxJobs?: number fees?: ComputeEnvFeesStructure access?: ComputeAccessList - free?: ComputeEnvironmentFreeOptions - resources?: ComputeResource[] + free?: C2DEnvironmentFreeConfig // config-time only; resolved to ComputeEnvironmentFreeOptions at startup + resources?: EnvironmentResourceRef[] // lightweight refs to connection pool enableNetwork?: boolean // whether network is enabled for algorithm containers } @@ -169,6 +195,7 @@ export interface C2DDockerConfig { paymentClaimInterval?: number // Default: 3600 seconds (1 hours) scanImages?: boolean scanImageDBUpdateInterval?: number // Default: 12 hours + resources?: ComputeResource[] // optional: cpu/ram/disk auto-detected; include for GPUs/NICs or to cap auto-detected totals environments: C2DEnvironmentConfig[] } diff --git a/src/components/c2d/compute_engine_base.ts b/src/components/c2d/compute_engine_base.ts index 361599798..5d724fb56 100644 --- a/src/components/c2d/compute_engine_base.ts +++ b/src/components/c2d/compute_engine_base.ts @@ -364,10 +364,10 @@ export abstract class C2DEngine { for (const resource of job.resources) { const envRes = envResourceMap.get(resource.id) if (envRes) { - // GPUs are shared-exclusive: inUse tracked globally across all envs - // Everything else (cpu, ram, disk) is per-env exclusive - const isSharedExclusive = envRes.type === 'gpu' - if (!isSharedExclusive && !isThisEnv) continue + // discrete resources (GPUs, FPGAs, NICs) tracked globally across all envs + // fungible resources (cpu, ram, disk) are per-env exclusive + const isGloballyTracked = envRes.kind === 'discrete' + if (!isGloballyTracked && !isThisEnv) continue if (!(resource.id in usedResources)) usedResources[resource.id] = 0 usedResources[resource.id] += resource.amount if (job.isFree) { @@ -401,13 +401,21 @@ export abstract class C2DEngine { ) { let globalUsed = 0 let globalTotal = 0 + let discreteInUse: number | undefined for (const e of allEnvironments) { const res = this.getResource(e.resources, resourceId) if (res) { globalTotal += res.total || 0 - globalUsed += res.inUse || 0 + if (res.kind === 'discrete') { + // getUsedResources already aggregates discrete inUse globally across all envs, + // so each env carries the same global value — take the max to avoid N-fold counting. + discreteInUse = Math.max(discreteInUse ?? 0, res.inUse || 0) + } else { + globalUsed += res.inUse || 0 + } } } + if (discreteInUse !== undefined) globalUsed += discreteInUse const physicalLimit = this.physicalLimits.get(resourceId) if (physicalLimit !== undefined && globalTotal > physicalLimit) { globalTotal = physicalLimit @@ -434,12 +442,19 @@ export abstract class C2DEngine { for (const request of activeResources) { let envResource = this.getResource(env.resources, request.id) if (!envResource) throw new Error(`No such resource ${request.id}`) - if (envResource.total - envResource.inUse < request.amount) - throw new Error(`Not enough available ${request.id}`) - // Global check for non-GPU resources (cpu, ram, disk are per-env exclusive) - // GPUs are shared-exclusive so their inUse already reflects global usage - if (allEnvironments && envResource.type !== 'gpu') { + const isFungible = envResource.kind === 'fungible' + const isShareableDiscrete = + envResource.kind === 'discrete' && envResource.shareable === true + + // Gate 1 (per-env ceiling) — fungible resources only. + // envResource.total = env aggregate ceiling (from EnvironmentResourceRef.total). + if (isFungible && envResource.total - (envResource.inUse ?? 0) < request.amount) + throw new Error(`Not enough available ${request.id} in this environment`) + + // Gate 2 (engine-wide pool ceiling) — fungible + exclusive discrete. + // shareable discrete: tracked for visibility but never blocks allocation. + if (!isShareableDiscrete && allEnvironments) { this.checkGlobalResourceAvailability(allEnvironments, request.id, request.amount) } diff --git a/src/components/c2d/compute_engine_docker.ts b/src/components/c2d/compute_engine_docker.ts index 95f188e9c..9af9a85b5 100755 --- a/src/components/c2d/compute_engine_docker.ts +++ b/src/components/c2d/compute_engine_docker.ts @@ -1,6 +1,5 @@ /* eslint-disable security/detect-non-literal-fs-filename */ import { Readable, PassThrough } from 'stream' -import os from 'os' import path from 'path' import { C2DStatusNumber, @@ -22,6 +21,7 @@ import type { ComputeResourceRequest, ComputeEnvFees, ComputeResource, + ComputeResourceKind, C2DEnvironmentConfig, ComputeResourcesPricingInfo } from '../../@types/C2D/C2D.js' @@ -61,7 +61,6 @@ import { getOceanTokenAddressForChain } from '../../utils/address.js' import { dockerRegistryAuth, OceanNodeConfig } from '../../@types/OceanNode.js' import { EncryptMethod } from '../../@types/fileObject.js' import { getAddress, ZeroAddress } from 'ethers' -import { AccessList } from '../../@types/AccessList.js' const C2D_CONTAINER_UID = 1000 const C2D_CONTAINER_GID = 1000 @@ -189,42 +188,137 @@ export class C2DEngineDocker extends C2DEngine { return this.getC2DConfig().tempFolder + this.getC2DConfig().hash } - private createBenchmarkEnvironment(sysinfo: any, envConfig: any): void { - const ramGB = this.physicalLimits.get('ram') || 0 + private resolveResourceKind(res: ComputeResource): ComputeResourceKind { + if (res.kind) return res.kind + if (res.init) return 'discrete' + return 'fungible' + } + + private resolveConnectionResourcePool( + sysinfo: any, + configResources: ComputeResource[] | undefined + ): Map { + const pool = new Map() + + const physicalCpu = sysinfo.NCPU + const physicalRamGB = Math.floor(sysinfo.MemTotal / 1024 / 1024 / 1024) const physicalDiskGB = this.physicalLimits.get('disk') || 0 - const gpuMap = new Map() - for (const env of envConfig.environments) { - if (env.resources) { - for (const res of env.resources) { - if (res.id !== 'cpu' && res.id !== 'ram' && res.id !== 'disk') { - if (!gpuMap.has(res.id)) { - gpuMap.set(res.id, res) - } - } + pool.set('cpu', { + id: 'cpu', + type: 'cpu', + kind: 'fungible', + total: physicalCpu, + max: physicalCpu, + min: 1 + }) + pool.set('ram', { + id: 'ram', + type: 'ram', + kind: 'fungible', + total: physicalRamGB, + max: physicalRamGB, + min: 1 + }) + pool.set('disk', { + id: 'disk', + type: 'disk', + kind: 'fungible', + total: physicalDiskGB, + max: physicalDiskGB, + min: 0 + }) + + for (const res of configResources ?? []) { + const resolvedKind = this.resolveResourceKind(res) + if (['cpu', 'ram', 'disk'].includes(res.id)) { + const base = pool.get(res.id) + const cap = this.physicalLimits.get(res.id) ?? res.total + if (res.total > cap) + CORE_LOGGER.warn( + `Resource "${res.id}": configured total ${res.total} exceeds physical ${cap}, capping` + ) + if (res.total !== undefined) base.total = Math.min(res.total, cap) + base.max = res.max !== undefined ? Math.min(res.max, base.total) : base.total + if (res.min !== undefined) base.min = res.min + if (res.constraints !== undefined) + base.constraints = structuredClone(res.constraints) + } else { + // Warn if a GPU resource has multiple DeviceIDs — each physical GPU should be its own resource. + if (res.init?.deviceRequests?.DeviceIDs?.length > 1) { + CORE_LOGGER.warn( + `Resource "${res.id}": DeviceIDs has ${res.init.deviceRequests.DeviceIDs.length} entries. ` + + `Each physical GPU should be its own resource with a single DeviceID.` + ) } + const custom: ComputeResource = { + ...res, + kind: resolvedKind, + max: res.max ?? res.total, + min: res.min ?? 0 + } + pool.set(res.id, custom) + // Register in physicalLimits so checkGlobalResourceAvailability caps correctly. + this.physicalLimits.set(res.id, res.total) } } - const gpuResources: ComputeResource[] = Array.from(gpuMap.values()) + return pool + } + + private resolveEnvironmentResources( + envDef: C2DEnvironmentConfig, + pool: Map + ): ComputeResource[] { + const refs = envDef.resources || [] + const result: ComputeResource[] = [] + for (const ref of refs) { + const poolRes = pool.get(ref.id) + if (!poolRes) { + CORE_LOGGER.warn(`resource "${ref.id}" not in pool, skipping`) + continue + } + const resolved: ComputeResource = { + ...poolRes, + init: poolRes.init ? structuredClone(poolRes.init) : undefined, + constraints: poolRes.constraints + ? structuredClone(poolRes.constraints) + : undefined + } + + if (poolRes.kind === 'fungible') { + resolved.total = + ref.total !== undefined ? Math.min(ref.total, poolRes.total) : poolRes.total + } + if (ref.max !== undefined) resolved.max = Math.min(ref.max, resolved.total) + if (ref.min !== undefined) resolved.min = ref.min + if (ref.constraints !== undefined) + resolved.constraints = structuredClone(ref.constraints) + result.push(resolved) + } + return result + } + + private createBenchmarkEnvironment(sysinfo: any, envConfig: any): void { + // Collect all discrete accelerators (GPUs, FPGAs, etc.) from the connection-level resources. + const discreteResources: ComputeResource[] = (envConfig.resources ?? []).filter( + (res: ComputeResource) => this.resolveResourceKind(res) === 'discrete' + ) const benchmarkPrices: ComputeResourcesPricingInfo[] = - gpuResources.length > 0 ? [{ id: gpuResources[0].id, price: 1 }] : [] + discreteResources.length > 0 ? [{ id: discreteResources[0].id, price: 1 }] : [] const benchmarkFees: ComputeEnvFeesStructure = { [BASE_CHAIN_ID]: [{ feeToken: USDC_TOKEN_ADDRESS_BASE, prices: benchmarkPrices }] } + // Benchmark env uses resource refs: cpu/ram/disk are auto-detected; discrete accelerators listed by id. + const gpuRefs = discreteResources.map((r) => ({ id: r.id })) const benchmarkEnv: C2DEnvironmentConfig = { description: 'Auto-generated benchmark environment', storageExpiry: 604800, maxJobDuration: 180, minJobDuration: 0, - resources: [ - { id: 'cpu', total: sysinfo.NCPU, min: 1, max: sysinfo.NCPU }, - { id: 'ram', total: ramGB, min: 1, max: ramGB }, - { id: 'disk', total: physicalDiskGB, min: 0, max: physicalDiskGB }, - ...gpuResources - ], + resources: [{ id: 'cpu' }, { id: 'ram' }, { id: 'disk' }, ...gpuRefs], access: { addresses: [], accessLists: [ @@ -290,65 +384,16 @@ export class C2DEngineDocker extends C2DEngine { } } + const connectionPool = this.resolveConnectionResourcePool( + sysinfo, + envConfig.resources + ) + for (let envIdx = 0; envIdx < envConfig.environments.length; envIdx++) { const envDef: C2DEnvironmentConfig = envConfig.environments[envIdx] const fees = this.processFeesForEnvironment(envDef.fees, supportedChains) - - const envResources: ComputeResource[] = [] - const cpuResources = { - id: 'cpu', - type: 'cpu', - total: sysinfo.NCPU, - max: sysinfo.NCPU, - min: 1, - description: os.cpus()[0].model - } - const ramResources = { - id: 'ram', - type: 'ram', - total: Math.floor(sysinfo.MemTotal / 1024 / 1024 / 1024), - max: Math.floor(sysinfo.MemTotal / 1024 / 1024 / 1024), - min: 1 - } - const physicalDiskGB = this.physicalLimits.get('disk') || 0 - const diskResources = { - id: 'disk', - type: 'disk', - total: physicalDiskGB, - max: physicalDiskGB, - min: 0 - } - - if (envDef.resources) { - for (const res of envDef.resources) { - // allow user to add other resources - if (res.id === 'cpu') { - if (res.total) cpuResources.total = res.total - if (res.max) cpuResources.max = res.max - if (res.min) cpuResources.min = res.min - } - if (res.id === 'ram') { - if (res.total) ramResources.total = res.total - if (res.max) ramResources.max = res.max - if (res.min) ramResources.min = res.min - } - if (res.id === 'disk') { - if (res.total) diskResources.total = res.total - if (res.max) diskResources.max = res.max - if (res.min !== undefined) diskResources.min = res.min - } - - if (res.id !== 'cpu' && res.id !== 'ram' && res.id !== 'disk') { - if (!res.max) res.max = res.total - if (!res.min) res.min = 0 - envResources.push(res) - } - } - } - envResources.push(cpuResources) - envResources.push(ramResources) - envResources.push(diskResources) + const envResources = this.resolveEnvironmentResources(envDef, connectionPool) const env: ComputeEnvironment = { id: '', @@ -385,9 +430,15 @@ export class C2DEngineDocker extends C2DEngine { if (envDef.free.maxJobDuration !== undefined) env.free.maxJobDuration = envDef.free.maxJobDuration if (envDef.free.maxJobs !== undefined) env.free.maxJobs = envDef.free.maxJobs - if (envDef.free.resources) env.free.resources = envDef.free.resources if (envDef.free.allowImageBuild !== undefined) env.free.allowImageBuild = envDef.free.allowImageBuild + // Resolve free resource refs → full ComputeResource[] using the same connection pool. + if (envDef.free.resources) { + env.free.resources = this.resolveEnvironmentResources( + { resources: envDef.free.resources } as C2DEnvironmentConfig, + connectionPool + ) + } } const envIdSuffix = envDef.id || String(envIdx) @@ -402,36 +453,17 @@ export class C2DEngineDocker extends C2DEngine { ) } + // CPU affinity: all environments share the full physical core pool. + // allocateCpus() dynamically assigns free cores per job across all envs. const physicalCpuCount = this.physicalLimits.get('cpu') || 0 - let cpuOffset = 0 + const allCores = Array.from({ length: physicalCpuCount }, (_, i) => i) for (const env of this.envs) { const cpuRes = this.getResource(env.resources ?? [], 'cpu') if (cpuRes && cpuRes.total > 0) { - let isBenchmarkEnv = false - if (env.access?.accessLists) { - const baseAccessList = env.access?.accessLists?.[0] as AccessList - if (baseAccessList && baseAccessList[BASE_CHAIN_ID]) { - isBenchmarkEnv = baseAccessList[BASE_CHAIN_ID].includes( - getAddress('0xcb7Db55Ca9Aa9C3b25F5Bc266da63317fa02086a') - ) - } - } - - if (isBenchmarkEnv) { - const total = physicalCpuCount > 0 ? physicalCpuCount : cpuRes.total - const cores = Array.from({ length: total }, (_, i) => i) - this.envCpuCoresMap.set(env.id, cores) - CORE_LOGGER.info( - `CPU affinity: benchmark environment ${env.id} cores 0-${cores[cores.length - 1]}` - ) - } else { - const cores = Array.from({ length: cpuRes.total }, (_, i) => cpuOffset + i) - this.envCpuCoresMap.set(env.id, cores) - CORE_LOGGER.info( - `CPU affinity: environment ${env.id} cores ${cores[0]}-${cores[cores.length - 1]}` - ) - cpuOffset += cpuRes.total - } + this.envCpuCoresMap.set(env.id, allCores) + CORE_LOGGER.info( + `CPU affinity: environment ${env.id} shares pool of ${allCores.length} cores` + ) } } diff --git a/src/test/unit/compute.test.ts b/src/test/unit/compute.test.ts index 07d1bf605..22258a35a 100644 --- a/src/test/unit/compute.test.ts +++ b/src/test/unit/compute.test.ts @@ -9,6 +9,7 @@ import { ComputeAsset, ComputeEnvironment, ComputeJob, + ComputeResource, ComputeResourceRequest, DBComputeJob, RunningPlatform @@ -35,7 +36,11 @@ import { C2DEngine, omitDBComputeFieldsFromComputeJob } from '../../components/c2d/index.js' -import { checkManifestPlatform } from '../../components/c2d/compute_engine_docker.js' +import { + checkManifestPlatform, + C2DEngineDocker +} from '../../components/c2d/compute_engine_docker.js' +import { C2DDockerConfigSchema } from '../../utils/config/schemas.js' import { ValidateParams } from '../../components/httpRoutes/validateCommands.js' import { Readable } from 'stream' import sinon from 'sinon' @@ -49,6 +54,10 @@ class TestC2DEngine extends C2DEngine { super(null, null, null, null, null) } + setPhysicalLimits(limits: Map) { + this.physicalLimits = limits + } + async getComputeEnvironments(): Promise { return [] } @@ -342,9 +351,9 @@ describe('Compute Jobs Database', () => { }) const baseResources = [ - { id: 'cpu', total: 8, min: 1, max: 8, inUse: 0 }, - { id: 'ram', total: 32, min: 1, max: 32, inUse: 0 }, - { id: 'disk', total: 500, min: 10, max: 500, inUse: 0 } + { id: 'cpu', kind: 'fungible', total: 8, min: 1, max: 8, inUse: 0 }, + { id: 'ram', kind: 'fungible', total: 32, min: 1, max: 32, inUse: 0 }, + { id: 'disk', kind: 'fungible', total: 500, min: 10, max: 500, inUse: 0 } ] it('satisfies constraints exactly → passes without modification', async function () { @@ -445,6 +454,119 @@ describe('Compute Jobs Database', () => { expect(cpuEntry.amount).to.equal(1) // bumped to min expect(diskEntry.amount).to.equal(10) // bumped to min }) + + it('per-env constraint override: premium env uses 8 GB RAM, standard env uses 4 GB RAM', async function () { + // Pool default: gpu0 requires ram min:4. + // premium env overrides to ram min:8. + // standard env inherits pool default (ram min:4). + const gpuWithOverrideConstraint = { + id: 'gpu0', + kind: 'discrete', + total: 1, + min: 0, + max: 1, + inUse: 0, + constraints: [{ id: 'ram', min: 8 }] // premium override + } + const gpuWithPoolConstraint = { + id: 'gpu0', + kind: 'discrete', + total: 1, + min: 0, + max: 1, + inUse: 0, + constraints: [{ id: 'ram', min: 4 }] // pool default inherited by standard + } + const premiumEnv = makeEnv([ + { id: 'ram', kind: 'fungible', total: 32, min: 1, max: 32, inUse: 0 }, + gpuWithOverrideConstraint + ]) + const standardEnv = makeEnv([ + { id: 'ram', kind: 'fungible', total: 32, min: 1, max: 32, inUse: 0 }, + gpuWithPoolConstraint + ]) + + const premiumReq: ComputeResourceRequest[] = [ + { id: 'ram', amount: 1 }, + { id: 'gpu0', amount: 1 } + ] + const standardReq: ComputeResourceRequest[] = [ + { id: 'ram', amount: 1 }, + { id: 'gpu0', amount: 1 } + ] + + const premiumResult = await engine.checkAndFillMissingResources( + premiumReq, + premiumEnv, + false + ) + const standardResult = await engine.checkAndFillMissingResources( + standardReq, + standardEnv, + false + ) + + expect(premiumResult.find((r) => r.id === 'ram').amount).to.equal(8) + expect(standardResult.find((r) => r.id === 'ram').amount).to.equal(4) + }) + + it('ref.constraints: [] removes all constraints for env — GPU job admitted with only resource min', async function () { + const gpuNoConstraints = { + id: 'gpu0', + kind: 'discrete', + total: 1, + min: 0, + max: 1, + inUse: 0, + constraints: [] as any[] // no constraints for this env + } + const env = makeEnv([ + { id: 'ram', kind: 'fungible', total: 32, min: 1, max: 32, inUse: 0 }, + gpuNoConstraints + ]) + const req: ComputeResourceRequest[] = [ + { id: 'ram', amount: 1 }, + { id: 'gpu0', amount: 1 } + ] + // No constraints → ram stays at 1 (not bumped to any min) + const result = await engine.checkAndFillMissingResources(req, env, false) + expect(result.find((r) => r.id === 'ram').amount).to.equal(1) + }) + + it('constraint-driven exhaustion: GPU becomes unrentable when RAM nearly depleted', async function () { + // gpu0 requires min:4 GB RAM. Env has 10 GB RAM total, 9 GB in use → only 1 GB remaining. + // Requesting gpu0 triggers checkAndFillMissingResources to bump RAM to 4 GB. + // checkIfResourcesAreAvailable should then reject at Gate 1 (only 1 GB remaining). + const resources = [ + { id: 'ram', kind: 'fungible', total: 10, min: 1, max: 10, inUse: 9 }, + { + id: 'gpu0', + kind: 'discrete', + total: 1, + min: 0, + max: 1, + inUse: 0, + constraints: [{ id: 'ram', min: 4 }] + } + ] + const env = makeEnv(resources) + const req: ComputeResourceRequest[] = [ + { id: 'ram', amount: 1 }, + { id: 'gpu0', amount: 1 } + ] + + // Step 1: auto-bump RAM from 1 to 4 (constraint min per gpu unit) + const filled = await engine.checkAndFillMissingResources(req, env, false) + expect(filled.find((r) => r.id === 'ram').amount).to.equal(4) + + // Step 2: 10 - 9 = 1 available < 4 requested → Gate 1 blocks + try { + await engine.checkIfResourcesAreAvailable(filled, env, false) + assert.fail('Expected error was not thrown') + } catch (err: any) { + expect(err.message).to.include('ram') + } + }) }) describe('testing checkIfResourcesAreAvailable', function () { @@ -452,13 +574,22 @@ describe('Compute Jobs Database', () => { before(function () { engine = new TestC2DEngine() + engine.setPhysicalLimits( + new Map([ + ['cpu', 10], + ['ram', 32], + ['disk', 100], + ['gpu0', 1], + ['nic0', 1] + ]) + ) }) it('resources within env limits → passes', async function () { const env = makeEnv([ - { id: 'cpu', total: 8, min: 1, max: 8, inUse: 2 }, - { id: 'ram', total: 32, min: 1, max: 32, inUse: 4 }, - { id: 'disk', total: 500, min: 10, max: 500, inUse: 50 } + { id: 'cpu', kind: 'fungible', total: 8, min: 1, max: 8, inUse: 2 }, + { id: 'ram', kind: 'fungible', total: 32, min: 1, max: 32, inUse: 4 }, + { id: 'disk', kind: 'fungible', total: 500, min: 10, max: 500, inUse: 50 } ]) const req: ComputeResourceRequest[] = [ { id: 'cpu', amount: 4 }, @@ -471,9 +602,9 @@ describe('Compute Jobs Database', () => { it('resources exceed env availability → throws', async function () { const env = makeEnv([ - { id: 'cpu', total: 4, min: 1, max: 4, inUse: 3 }, - { id: 'ram', total: 32, min: 1, max: 32, inUse: 0 }, - { id: 'disk', total: 500, min: 10, max: 500, inUse: 0 } + { id: 'cpu', kind: 'fungible', total: 4, min: 1, max: 4, inUse: 3 }, + { id: 'ram', kind: 'fungible', total: 32, min: 1, max: 32, inUse: 0 }, + { id: 'disk', kind: 'fungible', total: 500, min: 10, max: 500, inUse: 0 } ]) const req: ComputeResourceRequest[] = [ { id: 'cpu', amount: 4 }, // only 1 available (4-3) @@ -491,15 +622,15 @@ describe('Compute Jobs Database', () => { it('free resource limit exceeded → throws', async function () { const env = makeEnv( [ - { id: 'cpu', total: 8, min: 1, max: 8, inUse: 0 }, - { id: 'ram', total: 32, min: 1, max: 32, inUse: 0 }, - { id: 'disk', total: 500, min: 10, max: 500, inUse: 0 } + { id: 'cpu', kind: 'fungible', total: 8, min: 1, max: 8, inUse: 0 }, + { id: 'ram', kind: 'fungible', total: 32, min: 1, max: 32, inUse: 0 }, + { id: 'disk', kind: 'fungible', total: 500, min: 10, max: 500, inUse: 0 } ], { freeResources: [ - { id: 'cpu', total: 2, min: 1, max: 2, inUse: 2 }, // fully used - { id: 'ram', total: 4, min: 1, max: 4, inUse: 0 }, - { id: 'disk', total: 20, min: 10, max: 20, inUse: 0 } + { id: 'cpu', kind: 'fungible', total: 2, min: 1, max: 2, inUse: 2 }, // fully used + { id: 'ram', kind: 'fungible', total: 4, min: 1, max: 4, inUse: 0 }, + { id: 'disk', kind: 'fungible', total: 20, min: 10, max: 20, inUse: 0 } ] } ) @@ -515,6 +646,207 @@ describe('Compute Jobs Database', () => { expect(err.message).to.include('cpu') } }) + + it('Gate 1 (per-env ceiling, fungible) blocks when env capacity exhausted', async function () { + const env = makeEnv([ + { id: 'cpu', kind: 'fungible', total: 6, min: 1, max: 6, inUse: 6 } + ]) + const req: ComputeResourceRequest[] = [{ id: 'cpu', amount: 1 }] + try { + await engine.checkIfResourcesAreAvailable(req, env, false) + assert.fail('Expected error was not thrown') + } catch (err: any) { + expect(err.message).to.include('Not enough available cpu') + expect(err.message).to.include('environment') + } + }) + + it('Gate 2 (engine-wide pool, fungible) blocks when global capacity exhausted across two envs', async function () { + // Pool: 10 physical CPUs. env1 uses 6, env2 uses 4 → 10 total in-use. + const env1 = makeEnv([ + { id: 'cpu', kind: 'fungible', total: 6, min: 1, max: 6, inUse: 6 } + ]) + env1.id = 'env1' + const env2 = makeEnv([ + { id: 'cpu', kind: 'fungible', total: 6, min: 1, max: 6, inUse: 4 } + ]) + env2.id = 'env2' + // env2 Gate 1: 6 - 4 = 2 >= 1 → passes. Gate 2: total 12 capped to 10, used 10, remaining 0 < 1 → blocks. + const req: ComputeResourceRequest[] = [{ id: 'cpu', amount: 1 }] + try { + await engine.checkIfResourcesAreAvailable(req, env2, false, [env1, env2]) + assert.fail('Expected error was not thrown') + } catch (err: any) { + expect(err.message).to.include('globally') + } + }) + + it('Gate 2 passes when global capacity is available (env1 partially used)', async function () { + const env1 = makeEnv([ + { id: 'cpu', kind: 'fungible', total: 6, min: 1, max: 6, inUse: 3 } + ]) + env1.id = 'env1' + const env2 = makeEnv([ + { id: 'cpu', kind: 'fungible', total: 6, min: 1, max: 6, inUse: 2 } + ]) + env2.id = 'env2' + // Gate 2: total 12 capped to 10, used 5, remaining 5 >= 1 → passes + const req: ComputeResourceRequest[] = [{ id: 'cpu', amount: 1 }] + await engine.checkIfResourcesAreAvailable(req, env2, false, [env1, env2]) + // no throw = pass + }) + + it('discrete exclusive (GPU) globally tracked — second job blocked when total:1 in use', async function () { + // gpu0 is a discrete exclusive resource (total:1). env1 has it in-use. + const env1 = makeEnv([ + { + id: 'gpu0', + kind: 'discrete', + shareable: false, + total: 1, + min: 0, + max: 1, + inUse: 1 + } + ]) + env1.id = 'env1' + const env2 = makeEnv([ + { + id: 'gpu0', + kind: 'discrete', + shareable: false, + total: 1, + min: 0, + max: 1, + inUse: 0 + } + ]) + env2.id = 'env2' + // Gate 2: globalTotal = 2 capped to physicalLimits['gpu0']=1, globalUsed=1, remaining=0 < 1 → blocks + const req: ComputeResourceRequest[] = [{ id: 'gpu0', amount: 1 }] + try { + await engine.checkIfResourcesAreAvailable(req, env2, false, [env1, env2]) + assert.fail('Expected error was not thrown') + } catch (err: any) { + expect(err.message).to.include('gpu0') + expect(err.message).to.include('globally') + } + }) + + it('discrete shareable (NIC) never blocks allocation — both jobs admitted', async function () { + // nic0 is shareable discrete. env1 already has it in-use. + const env1 = makeEnv([ + { + id: 'nic0', + kind: 'discrete', + shareable: true, + total: 1, + min: 0, + max: 1, + inUse: 1 + } + ]) + env1.id = 'env1' + const env2 = makeEnv([ + { + id: 'nic0', + kind: 'discrete', + shareable: true, + total: 1, + min: 0, + max: 1, + inUse: 0 + } + ]) + env2.id = 'env2' + // isShareableDiscrete = true → Gate 2 skipped → no throw + const req: ComputeResourceRequest[] = [{ id: 'nic0', amount: 1 }] + await engine.checkIfResourcesAreAvailable(req, env2, false, [env1, env2]) + // no throw = pass + }) + + it('non-GPU discrete resource is globally tracked (kind drives tracking, not type)', async function () { + // An FPGA with kind:'discrete' but type:'fpga' must be globally tracked just like a GPU. + const env1 = makeEnv([ + { + id: 'fpga0', + kind: 'discrete', + type: 'fpga', + total: 1, + min: 0, + max: 1, + inUse: 1 + } + ]) + env1.id = 'env1' + const env2 = makeEnv([ + { + id: 'fpga0', + kind: 'discrete', + type: 'fpga', + total: 1, + min: 0, + max: 1, + inUse: 0 + } + ]) + env2.id = 'env2' + ;(engine as any).physicalLimits.set('fpga0', 1) + const req: ComputeResourceRequest[] = [{ id: 'fpga0', amount: 1 }] + try { + await engine.checkIfResourcesAreAvailable(req, env2, false, [env1, env2]) + assert.fail('Expected error was not thrown') + } catch (err: any) { + expect(err.message).to.include('fpga0') + expect(err.message).to.include('globally') + } + }) + + it('discrete GPU — double-counting across envs does not block when capacity remains', async function () { + // Setup: 2 physical GPUs (physicalLimits gpu0=2), two environments each advertising + // total:2. A single job consumes 1 GPU on env1. getUsedResources aggregates discrete + // usage globally, so both env1 and env2 receive inUse:1. Without the max-vs-sum fix, + // checkGlobalResourceAvailability would compute globalUsed = 1+1 = 2, exhausting + // the physical pool and incorrectly blocking the next allocation. + engine.setPhysicalLimits( + new Map([ + ['cpu', 10], + ['ram', 32], + ['disk', 100], + ['gpu0', 2], + ['nic0', 1] + ]) + ) + const env1 = makeEnv([ + { + id: 'gpu0', + kind: 'discrete', + shareable: false, + total: 2, + min: 0, + max: 2, + inUse: 1 + } + ]) + env1.id = 'env1' + // env2 carries the same global inUse value because getUsedResources tracks discrete globally + const env2 = makeEnv([ + { + id: 'gpu0', + kind: 'discrete', + shareable: false, + total: 2, + min: 0, + max: 2, + inUse: 1 + } + ]) + env2.id = 'env2' + // 1 GPU in use, 1 remaining — this request must succeed, not be double-blocked + const req: ComputeResourceRequest[] = [{ id: 'gpu0', amount: 1 }] + await engine.checkIfResourcesAreAvailable(req, env2, false, [env1, env2]) + // no throw = pass (double-counting would have thrown "Not enough gpu0 globally") + }) }) after(async () => { @@ -522,6 +854,353 @@ describe('Compute Jobs Database', () => { }) }) +describe('Schema validation (C2DDockerConfigSchema)', () => { + const validBase = { + socketPath: '/var/run/docker.sock', + environments: [ + { + storageExpiry: 604800, + maxJobDuration: 3600, + minJobDuration: 60, + fees: { '1': [{ feeToken: '0x123', prices: [{ id: 'cpu', price: 1 }] }] } + } + ] + } + + it('old format (env resources with init) is rejected — clean break enforced', function () { + const config = [ + { + ...validBase, + environments: [ + { + ...validBase.environments[0], + resources: [ + { + id: 'gpu0', + total: 1, + init: { + deviceRequests: { + Driver: 'nvidia', + DeviceIDs: ['uuid-a'], + Capabilities: [['gpu']] + } + } + } + ] + } + ] + } + ] + const result = C2DDockerConfigSchema.safeParse(config) + expect(result.success).to.equal(false) + const msgs = result.error?.issues.map((i) => i.message).join(' ') + expect(msgs).to.include('migration guide') + }) + + it('env ref pointing to unknown pool id is rejected', function () { + const config = [ + { + ...validBase, + environments: [ + { + ...validBase.environments[0], + resources: [{ id: 'unknown-gpu' }] + } + ] + } + ] + const result = C2DDockerConfigSchema.safeParse(config) + expect(result.success).to.equal(false) + const msgs = result.error?.issues.map((i) => i.message).join(' ') + expect(msgs).to.include('not found in connection-level resources') + }) + + it('shareable:true on type:gpu resource is rejected', function () { + const config = [ + { + ...validBase, + resources: [ + { + id: 'gpu0', + type: 'gpu', + kind: 'discrete', + total: 1, + shareable: true, + init: { + deviceRequests: { + Driver: 'nvidia', + DeviceIDs: ['uuid-a'], + Capabilities: [['gpu']] + } + } + } + ], + environments: [ + { + ...validBase.environments[0], + resources: [{ id: 'gpu0' }] + } + ] + } + ] + const result = C2DDockerConfigSchema.safeParse(config) + expect(result.success).to.equal(false) + const msgs = result.error?.issues.map((i) => i.message).join(' ') + expect(msgs).to.include('shareable:true is not allowed') + }) + + it('valid two-level config with GPU pool and env refs parses successfully', function () { + const config = [ + { + socketPath: '/var/run/docker.sock', + resources: [ + { + id: 'gpu0', + kind: 'discrete', + type: 'gpu', + total: 1, + init: { + deviceRequests: { + Driver: 'nvidia', + DeviceIDs: ['uuid-a'], + Capabilities: [['gpu']] + } + } + } + ], + environments: [ + { + storageExpiry: 604800, + maxJobDuration: 3600, + minJobDuration: 60, + resources: [{ id: 'cpu' }, { id: 'ram' }, { id: 'disk' }, { id: 'gpu0' }], + fees: { '1': [{ feeToken: '0x123', prices: [{ id: 'gpu0', price: 5 }] }] } + } + ] + } + ] + const result = C2DDockerConfigSchema.safeParse(config) + expect(result.success).to.equal(true) + }) +}) + +describe('resolveResourceKind / resolveConnectionResourcePool / resolveEnvironmentResources', () => { + let engine: any + + beforeEach(function () { + // Use Object.create to bypass the Docker-specific constructor while retaining the prototype chain. + engine = Object.create(C2DEngineDocker.prototype) + engine.physicalLimits = new Map() + }) + + describe('resolveResourceKind()', function () { + it('explicit kind:"discrete" wins over init presence', function () { + const res: Partial = { + id: 'cpu', + kind: 'discrete', + init: undefined + } + expect(engine.resolveResourceKind(res)).to.equal('discrete') + }) + + it('explicit kind:"fungible" wins even when init is present', function () { + const res: Partial = { + id: 'cpu', + kind: 'fungible', + init: { + deviceRequests: { Driver: 'nvidia', DeviceIDs: ['x'], Capabilities: [['gpu']] } + } + } + expect(engine.resolveResourceKind(res)).to.equal('fungible') + }) + + it('no kind + init present → inferred as discrete', function () { + const res: Partial = { + id: 'gpu0', + init: { + deviceRequests: { Driver: 'nvidia', DeviceIDs: ['x'], Capabilities: [['gpu']] } + } + } + expect(engine.resolveResourceKind(res)).to.equal('discrete') + }) + + it('no kind, no init → inferred as fungible', function () { + const res: Partial = { id: 'cpu' } + expect(engine.resolveResourceKind(res)).to.equal('fungible') + }) + }) + + describe('resolveConnectionResourcePool()', function () { + it('auto-detects cpu and ram from sysinfo; disk from physicalLimits', function () { + engine.physicalLimits.set('disk', 200) + const sysinfo = { NCPU: 8, MemTotal: 32 * 1024 * 1024 * 1024 } // 32 GB + const pool = engine.resolveConnectionResourcePool(sysinfo, []) + expect(pool.get('cpu').total).to.equal(8) + expect(pool.get('ram').total).to.equal(32) + expect(pool.get('disk').total).to.equal(200) + expect(pool.get('cpu').kind).to.equal('fungible') + expect(pool.get('ram').kind).to.equal('fungible') + }) + + it('configured total caps cpu at physical limit', function () { + engine.physicalLimits.set('cpu', 8) + engine.physicalLimits.set('disk', 100) + const sysinfo = { NCPU: 8, MemTotal: 32 * 1024 * 1024 * 1024 } + // Config requests 6 cores (cap below physical) → should use 6. + const pool = engine.resolveConnectionResourcePool(sysinfo, [ + { id: 'cpu', total: 6, min: 1 } + ]) + expect(pool.get('cpu').total).to.equal(6) + }) + + it('configured total exceeding physical is capped at physical', function () { + engine.physicalLimits.set('cpu', 8) + engine.physicalLimits.set('disk', 100) + const sysinfo = { NCPU: 8, MemTotal: 16 * 1024 * 1024 * 1024 } + // Config requests 20 cores on an 8-core host → capped to 8. + const pool = engine.resolveConnectionResourcePool(sysinfo, [ + { id: 'cpu', total: 20 } + ]) + expect(pool.get('cpu').total).to.equal(8) + }) + + it('custom GPU resource is added to pool and registered in physicalLimits', function () { + engine.physicalLimits.set('disk', 100) + const sysinfo = { NCPU: 4, MemTotal: 8 * 1024 * 1024 * 1024 } + const gpu = { + id: 'gpu0', + type: 'gpu', + total: 1, + init: { + deviceRequests: { + Driver: 'nvidia', + DeviceIDs: ['uuid-a'], + Capabilities: [['gpu']] + } + } + } + const pool = engine.resolveConnectionResourcePool(sysinfo, [gpu]) + expect(pool.has('gpu0')).to.equal(true) + expect(pool.get('gpu0').kind).to.equal('discrete') // inferred from init + expect(pool.get('gpu0').total).to.equal(1) + expect(engine.physicalLimits.get('gpu0')).to.equal(1) + }) + }) + + describe('resolveEnvironmentResources()', function () { + let pool: Map + + beforeEach(function () { + pool = new Map([ + ['cpu', { id: 'cpu', kind: 'fungible', type: 'cpu', total: 10, min: 1, max: 10 }], + ['ram', { id: 'ram', kind: 'fungible', type: 'ram', total: 32, min: 1, max: 32 }], + [ + 'disk', + { id: 'disk', kind: 'fungible', type: 'disk', total: 100, min: 1, max: 100 } + ], + [ + 'gpu0', + { + id: 'gpu0', + kind: 'discrete', + type: 'gpu', + total: 1, + min: 0, + max: 1, + constraints: [{ id: 'ram', min: 4 }], + init: { + deviceRequests: { + Driver: 'nvidia', + DeviceIDs: ['uuid-a'], + Capabilities: [['gpu']] + } + } + } + ] + ]) + }) + + it('ref.total becomes env aggregate ceiling for fungible (capped at pool.total)', function () { + const envDef = { resources: [{ id: 'cpu', total: 6 }] } + const result = engine.resolveEnvironmentResources(envDef, pool) + expect(result[0].total).to.equal(6) + }) + + it('ref.total exceeding pool.total is capped at pool.total', function () { + const envDef = { resources: [{ id: 'cpu', total: 999 }] } + const result = engine.resolveEnvironmentResources(envDef, pool) + expect(result[0].total).to.equal(10) // pool.total = 10 + }) + + it('omitting ref.total inherits pool total for fungible', function () { + const envDef = { resources: [{ id: 'cpu' }] } + const result = engine.resolveEnvironmentResources(envDef, pool) + expect(result[0].total).to.equal(10) + }) + + it('ref.max is capped to resolved.total', function () { + const envDef = { resources: [{ id: 'cpu', total: 6, max: 99 }] } + const result = engine.resolveEnvironmentResources(envDef, pool) + expect(result[0].max).to.equal(6) // capped to total + }) + + it('ref.min overrides pool min', function () { + const envDef = { resources: [{ id: 'cpu', min: 2 }] } + const result = engine.resolveEnvironmentResources(envDef, pool) + expect(result[0].min).to.equal(2) + }) + + it('ref.constraints replaces pool constraints entirely', function () { + const envDef = { + resources: [ + { + id: 'gpu0', + constraints: [ + { id: 'ram', min: 8 }, + { id: 'cpu', min: 4 } + ] + } + ] + } + const result = engine.resolveEnvironmentResources(envDef, pool) + const gpuRes = result.find((r: ComputeResource) => r.id === 'gpu0') + expect(gpuRes.constraints).to.have.length(2) + expect(gpuRes.constraints[0]).to.deep.equal({ id: 'ram', min: 8 }) + }) + + it('ref.constraints: [] removes all constraints for this env', function () { + const envDef = { resources: [{ id: 'gpu0', constraints: [] as any[] }] } + const result = engine.resolveEnvironmentResources(envDef, pool) + const gpuRes = result.find((r: ComputeResource) => r.id === 'gpu0') + expect(gpuRes.constraints).to.deep.equal([]) + }) + + it('omitting ref.constraints inherits pool constraints (deep-cloned)', function () { + const envDef = { resources: [{ id: 'gpu0' }] } + const result = engine.resolveEnvironmentResources(envDef, pool) + const gpuRes = result.find((r: ComputeResource) => r.id === 'gpu0') + expect(gpuRes.constraints).to.deep.equal([{ id: 'ram', min: 4 }]) + // Mutating the resolved constraints must not affect the pool + gpuRes.constraints[0].min = 99 + expect(pool.get('gpu0').constraints[0].min).to.equal(4) + }) + + it('init is deep-cloned: mutating resolved.init does not corrupt pool', function () { + const envDef = { resources: [{ id: 'gpu0' }] } + const result = engine.resolveEnvironmentResources(envDef, pool) + const gpuRes = result.find((r: ComputeResource) => r.id === 'gpu0') + gpuRes.init.deviceRequests.DeviceIDs[0] = 'mutated' + expect(pool.get('gpu0').init.deviceRequests.DeviceIDs[0]).to.equal('uuid-a') + }) + + it('unknown ref.id is skipped silently', function () { + const envDef = { resources: [{ id: 'nonexistent' }] } + const result = engine.resolveEnvironmentResources(envDef, pool) + expect(result).to.have.length(0) + }) + }) +}) + describe('getAlgoChecksums', () => { let findDdoStub: sinon.SinonStub let loggerErrorSpy: sinon.SinonSpy diff --git a/src/utils/config/schemas.ts b/src/utils/config/schemas.ts index cfdf57ce5..46295f804 100644 --- a/src/utils/config/schemas.ts +++ b/src/utils/config/schemas.ts @@ -178,7 +178,8 @@ export const ComputeResourceSchema = z.object({ total: z.number().optional(), description: z.string().optional(), type: z.string().optional(), - kind: z.string().optional(), + kind: z.enum(['discrete', 'fungible']).optional(), + shareable: z.boolean().optional(), min: z.number().optional(), max: z.number().optional(), inUse: z.number().optional(), @@ -189,6 +190,16 @@ export const ComputeResourceSchema = z.object({ constraints: z.array(ResourceConstraintSchema).optional() }) +export const EnvironmentResourceRefSchema = z + .object({ + id: z.string(), + total: z.number().optional(), + min: z.number().optional(), + max: z.number().optional(), + constraints: z.array(ResourceConstraintSchema).optional() + }) + .passthrough() + export const ComputeResourcesPricingInfoSchema = z.object({ id: z.string(), price: z.number() @@ -216,6 +227,24 @@ export const ComputeEnvironmentFreeOptionsSchema = z.object({ allowImageBuild: z.boolean().optional().default(false) }) +// Config-time schema for the free block — resources are refs, not full ComputeResource objects. +export const C2DEnvironmentFreeConfigSchema = z.object({ + minJobDuration: z.number().int().optional().default(60), + maxJobDuration: z.number().int().optional().default(3600), + maxJobs: z.number().int().optional().default(3), + resources: z.array(EnvironmentResourceRefSchema).optional(), + access: z + .object({ + addresses: z.array(z.string()), + accessLists: z + .array(z.record(z.string(), z.array(z.string()))) + .nullable() + .optional() + }) + .optional(), + allowImageBuild: z.boolean().optional().default(false) +}) + export const C2DEnvironmentConfigSchema = z .object({ id: z.string().optional(), @@ -234,8 +263,8 @@ export const C2DEnvironmentConfigSchema = z .optional() }) .optional(), - free: ComputeEnvironmentFreeOptionsSchema.optional(), - resources: z.array(ComputeResourceSchema).optional(), + free: C2DEnvironmentFreeConfigSchema.optional(), + resources: z.array(EnvironmentResourceRefSchema).optional(), enableNetwork: z.boolean().optional().default(false) }) .refine( @@ -250,29 +279,82 @@ export const C2DEnvironmentConfigSchema = z .refine((data) => data.storageExpiry >= data.maxJobDuration, { message: '"storageExpiry" should be greater than "maxJobDuration"' }) - .refine( - (data) => { - if (!data.resources) return false - return data.resources.some((r) => r.id === 'disk' && r.total) - }, - { message: 'There is no "disk" resource configured. This is mandatory' } - ) export const C2DDockerConfigSchema = z.array( - z.object({ - socketPath: z.string().optional(), - protocol: z.string().optional(), - host: z.string().optional(), - port: z.number().optional(), - caPath: z.string().optional(), - certPath: z.string().optional(), - keyPath: z.string().optional(), - imageRetentionDays: z.number().int().min(1).optional().default(7), - imageCleanupInterval: z.number().int().min(3600).optional().default(86400), // min 1 hour, default 24 hours - scanImages: z.boolean().optional().default(false), - scanImageDBUpdateInterval: z.number().int().min(3600).optional().default(43200), // default 43200 (12 hours) - environments: z.array(C2DEnvironmentConfigSchema).min(1) - }) + z + .object({ + socketPath: z.string().optional(), + protocol: z.string().optional(), + host: z.string().optional(), + port: z.number().optional(), + caPath: z.string().optional(), + certPath: z.string().optional(), + keyPath: z.string().optional(), + imageRetentionDays: z.number().int().min(1).optional().default(7), + imageCleanupInterval: z.number().int().min(3600).optional().default(86400), + paymentClaimInterval: z.number().int().min(60).optional().default(3600), + scanImages: z.boolean().optional().default(false), + scanImageDBUpdateInterval: z.number().int().min(3600).optional().default(43200), + resources: z.array(ComputeResourceSchema).optional(), + environments: z.array(C2DEnvironmentConfigSchema).min(1) + }) + .superRefine((dockerConfig, ctx) => { + // Reject old format: env-level resources with init/driverVersion/platform indicate full ComputeResource objects + // that should have been moved to connection-level resources. + dockerConfig.environments.forEach((env, envIdx) => { + ;(env.resources || []).forEach((ref, i) => { + if ( + (ref as any).init !== undefined || + (ref as any).driverVersion !== undefined + ) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: `environments[${envIdx}].resources[${i}]: hardware fields (init, driverVersion, platform, etc.) must be defined at connection level in "resources", not inside an environment. See migration guide.`, + path: ['environments', envIdx, 'resources', i] + }) + } + }) + }) + + // Validate env resource refs point to known pool ids. + // cpu, ram, disk are always valid (auto-detected from host). + const autoDetected = new Set(['cpu', 'ram', 'disk']) + const poolIds = new Set([ + ...autoDetected, + ...(dockerConfig.resources ?? []).map((r) => r.id) + ]) + dockerConfig.environments.forEach((env, envIdx) => { + ;(env.resources || []).forEach((ref, i) => { + if (!poolIds.has(ref.id)) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: `environments[${envIdx}].resources[${i}].id "${ref.id}" not found in connection-level resources`, + path: ['environments', envIdx, 'resources', i, 'id'] + }) + } + }) + }) + + // Reject shareable:true on gpu/fpga type resources — these require exclusive access. + ;(dockerConfig.resources ?? []).forEach((res, i) => { + if (res.shareable === true && (res.type === 'gpu' || res.type === 'fpga')) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: `Resource "${res.id}": shareable:true is not allowed for type "${res.type}" — GPUs and FPGAs require exclusive access per job`, + path: ['resources', i] + }) + } + }) + + // Warn (not error) if shareable:true on a fungible resource — it has no effect. + ;(dockerConfig.resources ?? []).forEach((res) => { + if (res.shareable === true && res.kind === 'fungible') { + CONFIG_LOGGER.warn( + `Resource "${res.id}": shareable:true has no effect on fungible resources` + ) + } + }) + }) ) export const C2DClusterInfoSchema = z.object({