gpu-mode · Jack-Khuu · May 1, 2026 · May 1, 2026 · May 4, 2026 · May 5, 2026
diff --git a/.github/workflows/metal_workflow.yml b/.github/workflows/metal_workflow.yml
@@ -0,0 +1,75 @@
+name: Metal MLX Job
+on:
+  workflow_dispatch:
+    inputs:
+      run_id:
+        description: 'Unique identifier for this run'
+        required: true
+        type: string
+      payload:
+        description: 'Content of the user submission, as json string'
+        required: true
+        type: string
+      runner:
+        description: 'Metal runner to run workflow on'
+        required: true
+        default: "arc-metal-runner-set"
+        type: string
+      requirements:
+        description: 'Contents for a requirements.txt file'
+        required: false
+        type: string
+
+run-name: 'Metal Job - ${{ github.event.inputs.run_id }}'
+
+jobs:
+  run:
+    runs-on: ${{ github.event.inputs.runner }}
+    timeout-minutes: 20
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Create input files
+      shell: bash
+      run: |
+        # Extract the payload content without printing it
+        PAYLOAD=$(jq -r '.inputs.payload' $GITHUB_EVENT_PATH)
+
+        # Apply mask to the extracted content
+        echo "::add-mask::$PAYLOAD"
+
+        # Now write to file (won't be logged since it's masked)
+        echo "$PAYLOAD" > payload.json
+
+    - name: Setup Virtual Environment and Install Dependencies
+      shell: bash
+      run: |
+        pip install --break-system-packages --upgrade pip
+        pip install --break-system-packages -e .
+
+    - name: Install requirements
+      if: ${{ github.event.inputs.requirements != '' }}
+      shell: bash
+      run: |
+        echo "${{ github.event.inputs.requirements }}" > requirements.txt
+        pip install --break-system-packages -r requirements.txt
+
+    - name: Run script
+      shell: bash
+      run: |
+        python3 src/runners/github-runner.py
+
+    - name: Upload training artifacts
+      uses: actions/upload-artifact@v4
+      if: always()
+      with:
+        name: run-result
+        path: result.json
+
+    - name: Upload profiling artifacts
+      uses: actions/upload-artifact@v4
+      if: always()
+      with:
+        name: profile-data
+        path: profile_data/*
+        retention-days: 1
diff --git a/examples/mlx.yaml b/examples/mlx.yaml
@@ -0,0 +1,9 @@
+name: MLX Problem Set
+deadline: "2026-05-01 03:59"
+description: "Test MLX"
+problems:
+  - directory: mlx/example
+    name: example_mlx
+    deadline: "2026-05-01 03:59"
+    gpus:
+      - M4_Max 
diff --git a/examples/mlx/example/eval.py b/examples/mlx/example/eval.py
@@ -0,0 +1,133 @@
+import math
+import os
+import re
+import sys
+import time
+from pathlib import Path
+
+import mlx.core as mx
+
+from reference import check_implementation, generate_input
+from submission import custom_kernel
+
+WARMUP_ITERS = 10
+BENCH_ITERS = 100
+
+
+class PopcornOutput:
+    def __init__(self, fd: int):
+        self.file = os.fdopen(fd, "w")
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.file.close()
+
+    def log(self, key, value):
+        print(f"{key}: {value}", file=self.file, flush=True)
+
+
+def get_test_cases(file_name):
+    content = Path(file_name).read_text()
+    tests = []
+    pattern = r"\s*([a-zA-Z_]+):\s*([a-zA-Z_]+|[+-]?[0-9]+)\s*"
+    for line in content.splitlines():
+        if not line.strip():
+            continue
+        case = {}
+        for part in line.split(";"):
+            m = re.fullmatch(pattern, part)
+            if not m:
+                print(f"invalid test case: '{line}'", file=sys.stderr)
+                sys.exit(113)
+            key, val = m[1], m[2]
+            try:
+                val = int(val)
+            except ValueError:
+                pass
+            case[key] = val
+        tests.append(case)
+    return tests
+
+
+def run_testing(logger, tests):
+    passed = True
+    logger.log("test-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"test.{idx}.spec", test)
+        data = generate_input(**test)
+        output = custom_kernel(data)
+        mx.eval(output)
+        error = check_implementation(data, output)
+        if error:
+            logger.log(f"test.{idx}.status", "fail")
+            logger.log(f"test.{idx}.error", error)
+            passed = False
+        else:
+            logger.log(f"test.{idx}.status", "pass")
+    logger.log("check", "pass" if passed else "fail")
+    return 0 if passed else 112
+
+
+def run_benchmarking(logger, tests):
+    # warmup
+    data = generate_input(**tests[0])
+    for _ in range(WARMUP_ITERS):
+        mx.eval(custom_kernel(data))
+
+    passed = True
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test)
+        data = generate_input(**test)
+        mx.eval(data)
+
+        output = custom_kernel(data)
+        mx.eval(output)
+        error = check_implementation(data, output)
+        if error:
+            logger.log(f"benchmark.{idx}.status", "fail")
+            logger.log(f"benchmark.{idx}.error", error)
+            passed = False
+            continue
+
+        durations = []
+        for i in range(BENCH_ITERS):
+            start = time.perf_counter_ns()
+            mx.eval(custom_kernel(data))
+            durations.append(time.perf_counter_ns() - start)
+            if i > 1:
+                avg = sum(durations) / len(durations)
+                std = math.sqrt(sum((d - avg) ** 2 for d in durations) / (len(durations) - 1))
+                if std / math.sqrt(len(durations)) / avg < 0.01:
+                    break
+
+        avg = sum(durations) / len(durations)
+        logger.log(f"benchmark.{idx}.runs", len(durations))
+        logger.log(f"benchmark.{idx}.mean", avg)
+
+    logger.log("check", "pass" if passed else "fail")
+    return 0 if passed else 112
+
+
+def main():
+    fd = os.getenv("POPCORN_FD")
+    if not fd:
+        return 111
+    if len(sys.argv) < 3:
+        return 2
+
+    mode = sys.argv[1]
+    tests = get_test_cases(sys.argv[2])
+
+    with PopcornOutput(int(fd)) as logger:
+        if mode == "test":
+            return run_testing(logger, tests)
+        if mode in ("benchmark", "leaderboard"):
+            return run_benchmarking(logger, tests)
+        return 2
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/examples/mlx/example/reference.py b/examples/mlx/example/reference.py
@@ -0,0 +1,29 @@
+import mlx.core as mx
+
+
+ATOL = 1e-3
+RTOL = 1e-3
+
+
+def generate_input(size, seed=42):
+    mx.random.seed(seed)
+    A = mx.random.normal(shape=(size, size)).astype(mx.float16)
+    B = mx.random.normal(shape=(size, size)).astype(mx.float16)
+    mx.eval(A, B)
+    return A, B
+
+
+def reference_kernel(data):
+    A, B = data
+    return A + B
+
+
+def check_implementation(data, output):
+    expected = reference_kernel(data)
+    mx.eval(expected)
+    if output.shape != expected.shape:
+        return f"shape mismatch: expected {expected.shape}, got {output.shape}"
+    if not mx.allclose(output, expected, atol=ATOL, rtol=RTOL).item():
+        max_diff = mx.max(mx.abs(output - expected)).item()
+        return f"mismatch found! max diff: {max_diff}"
+    return ""
diff --git a/examples/mlx/example/submission.py b/examples/mlx/example/submission.py
@@ -0,0 +1,6 @@
+import mlx.core as mx
+
+
+def custom_kernel(data):
+    A, B = data
+    return A + B
diff --git a/examples/mlx/example/task.yml b/examples/mlx/example/task.yml
@@ -0,0 +1,32 @@
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a float16 vector addition kernel using MLX.
+
+  Input: tuple(mx.array, mx.array) with arrays of shape (N, N) and type mx.float16.
+  Output: mx.array of shape (N, N) and type mx.float16
+
+config:
+  main: "eval.py"
+
+tests:
+  - {"size": 128, "seed": 5236}
+  - {"size": 256, "seed": 5531}
+  - {"size": 512, "seed": 9173}
+
+benchmarks:
+  - {"size": 1024, "seed": 31232}
+  - {"size": 4096, "seed": 2146}
+  - {"size": 16384, "seed": 54352}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 180
+
+gpus:
+  - M4_Max
diff --git a/instructions.txt b/instructions.txt
@@ -0,0 +1,73 @@
+## Changes Summary
+
+### New files
+- src/libkernelbot/launchers/local.py — LocalLauncher that runs submissions directly on the host machine via run_config(). Blocks CUDA submissions.
+
+### Modified files — Adding Metal/MLX support
+
+1. src/libkernelbot/consts.py
+   - Added MetalGPU enum (M4_Max)
+   - Registered it in _GPU_LOOKUP under "Local" runner
+   - Added M4_Max: None to GPU_TO_SM
+
+2. src/libkernelbot/launchers/__init__.py — Exports LocalLauncher
+
+3. src/kernelbot/main.py — Registers LocalLauncher() in create_backend()
+
+4. src/kernelbot/cogs/admin_cog.py — Added MetalGPU to Discord GPU dropdowns
+
+### Modified files — Bug fixes for macOS compatibility
+
+5. src/libkernelbot/run_eval.py — Three fixes in make_system_info():
+   - Added MPS/Metal detection via torch.backends.mps
+   - Catch FileNotFoundError for nvidia-smi/rocm-smi (don't exist on macOS)
+   - Catch FileNotFoundError for /proc/cpuinfo (doesn't exist on macOS)
+
+6. src/kernelbot/api/main.py — Replace / with _ in auto-derived dev leaderboard names so nested directories don't break API routing
+
+---
+
+## Manual Test Steps
+
+# 1. Start Postgres (if not already running)
+brew services start postgresql@14
+
+# 2. Create DB and run migrations
+export DATABASE_URL="postgresql://$(whoami)@localhost:5432/kernelbot"
+createdb kernelbot  # skip if already exists
+cd /path/to/kernelbot
+uv run yoyo apply --database "$DATABASE_URL" src/migrations/
+
+# 3. Create test user
+psql "$DATABASE_URL" -c "INSERT INTO leaderboard.user_info (id, user_name, cli_id, cli_valid)
+VALUES ('999999', 'testuser', 'test-cli-id-123', true)
+ON CONFLICT (id) DO UPDATE SET cli_id = 'test-cli-id-123', cli_valid = true;"
+
+# 4. Install mlx
+uv pip install mlx
+
+# 5. Start the API server
+cd src/kernelbot
+export DATABASE_URL="postgresql://$(whoami)@localhost:5432/kernelbot"
+export ADMIN_TOKEN="your-admin-token"
+export PROBLEM_DEV_DIR="/path/to/kernelbot/examples"
+export GITHUB_TOKEN="dummy"
+export GITHUB_REPO="dummy/dummy"
+export DISABLE_SSL=1
+uv run python main.py --api-only
+
+# 6. (In another terminal) Create the dev leaderboard
+curl -X POST "http://localhost:8000/admin/leaderboards" \
+  -H "Authorization: Bearer your-admin-token" \
+  -H "Content-Type: application/json" \
+  -d '{"directory": "mlx/example"}'
+
+# 7. Submit a test
+curl -X POST "http://localhost:8000/mlx_example-dev/M4_Max/test" \
+  -H "X-Popcorn-Cli-Id: test-cli-id-123" \
+  -F "file=@examples/mlx/example/submission.py"
+
+# 8. Submit a benchmark
+curl -X POST "http://localhost:8000/mlx_example-dev/M4_Max/benchmark" \
+  -H "X-Popcorn-Cli-Id: test-cli-id-123" \
+  -F "file=@examples/mlx/example/submission.py"