diff --git a/.github/actions/run-inference/action.yml b/.github/actions/run-inference/action.yml new file mode 100644 index 00000000..21cb9fb3 --- /dev/null +++ b/.github/actions/run-inference/action.yml @@ -0,0 +1,61 @@ +name: Run Inference +description: Run one llama-tornado inference pass and write the metrics + sidecar files. + +inputs: + backend: + description: 'GPU backend (opencl or ptx)' + required: true + model_file: + description: 'Model filename inside $MODELS_DIR (e.g. Llama-3.2-1B-Instruct-F16.gguf)' + required: true + model: + description: 'Human-readable model name for the sidecar (e.g. Llama-3.2-1B-Instruct)' + required: true + quantization: + description: 'Quantization type (e.g. F16, Q8_0)' + required: true + configuration: + description: 'Configuration key for the sidecar (e.g. standard, prefill-decode)' + required: true + flags: + description: 'Extra CLI flags passed to llama-tornado (omit for standard run)' + required: false + default: '' + metrics_file: + description: 'Absolute path for the output metrics JSON file' + required: true + prompt: + description: 'Prompt to pass to the model' + required: false + default: 'Say hello' + +runs: + using: composite + steps: + - name: Run inference + shell: bash + working-directory: ${{ github.workspace }} + env: + JAVA_TOOL_OPTIONS: >- + -Dllama.metrics.format=json + -Dllama.metrics.output=file + -Dllama.metrics.file=${{ inputs.metrics_file }} + run: | + # Run inference and emit raw metrics JSON via JAVA_TOOL_OPTIONS + ./llama-tornado --gpu --${{ inputs.backend }} \ + --model $MODELS_DIR/${{ inputs.model_file }} \ + --prompt "${{ inputs.prompt }}" \ + ${{ inputs.flags }} + + # Write metadata sidecar so process_metrics.py can identify each metrics file + SIDECAR="${{ inputs.metrics_file }%.json}.meta.json" + python3 scripts/write_metrics_sidecar.py \ + --out "$SIDECAR" \ + backend="${{ inputs.backend }}" \ + task=llama-inference \ + model_file=${{ inputs.model_file }} \ + model=${{ inputs.model }} \ + quantization=${{ inputs.quantization }} \ + configuration=${{ inputs.configuration }} \ + "flags=${{ inputs.flags }}" \ + prompt="${{ inputs.prompt }}" diff --git a/.github/actions/setup-tornadovm/action.yml b/.github/actions/setup-tornadovm/action.yml new file mode 100644 index 00000000..a5512658 --- /dev/null +++ b/.github/actions/setup-tornadovm/action.yml @@ -0,0 +1,93 @@ +name: Setup TornadoVM +description: Clone (or restore from cache), build, and configure TornadoVM. Exports TORNADOVM_HOME and updates PATH for all subsequent steps. + +inputs: + backend: + description: 'TornadoVM backend to build (opencl or ptx)' + required: true + +runs: + using: composite + steps: + - name: Get TornadoVM HEAD SHA + id: tornado_sha + shell: bash + run: | + SHA=$(git ls-remote https://github.com/beehive-lab/TornadoVM HEAD | cut -f1) + echo "sha=$SHA" >> $GITHUB_OUTPUT + + # actions/cache `path:` is an expression — it cannot access the calling + # workflow's env vars via ${{ env.* }}, so we resolve TORNADO_ROOT here. + - name: Resolve TORNADO_ROOT for cache path + id: paths + shell: bash + run: echo "tornado_root=$TORNADO_ROOT" >> $GITHUB_OUTPUT + + - name: Restore TornadoVM cache + id: cache + uses: actions/cache@v4 + with: + path: ${{ steps.paths.outputs.tornado_root }} + key: tornadovm-${{ inputs.backend }}-${{ steps.tornado_sha.outputs.sha }} + + - name: Clone TornadoVM master + if: steps.cache.outputs.cache-hit != 'true' + shell: bash + run: | + git clone --depth 1 --branch master \ + https://github.com/beehive-lab/TornadoVM.git \ + $TORNADO_ROOT + + - name: Set up Python venv for TornadoVM + if: steps.cache.outputs.cache-hit != 'true' + shell: bash + run: | + python3 -m venv $TORNADO_ROOT/venv + source $TORNADO_ROOT/venv/bin/activate + python --version + + - name: Build TornadoVM + if: steps.cache.outputs.cache-hit != 'true' + shell: bash + run: | + cd $TORNADO_ROOT + mkdir -p graalJars && cp $GRAAL_JARS/* graalJars/ + source venv/bin/activate + echo "=== Building TornadoVM ===" + make BACKEND=${{ inputs.backend }} + + echo "=== Verifying TornadoVM SDK directory ===" + SDK_DIR=$(find dist -type d -maxdepth 3 -path "*/tornadovm-*-${{ inputs.backend }}" | head -n 1) + if [ -z "$SDK_DIR" ]; then + echo "::error::Could not locate TornadoVM SDK directory!" + find dist -maxdepth 5 -type d + exit 1 + fi + + # Runs on both cache hit and miss — sets TORNADOVM_HOME and PATH for all + # subsequent steps in the calling workflow. + - name: Configure TornadoVM environment + shell: bash + run: | + cd $TORNADO_ROOT + SDK_DIR=$(find dist -type d -maxdepth 3 -path "*/tornadovm-*-${{ inputs.backend }}" | head -n 1) + if [ -z "$SDK_DIR" ]; then + echo "::error::Could not locate TornadoVM SDK directory!" + find dist -maxdepth 5 -type d + exit 1 + fi + FULL_SDK="${PWD}/${SDK_DIR}" + echo "Detected TornadoVM SDK: $FULL_SDK" + + # Export for current shell session + export TORNADOVM_HOME="$FULL_SDK" + export PATH="$FULL_SDK/bin:$JAVA_HOME/bin:$PATH" + + # Save for subsequent steps + echo "TORNADOVM_HOME=$FULL_SDK" >> $GITHUB_ENV + echo "$FULL_SDK/bin" >> $GITHUB_PATH + echo "$JAVA_HOME/bin" >> $GITHUB_PATH + + echo "=== Checking tornado CLI ===" + which tornado || { echo "::error::tornado not in PATH"; exit 1; } + tornado --devices diff --git a/.github/workflows/build-and-run.yml b/.github/workflows/build-and-run.yml index a17c9228..982c4cfc 100644 --- a/.github/workflows/build-and-run.yml +++ b/.github/workflows/build-and-run.yml @@ -48,480 +48,206 @@ jobs: - name: Checkout GPULlama3 uses: actions/checkout@v4 - - name: Clone TornadoVM master - run: | - git clone --depth 1 --branch master \ - https://github.com/beehive-lab/TornadoVM.git \ - $TORNADO_ROOT - - - name: Set up Python venv for TornadoVM - run: | - python3 -m venv $TORNADO_ROOT/venv - source $TORNADO_ROOT/venv/bin/activate - python --version - - - name: Build TornadoVM - run: | - cd $TORNADO_ROOT - mkdir -p graalJars && cp $GRAAL_JARS/* graalJars/ - source venv/bin/activate - echo "=== Building TornadoVM ===" - - make BACKEND=${{ matrix.backend.name }} - - echo "=== Searching for TornadoVM SDK directory ===" - SDK_DIR=$(find dist -type d -maxdepth 3 -path "*/tornadovm-*-${{ matrix.backend.name }}" | head -n 1) - if [ -z "$SDK_DIR" ]; then - echo "::error::Could not locate TornadoVM SDK directory!" - find dist -maxdepth 5 -type d - exit 1 - fi - FULL_SDK="${PWD}/${SDK_DIR}" - echo "Detected TornadoVM SDK: $FULL_SDK" - - # Export for current shell session - export TORNADOVM_HOME="$FULL_SDK" - export PATH="$FULL_SDK/bin:$JAVA_HOME/bin:$PATH" - - # Save for subsequent steps - echo "TORNADOVM_HOME=$FULL_SDK" >> $GITHUB_ENV - echo "PATH=$PATH" >> $GITHUB_ENV - - echo "=== Checking tornado CLI ===" - which tornado || { echo "::error::tornado not in PATH"; exit 1; } - tornado --devices + - name: Setup TornadoVM + uses: ./.github/actions/setup-tornadovm + with: + backend: ${{ matrix.backend.name }} - name: Build GPULlama3.java run: | cd ${{ github.workspace }} echo "Using TORNADOVM_HOME=$TORNADOVM_HOME" - export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" tornado --version ./mvnw clean package -DskipTests + # ── Llama-3.2-1B: standard + prefill-decode variants, all backends ────────── - name: FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Standard - env: - JAVA_TOOL_OPTIONS: >- - -Dllama.metrics.format=json - -Dllama.metrics.output=file - -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-f16-standard.json - run: | - cd ${{ github.workspace }} - export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" - ./llama-tornado --gpu --${{ matrix.backend.name }} \ - --model $MODELS_DIR/Llama-3.2-1B-Instruct-F16.gguf \ - --prompt "Say hello" - python3 scripts/write_metrics_sidecar.py \ - --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-f16-standard.meta.json" \ - backend="${{ matrix.backend.name }}" \ - task=llama-inference \ - model_file=Llama-3.2-1B-Instruct-F16.gguf \ - model=Llama-3.2-1B-Instruct \ - quantization=F16 \ - configuration=standard \ - flags="" \ - prompt="Say hello" + uses: ./.github/actions/run-inference + with: + backend: ${{ matrix.backend.name }} + model_file: Llama-3.2-1B-Instruct-F16.gguf + model: Llama-3.2-1B-Instruct + quantization: F16 + configuration: standard + metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-f16-standard.json - name: FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Prefill-Decode - env: - JAVA_TOOL_OPTIONS: >- - -Dllama.metrics.format=json - -Dllama.metrics.output=file - -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-f16-prefill-decode.json - run: | - cd ${{ github.workspace }} - export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" - ./llama-tornado --gpu --${{ matrix.backend.name }} \ - --model $MODELS_DIR/Llama-3.2-1B-Instruct-F16.gguf \ - --prompt "Say hello" \ - --with-prefill-decode - python3 scripts/write_metrics_sidecar.py \ - --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-f16-prefill-decode.meta.json" \ - backend="${{ matrix.backend.name }}" \ - task=llama-inference \ - model_file=Llama-3.2-1B-Instruct-F16.gguf \ - model=Llama-3.2-1B-Instruct \ - quantization=F16 \ - configuration=prefill-decode \ - "flags=--with-prefill-decode" \ - prompt="Say hello" + uses: ./.github/actions/run-inference + with: + backend: ${{ matrix.backend.name }} + model_file: Llama-3.2-1B-Instruct-F16.gguf + model: Llama-3.2-1B-Instruct + quantization: F16 + configuration: prefill-decode + flags: --with-prefill-decode + metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-f16-prefill-decode.json - name: FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Batch-Prefill-Decode - env: - JAVA_TOOL_OPTIONS: >- - -Dllama.metrics.format=json - -Dllama.metrics.output=file - -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-f16-batch-prefill-decode.json - run: | - cd ${{ github.workspace }} - export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" - ./llama-tornado --gpu --${{ matrix.backend.name }} \ - --model $MODELS_DIR/Llama-3.2-1B-Instruct-F16.gguf \ - --prompt "Say hello" \ - --with-prefill-decode --batch-prefill-size 32 - python3 scripts/write_metrics_sidecar.py \ - --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-f16-batch-prefill-decode.meta.json" \ - backend="${{ matrix.backend.name }}" \ - task=llama-inference \ - model_file=Llama-3.2-1B-Instruct-F16.gguf \ - model=Llama-3.2-1B-Instruct \ - quantization=F16 \ - configuration=batch-prefill-decode \ - "flags=--with-prefill-decode --batch-prefill-size 32" \ - prompt="Say hello" + uses: ./.github/actions/run-inference + with: + backend: ${{ matrix.backend.name }} + model_file: Llama-3.2-1B-Instruct-F16.gguf + model: Llama-3.2-1B-Instruct + quantization: F16 + configuration: batch-prefill-decode + flags: --with-prefill-decode --batch-prefill-size 32 + metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-f16-batch-prefill-decode.json # ── PTX-only: CUDA-graph variants ──────────────────────────────────────── - name: PTX - FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Prefill-Decode-CUDA-Graphs if: matrix.backend.name == 'ptx' - env: - JAVA_TOOL_OPTIONS: >- - -Dllama.metrics.format=json - -Dllama.metrics.output=file - -Dllama.metrics.file=${{ runner.temp }}/metrics-ptx-llama-1b-f16-prefill-decode-cuda-graphs.json - run: | - cd ${{ github.workspace }} - export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" - ./llama-tornado --gpu --ptx \ - --model $MODELS_DIR/Llama-3.2-1B-Instruct-F16.gguf \ - --prompt "Say hello" \ - --with-prefill-decode \ - --cuda-graphs - python3 scripts/write_metrics_sidecar.py \ - --out "${{ runner.temp }}/metrics-ptx-llama-1b-f16-prefill-decode-cuda-graphs.meta.json" \ - backend=ptx \ - task=llama-inference \ - model_file=Llama-3.2-1B-Instruct-F16.gguf \ - model=Llama-3.2-1B-Instruct \ - quantization=F16 \ - configuration=prefill-decode-cuda-graphs \ - "flags=--with-prefill-decode --cuda-graphs" \ - prompt="Say hello" + uses: ./.github/actions/run-inference + with: + backend: ptx + model_file: Llama-3.2-1B-Instruct-F16.gguf + model: Llama-3.2-1B-Instruct + quantization: F16 + configuration: prefill-decode-cuda-graphs + flags: --with-prefill-decode --cuda-graphs + metrics_file: ${{ runner.temp }}/metrics-ptx-llama-1b-f16-prefill-decode-cuda-graphs.json - name: PTX - FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Batch-Prefill-Decode-CUDA-Graphs if: matrix.backend.name == 'ptx' - env: - JAVA_TOOL_OPTIONS: >- - -Dllama.metrics.format=json - -Dllama.metrics.output=file - -Dllama.metrics.file=${{ runner.temp }}/metrics-ptx-llama-1b-f16-batch-prefill-decode-cuda-graphs.json - run: | - cd ${{ github.workspace }} - export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" - ./llama-tornado --gpu --ptx \ - --model $MODELS_DIR/Llama-3.2-1B-Instruct-F16.gguf \ - --prompt "Say hello" \ - --with-prefill-decode --batch-prefill-size 32 \ - --cuda-graphs - python3 scripts/write_metrics_sidecar.py \ - --out "${{ runner.temp }}/metrics-ptx-llama-1b-f16-batch-prefill-decode-cuda-graphs.meta.json" \ - backend=ptx \ - task=llama-inference \ - model_file=Llama-3.2-1B-Instruct-F16.gguf \ - model=Llama-3.2-1B-Instruct \ - quantization=F16 \ - configuration=batch-prefill-decode-cuda-graphs \ - "flags=--with-prefill-decode --batch-prefill-size 32 --cuda-graphs" \ - prompt="Say hello" + uses: ./.github/actions/run-inference + with: + backend: ptx + model_file: Llama-3.2-1B-Instruct-F16.gguf + model: Llama-3.2-1B-Instruct + quantization: F16 + configuration: batch-prefill-decode-cuda-graphs + flags: --with-prefill-decode --batch-prefill-size 32 --cuda-graphs + metrics_file: ${{ runner.temp }}/metrics-ptx-llama-1b-f16-batch-prefill-decode-cuda-graphs.json # ── Additional models — standard inference, all backends ───────────────── - name: FP16 - Run Qwen3-4B-f16.gguf - env: - JAVA_TOOL_OPTIONS: >- - -Dllama.metrics.format=json - -Dllama.metrics.output=file - -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen3-4b-f16-standard.json - run: | - cd ${{ github.workspace }} - export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" - ./llama-tornado --gpu --${{ matrix.backend.name }} \ - --model $MODELS_DIR/Qwen3-4B-f16.gguf \ - --prompt "Say hello" - python3 scripts/write_metrics_sidecar.py \ - --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen3-4b-f16-standard.meta.json" \ - backend="${{ matrix.backend.name }}" \ - task=llama-inference \ - model_file=Qwen3-4B-f16.gguf \ - model=Qwen3-4B \ - quantization=F16 \ - configuration=standard \ - flags="" \ - prompt="Say hello" + uses: ./.github/actions/run-inference + with: + backend: ${{ matrix.backend.name }} + model_file: Qwen3-4B-f16.gguf + model: Qwen3-4B + quantization: F16 + configuration: standard + metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen3-4b-f16-standard.json - name: FP16 - Run Mistral-7B-Instruct-v0.3.fp16.gguf - env: - JAVA_TOOL_OPTIONS: >- - -Dllama.metrics.format=json - -Dllama.metrics.output=file - -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-mistral-7b-fp16-standard.json - run: | - cd ${{ github.workspace }} - export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" - ./llama-tornado --gpu --${{ matrix.backend.name }} \ - --model $MODELS_DIR/Mistral-7B-Instruct-v0.3.fp16.gguf \ - --prompt "Say hello" - python3 scripts/write_metrics_sidecar.py \ - --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-mistral-7b-fp16-standard.meta.json" \ - backend="${{ matrix.backend.name }}" \ - task=llama-inference \ - model_file=Mistral-7B-Instruct-v0.3.fp16.gguf \ - model=Mistral-7B-Instruct-v0.3 \ - quantization=F16 \ - configuration=standard \ - flags="" \ - prompt="Say hello" + uses: ./.github/actions/run-inference + with: + backend: ${{ matrix.backend.name }} + model_file: Mistral-7B-Instruct-v0.3.fp16.gguf + model: Mistral-7B-Instruct-v0.3 + quantization: F16 + configuration: standard + metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-mistral-7b-fp16-standard.json - name: FP16 - Run Qwen2.5-1.5b-instruct-fp16.gguf - env: - JAVA_TOOL_OPTIONS: >- - -Dllama.metrics.format=json - -Dllama.metrics.output=file - -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen2-5-1-5b-fp16-standard.json - run: | - cd ${{ github.workspace }} - export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" - ./llama-tornado --gpu --${{ matrix.backend.name }} \ - --model $MODELS_DIR/qwen2.5-1.5b-instruct-fp16.gguf \ - --prompt "Say hello" - python3 scripts/write_metrics_sidecar.py \ - --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen2-5-1-5b-fp16-standard.meta.json" \ - backend="${{ matrix.backend.name }}" \ - task=llama-inference \ - model_file=qwen2.5-1.5b-instruct-fp16.gguf \ - model=Qwen2.5-1.5B-Instruct \ - quantization=F16 \ - configuration=standard \ - flags="" \ - prompt="Say hello" + uses: ./.github/actions/run-inference + with: + backend: ${{ matrix.backend.name }} + model_file: qwen2.5-1.5b-instruct-fp16.gguf + model: Qwen2.5-1.5B-Instruct + quantization: F16 + configuration: standard + metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen2-5-1-5b-fp16-standard.json - name: FP16 - Run Phi-3-mini-4k-instruct-fp16.gguf - env: - JAVA_TOOL_OPTIONS: >- - -Dllama.metrics.format=json - -Dllama.metrics.output=file - -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-phi3-mini-fp16-standard.json - run: | - cd ${{ github.workspace }} - export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" - ./llama-tornado --gpu --${{ matrix.backend.name }} \ - --model $MODELS_DIR/Phi-3-mini-4k-instruct-fp16.gguf \ - --prompt "Say hello" - python3 scripts/write_metrics_sidecar.py \ - --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-phi3-mini-fp16-standard.meta.json" \ - backend="${{ matrix.backend.name }}" \ - task=llama-inference \ - model_file=Phi-3-mini-4k-instruct-fp16.gguf \ - model=Phi-3-mini-4k-instruct \ - quantization=F16 \ - configuration=standard \ - flags="" \ - prompt="Say hello" + uses: ./.github/actions/run-inference + with: + backend: ${{ matrix.backend.name }} + model_file: Phi-3-mini-4k-instruct-fp16.gguf + model: Phi-3-mini-4k-instruct + quantization: F16 + configuration: standard + metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-phi3-mini-fp16-standard.json - name: FP16 - Run Granite-3.2-2b-instruct-f16.gguf - env: - JAVA_TOOL_OPTIONS: >- - -Dllama.metrics.format=json - -Dllama.metrics.output=file - -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-3-2-2b-f16-standard.json - run: | - cd ${{ github.workspace }} - export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" - ./llama-tornado --gpu --${{ matrix.backend.name }} \ - --model $MODELS_DIR/granite-3.2-2b-instruct-f16.gguf \ - --prompt "Say hello" - python3 scripts/write_metrics_sidecar.py \ - --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-3-2-2b-f16-standard.meta.json" \ - backend="${{ matrix.backend.name }}" \ - task=llama-inference \ - model_file=granite-3.2-2b-instruct-f16.gguf \ - model=Granite-3.2-2B-Instruct \ - quantization=F16 \ - configuration=standard \ - flags="" \ - prompt="Say hello" + uses: ./.github/actions/run-inference + with: + backend: ${{ matrix.backend.name }} + model_file: granite-3.2-2b-instruct-f16.gguf + model: Granite-3.2-2B-Instruct + quantization: F16 + configuration: standard + metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-3-2-2b-f16-standard.json - name: FP16 - Run Granite-4.0-1b-F16.gguf - env: - JAVA_TOOL_OPTIONS: >- - -Dllama.metrics.format=json - -Dllama.metrics.output=file - -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-4-0-1b-f16-standard.json - run: | - cd ${{ github.workspace }} - export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" - ./llama-tornado --gpu --${{ matrix.backend.name }} \ - --model $MODELS_DIR/granite-4.0-1b-F16.gguf \ - --prompt "Say hello" - python3 scripts/write_metrics_sidecar.py \ - --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-4-0-1b-f16-standard.meta.json" \ - backend="${{ matrix.backend.name }}" \ - task=llama-inference \ - model_file=granite-4.0-1b-F16.gguf \ - model=Granite-4.0-1B \ - quantization=F16 \ - configuration=standard \ - flags="" \ - prompt="Say hello" + uses: ./.github/actions/run-inference + with: + backend: ${{ matrix.backend.name }} + model_file: granite-4.0-1b-F16.gguf + model: Granite-4.0-1B + quantization: F16 + configuration: standard + metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-4-0-1b-f16-standard.json - name: Q8 - Run Llama-3.2-1B-Instruct-Q8_0.gguf - env: - JAVA_TOOL_OPTIONS: >- - -Dllama.metrics.format=json - -Dllama.metrics.output=file - -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-q8-standard.json - run: | - cd ${{ github.workspace }} - export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" - ./llama-tornado --gpu --${{ matrix.backend.name }} \ - --model $MODELS_DIR/Llama-3.2-1B-Instruct-Q8_0.gguf \ - --prompt "Say hello" - python3 scripts/write_metrics_sidecar.py \ - --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-q8-standard.meta.json" \ - backend="${{ matrix.backend.name }}" \ - task=llama-inference \ - model_file=Llama-3.2-1B-Instruct-Q8_0.gguf \ - model=Llama-3.2-1B-Instruct \ - quantization=Q8_0 \ - configuration=standard \ - flags="" \ - prompt="Say hello" + uses: ./.github/actions/run-inference + with: + backend: ${{ matrix.backend.name }} + model_file: Llama-3.2-1B-Instruct-Q8_0.gguf + model: Llama-3.2-1B-Instruct + quantization: Q8_0 + configuration: standard + metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-q8-standard.json - name: Q8 - Run Qwen3-0.6B-Q8_0.gguf - env: - JAVA_TOOL_OPTIONS: >- - -Dllama.metrics.format=json - -Dllama.metrics.output=file - -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen3-0-6b-q8-standard.json - run: | - cd ${{ github.workspace }} - export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" - ./llama-tornado --gpu --${{ matrix.backend.name }} \ - --model $MODELS_DIR/Qwen3-0.6B-Q8_0.gguf \ - --prompt "Say hello" - python3 scripts/write_metrics_sidecar.py \ - --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen3-0-6b-q8-standard.meta.json" \ - backend="${{ matrix.backend.name }}" \ - task=llama-inference \ - model_file=Qwen3-0.6B-Q8_0.gguf \ - model=Qwen3-0.6B \ - quantization=Q8_0 \ - configuration=standard \ - flags="" \ - prompt="Say hello" + uses: ./.github/actions/run-inference + with: + backend: ${{ matrix.backend.name }} + model_file: Qwen3-0.6B-Q8_0.gguf + model: Qwen3-0.6B + quantization: Q8_0 + configuration: standard + metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen3-0-6b-q8-standard.json - name: Q8 - Run Phi-3-mini-4k-instruct-Q8_0.gguf - env: - JAVA_TOOL_OPTIONS: >- - -Dllama.metrics.format=json - -Dllama.metrics.output=file - -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-phi3-mini-q8-standard.json - run: | - cd ${{ github.workspace }} - export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" - ./llama-tornado --gpu --${{ matrix.backend.name }} \ - --model $MODELS_DIR/Phi-3-mini-4k-instruct-Q8_0.gguf \ - --prompt "Say hello" - python3 scripts/write_metrics_sidecar.py \ - --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-phi3-mini-q8-standard.meta.json" \ - backend="${{ matrix.backend.name }}" \ - task=llama-inference \ - model_file=Phi-3-mini-4k-instruct-Q8_0.gguf \ - model=Phi-3-mini-4k-instruct \ - quantization=Q8_0 \ - configuration=standard \ - flags="" \ - prompt="Say hello" + uses: ./.github/actions/run-inference + with: + backend: ${{ matrix.backend.name }} + model_file: Phi-3-mini-4k-instruct-Q8_0.gguf + model: Phi-3-mini-4k-instruct + quantization: Q8_0 + configuration: standard + metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-phi3-mini-q8-standard.json - name: Q8 - Run Qwen2.5-1.5b-instruct-q8_0.gguf - env: - JAVA_TOOL_OPTIONS: >- - -Dllama.metrics.format=json - -Dllama.metrics.output=file - -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen2-5-1-5b-q8-standard.json - run: | - cd ${{ github.workspace }} - export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" - ./llama-tornado --gpu --${{ matrix.backend.name }} \ - --model $MODELS_DIR/qwen2.5-1.5b-instruct-q8_0.gguf \ - --prompt "Say hello" - python3 scripts/write_metrics_sidecar.py \ - --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen2-5-1-5b-q8-standard.meta.json" \ - backend="${{ matrix.backend.name }}" \ - task=llama-inference \ - model_file=qwen2.5-1.5b-instruct-q8_0.gguf \ - model=Qwen2.5-1.5B-Instruct \ - quantization=Q8_0 \ - configuration=standard \ - flags="" \ - prompt="Say hello" - - - name: Q8 - Mistral-7B-Instruct-v0.3.Q8_0.gguf - env: - JAVA_TOOL_OPTIONS: >- - -Dllama.metrics.format=json - -Dllama.metrics.output=file - -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-mistral-7b-q8-standard.json - run: | - cd ${{ github.workspace }} - export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" - ./llama-tornado --gpu --${{ matrix.backend.name }} \ - --model $MODELS_DIR/Mistral-7B-Instruct-v0.3.Q8_0.gguf \ - --prompt "Say hello" - python3 scripts/write_metrics_sidecar.py \ - --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-mistral-7b-q8-standard.meta.json" \ - backend="${{ matrix.backend.name }}" \ - task=llama-inference \ - model_file=Mistral-7B-Instruct-v0.3.Q8_0.gguf \ - model=Mistral-7B-Instruct-v0.3 \ - quantization=Q8_0 \ - configuration=standard \ - flags="" \ - prompt="Say hello" + uses: ./.github/actions/run-inference + with: + backend: ${{ matrix.backend.name }} + model_file: qwen2.5-1.5b-instruct-q8_0.gguf + model: Qwen2.5-1.5B-Instruct + quantization: Q8_0 + configuration: standard + metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen2-5-1-5b-q8-standard.json + + - name: Q8 - Run Mistral-7B-Instruct-v0.3.Q8_0.gguf + uses: ./.github/actions/run-inference + with: + backend: ${{ matrix.backend.name }} + model_file: Mistral-7B-Instruct-v0.3.Q8_0.gguf + model: Mistral-7B-Instruct-v0.3 + quantization: Q8_0 + configuration: standard + metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-mistral-7b-q8-standard.json - name: Q8 - Run Granite-3.2-2b-instruct-Q8_0.gguf - env: - JAVA_TOOL_OPTIONS: >- - -Dllama.metrics.format=json - -Dllama.metrics.output=file - -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-3-2-2b-q8-standard.json - run: | - cd ${{ github.workspace }} - export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" - ./llama-tornado --gpu --${{ matrix.backend.name }} \ - --model $MODELS_DIR/granite-3.2-2b-instruct-Q8_0.gguf \ - --prompt "Say hello" - python3 scripts/write_metrics_sidecar.py \ - --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-3-2-2b-q8-standard.meta.json" \ - backend="${{ matrix.backend.name }}" \ - task=llama-inference \ - model_file=granite-3.2-2b-instruct-Q8_0.gguf \ - model=Granite-3.2-2B-Instruct \ - quantization=Q8_0 \ - configuration=standard \ - flags="" \ - prompt="Say hello" + uses: ./.github/actions/run-inference + with: + backend: ${{ matrix.backend.name }} + model_file: granite-3.2-2b-instruct-Q8_0.gguf + model: Granite-3.2-2B-Instruct + quantization: Q8_0 + configuration: standard + metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-3-2-2b-q8-standard.json - name: Q8 - Run Granite-4.0-1b-Q8_0.gguf - env: - JAVA_TOOL_OPTIONS: >- - -Dllama.metrics.format=json - -Dllama.metrics.output=file - -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-4-0-1b-q8-standard.json - run: | - cd ${{ github.workspace }} - export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" - ./llama-tornado --gpu --${{ matrix.backend.name }} \ - --model $MODELS_DIR/granite-4.0-1b-Q8_0.gguf \ - --prompt "Say hello" - python3 scripts/write_metrics_sidecar.py \ - --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-4-0-1b-q8-standard.meta.json" \ - backend="${{ matrix.backend.name }}" \ - task=llama-inference \ - model_file=granite-4.0-1b-Q8_0.gguf \ - model=Granite-4.0-1B \ - quantization=Q8_0 \ - configuration=standard \ - flags="" \ - prompt="Say hello" + uses: ./.github/actions/run-inference + with: + backend: ${{ matrix.backend.name }} + model_file: granite-4.0-1b-Q8_0.gguf + model: Granite-4.0-1B + quantization: Q8_0 + configuration: standard + metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-4-0-1b-q8-standard.json # ── Upload metrics for the publish job ──────────────────────────────────── - name: Upload metrics artifacts diff --git a/.github/workflows/integration-quarkus-langchain4j.yml b/.github/workflows/integration-quarkus-langchain4j.yml index 8ee6e900..d4a4291e 100644 --- a/.github/workflows/integration-quarkus-langchain4j.yml +++ b/.github/workflows/integration-quarkus-langchain4j.yml @@ -43,53 +43,16 @@ jobs: uses: actions/checkout@v4 # Step 1: Clone and build TornadoVM - - name: Clone TornadoVM master - run: | - git clone --depth 1 --branch master \ - https://github.com/beehive-lab/TornadoVM.git \ - $TORNADO_ROOT - - name: Set up Python venv for TornadoVM - run: | - python3 -m venv $TORNADO_ROOT/venv - source $TORNADO_ROOT/venv/bin/activate - python --version - - name: Build TornadoVM - run: | - cd $TORNADO_ROOT - mkdir -p graalJars && cp $GRAAL_JARS/* graalJars/ - source venv/bin/activate - echo "=== Building TornadoVM ===" - - make BACKEND=${{ matrix.backend.name }} - - echo "=== Searching for TornadoVM SDK directory ===" - SDK_DIR=$(find dist -type d -maxdepth 3 -path "*/tornadovm-*-${{ matrix.backend.name }}" | head -n 1) - if [ -z "$SDK_DIR" ]; then - echo "::error::Could not locate TornadoVM SDK directory!" - find dist -maxdepth 5 -type d - exit 1 - fi - FULL_SDK="${PWD}/${SDK_DIR}" - echo "Detected TornadoVM SDK: $FULL_SDK" - - # Export for current shell session - export TORNADOVM_HOME="$FULL_SDK" - export PATH="$FULL_SDK/bin:$JAVA_HOME/bin:$PATH" - - # Save for subsequent steps - echo "TORNADOVM_HOME=$FULL_SDK" >> $GITHUB_ENV - echo "PATH=$PATH" >> $GITHUB_ENV - - echo "=== Checking tornado CLI ===" - which tornado || { echo "::error::tornado not in PATH"; exit 1; } - tornado --devices + - name: Setup TornadoVM + uses: ./.github/actions/setup-tornadovm + with: + backend: ${{ matrix.backend.name }} # Step 2: Build GPULlama3.java - name: Build GPULlama3.java run: | cd ${{ github.workspace }} echo "Using TORNADOVM_HOME=$TORNADOVM_HOME" - export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" tornado --version # Append SNAPSHOT to GPULlama3 version @@ -108,13 +71,12 @@ jobs: - name: Clone Quarkus LangChain4j run: | cd ${{ github.workspace }} - git clone https://github.com/quarkiverse/quarkus-langchain4j.git + git clone --depth 1 https://github.com/quarkiverse/quarkus-langchain4j.git # Step 4: Build Quarkus LangChain4j with current GPULlama3.java - name: Build Quarkus LangChain4j run: | cd ${{ github.workspace }}/quarkus-langchain4j - export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" # Update the GPULlama3 version used by quarkus-langchain4j POM="pom.xml" @@ -136,7 +98,6 @@ jobs: - name: Start Quarkus Application and Wait for Startup run: | cd ${{ github.workspace }}/quarkus-langchain4j/integration-tests/gpu-llama3 - export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" echo "Starting Quarkus application on port $QUARKUS_PORT..." @@ -146,7 +107,8 @@ jobs: -Dquarkus.http.port=$QUARKUS_PORT \ -jar target/quarkus-app/quarkus-run.jar & APP_PID=$! - + echo "APP_PID=$APP_PID" >> $GITHUB_ENV + if [ -z "$APP_PID" ]; then echo "ERROR: Failed to start Quarkus application" exit 1 diff --git a/.github/workflows/rerun-workflow.yml b/.github/workflows/rerun-workflow.yml index e4b14e75..62eec7d3 100644 --- a/.github/workflows/rerun-workflow.yml +++ b/.github/workflows/rerun-workflow.yml @@ -47,8 +47,30 @@ jobs: core.setOutput('is_help', 'false'); } - - name: Get PR SHA + - name: Check commenter permissions + id: check_permission if: steps.help.outputs.is_help != 'true' + uses: actions/github-script@v7 + with: + script: | + const { data: perm } = await github.rest.repos.getCollaboratorPermissionLevel({ + owner: context.repo.owner, + repo: context.repo.repo, + username: context.payload.comment.user.login + }); + const authorized = ['write', 'admin'].includes(perm.permission); + if (!authorized) { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body: `@${context.payload.comment.user.login} You need write permission to trigger workflow reruns.` + }); + } + core.setOutput('authorized', authorized ? 'true' : 'false'); + + - name: Get PR SHA + if: steps.help.outputs.is_help != 'true' && steps.check_permission.outputs.authorized == 'true' id: pr uses: actions/github-script@v7 with: @@ -64,7 +86,7 @@ jobs: console.log(`PR head ref: ${pr.head.ref}`); - name: Add reaction to comment - if: steps.help.outputs.is_help != 'true' + if: steps.help.outputs.is_help != 'true' && steps.check_permission.outputs.authorized == 'true' uses: actions/github-script@v7 with: script: | @@ -76,7 +98,7 @@ jobs: }); - name: Post start comment - if: steps.help.outputs.is_help != 'true' + if: steps.help.outputs.is_help != 'true' && steps.check_permission.outputs.authorized == 'true' uses: actions/github-script@v7 with: script: | @@ -92,7 +114,7 @@ jobs: }); - name: Rerun failed workflows - if: steps.help.outputs.is_help != 'true' + if: steps.help.outputs.is_help != 'true' && steps.check_permission.outputs.authorized == 'true' uses: actions/github-script@v7 with: script: | @@ -167,7 +189,7 @@ jobs: console.log(`Reran ${rerunCount} workflow(s)`); - name: Post completion comment - if: always() && steps.help.outputs.is_help != 'true' + if: always() && steps.help.outputs.is_help != 'true' && steps.check_permission.outputs.authorized == 'true' uses: actions/github-script@v7 with: script: |