beehive-lab · orionpapadakis · May 26, 2026 · May 28, 2026 · May 28, 2026 · May 28, 2026
diff --git a/.github/workflows/build-and-run.yml b/.github/workflows/build-and-run.yml
@@ -385,6 +385,107 @@ jobs:
             flags="" \
             prompt="Say hello"
 
+      - name: Q8 - Run Llama-3.2-1B-Instruct-Q8_0.gguf - Prefill-Decode
+        env:
+          JAVA_TOOL_OPTIONS: >-
+            -Dllama.metrics.format=json
+            -Dllama.metrics.output=file
+            -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-q8-prefill-decode.json
+        run: |
+          cd ${{ github.workspace }}
+          export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH"
+          ./llama-tornado --gpu --${{ matrix.backend.name }} \
+            --model $MODELS_DIR/Llama-3.2-1B-Instruct-Q8_0.gguf \
+            --prompt "Say hello" \
+            --with-prefill-decode
+          python3 scripts/write_metrics_sidecar.py \
+            --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-q8-prefill-decode.meta.json" \
+            backend="${{ matrix.backend.name }}" \
+            task=llama-inference \
+            model_file=Llama-3.2-1B-Instruct-Q8_0.gguf \
+            model=Llama-3.2-1B-Instruct \
+            quantization=Q8_0 \
+            configuration=prefill-decode \
+            "flags=--with-prefill-decode" \
+            prompt="Say hello"
+
+      - name: Q8 - Run Llama-3.2-1B-Instruct-Q8_0.gguf - Batch-Prefill-Decode
+        env:
+          JAVA_TOOL_OPTIONS: >-
+            -Dllama.metrics.format=json
+            -Dllama.metrics.output=file
+            -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-q8-batch-prefill-decode.json
+        run: |
+          cd ${{ github.workspace }}
+          export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH"
+          ./llama-tornado --gpu --${{ matrix.backend.name }} \
+            --model $MODELS_DIR/Llama-3.2-1B-Instruct-Q8_0.gguf \
+            --prompt "Say hello" \
+            --with-prefill-decode --batch-prefill-size 32
+          python3 scripts/write_metrics_sidecar.py \
+            --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-q8-batch-prefill-decode.meta.json" \
+            backend="${{ matrix.backend.name }}" \
+            task=llama-inference \
+            model_file=Llama-3.2-1B-Instruct-Q8_0.gguf \
+            model=Llama-3.2-1B-Instruct \
+            quantization=Q8_0 \
+            configuration=batch-prefill-decode \
+            "flags=--with-prefill-decode --batch-prefill-size 32" \
+            prompt="Say hello"
+
+      # ── PTX-only: CUDA-graph variants ────────────────────────────────────────
+      - name: PTX - Q8 - Run Llama-3.2-1B-Instruct-Q8_0.gguf - Prefill-Decode-CUDA-Graphs
+        if: matrix.backend.name == 'ptx'
+        env:
+          JAVA_TOOL_OPTIONS: >-
+            -Dllama.metrics.format=json
+            -Dllama.metrics.output=file
+            -Dllama.metrics.file=${{ runner.temp }}/metrics-ptx-llama-1b-q8-prefill-decode-cuda-graphs.json
+        run: |
+          cd ${{ github.workspace }}
+          export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH"
+          ./llama-tornado --gpu --ptx \
+            --model $MODELS_DIR/Llama-3.2-1B-Instruct-Q8_0.gguf \
+            --prompt "Say hello" \
+            --with-prefill-decode \
+            --cuda-graphs
+          python3 scripts/write_metrics_sidecar.py \
+            --out "${{ runner.temp }}/metrics-ptx-llama-1b-q8-prefill-decode-cuda-graphs.meta.json" \
+            backend=ptx \
+            task=llama-inference \
+            model_file=Llama-3.2-1B-Instruct-Q8_0.gguf \
+            model=Llama-3.2-1B-Instruct \
+            quantization=Q8_0 \
+            configuration=prefill-decode-cuda-graphs \
+            "flags=--with-prefill-decode --cuda-graphs" \
+            prompt="Say hello"
+
+      - name: PTX - Q8 - Run Llama-3.2-1B-Instruct-Q8_0.gguf - Batch-Prefill-Decode-CUDA-Graphs
+        if: matrix.backend.name == 'ptx'
+        env:
+          JAVA_TOOL_OPTIONS: >-
+            -Dllama.metrics.format=json
+            -Dllama.metrics.output=file
+            -Dllama.metrics.file=${{ runner.temp }}/metrics-ptx-llama-1b-q8-batch-prefill-decode-cuda-graphs.json
+        run: |
+          cd ${{ github.workspace }}
+          export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH"
+          ./llama-tornado --gpu --ptx \
+            --model $MODELS_DIR/Llama-3.2-1B-Instruct-Q8_0.gguf \
+            --prompt "Say hello" \
+            --with-prefill-decode --batch-prefill-size 32 \
+            --cuda-graphs
+          python3 scripts/write_metrics_sidecar.py \
+            --out "${{ runner.temp }}/metrics-ptx-llama-1b-q8-batch-prefill-decode-cuda-graphs.meta.json" \
+            backend=ptx \
+            task=llama-inference \
+            model_file=Llama-3.2-1B-Instruct-Q8_0.gguf \
+            model=Llama-3.2-1B-Instruct \
+            quantization=Q8_0 \
+            configuration=batch-prefill-decode-cuda-graphs \
+            "flags=--with-prefill-decode --batch-prefill-size 32 --cuda-graphs" \
+            prompt="Say hello"
+
       - name: Q8 - Run Qwen3-0.6B-Q8_0.gguf
         env:
           JAVA_TOOL_OPTIONS: >-

diff --git a/src/main/java/org/beehive/gpullama3/inference/InferenceCore.java b/src/main/java/org/beehive/gpullama3/inference/InferenceCore.java
@@ -814,7 +814,7 @@ public static FloatArray forwardTornadoVM(Model model, State state, int token, i
             default -> throw new IllegalArgumentException("Unsupported weight type: " + weights.getWeightType());
         }
 
-        return tornadoVMMasterPlan.tornadoVMForwardExecuteLayered(position);
+        return tornadoVMMasterPlan.tornadoVMExecuteForward(position);
     }
 
 }
diff --git a/src/main/java/org/beehive/gpullama3/inference/InferenceCoreBatchPrefillDecode.java b/src/main/java/org/beehive/gpullama3/inference/InferenceCoreBatchPrefillDecode.java
@@ -7,7 +7,7 @@
 import org.beehive.gpullama3.model.Model;
 import org.beehive.gpullama3.tensor.standard.ArrayFloatTensor;
 import org.beehive.gpullama3.tensor.standard.FloatTensor;
-import org.beehive.gpullama3.tornadovm.TornadoVMMasterPlanWithBatchPrefillDecode;
+import org.beehive.gpullama3.tornadovm.TornadoVMMasterPlanBatchPrefillDecode;
 import uk.ac.manchester.tornado.api.types.arrays.FloatArray;
 
 /**
@@ -21,9 +21,9 @@
  *       prompt tokens in one pass using batch matmul, avoiding redundant weight
  *       traversals. Only the KV cache is populated; logits are intentionally omitted.</li>
  *   <li>{@link #batchForwardTornadoVMPrefill} — GPU batch prefill: delegates the chunk
- *       to {@link TornadoVMMasterPlanWithBatchPrefillDecode#tornadoVMForwardBatchPrefill}.</li>
+ *       to {@link TornadoVMMasterPlanBatchPrefillDecode#tornadoVMForwardBatchPrefill}.</li>
  *   <li>{@link #forwardTornadoVMDecode} — GPU decode: delegates a single decode step to
- *       {@link TornadoVMMasterPlanWithBatchPrefillDecode#tornadoVMForwardDecode}, which
+ *       {@link TornadoVMMasterPlanBatchPrefillDecode#tornadoVMForwardDecode}, which
  *       handles the embedding copy and runs the full decode + logits graphs.</li>
  * </ul>
  */
@@ -165,7 +165,7 @@ public static void batchForwardJavaPrefill(Model model, State state, int[] token
      * GPU batched prefill forward pass (Phase 4).
      *
      * <p>Delegates the full chunk to
-     * {@link TornadoVMMasterPlanWithBatchPrefillDecode#tornadoVMForwardBatchPrefill},
+     * {@link TornadoVMMasterPlanBatchPrefillDecode#tornadoVMForwardBatchPrefill},
      * which handles embedding lookup and GPU execution internally.</p>
      *
      * @param model     the LLaMA model
@@ -175,15 +175,15 @@ public static void batchForwardJavaPrefill(Model model, State state, int[] token
      * @param plan      the batched prefill/decode GPU plan
      */
     public static void batchForwardTornadoVMPrefill(Model model, int[] tokens, int startPos, int chunkSize,
-            TornadoVMMasterPlanWithBatchPrefillDecode plan) {
+            TornadoVMMasterPlanBatchPrefillDecode plan) {
         plan.tornadoVMForwardBatchPrefill(tokens, startPos, model, chunkSize);
     }
 
     /**
      * GPU decode forward pass (Phase 4).
      *
      * <p>Delegates a single-token decode step to
-     * {@link TornadoVMMasterPlanWithBatchPrefillDecode#tornadoVMForwardDecode},
+     * {@link TornadoVMMasterPlanBatchPrefillDecode#tornadoVMForwardDecode},
      * which copies the token embedding and runs the decode + logits graphs.</p>
      *
      * @param model    the LLaMA model
@@ -193,7 +193,7 @@ public static void batchForwardTornadoVMPrefill(Model model, int[] tokens, int s
      * @return logits array for token sampling
      */
     public static FloatArray forwardTornadoVMDecode(Model model, int token, int position,
-            TornadoVMMasterPlanWithBatchPrefillDecode plan) {
+            TornadoVMMasterPlanBatchPrefillDecode plan) {
         return plan.tornadoVMForwardDecode(token, position, model);
     }
 }
diff --git a/src/main/java/org/beehive/gpullama3/inference/InferenceCoreWithPrefillDecode.java b/src/main/java/org/beehive/gpullama3/inference/InferenceCoreWithPrefillDecode.java
@@ -7,7 +7,7 @@
 import org.beehive.gpullama3.model.Configuration;
 import org.beehive.gpullama3.model.Model;
 import org.beehive.gpullama3.tensor.standard.FloatTensor;
-import org.beehive.gpullama3.tornadovm.TornadoVMMasterPlanWithPrefillDecode;
+import org.beehive.gpullama3.tornadovm.TornadoVMMasterPlanPrefillDecode;
 
 import java.lang.foreign.MemorySegment;
 
@@ -131,7 +131,7 @@ public static void forwardJavaPrefill(Model model, State state, int token, int p
      *
      * <p>Copies the token embedding into {@code state.embeddingX} (same as
      * {@link InferenceCore#forwardTornadoVM}) then delegates to
-     * {@link TornadoVMMasterPlanWithPrefillDecode#tornadoVMForwardPrefill},
+     * {@link TornadoVMMasterPlanPrefillDecode#tornadoVMForwardPrefill},
      * which executes preprocessing + layer graphs but skips the logits graph.</p>
      *
      * @param model       the LLaMA model (must carry {@link TornadoWeights}, FP16 only)
@@ -142,7 +142,7 @@ public static void forwardJavaPrefill(Model model, State state, int token, int p
      * @throws UnsupportedOperationException if the model uses Q8_0 weights
      */
     public static void forwardTornadoVMPrefill(Model model, State state, int token, int position,
-            TornadoVMMasterPlanWithPrefillDecode prefillPlan) {
+            TornadoVMMasterPlanPrefillDecode prefillPlan) {
         final Configuration configuration = model.configuration();
         final TornadoWeights weights = (TornadoWeights) model.weights();
 
@@ -153,9 +153,13 @@ public static void forwardTornadoVMPrefill(Model model, State state, int token,
                 MemorySegment.copy(tokenEmbeddings, (long) token * configuration.dim() * bytes,
                         state.embeddingX.getSegment(), 0, (long) configuration.dim() * bytes);
             }
-            case Q8_0 -> throw new UnsupportedOperationException(
-                    // TODO Phase 4: implement Q8_0 GPU batched prefill kernels
-                    "GPU prefill/decode path not yet implemented for Q8_0 weights");
+            case Q8_0 -> {
+                MemorySegment tokenEmbeddings = weights.getTokenEmbeddingTable().asByteArray().getSegment();
+                int blocksPerToken = (configuration.dim() + 31) / 32;
+                long bytesPerToken = (long) blocksPerToken * 34;
+                MemorySegment.copy(tokenEmbeddings, (long) token * bytesPerToken,
+                        state.embeddingX.getSegment(), 0, bytesPerToken);
+            }
             default -> throw new IllegalArgumentException("Unsupported weight type: " + weights.getWeightType());
         }
 

diff --git a/src/main/java/org/beehive/gpullama3/inference/InferenceEngineWithBatchPrefillDecode.java b/src/main/java/org/beehive/gpullama3/inference/InferenceEngineWithBatchPrefillDecode.java
@@ -7,7 +7,7 @@
 import org.beehive.gpullama3.model.Model;
 import org.beehive.gpullama3.tokenizer.Tokenizer;
 import org.beehive.gpullama3.tornadovm.TornadoVMMasterPlan;
-import org.beehive.gpullama3.tornadovm.TornadoVMMasterPlanWithBatchPrefillDecode;
+import org.beehive.gpullama3.tornadovm.TornadoVMMasterPlanBatchPrefillDecode;
 
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -163,8 +163,8 @@ public static List<Integer> generateTokensGPULlama(
                 ? config.contextLength() : maxTokens;
         final int batchSize = TornadoVMMasterPlan.PREFILL_BATCH_SIZE;
 
-        TornadoVMMasterPlanWithBatchPrefillDecode plan =
-                (TornadoVMMasterPlanWithBatchPrefillDecode) tornadoVMPlan;
+        TornadoVMMasterPlanBatchPrefillDecode plan =
+                (TornadoVMMasterPlanBatchPrefillDecode) tornadoVMPlan;
 
         List<Integer> generatedTokens = new ArrayList<>();
 

diff --git a/src/main/java/org/beehive/gpullama3/inference/InferenceEngineWithPrefillDecode.java b/src/main/java/org/beehive/gpullama3/inference/InferenceEngineWithPrefillDecode.java
@@ -7,7 +7,7 @@
 import org.beehive.gpullama3.model.Model;
 import org.beehive.gpullama3.tokenizer.Tokenizer;
 import org.beehive.gpullama3.tornadovm.TornadoVMMasterPlan;
-import org.beehive.gpullama3.tornadovm.TornadoVMMasterPlanWithPrefillDecode;
+import org.beehive.gpullama3.tornadovm.TornadoVMMasterPlanPrefillDecode;
 
 import java.util.ArrayList;
 import java.util.List;
@@ -135,8 +135,8 @@ public static List<Integer> generateTokensGPULlama(
         int actualMaxTokens = (maxTokens < 0 || config.contextLength() < maxTokens)
                 ? config.contextLength() : maxTokens;
 
-        TornadoVMMasterPlanWithPrefillDecode prefillPlan =
-                (TornadoVMMasterPlanWithPrefillDecode) tornadoVMPlan;
+        TornadoVMMasterPlanPrefillDecode prefillPlan =
+                (TornadoVMMasterPlanPrefillDecode) tornadoVMPlan;
 
         List<Integer> generatedTokens = new ArrayList<>();
 

diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlan.java b/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlan.java
@@ -10,20 +10,20 @@
  *
  * <p>Three concrete implementations exist:</p>
  * <ul>
- *   <li>{@link TornadoVMMasterPlanStandard} — baseline single-token forward pass
+ *   <li>{@link TornadoVMMasterPlanSingleToken} — baseline single-token forward pass
  *       (preprocessing + N layers + logits).</li>
- *   <li>{@link TornadoVMMasterPlanWithPrefillDecode} — sequential prefill/decode separation;
+ *   <li>{@link TornadoVMMasterPlanPrefillDecode} — sequential prefill/decode separation;
  *       reuses the same N layer graphs for both phases, skipping logits during prefill.</li>
- *   <li>{@link TornadoVMMasterPlanWithBatchPrefillDecode} — batched prefill + single-token
+ *   <li>{@link TornadoVMMasterPlanBatchPrefillDecode} — batched prefill + single-token
  *       decode; holds 2N+3 graphs in one plan to keep the KV cache on device across phases.</li>
  * </ul>
  *
  * <p>The {@link #initializeTornadoVMPlan} factory selects the implementation based on
  * {@code llama.withPrefillDecode} and {@code llama.prefillBatchSize}:</p>
  * <ul>
- *   <li>{@code withPrefillDecode=false} → {@link TornadoVMMasterPlanStandard}</li>
- *   <li>{@code withPrefillDecode=true}, {@code prefillBatchSize=1} → {@link TornadoVMMasterPlanWithPrefillDecode}</li>
- *   <li>{@code withPrefillDecode=true}, {@code prefillBatchSize>1} → {@link TornadoVMMasterPlanWithBatchPrefillDecode}</li>
+ *   <li>{@code withPrefillDecode=false} → {@link TornadoVMMasterPlanSingleToken}</li>
+ *   <li>{@code withPrefillDecode=true}, {@code prefillBatchSize=1} → {@link TornadoVMMasterPlanPrefillDecode}</li>
+ *   <li>{@code withPrefillDecode=true}, {@code prefillBatchSize>1} → {@link TornadoVMMasterPlanBatchPrefillDecode}</li>
  * </ul>
  */
 public interface TornadoVMMasterPlan {
@@ -43,8 +43,8 @@ public interface TornadoVMMasterPlan {
      * Factory: creates, JIT-compiles, and warms up the appropriate TornadoVMMasterPlan.
      *
      * <p>When {@code llama.withPrefillDecode=true} and {@code llama.prefillBatchSize > 1},
-     * a {@link TornadoVMMasterPlanWithBatchPrefillDecode} is returned.
-     * Otherwise a {@link TornadoVMMasterPlanStandard} is returned (used for the baseline
+     * a {@link TornadoVMMasterPlanBatchPrefillDecode} is returned.
+     * Otherwise a {@link TornadoVMMasterPlanSingleToken} is returned (used for the baseline
      * path and the sequential prefill/decode path when batch size is 1).</p>
      *
      * @param state the model state
@@ -56,13 +56,13 @@ static TornadoVMMasterPlan initializeTornadoVMPlan(State state, Model model) {
 
         if (WITH_PREFILL_DECODE && PREFILL_BATCH_SIZE > 1) {
             // GPU path with batched prefill/decode
-            plan = new TornadoVMMasterPlanWithBatchPrefillDecode(state, model);
+            plan = new TornadoVMMasterPlanBatchPrefillDecode(state, model);
         } else if (WITH_PREFILL_DECODE) {
             // GPU path with simple prefill/decode
-            plan = new TornadoVMMasterPlanWithPrefillDecode(state, model);
+            plan = new TornadoVMMasterPlanPrefillDecode(state, model);
         } else {
             // GPU path with no prefill/decode
-            plan = new TornadoVMMasterPlanStandard(state, model);
+            plan = new TornadoVMMasterPlanSingleToken(state, model);
         }
         model.setTornadoVMPlan(plan);
         return plan;
@@ -76,7 +76,7 @@ static TornadoVMMasterPlan initializeTornadoVMPlan(State state, Model model) {
 
     void forceCopyInReadOnlyData();
 
-    FloatArray tornadoVMForwardExecuteLayered(int position);
+    FloatArray tornadoVMExecuteForward(int position);
 
     /** Releases all device memory held by this plan. */
     void freeTornadoExecutionPlan();