Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 101 additions & 0 deletions .github/workflows/build-and-run.yml
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,107 @@ jobs:
flags="" \
prompt="Say hello"

- name: Q8 - Run Llama-3.2-1B-Instruct-Q8_0.gguf - Prefill-Decode
env:
JAVA_TOOL_OPTIONS: >-
-Dllama.metrics.format=json
-Dllama.metrics.output=file
-Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-q8-prefill-decode.json
run: |
cd ${{ github.workspace }}
export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH"
./llama-tornado --gpu --${{ matrix.backend.name }} \
--model $MODELS_DIR/Llama-3.2-1B-Instruct-Q8_0.gguf \
--prompt "Say hello" \
--with-prefill-decode
python3 scripts/write_metrics_sidecar.py \
--out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-q8-prefill-decode.meta.json" \
backend="${{ matrix.backend.name }}" \
task=llama-inference \
model_file=Llama-3.2-1B-Instruct-Q8_0.gguf \
model=Llama-3.2-1B-Instruct \
quantization=Q8_0 \
configuration=prefill-decode \
"flags=--with-prefill-decode" \
prompt="Say hello"

- name: Q8 - Run Llama-3.2-1B-Instruct-Q8_0.gguf - Batch-Prefill-Decode
env:
JAVA_TOOL_OPTIONS: >-
-Dllama.metrics.format=json
-Dllama.metrics.output=file
-Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-q8-batch-prefill-decode.json
run: |
cd ${{ github.workspace }}
export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH"
./llama-tornado --gpu --${{ matrix.backend.name }} \
--model $MODELS_DIR/Llama-3.2-1B-Instruct-Q8_0.gguf \
--prompt "Say hello" \
--with-prefill-decode --batch-prefill-size 32
python3 scripts/write_metrics_sidecar.py \
--out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-q8-batch-prefill-decode.meta.json" \
backend="${{ matrix.backend.name }}" \
task=llama-inference \
model_file=Llama-3.2-1B-Instruct-Q8_0.gguf \
model=Llama-3.2-1B-Instruct \
quantization=Q8_0 \
configuration=batch-prefill-decode \
"flags=--with-prefill-decode --batch-prefill-size 32" \
prompt="Say hello"

# ── PTX-only: CUDA-graph variants ────────────────────────────────────────
- name: PTX - Q8 - Run Llama-3.2-1B-Instruct-Q8_0.gguf - Prefill-Decode-CUDA-Graphs
if: matrix.backend.name == 'ptx'
env:
JAVA_TOOL_OPTIONS: >-
-Dllama.metrics.format=json
-Dllama.metrics.output=file
-Dllama.metrics.file=${{ runner.temp }}/metrics-ptx-llama-1b-q8-prefill-decode-cuda-graphs.json
run: |
cd ${{ github.workspace }}
export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH"
./llama-tornado --gpu --ptx \
--model $MODELS_DIR/Llama-3.2-1B-Instruct-Q8_0.gguf \
--prompt "Say hello" \
--with-prefill-decode \
--cuda-graphs
python3 scripts/write_metrics_sidecar.py \
--out "${{ runner.temp }}/metrics-ptx-llama-1b-q8-prefill-decode-cuda-graphs.meta.json" \
backend=ptx \
task=llama-inference \
model_file=Llama-3.2-1B-Instruct-Q8_0.gguf \
model=Llama-3.2-1B-Instruct \
quantization=Q8_0 \
configuration=prefill-decode-cuda-graphs \
"flags=--with-prefill-decode --cuda-graphs" \
prompt="Say hello"

- name: PTX - Q8 - Run Llama-3.2-1B-Instruct-Q8_0.gguf - Batch-Prefill-Decode-CUDA-Graphs
if: matrix.backend.name == 'ptx'
env:
JAVA_TOOL_OPTIONS: >-
-Dllama.metrics.format=json
-Dllama.metrics.output=file
-Dllama.metrics.file=${{ runner.temp }}/metrics-ptx-llama-1b-q8-batch-prefill-decode-cuda-graphs.json
run: |
cd ${{ github.workspace }}
export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH"
./llama-tornado --gpu --ptx \
--model $MODELS_DIR/Llama-3.2-1B-Instruct-Q8_0.gguf \
--prompt "Say hello" \
--with-prefill-decode --batch-prefill-size 32 \
--cuda-graphs
python3 scripts/write_metrics_sidecar.py \
--out "${{ runner.temp }}/metrics-ptx-llama-1b-q8-batch-prefill-decode-cuda-graphs.meta.json" \
backend=ptx \
task=llama-inference \
model_file=Llama-3.2-1B-Instruct-Q8_0.gguf \
model=Llama-3.2-1B-Instruct \
quantization=Q8_0 \
configuration=batch-prefill-decode-cuda-graphs \
"flags=--with-prefill-decode --batch-prefill-size 32 --cuda-graphs" \
prompt="Say hello"

- name: Q8 - Run Qwen3-0.6B-Q8_0.gguf
env:
JAVA_TOOL_OPTIONS: >-
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -814,7 +814,7 @@ public static FloatArray forwardTornadoVM(Model model, State state, int token, i
default -> throw new IllegalArgumentException("Unsupported weight type: " + weights.getWeightType());
}

return tornadoVMMasterPlan.tornadoVMForwardExecuteLayered(position);
return tornadoVMMasterPlan.tornadoVMExecuteForward(position);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import org.beehive.gpullama3.model.Model;
import org.beehive.gpullama3.tensor.standard.ArrayFloatTensor;
import org.beehive.gpullama3.tensor.standard.FloatTensor;
import org.beehive.gpullama3.tornadovm.TornadoVMMasterPlanWithBatchPrefillDecode;
import org.beehive.gpullama3.tornadovm.TornadoVMMasterPlanBatchPrefillDecode;
import uk.ac.manchester.tornado.api.types.arrays.FloatArray;

/**
Expand All @@ -21,9 +21,9 @@
* prompt tokens in one pass using batch matmul, avoiding redundant weight
* traversals. Only the KV cache is populated; logits are intentionally omitted.</li>
* <li>{@link #batchForwardTornadoVMPrefill} — GPU batch prefill: delegates the chunk
* to {@link TornadoVMMasterPlanWithBatchPrefillDecode#tornadoVMForwardBatchPrefill}.</li>
* to {@link TornadoVMMasterPlanBatchPrefillDecode#tornadoVMForwardBatchPrefill}.</li>
* <li>{@link #forwardTornadoVMDecode} — GPU decode: delegates a single decode step to
* {@link TornadoVMMasterPlanWithBatchPrefillDecode#tornadoVMForwardDecode}, which
* {@link TornadoVMMasterPlanBatchPrefillDecode#tornadoVMForwardDecode}, which
* handles the embedding copy and runs the full decode + logits graphs.</li>
* </ul>
*/
Expand Down Expand Up @@ -165,7 +165,7 @@ public static void batchForwardJavaPrefill(Model model, State state, int[] token
* GPU batched prefill forward pass (Phase 4).
*
* <p>Delegates the full chunk to
* {@link TornadoVMMasterPlanWithBatchPrefillDecode#tornadoVMForwardBatchPrefill},
* {@link TornadoVMMasterPlanBatchPrefillDecode#tornadoVMForwardBatchPrefill},
* which handles embedding lookup and GPU execution internally.</p>
*
* @param model the LLaMA model
Expand All @@ -175,15 +175,15 @@ public static void batchForwardJavaPrefill(Model model, State state, int[] token
* @param plan the batched prefill/decode GPU plan
*/
public static void batchForwardTornadoVMPrefill(Model model, int[] tokens, int startPos, int chunkSize,
TornadoVMMasterPlanWithBatchPrefillDecode plan) {
TornadoVMMasterPlanBatchPrefillDecode plan) {
plan.tornadoVMForwardBatchPrefill(tokens, startPos, model, chunkSize);
}

/**
* GPU decode forward pass (Phase 4).
*
* <p>Delegates a single-token decode step to
* {@link TornadoVMMasterPlanWithBatchPrefillDecode#tornadoVMForwardDecode},
* {@link TornadoVMMasterPlanBatchPrefillDecode#tornadoVMForwardDecode},
* which copies the token embedding and runs the decode + logits graphs.</p>
*
* @param model the LLaMA model
Expand All @@ -193,7 +193,7 @@ public static void batchForwardTornadoVMPrefill(Model model, int[] tokens, int s
* @return logits array for token sampling
*/
public static FloatArray forwardTornadoVMDecode(Model model, int token, int position,
TornadoVMMasterPlanWithBatchPrefillDecode plan) {
TornadoVMMasterPlanBatchPrefillDecode plan) {
return plan.tornadoVMForwardDecode(token, position, model);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import org.beehive.gpullama3.model.Configuration;
import org.beehive.gpullama3.model.Model;
import org.beehive.gpullama3.tensor.standard.FloatTensor;
import org.beehive.gpullama3.tornadovm.TornadoVMMasterPlanWithPrefillDecode;
import org.beehive.gpullama3.tornadovm.TornadoVMMasterPlanPrefillDecode;

import java.lang.foreign.MemorySegment;

Expand Down Expand Up @@ -131,7 +131,7 @@ public static void forwardJavaPrefill(Model model, State state, int token, int p
*
* <p>Copies the token embedding into {@code state.embeddingX} (same as
* {@link InferenceCore#forwardTornadoVM}) then delegates to
* {@link TornadoVMMasterPlanWithPrefillDecode#tornadoVMForwardPrefill},
* {@link TornadoVMMasterPlanPrefillDecode#tornadoVMForwardPrefill},
* which executes preprocessing + layer graphs but skips the logits graph.</p>
*
* @param model the LLaMA model (must carry {@link TornadoWeights}, FP16 only)
Expand All @@ -142,7 +142,7 @@ public static void forwardJavaPrefill(Model model, State state, int token, int p
* @throws UnsupportedOperationException if the model uses Q8_0 weights
*/
public static void forwardTornadoVMPrefill(Model model, State state, int token, int position,
TornadoVMMasterPlanWithPrefillDecode prefillPlan) {
TornadoVMMasterPlanPrefillDecode prefillPlan) {
final Configuration configuration = model.configuration();
final TornadoWeights weights = (TornadoWeights) model.weights();

Expand All @@ -153,9 +153,13 @@ public static void forwardTornadoVMPrefill(Model model, State state, int token,
MemorySegment.copy(tokenEmbeddings, (long) token * configuration.dim() * bytes,
state.embeddingX.getSegment(), 0, (long) configuration.dim() * bytes);
}
case Q8_0 -> throw new UnsupportedOperationException(
// TODO Phase 4: implement Q8_0 GPU batched prefill kernels
"GPU prefill/decode path not yet implemented for Q8_0 weights");
case Q8_0 -> {
MemorySegment tokenEmbeddings = weights.getTokenEmbeddingTable().asByteArray().getSegment();
int blocksPerToken = (configuration.dim() + 31) / 32;
long bytesPerToken = (long) blocksPerToken * 34;
MemorySegment.copy(tokenEmbeddings, (long) token * bytesPerToken,
state.embeddingX.getSegment(), 0, bytesPerToken);
}
Comment on lines +157 to +162
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe this should be a method on each own. Same for the above

default -> throw new IllegalArgumentException("Unsupported weight type: " + weights.getWeightType());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import org.beehive.gpullama3.model.Model;
import org.beehive.gpullama3.tokenizer.Tokenizer;
import org.beehive.gpullama3.tornadovm.TornadoVMMasterPlan;
import org.beehive.gpullama3.tornadovm.TornadoVMMasterPlanWithBatchPrefillDecode;
import org.beehive.gpullama3.tornadovm.TornadoVMMasterPlanBatchPrefillDecode;

import java.util.ArrayList;
import java.util.Arrays;
Expand Down Expand Up @@ -163,8 +163,8 @@ public static List<Integer> generateTokensGPULlama(
? config.contextLength() : maxTokens;
final int batchSize = TornadoVMMasterPlan.PREFILL_BATCH_SIZE;

TornadoVMMasterPlanWithBatchPrefillDecode plan =
(TornadoVMMasterPlanWithBatchPrefillDecode) tornadoVMPlan;
TornadoVMMasterPlanBatchPrefillDecode plan =
(TornadoVMMasterPlanBatchPrefillDecode) tornadoVMPlan;

List<Integer> generatedTokens = new ArrayList<>();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import org.beehive.gpullama3.model.Model;
import org.beehive.gpullama3.tokenizer.Tokenizer;
import org.beehive.gpullama3.tornadovm.TornadoVMMasterPlan;
import org.beehive.gpullama3.tornadovm.TornadoVMMasterPlanWithPrefillDecode;
import org.beehive.gpullama3.tornadovm.TornadoVMMasterPlanPrefillDecode;

import java.util.ArrayList;
import java.util.List;
Expand Down Expand Up @@ -135,8 +135,8 @@ public static List<Integer> generateTokensGPULlama(
int actualMaxTokens = (maxTokens < 0 || config.contextLength() < maxTokens)
? config.contextLength() : maxTokens;

TornadoVMMasterPlanWithPrefillDecode prefillPlan =
(TornadoVMMasterPlanWithPrefillDecode) tornadoVMPlan;
TornadoVMMasterPlanPrefillDecode prefillPlan =
(TornadoVMMasterPlanPrefillDecode) tornadoVMPlan;

List<Integer> generatedTokens = new ArrayList<>();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,20 @@
*
* <p>Three concrete implementations exist:</p>
* <ul>
* <li>{@link TornadoVMMasterPlanStandard} — baseline single-token forward pass
* <li>{@link TornadoVMMasterPlanSingleToken} — baseline single-token forward pass
* (preprocessing + N layers + logits).</li>
* <li>{@link TornadoVMMasterPlanWithPrefillDecode} — sequential prefill/decode separation;
* <li>{@link TornadoVMMasterPlanPrefillDecode} — sequential prefill/decode separation;
* reuses the same N layer graphs for both phases, skipping logits during prefill.</li>
* <li>{@link TornadoVMMasterPlanWithBatchPrefillDecode} — batched prefill + single-token
* <li>{@link TornadoVMMasterPlanBatchPrefillDecode} — batched prefill + single-token
* decode; holds 2N+3 graphs in one plan to keep the KV cache on device across phases.</li>
* </ul>
*
* <p>The {@link #initializeTornadoVMPlan} factory selects the implementation based on
* {@code llama.withPrefillDecode} and {@code llama.prefillBatchSize}:</p>
* <ul>
* <li>{@code withPrefillDecode=false} → {@link TornadoVMMasterPlanStandard}</li>
* <li>{@code withPrefillDecode=true}, {@code prefillBatchSize=1} → {@link TornadoVMMasterPlanWithPrefillDecode}</li>
* <li>{@code withPrefillDecode=true}, {@code prefillBatchSize>1} → {@link TornadoVMMasterPlanWithBatchPrefillDecode}</li>
* <li>{@code withPrefillDecode=false} → {@link TornadoVMMasterPlanSingleToken}</li>
* <li>{@code withPrefillDecode=true}, {@code prefillBatchSize=1} → {@link TornadoVMMasterPlanPrefillDecode}</li>
* <li>{@code withPrefillDecode=true}, {@code prefillBatchSize>1} → {@link TornadoVMMasterPlanBatchPrefillDecode}</li>
* </ul>
*/
public interface TornadoVMMasterPlan {
Expand All @@ -43,8 +43,8 @@ public interface TornadoVMMasterPlan {
* Factory: creates, JIT-compiles, and warms up the appropriate TornadoVMMasterPlan.
*
* <p>When {@code llama.withPrefillDecode=true} and {@code llama.prefillBatchSize > 1},
* a {@link TornadoVMMasterPlanWithBatchPrefillDecode} is returned.
* Otherwise a {@link TornadoVMMasterPlanStandard} is returned (used for the baseline
* a {@link TornadoVMMasterPlanBatchPrefillDecode} is returned.
* Otherwise a {@link TornadoVMMasterPlanSingleToken} is returned (used for the baseline
* path and the sequential prefill/decode path when batch size is 1).</p>
*
* @param state the model state
Expand All @@ -56,13 +56,13 @@ static TornadoVMMasterPlan initializeTornadoVMPlan(State state, Model model) {

if (WITH_PREFILL_DECODE && PREFILL_BATCH_SIZE > 1) {
// GPU path with batched prefill/decode
plan = new TornadoVMMasterPlanWithBatchPrefillDecode(state, model);
plan = new TornadoVMMasterPlanBatchPrefillDecode(state, model);
} else if (WITH_PREFILL_DECODE) {
// GPU path with simple prefill/decode
plan = new TornadoVMMasterPlanWithPrefillDecode(state, model);
plan = new TornadoVMMasterPlanPrefillDecode(state, model);
} else {
// GPU path with no prefill/decode
plan = new TornadoVMMasterPlanStandard(state, model);
plan = new TornadoVMMasterPlanSingleToken(state, model);
}
model.setTornadoVMPlan(plan);
return plan;
Expand All @@ -76,7 +76,7 @@ static TornadoVMMasterPlan initializeTornadoVMPlan(State state, Model model) {

void forceCopyInReadOnlyData();

FloatArray tornadoVMForwardExecuteLayered(int position);
FloatArray tornadoVMExecuteForward(int position);

/** Releases all device memory held by this plan. */
void freeTornadoExecutionPlan();
Expand Down
Loading
Loading