Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 29 additions & 9 deletions csrc/models/llama/llama_attention.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "infinicore/nn/linear.hpp"
#include "infinicore/nn/rope.hpp"
#include "infinicore/ops.hpp"
#include "infinicore/ops/mha_kvcache.hpp"
#include "infinicore/ops/mha_varlen.hpp"
#include "infinicore/ops/mul.hpp"

Expand Down Expand Up @@ -331,16 +332,35 @@ infinicore::Tensor LlamaAttention::forward_paged_(const infinicore::Tensor &hidd
scaling_);
}
} else {
infinicore::op::paged_attention_(
attn_output,
q_reshaped,
k_total,
v_total,
block_tables.value(),
total_sequence_lengths.value(),
std::nullopt,
scaling_);
if (attention_backend_ == backends::AttentionBackend::FlashAttn) {
// FA2 decode path: flash::mha_fwd_kvcache
// In paged-attn mode, seq_len = actual batch_size (one query token per sequence).
// q_reshaped: [seq_len, num_heads, head_dim] → [seq_len, 1, num_heads, head_dim]
// k/v cache: [num_blocks, num_kv_heads, block_size, head_dim]
// → permute {0,2,1,3} → [num_blocks, block_size, num_kv_heads, head_dim]
auto q_for_fa = q_reshaped->view({seq_len, 1, num_attention_heads_, head_dim_});
auto attn_out_4d = infinicore::op::mha_kvcache(
q_for_fa,
k_total->permute({0, 2, 1, 3}), // [num_blocks, block_size, num_kv_heads, head_dim]
v_total->permute({0, 2, 1, 3}),
total_sequence_lengths.value(), // [seq_len] int32 (one entry per sequence)
block_tables.value(), // [seq_len, max_num_blocks_per_seq] int32
std::nullopt,
scaling_);
attn_output = attn_out_4d->view({seq_len, num_attention_heads_, head_dim_});
} else {
infinicore::op::paged_attention_(
attn_output,
q_reshaped,
k_total,
v_total,
block_tables.value(),
total_sequence_lengths.value(),
std::nullopt,
scaling_);
}
}


// 7. Project output
attn_output
Expand Down