diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi300x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi300x.sh index 65cb8ee8e..21c44f2d8 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_mi300x.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_mi300x.sh @@ -24,6 +24,7 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then fi export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1 SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} @@ -42,6 +43,7 @@ vllm serve $MODEL --port $PORT \ --max-model-len $MAX_MODEL_LEN \ --block-size=32 \ --no-enable-prefix-caching \ +--attention-backend "ROCM_AITER_FA" \ --trust-remote-code > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 684d40dcc..22143c7f8 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3068,3 +3068,9 @@ description: - "Bump image to rocm/sgl-dev:rocm720-mi35x-8c3b5aa-20260521-DSv4" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1548 + +- config-keys: + - minimaxm2.5-fp8-mi300x-vllm + description: + - "Add VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1 + --attention-backend ROCM_AITER_FA (match AMD-recommended AITER recipe pattern used on mi355x)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1550