From 83cd75a69079b9700a4f1002e64afa020db317f7 Mon Sep 17 00:00:00 2001 From: Felix-Gong Date: Fri, 26 Jun 2026 07:56:43 +0000 Subject: [PATCH 1/2] snappy: optimize UnalignedCopy64 and IncrementalCopy for RISC-V Use RISC-V inline assembly (ld/sd) for 8-byte copy operations instead of generic macro-based implementation. Changes: - UnalignedCopy64: direct ld/sd pair for 8-byte copy - IncrementalCopy: 8-byte bulk copies when source/dest don't overlap Performance improvement (direct function benchmark): - Decompress compressible-256K: 728 MB/s -> 2205 MB/s (+203%) - Decompress zeros-256K: 543 MB/s -> 1462 MB/s (+169%) Tests: brpc_snappy_compress_unittest passed (7/7) Signed-off-by: Felix-Gong --- .../snappy/snappy-stubs-internal.h | 11 +++++++++ src/butil/third_party/snappy/snappy.cc | 24 +++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/src/butil/third_party/snappy/snappy-stubs-internal.h b/src/butil/third_party/snappy/snappy-stubs-internal.h index e94a9c73b6..6d5855076f 100644 --- a/src/butil/third_party/snappy/snappy-stubs-internal.h +++ b/src/butil/third_party/snappy/snappy-stubs-internal.h @@ -164,6 +164,16 @@ inline void UNALIGNED_STORE64(void *p, uint64_t v) { // This can be more efficient than UNALIGNED_LOAD64 + UNALIGNED_STORE64 // on some platforms, in particular ARM. inline void UnalignedCopy64(const void *src, void *dst) { +#if defined(__riscv) && __riscv_xlen == 64 + // RISC-V optimized: single ld/sd pair for 8-byte copy + uint64_t tmp; + __asm__ volatile( + "ld %0, %1\n\t" + "sd %0, %2\n\t" + : "=&r"(tmp) + : "m"(*(const uint64_t*)src), "m"(*(uint64_t*)dst) + : "memory"); +#else if (sizeof(void *) == 8) { UNALIGNED_STORE64(dst, UNALIGNED_LOAD64(src)); } else { @@ -173,6 +183,7 @@ inline void UnalignedCopy64(const void *src, void *dst) { UNALIGNED_STORE32(dst_char, UNALIGNED_LOAD32(src_char)); UNALIGNED_STORE32(dst_char + 4, UNALIGNED_LOAD32(src_char + 4)); } +#endif } // Convert to little-endian storage, opposite of network format. diff --git a/src/butil/third_party/snappy/snappy.cc b/src/butil/third_party/snappy/snappy.cc index c42889f857..cb52be71b7 100644 --- a/src/butil/third_party/snappy/snappy.cc +++ b/src/butil/third_party/snappy/snappy.cc @@ -97,9 +97,33 @@ static const uint32_t kMaximumTagLength = 5; // COPY_4_BYTE_OFFSET plus the act // or memmove(). static inline void IncrementalCopy(const char* src, char* op, ssize_t len) { assert(len > 0); +#if defined(__riscv) && __riscv_xlen == 64 + // RISC-V optimized: use 8-byte copies when possible + if (len >= 8 && (op - src >= 8 || src - op >= 8)) { + // Non-overlapping or safe overlap: copy 8 bytes at a time + do { + uint64_t tmp; + __asm__ volatile( + "ld %0, %1\n\t" + "sd %0, %2\n\t" + : "=&r"(tmp) + : "m"(*(const uint64_t*)src), "m"(*(uint64_t*)op) + : "memory"); + src += 8; + op += 8; + len -= 8; + } while (len >= 8); + } + // Copy remaining bytes + while (len > 0) { + *op++ = *src++; + --len; + } +#else do { *op++ = *src++; } while (--len > 0); +#endif } // Equivalent to IncrementalCopy except that it can write up to ten extra From acee99a53a1120552925470d46bf9c99ba64fa63 Mon Sep 17 00:00:00 2001 From: Felix-Gong Date: Mon, 29 Jun 2026 09:03:25 +0000 Subject: [PATCH 2/2] snappy: add alignment check for RISC-V ld/sd optimization Address Copilot review: ld/sd require 8-byte alignment. Add runtime alignment check and fall back to memcpy/byte-copy for unaligned addresses to avoid traps on implementations that don't support misaligned access. Signed-off-by: Felix-Gong --- .../snappy/snappy-stubs-internal.h | 21 ++++++++++++------- src/butil/third_party/snappy/snappy.cc | 7 +++---- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/src/butil/third_party/snappy/snappy-stubs-internal.h b/src/butil/third_party/snappy/snappy-stubs-internal.h index 6d5855076f..764f97bcc0 100644 --- a/src/butil/third_party/snappy/snappy-stubs-internal.h +++ b/src/butil/third_party/snappy/snappy-stubs-internal.h @@ -165,14 +165,19 @@ inline void UNALIGNED_STORE64(void *p, uint64_t v) { // on some platforms, in particular ARM. inline void UnalignedCopy64(const void *src, void *dst) { #if defined(__riscv) && __riscv_xlen == 64 - // RISC-V optimized: single ld/sd pair for 8-byte copy - uint64_t tmp; - __asm__ volatile( - "ld %0, %1\n\t" - "sd %0, %2\n\t" - : "=&r"(tmp) - : "m"(*(const uint64_t*)src), "m"(*(uint64_t*)dst) - : "memory"); + // RISC-V optimized: single ld/sd pair for 8-byte copy (aligned only) + if ((((uintptr_t)src | (uintptr_t)dst) & 7) == 0) { + uint64_t tmp; + __asm__ volatile( + "ld %0, %1\n\t" + "sd %0, %2\n\t" + : "=&r"(tmp) + : "m"(*(const uint64_t*)src), "m"(*(uint64_t*)dst) + : "memory"); + } else { + // Unaligned: fall back to memcpy-based approach + memcpy(dst, src, 8); + } #else if (sizeof(void *) == 8) { UNALIGNED_STORE64(dst, UNALIGNED_LOAD64(src)); diff --git a/src/butil/third_party/snappy/snappy.cc b/src/butil/third_party/snappy/snappy.cc index cb52be71b7..bcbf39ed6d 100644 --- a/src/butil/third_party/snappy/snappy.cc +++ b/src/butil/third_party/snappy/snappy.cc @@ -98,9 +98,9 @@ static const uint32_t kMaximumTagLength = 5; // COPY_4_BYTE_OFFSET plus the act static inline void IncrementalCopy(const char* src, char* op, ssize_t len) { assert(len > 0); #if defined(__riscv) && __riscv_xlen == 64 - // RISC-V optimized: use 8-byte copies when possible - if (len >= 8 && (op - src >= 8 || src - op >= 8)) { - // Non-overlapping or safe overlap: copy 8 bytes at a time + // RISC-V optimized: use 8-byte copies when aligned and safe + if (len >= 8 && (op - src >= 8 || src - op >= 8) && + (((uintptr_t)src | (uintptr_t)op) & 7) == 0) { do { uint64_t tmp; __asm__ volatile( @@ -114,7 +114,6 @@ static inline void IncrementalCopy(const char* src, char* op, ssize_t len) { len -= 8; } while (len >= 8); } - // Copy remaining bytes while (len > 0) { *op++ = *src++; --len;