diff --git a/include/tscore/ink_memcpy_tolower.h b/include/tscore/ink_memcpy_tolower.h new file mode 100644 index 00000000000..3c93559ae5e --- /dev/null +++ b/include/tscore/ink_memcpy_tolower.h @@ -0,0 +1,163 @@ +/** @file + + SIMD-accelerated bulk ASCII tolower copy. + + Used on the URL canonicalization fast path for cache-key digests + (src/proxy/hdrs/URL.cc::url_CryptoHash_get_fast). The scalar loop is + the bottleneck for hosts and schemes long enough to vectorize; for + shorter inputs the scalar tail handles them with no SIMD overhead. + + Semantics match a byte-at-a-time loop using ParseRules::ink_tolower(): + + - Bytes in 'A'..'Z' (0x41..0x5A) have bit 5 set, mapping them to + 'a'..'z'. All other bytes (including 0x80..0xFF) pass through + unchanged. There is no UTF-8 case folding. + + - The destination is written byte-for-byte; src and dst must point + to non-overlapping regions of size at least @n bytes. + + Implementation note: the bodies are stacked widest-first and each + drains its block size before falling through to the next. A build + with AVX-512BW gets the 64-byte body as the main loop, then at most + one 32-byte AVX2 iteration and one 16-byte SSE2 iteration to drain + the remainder before the scalar tail handles 0-15 bytes. Builds + without the wider ISAs simply skip those blocks. Selection is purely + compile-time; no runtime dispatch. + + @section license License + + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +#pragma once + +#include +#include + +#if defined(__AVX512BW__) || defined(__AVX2__) || defined(__SSE2__) +#include +#elif defined(__ARM_NEON) || defined(__aarch64__) +#include +#endif + +namespace ts +{ + +inline void +memcpy_tolower(char *dst, const char *src, std::size_t n) noexcept +{ +#if defined(__AVX512BW__) + // AVX-512BW: 64 bytes per iteration with two key optimizations over the + // narrower paths: + // - _mm512_mask_add_epi8 fuses the "+0x20 where upper" into a single + // op (no separate maskz_set1 + or). + // - A masked load/store handles the 1..63-byte tail in a single SIMD + // pass, so we don't need to cascade to AVX2/SSE2 to drain the + // remainder. + // + // The masked tail does carry ~7 ns of fixed setup cost, which loses to + // the cascade on short inputs. Gating the whole block on n >= 64 means + // tiny inputs fall through to the AVX2/SSE2 path below, where they keep + // the speedup that path already provides. + // + // Inspired by Tony Finch's copytolower64.c + // (https://dotat.at/cgi/git/vectolower.git/). + if (n >= 64) { + const __m512i A_vec = _mm512_set1_epi8('A'); + const __m512i Z_vec = _mm512_set1_epi8('Z'); + const __m512i delta = _mm512_set1_epi8('a' - 'A'); + do { + __m512i bytes = _mm512_loadu_epi8(src); + __mmask64 is_upper = _mm512_cmpge_epi8_mask(bytes, A_vec) & _mm512_cmple_epi8_mask(bytes, Z_vec); + _mm512_storeu_epi8(dst, _mm512_mask_add_epi8(bytes, is_upper, bytes, delta)); + src += 64; + dst += 64; + n -= 64; + } while (n >= 64); + if (n != 0) { + auto len_mask = static_cast<__mmask64>((~0ULL) >> (64 - n)); + __m512i bytes = _mm512_maskz_loadu_epi8(len_mask, src); + __mmask64 is_upper = _mm512_cmpge_epi8_mask(bytes, A_vec) & _mm512_cmple_epi8_mask(bytes, Z_vec); + _mm512_mask_storeu_epi8(dst, len_mask, _mm512_mask_add_epi8(bytes, is_upper, bytes, delta)); + } + return; + } +#endif + +#if defined(__AVX2__) + // 32 bytes per iteration. Same compare-and-OR pattern as SSE2. + { + const __m256i a_minus_one = _mm256_set1_epi8('A' - 1); + const __m256i z_plus_one = _mm256_set1_epi8('Z' + 1); + const __m256i bit5 = _mm256_set1_epi8(0x20); + while (n >= 32) { + __m256i bytes = _mm256_loadu_si256(reinterpret_cast(src)); + __m256i ge_A = _mm256_cmpgt_epi8(bytes, a_minus_one); + __m256i le_Z = _mm256_cmpgt_epi8(z_plus_one, bytes); + __m256i mask = _mm256_and_si256(_mm256_and_si256(ge_A, le_Z), bit5); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), _mm256_or_si256(bytes, mask)); + src += 32; + dst += 32; + n -= 32; + } + } +#endif + +#if defined(__SSE2__) + // 16 bytes per iteration. Signed compare works for ASCII A-Z because all + // letters live below 0x80; high bytes (0x80..0xFF) compare as negative + // and correctly miss the [A,Z] range so they pass through unchanged. + { + const __m128i a_minus_one = _mm_set1_epi8('A' - 1); + const __m128i z_plus_one = _mm_set1_epi8('Z' + 1); + const __m128i bit5 = _mm_set1_epi8(0x20); + while (n >= 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast(src)); + __m128i ge_A = _mm_cmpgt_epi8(bytes, a_minus_one); + __m128i le_Z = _mm_cmpgt_epi8(z_plus_one, bytes); + __m128i mask = _mm_and_si128(_mm_and_si128(ge_A, le_Z), bit5); + _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_or_si128(bytes, mask)); + src += 16; + dst += 16; + n -= 16; + } + } +#elif defined(__ARM_NEON) || defined(__aarch64__) + // 16 bytes per iteration; unsigned compare available natively. + { + const uint8x16_t a_minus_one = vdupq_n_u8('A' - 1); + const uint8x16_t z_plus_one = vdupq_n_u8('Z' + 1); + const uint8x16_t bit5 = vdupq_n_u8(0x20); + while (n >= 16) { + uint8x16_t bytes = vld1q_u8(reinterpret_cast(src)); + uint8x16_t ge_A = vcgtq_u8(bytes, a_minus_one); + uint8x16_t le_Z = vcltq_u8(bytes, z_plus_one); + uint8x16_t mask = vandq_u8(vandq_u8(ge_A, le_Z), bit5); + vst1q_u8(reinterpret_cast(dst), vorrq_u8(bytes, mask)); + src += 16; + dst += 16; + n -= 16; + } + } +#endif + + while (n--) { + auto c = static_cast(*src++); + *dst++ = (c >= 'A' && c <= 'Z') ? static_cast(c | 0x20) : static_cast(c); + } +} + +} // namespace ts diff --git a/src/proxy/hdrs/URL.cc b/src/proxy/hdrs/URL.cc index 68d84b5d481..c8eff4dd69f 100644 --- a/src/proxy/hdrs/URL.cc +++ b/src/proxy/hdrs/URL.cc @@ -25,6 +25,7 @@ #include #include "tscore/ink_platform.h" #include "tscore/ink_memory.h" +#include "tscore/ink_memcpy_tolower.h" #include "proxy/hdrs/URL.h" #include "proxy/hdrs/MIME.h" #include "proxy/hdrs/HTTP.h" @@ -1684,16 +1685,6 @@ url_describe(HdrHeapObjImpl *raw, bool /* recurse ATS_UNUSED */) * * ***********************************************************************/ -static inline void -memcpy_tolower(char *d, const char *s, int n) -{ - while (n--) { - *d = ParseRules::ink_tolower(*s); - s++; - d++; - } -} - // fast path for CryptoHash, HTTP, no user/password/params/query, // no buffer overflow, no unescaping needed @@ -1704,7 +1695,7 @@ url_CryptoHash_get_fast(const URLImpl *url, CryptoContext &ctx, CryptoHash *hash char *p; p = buffer; - memcpy_tolower(p, url->m_ptr_scheme, url->m_len_scheme); + ts::memcpy_tolower(p, url->m_ptr_scheme, url->m_len_scheme); p += url->m_len_scheme; *p++ = ':'; *p++ = '/'; @@ -1713,7 +1704,7 @@ url_CryptoHash_get_fast(const URLImpl *url, CryptoContext &ctx, CryptoHash *hash *p++ = ':'; // no password *p++ = '@'; - memcpy_tolower(p, url->m_ptr_host, url->m_len_host); + ts::memcpy_tolower(p, url->m_ptr_host, url->m_len_host); p += url->m_len_host; *p++ = '/'; memcpy(p, url->m_ptr_path, url->m_len_path); diff --git a/tools/benchmark/CMakeLists.txt b/tools/benchmark/CMakeLists.txt index 49f25fad1c1..84fc111925e 100644 --- a/tools/benchmark/CMakeLists.txt +++ b/tools/benchmark/CMakeLists.txt @@ -36,6 +36,9 @@ target_link_libraries(benchmark_SharedMutex PRIVATE Catch2::Catch2 ts::tscore li add_executable(benchmark_Random benchmark_Random.cc) target_link_libraries(benchmark_Random PRIVATE Catch2::Catch2WithMain ts::tscore) +add_executable(benchmark_memcpy_tolower benchmark_memcpy_tolower.cc) +target_link_libraries(benchmark_memcpy_tolower PRIVATE Catch2::Catch2WithMain ts::tscore) + add_executable(benchmark_HostDB benchmark_HostDB.cc) target_link_libraries( benchmark_HostDB diff --git a/tools/benchmark/benchmark_memcpy_tolower.cc b/tools/benchmark/benchmark_memcpy_tolower.cc new file mode 100644 index 00000000000..e2b45b48b5d --- /dev/null +++ b/tools/benchmark/benchmark_memcpy_tolower.cc @@ -0,0 +1,167 @@ +/** @file + + Micro benchmark for ts::memcpy_tolower against a byte-at-a-time scalar + loop equivalent to the prior URL.cc::memcpy_tolower definition. + + @section license License + + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +#define CATCH_CONFIG_ENABLE_BENCHMARKING + +#include +#include + +#include "tscore/ink_memcpy_tolower.h" +#include "tscore/ParseRules.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace +{ + +// Sizes chosen to mirror the URL.cc hot path: +// 4-8B - common HTTP scheme strings ("http", "https") +// 16-32B - typical host names +// 64-256B - long host names / cache-key segments +// 1024B - stress the inner loop +constexpr std::array kSizes{4, 8, 16, 24, 32, 64, 256, 1024}; + +// Same character distribution we expect from URL/host input: ASCII letters +// (mixed case), digits, and the small set of non-alpha bytes that legitimately +// appear in URLs. +std::vector +make_mixed_case_ascii(std::size_t n, std::uint64_t seed = 0xABCDEFULL) +{ + std::mt19937_64 rng(seed); + std::vector v(n); + for (std::size_t i = 0; i < n; ++i) { + auto r = static_cast(rng() & 0x3FU); + if (r < 26U) { + v[i] = static_cast('A' + r); + } else if (r < 52U) { + v[i] = static_cast('a' + (r - 26U)); + } else { + static constexpr char kNonAlpha[] = "0123456789-_./:"; + v[i] = kNonAlpha[r % (sizeof(kNonAlpha) - 1U)]; + } + } + return v; +} + +// Mirror of the prior static inline memcpy_tolower() from URL.cc, kept here +// as the baseline the SIMD path is expected to beat. +inline void +memcpy_tolower_scalar(char *d, const char *s, std::size_t n) noexcept +{ + while (n--) { + *d = ParseRules::ink_tolower(*s); + ++s; + ++d; + } +} + +} // namespace + +TEST_CASE("active SIMD configuration", "[tolower][config]") +{ + // Print the compile-time ISA path so the benchmark output makes the + // selected configuration obvious. Cascades stack: AVX-512BW builds also + // emit the AVX2 and SSE2 drain loops; AVX2 builds emit the SSE2 drain. + std::cout << "ts::memcpy_tolower compiled with: "; +#if defined(__AVX512BW__) + std::cout << "AVX-512BW (64B body + masked tail, gated at n>=64) + AVX2 + SSE2 cascade"; +#elif defined(__AVX2__) + std::cout << "AVX2 (32B body) + SSE2 (16B drain)"; +#elif defined(__SSE2__) + std::cout << "SSE2 (16B body)"; +#elif defined(__ARM_NEON) || defined(__aarch64__) + std::cout << "NEON (16B body)"; +#else + std::cout << "scalar only"; +#endif + std::cout << '\n'; + SUCCEED(); +} + +TEST_CASE("ts::memcpy_tolower matches scalar reference", "[tolower][correctness]") +{ + // Cover sizes that bracket the 16-byte SIMD body: smaller-than, equal-to, + // a couple of multiples, and several offsets between multiples. + for (std::size_t sz : std::array{0, 1, 5, 15, 16, 17, 23, 31, 32, 33, 64, 257}) { + auto input = make_mixed_case_ascii(sz, 0xC0FFEE + sz); + std::vector expected(sz); + std::vector actual(sz); + + memcpy_tolower_scalar(expected.data(), input.data(), sz); + ts::memcpy_tolower(actual.data(), input.data(), sz); + + CAPTURE(sz); + REQUIRE(actual == expected); + } +} + +TEST_CASE("ts::memcpy_tolower preserves non-ASCII bytes", "[tolower][correctness]") +{ + // Every byte value from 0..255 should round-trip unchanged unless it is in + // 'A'..'Z', in which case it should map to 'a'..'z'. This catches anyone + // who later tries to "speed things up" by widening the range to Latin-1. + std::array input; + for (std::size_t i = 0; i < 256; ++i) { + input[i] = static_cast(i); + } + std::array output; + ts::memcpy_tolower(output.data(), reinterpret_cast(input.data()), input.size()); + + for (std::size_t i = 0; i < 256; ++i) { + auto in = static_cast(i); + auto out = static_cast(output[i]); + auto exp = (in >= 'A' && in <= 'Z') ? static_cast(in | 0x20) : in; + CAPTURE(i); + REQUIRE(out == exp); + } +} + +TEST_CASE("memcpy_tolower throughput", "[bench][tolower]") +{ + for (std::size_t sz : kSizes) { + auto input = make_mixed_case_ascii(sz); + std::vector output_scalar(sz); + std::vector output_simd(sz); + + std::string scalar_name = "scalar " + std::to_string(sz) + "B"; + BENCHMARK(scalar_name.c_str()) + { + memcpy_tolower_scalar(output_scalar.data(), input.data(), sz); + return output_scalar[0]; + }; + + std::string simd_name = "ts::mct " + std::to_string(sz) + "B"; + BENCHMARK(simd_name.c_str()) + { + ts::memcpy_tolower(output_simd.data(), input.data(), sz); + return output_simd[0]; + }; + } +}