apache · phongn · May 14, 2026
diff --git a/include/tscore/ink_memcpy_tolower.h b/include/tscore/ink_memcpy_tolower.h
@@ -0,0 +1,163 @@
+/** @file
+
+  SIMD-accelerated bulk ASCII tolower copy.
+
+  Used on the URL canonicalization fast path for cache-key digests
+  (src/proxy/hdrs/URL.cc::url_CryptoHash_get_fast). The scalar loop is
+  the bottleneck for hosts and schemes long enough to vectorize; for
+  shorter inputs the scalar tail handles them with no SIMD overhead.
+
+  Semantics match a byte-at-a-time loop using ParseRules::ink_tolower():
+
+    - Bytes in 'A'..'Z' (0x41..0x5A) have bit 5 set, mapping them to
+      'a'..'z'. All other bytes (including 0x80..0xFF) pass through
+      unchanged. There is no UTF-8 case folding.
+
+    - The destination is written byte-for-byte; src and dst must point
+      to non-overlapping regions of size at least @n bytes.
+
+  Implementation note: the bodies are stacked widest-first and each
+  drains its block size before falling through to the next. A build
+  with AVX-512BW gets the 64-byte body as the main loop, then at most
+  one 32-byte AVX2 iteration and one 16-byte SSE2 iteration to drain
+  the remainder before the scalar tail handles 0-15 bytes. Builds
+  without the wider ISAs simply skip those blocks. Selection is purely
+  compile-time; no runtime dispatch.
+
+  @section license License
+
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+ */
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__AVX512BW__) || defined(__AVX2__) || defined(__SSE2__)
+#include <immintrin.h>
+#elif defined(__ARM_NEON) || defined(__aarch64__)
+#include <arm_neon.h>
+#endif
+
+namespace ts
+{
+
+inline void
+memcpy_tolower(char *dst, const char *src, std::size_t n) noexcept
+{
+#if defined(__AVX512BW__)
+  // AVX-512BW: 64 bytes per iteration with two key optimizations over the
+  // narrower paths:
+  //   - _mm512_mask_add_epi8 fuses the "+0x20 where upper" into a single
+  //     op (no separate maskz_set1 + or).
+  //   - A masked load/store handles the 1..63-byte tail in a single SIMD
+  //     pass, so we don't need to cascade to AVX2/SSE2 to drain the
+  //     remainder.
+  //
+  // The masked tail does carry ~7 ns of fixed setup cost, which loses to
+  // the cascade on short inputs. Gating the whole block on n >= 64 means
+  // tiny inputs fall through to the AVX2/SSE2 path below, where they keep
+  // the speedup that path already provides.
+  //
+  // Inspired by Tony Finch's copytolower64.c
+  // (https://dotat.at/cgi/git/vectolower.git/).
+  if (n >= 64) {
+    const __m512i A_vec = _mm512_set1_epi8('A');
+    const __m512i Z_vec = _mm512_set1_epi8('Z');
+    const __m512i delta = _mm512_set1_epi8('a' - 'A');
+    do {
+      __m512i   bytes    = _mm512_loadu_epi8(src);
+      __mmask64 is_upper = _mm512_cmpge_epi8_mask(bytes, A_vec) & _mm512_cmple_epi8_mask(bytes, Z_vec);
+      _mm512_storeu_epi8(dst, _mm512_mask_add_epi8(bytes, is_upper, bytes, delta));
+      src += 64;
+      dst += 64;
+      n   -= 64;
+    } while (n >= 64);
+    if (n != 0) {
+      auto      len_mask = static_cast<__mmask64>((~0ULL) >> (64 - n));
+      __m512i   bytes    = _mm512_maskz_loadu_epi8(len_mask, src);
+      __mmask64 is_upper = _mm512_cmpge_epi8_mask(bytes, A_vec) & _mm512_cmple_epi8_mask(bytes, Z_vec);
+      _mm512_mask_storeu_epi8(dst, len_mask, _mm512_mask_add_epi8(bytes, is_upper, bytes, delta));
+    }
+    return;
+  }
+#endif
+
+#if defined(__AVX2__)
+  // 32 bytes per iteration. Same compare-and-OR pattern as SSE2.
+  {
+    const __m256i a_minus_one = _mm256_set1_epi8('A' - 1);
+    const __m256i z_plus_one  = _mm256_set1_epi8('Z' + 1);
+    const __m256i bit5        = _mm256_set1_epi8(0x20);
+    while (n >= 32) {
+      __m256i bytes = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
+      __m256i ge_A  = _mm256_cmpgt_epi8(bytes, a_minus_one);
+      __m256i le_Z  = _mm256_cmpgt_epi8(z_plus_one, bytes);
+      __m256i mask  = _mm256_and_si256(_mm256_and_si256(ge_A, le_Z), bit5);
+      _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), _mm256_or_si256(bytes, mask));
+      src += 32;
+      dst += 32;
+      n   -= 32;
+    }
+  }
+#endif
+
+#if defined(__SSE2__)
+  // 16 bytes per iteration. Signed compare works for ASCII A-Z because all
+  // letters live below 0x80; high bytes (0x80..0xFF) compare as negative
+  // and correctly miss the [A,Z] range so they pass through unchanged.
+  {
+    const __m128i a_minus_one = _mm_set1_epi8('A' - 1);
+    const __m128i z_plus_one  = _mm_set1_epi8('Z' + 1);
+    const __m128i bit5        = _mm_set1_epi8(0x20);
+    while (n >= 16) {
+      __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
+      __m128i ge_A  = _mm_cmpgt_epi8(bytes, a_minus_one);
+      __m128i le_Z  = _mm_cmpgt_epi8(z_plus_one, bytes);
+      __m128i mask  = _mm_and_si128(_mm_and_si128(ge_A, le_Z), bit5);
+      _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_or_si128(bytes, mask));
+      src += 16;
+      dst += 16;
+      n   -= 16;
+    }
+  }
+#elif defined(__ARM_NEON) || defined(__aarch64__)
+  // 16 bytes per iteration; unsigned compare available natively.
+  {
+    const uint8x16_t a_minus_one = vdupq_n_u8('A' - 1);
+    const uint8x16_t z_plus_one  = vdupq_n_u8('Z' + 1);
+    const uint8x16_t bit5        = vdupq_n_u8(0x20);
+    while (n >= 16) {
+      uint8x16_t bytes = vld1q_u8(reinterpret_cast<const uint8_t *>(src));
+      uint8x16_t ge_A  = vcgtq_u8(bytes, a_minus_one);
+      uint8x16_t le_Z  = vcltq_u8(bytes, z_plus_one);
+      uint8x16_t mask  = vandq_u8(vandq_u8(ge_A, le_Z), bit5);
+      vst1q_u8(reinterpret_cast<uint8_t *>(dst), vorrq_u8(bytes, mask));
+      src += 16;
+      dst += 16;
+      n   -= 16;
+    }
+  }
+#endif
+
+  while (n--) {
+    auto c = static_cast<unsigned char>(*src++);
+    *dst++ = (c >= 'A' && c <= 'Z') ? static_cast<char>(c | 0x20) : static_cast<char>(c);
+  }
+}
+
+} // namespace ts
diff --git a/src/proxy/hdrs/URL.cc b/src/proxy/hdrs/URL.cc
@@ -25,6 +25,7 @@
 #include <new>
 #include "tscore/ink_platform.h"
 #include "tscore/ink_memory.h"
+#include "tscore/ink_memcpy_tolower.h"
 #include "proxy/hdrs/URL.h"
 #include "proxy/hdrs/MIME.h"
 #include "proxy/hdrs/HTTP.h"
@@ -1684,16 +1685,6 @@ url_describe(HdrHeapObjImpl *raw, bool /* recurse ATS_UNUSED */)
  *                                                                     *
  ***********************************************************************/
 
-static inline void
-memcpy_tolower(char *d, const char *s, int n)
-{
-  while (n--) {
-    *d = ParseRules::ink_tolower(*s);
-    s++;
-    d++;
-  }
-}
-
 // fast path for CryptoHash, HTTP, no user/password/params/query,
 // no buffer overflow, no unescaping needed
 
@@ -1704,7 +1695,7 @@ url_CryptoHash_get_fast(const URLImpl *url, CryptoContext &ctx, CryptoHash *hash
   char *p;
 
   p = buffer;
-  memcpy_tolower(p, url->m_ptr_scheme, url->m_len_scheme);
+  ts::memcpy_tolower(p, url->m_ptr_scheme, url->m_len_scheme);
   p    += url->m_len_scheme;
   *p++  = ':';
   *p++  = '/';
@@ -1713,7 +1704,7 @@ url_CryptoHash_get_fast(const URLImpl *url, CryptoContext &ctx, CryptoHash *hash
   *p++ = ':';
   // no password
   *p++ = '@';
-  memcpy_tolower(p, url->m_ptr_host, url->m_len_host);
+  ts::memcpy_tolower(p, url->m_ptr_host, url->m_len_host);
   p    += url->m_len_host;
   *p++  = '/';
   memcpy(p, url->m_ptr_path, url->m_len_path);

diff --git a/tools/benchmark/CMakeLists.txt b/tools/benchmark/CMakeLists.txt
@@ -36,6 +36,9 @@ target_link_libraries(benchmark_SharedMutex PRIVATE Catch2::Catch2 ts::tscore li
 add_executable(benchmark_Random benchmark_Random.cc)
 target_link_libraries(benchmark_Random PRIVATE Catch2::Catch2WithMain ts::tscore)
 
+add_executable(benchmark_memcpy_tolower benchmark_memcpy_tolower.cc)
+target_link_libraries(benchmark_memcpy_tolower PRIVATE Catch2::Catch2WithMain ts::tscore)
+
 add_executable(benchmark_HostDB benchmark_HostDB.cc)
 target_link_libraries(
   benchmark_HostDB

diff --git a/tools/benchmark/benchmark_memcpy_tolower.cc b/tools/benchmark/benchmark_memcpy_tolower.cc
@@ -0,0 +1,167 @@
+/** @file
+
+  Micro benchmark for ts::memcpy_tolower against a byte-at-a-time scalar
+  loop equivalent to the prior URL.cc::memcpy_tolower definition.
+
+  @section license License
+
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+ */
+
+#define CATCH_CONFIG_ENABLE_BENCHMARKING
+
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/benchmark/catch_benchmark.hpp>
+
+#include "tscore/ink_memcpy_tolower.h"
+#include "tscore/ParseRules.h"
+
+#include <array>
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include <random>
+#include <string>
+#include <vector>
+
+namespace
+{
+
+// Sizes chosen to mirror the URL.cc hot path:
+//   4-8B   - common HTTP scheme strings ("http", "https")
+//   16-32B - typical host names
+//   64-256B - long host names / cache-key segments
+//   1024B  - stress the inner loop
+constexpr std::array<std::size_t, 8> kSizes{4, 8, 16, 24, 32, 64, 256, 1024};
+
+// Same character distribution we expect from URL/host input: ASCII letters
+// (mixed case), digits, and the small set of non-alpha bytes that legitimately
+// appear in URLs.
+std::vector<char>
+make_mixed_case_ascii(std::size_t n, std::uint64_t seed = 0xABCDEFULL)
+{
+  std::mt19937_64   rng(seed);
+  std::vector<char> v(n);
+  for (std::size_t i = 0; i < n; ++i) {
+    auto r = static_cast<unsigned>(rng() & 0x3FU);
+    if (r < 26U) {
+      v[i] = static_cast<char>('A' + r);
+    } else if (r < 52U) {
+      v[i] = static_cast<char>('a' + (r - 26U));
+    } else {
+      static constexpr char kNonAlpha[] = "0123456789-_./:";
+      v[i]                              = kNonAlpha[r % (sizeof(kNonAlpha) - 1U)];
+    }
+  }
+  return v;
+}
+
+// Mirror of the prior static inline memcpy_tolower() from URL.cc, kept here
+// as the baseline the SIMD path is expected to beat.
+inline void
+memcpy_tolower_scalar(char *d, const char *s, std::size_t n) noexcept
+{
+  while (n--) {
+    *d = ParseRules::ink_tolower(*s);
+    ++s;
+    ++d;
+  }
+}
+
+} // namespace
+
+TEST_CASE("active SIMD configuration", "[tolower][config]")
+{
+  // Print the compile-time ISA path so the benchmark output makes the
+  // selected configuration obvious. Cascades stack: AVX-512BW builds also
+  // emit the AVX2 and SSE2 drain loops; AVX2 builds emit the SSE2 drain.
+  std::cout << "ts::memcpy_tolower compiled with: ";
+#if defined(__AVX512BW__)
+  std::cout << "AVX-512BW (64B body + masked tail, gated at n>=64) + AVX2 + SSE2 cascade";
+#elif defined(__AVX2__)
+  std::cout << "AVX2 (32B body) + SSE2 (16B drain)";
+#elif defined(__SSE2__)
+  std::cout << "SSE2 (16B body)";
+#elif defined(__ARM_NEON) || defined(__aarch64__)
+  std::cout << "NEON (16B body)";
+#else
+  std::cout << "scalar only";
+#endif
+  std::cout << '\n';
+  SUCCEED();
+}
+
+TEST_CASE("ts::memcpy_tolower matches scalar reference", "[tolower][correctness]")
+{
+  // Cover sizes that bracket the 16-byte SIMD body: smaller-than, equal-to,
+  // a couple of multiples, and several offsets between multiples.
+  for (std::size_t sz : std::array<std::size_t, 12>{0, 1, 5, 15, 16, 17, 23, 31, 32, 33, 64, 257}) {
+    auto              input = make_mixed_case_ascii(sz, 0xC0FFEE + sz);
+    std::vector<char> expected(sz);
+    std::vector<char> actual(sz);
+
+    memcpy_tolower_scalar(expected.data(), input.data(), sz);
+    ts::memcpy_tolower(actual.data(), input.data(), sz);
+
+    CAPTURE(sz);
+    REQUIRE(actual == expected);
+  }
+}
+
+TEST_CASE("ts::memcpy_tolower preserves non-ASCII bytes", "[tolower][correctness]")
+{
+  // Every byte value from 0..255 should round-trip unchanged unless it is in
+  // 'A'..'Z', in which case it should map to 'a'..'z'. This catches anyone
+  // who later tries to "speed things up" by widening the range to Latin-1.
+  std::array<unsigned char, 256> input;
+  for (std::size_t i = 0; i < 256; ++i) {
+    input[i] = static_cast<unsigned char>(i);
+  }
+  std::array<char, 256> output;
+  ts::memcpy_tolower(output.data(), reinterpret_cast<const char *>(input.data()), input.size());
+
+  for (std::size_t i = 0; i < 256; ++i) {
+    auto in  = static_cast<unsigned char>(i);
+    auto out = static_cast<unsigned char>(output[i]);
+    auto exp = (in >= 'A' && in <= 'Z') ? static_cast<unsigned char>(in | 0x20) : in;
+    CAPTURE(i);
+    REQUIRE(out == exp);
+  }
+}
+
+TEST_CASE("memcpy_tolower throughput", "[bench][tolower]")
+{
+  for (std::size_t sz : kSizes) {
+    auto              input = make_mixed_case_ascii(sz);
+    std::vector<char> output_scalar(sz);
+    std::vector<char> output_simd(sz);
+
+    std::string scalar_name = "scalar  " + std::to_string(sz) + "B";
+    BENCHMARK(scalar_name.c_str())
+    {
+      memcpy_tolower_scalar(output_scalar.data(), input.data(), sz);
+      return output_scalar[0];
+    };
+
+    std::string simd_name = "ts::mct " + std::to_string(sz) + "B";
+    BENCHMARK(simd_name.c_str())
+    {
+      ts::memcpy_tolower(output_simd.data(), input.data(), sz);
+      return output_simd[0];
+    };
+  }
+}