Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
163 changes: 163 additions & 0 deletions include/tscore/ink_memcpy_tolower.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
/** @file

SIMD-accelerated bulk ASCII tolower copy.

Used on the URL canonicalization fast path for cache-key digests
(src/proxy/hdrs/URL.cc::url_CryptoHash_get_fast). The scalar loop is
the bottleneck for hosts and schemes long enough to vectorize; for
shorter inputs the scalar tail handles them with no SIMD overhead.

Semantics match a byte-at-a-time loop using ParseRules::ink_tolower():

- Bytes in 'A'..'Z' (0x41..0x5A) have bit 5 set, mapping them to
'a'..'z'. All other bytes (including 0x80..0xFF) pass through
unchanged. There is no UTF-8 case folding.

- The destination is written byte-for-byte; src and dst must point
to non-overlapping regions of size at least @n bytes.

Implementation note: the bodies are stacked widest-first and each
drains its block size before falling through to the next. A build
with AVX-512BW gets the 64-byte body as the main loop, then at most
one 32-byte AVX2 iteration and one 16-byte SSE2 iteration to drain
the remainder before the scalar tail handles 0-15 bytes. Builds
without the wider ISAs simply skip those blocks. Selection is purely
compile-time; no runtime dispatch.

@section license License

Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#pragma once

#include <cstddef>
#include <cstdint>

#if defined(__AVX512BW__) || defined(__AVX2__) || defined(__SSE2__)
#include <immintrin.h>
#elif defined(__ARM_NEON) || defined(__aarch64__)
#include <arm_neon.h>
#endif

namespace ts
{

inline void
memcpy_tolower(char *dst, const char *src, std::size_t n) noexcept
{
#if defined(__AVX512BW__)
// AVX-512BW: 64 bytes per iteration with two key optimizations over the
// narrower paths:
// - _mm512_mask_add_epi8 fuses the "+0x20 where upper" into a single
// op (no separate maskz_set1 + or).
// - A masked load/store handles the 1..63-byte tail in a single SIMD
// pass, so we don't need to cascade to AVX2/SSE2 to drain the
// remainder.
//
// The masked tail does carry ~7 ns of fixed setup cost, which loses to
// the cascade on short inputs. Gating the whole block on n >= 64 means
// tiny inputs fall through to the AVX2/SSE2 path below, where they keep
// the speedup that path already provides.
//
// Inspired by Tony Finch's copytolower64.c
// (https://dotat.at/cgi/git/vectolower.git/).
if (n >= 64) {
const __m512i A_vec = _mm512_set1_epi8('A');
const __m512i Z_vec = _mm512_set1_epi8('Z');
const __m512i delta = _mm512_set1_epi8('a' - 'A');
do {
__m512i bytes = _mm512_loadu_epi8(src);
__mmask64 is_upper = _mm512_cmpge_epi8_mask(bytes, A_vec) & _mm512_cmple_epi8_mask(bytes, Z_vec);
_mm512_storeu_epi8(dst, _mm512_mask_add_epi8(bytes, is_upper, bytes, delta));
src += 64;
dst += 64;
n -= 64;
} while (n >= 64);
if (n != 0) {
auto len_mask = static_cast<__mmask64>((~0ULL) >> (64 - n));
__m512i bytes = _mm512_maskz_loadu_epi8(len_mask, src);
__mmask64 is_upper = _mm512_cmpge_epi8_mask(bytes, A_vec) & _mm512_cmple_epi8_mask(bytes, Z_vec);
_mm512_mask_storeu_epi8(dst, len_mask, _mm512_mask_add_epi8(bytes, is_upper, bytes, delta));
}
return;
}
#endif

#if defined(__AVX2__)
// 32 bytes per iteration. Same compare-and-OR pattern as SSE2.
{
const __m256i a_minus_one = _mm256_set1_epi8('A' - 1);
const __m256i z_plus_one = _mm256_set1_epi8('Z' + 1);
const __m256i bit5 = _mm256_set1_epi8(0x20);
while (n >= 32) {
__m256i bytes = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
__m256i ge_A = _mm256_cmpgt_epi8(bytes, a_minus_one);
__m256i le_Z = _mm256_cmpgt_epi8(z_plus_one, bytes);
__m256i mask = _mm256_and_si256(_mm256_and_si256(ge_A, le_Z), bit5);
_mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), _mm256_or_si256(bytes, mask));
src += 32;
dst += 32;
n -= 32;
}
}
#endif

#if defined(__SSE2__)
// 16 bytes per iteration. Signed compare works for ASCII A-Z because all
// letters live below 0x80; high bytes (0x80..0xFF) compare as negative
// and correctly miss the [A,Z] range so they pass through unchanged.
{
const __m128i a_minus_one = _mm_set1_epi8('A' - 1);
const __m128i z_plus_one = _mm_set1_epi8('Z' + 1);
const __m128i bit5 = _mm_set1_epi8(0x20);
while (n >= 16) {
__m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
__m128i ge_A = _mm_cmpgt_epi8(bytes, a_minus_one);
__m128i le_Z = _mm_cmpgt_epi8(z_plus_one, bytes);
__m128i mask = _mm_and_si128(_mm_and_si128(ge_A, le_Z), bit5);
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_or_si128(bytes, mask));
src += 16;
dst += 16;
n -= 16;
}
}
#elif defined(__ARM_NEON) || defined(__aarch64__)
// 16 bytes per iteration; unsigned compare available natively.
{
const uint8x16_t a_minus_one = vdupq_n_u8('A' - 1);
const uint8x16_t z_plus_one = vdupq_n_u8('Z' + 1);
const uint8x16_t bit5 = vdupq_n_u8(0x20);
while (n >= 16) {
uint8x16_t bytes = vld1q_u8(reinterpret_cast<const uint8_t *>(src));
uint8x16_t ge_A = vcgtq_u8(bytes, a_minus_one);
uint8x16_t le_Z = vcltq_u8(bytes, z_plus_one);
uint8x16_t mask = vandq_u8(vandq_u8(ge_A, le_Z), bit5);
vst1q_u8(reinterpret_cast<uint8_t *>(dst), vorrq_u8(bytes, mask));
src += 16;
dst += 16;
n -= 16;
}
}
#endif

while (n--) {
auto c = static_cast<unsigned char>(*src++);
*dst++ = (c >= 'A' && c <= 'Z') ? static_cast<char>(c | 0x20) : static_cast<char>(c);
}
}

} // namespace ts
15 changes: 3 additions & 12 deletions src/proxy/hdrs/URL.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include <new>
#include "tscore/ink_platform.h"
#include "tscore/ink_memory.h"
#include "tscore/ink_memcpy_tolower.h"
#include "proxy/hdrs/URL.h"
#include "proxy/hdrs/MIME.h"
#include "proxy/hdrs/HTTP.h"
Expand Down Expand Up @@ -1684,16 +1685,6 @@ url_describe(HdrHeapObjImpl *raw, bool /* recurse ATS_UNUSED */)
* *
***********************************************************************/

static inline void
memcpy_tolower(char *d, const char *s, int n)
{
while (n--) {
*d = ParseRules::ink_tolower(*s);
s++;
d++;
}
}

// fast path for CryptoHash, HTTP, no user/password/params/query,
// no buffer overflow, no unescaping needed

Expand All @@ -1704,7 +1695,7 @@ url_CryptoHash_get_fast(const URLImpl *url, CryptoContext &ctx, CryptoHash *hash
char *p;

p = buffer;
memcpy_tolower(p, url->m_ptr_scheme, url->m_len_scheme);
ts::memcpy_tolower(p, url->m_ptr_scheme, url->m_len_scheme);
p += url->m_len_scheme;
*p++ = ':';
*p++ = '/';
Expand All @@ -1713,7 +1704,7 @@ url_CryptoHash_get_fast(const URLImpl *url, CryptoContext &ctx, CryptoHash *hash
*p++ = ':';
// no password
*p++ = '@';
memcpy_tolower(p, url->m_ptr_host, url->m_len_host);
ts::memcpy_tolower(p, url->m_ptr_host, url->m_len_host);
p += url->m_len_host;
*p++ = '/';
memcpy(p, url->m_ptr_path, url->m_len_path);
Expand Down
3 changes: 3 additions & 0 deletions tools/benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ target_link_libraries(benchmark_SharedMutex PRIVATE Catch2::Catch2 ts::tscore li
add_executable(benchmark_Random benchmark_Random.cc)
target_link_libraries(benchmark_Random PRIVATE Catch2::Catch2WithMain ts::tscore)

add_executable(benchmark_memcpy_tolower benchmark_memcpy_tolower.cc)
target_link_libraries(benchmark_memcpy_tolower PRIVATE Catch2::Catch2WithMain ts::tscore)

add_executable(benchmark_HostDB benchmark_HostDB.cc)
target_link_libraries(
benchmark_HostDB
Expand Down
167 changes: 167 additions & 0 deletions tools/benchmark/benchmark_memcpy_tolower.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
/** @file

Micro benchmark for ts::memcpy_tolower against a byte-at-a-time scalar
loop equivalent to the prior URL.cc::memcpy_tolower definition.

@section license License

Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

#define CATCH_CONFIG_ENABLE_BENCHMARKING

#include <catch2/catch_test_macros.hpp>
#include <catch2/benchmark/catch_benchmark.hpp>

#include "tscore/ink_memcpy_tolower.h"
#include "tscore/ParseRules.h"

#include <array>
#include <cstdint>
#include <cstring>
#include <iostream>
#include <random>
#include <string>
#include <vector>

namespace
{

// Sizes chosen to mirror the URL.cc hot path:
// 4-8B - common HTTP scheme strings ("http", "https")
// 16-32B - typical host names
// 64-256B - long host names / cache-key segments
// 1024B - stress the inner loop
constexpr std::array<std::size_t, 8> kSizes{4, 8, 16, 24, 32, 64, 256, 1024};

// Same character distribution we expect from URL/host input: ASCII letters
// (mixed case), digits, and the small set of non-alpha bytes that legitimately
// appear in URLs.
std::vector<char>
make_mixed_case_ascii(std::size_t n, std::uint64_t seed = 0xABCDEFULL)
{
std::mt19937_64 rng(seed);
std::vector<char> v(n);
for (std::size_t i = 0; i < n; ++i) {
auto r = static_cast<unsigned>(rng() & 0x3FU);
if (r < 26U) {
v[i] = static_cast<char>('A' + r);
} else if (r < 52U) {
v[i] = static_cast<char>('a' + (r - 26U));
} else {
static constexpr char kNonAlpha[] = "0123456789-_./:";
v[i] = kNonAlpha[r % (sizeof(kNonAlpha) - 1U)];
}
}
return v;
}

// Mirror of the prior static inline memcpy_tolower() from URL.cc, kept here
// as the baseline the SIMD path is expected to beat.
inline void
memcpy_tolower_scalar(char *d, const char *s, std::size_t n) noexcept
{
while (n--) {
*d = ParseRules::ink_tolower(*s);
++s;
++d;
}
}

} // namespace

TEST_CASE("active SIMD configuration", "[tolower][config]")
{
// Print the compile-time ISA path so the benchmark output makes the
// selected configuration obvious. Cascades stack: AVX-512BW builds also
// emit the AVX2 and SSE2 drain loops; AVX2 builds emit the SSE2 drain.
std::cout << "ts::memcpy_tolower compiled with: ";
#if defined(__AVX512BW__)
std::cout << "AVX-512BW (64B body + masked tail, gated at n>=64) + AVX2 + SSE2 cascade";
#elif defined(__AVX2__)
std::cout << "AVX2 (32B body) + SSE2 (16B drain)";
#elif defined(__SSE2__)
std::cout << "SSE2 (16B body)";
#elif defined(__ARM_NEON) || defined(__aarch64__)
std::cout << "NEON (16B body)";
#else
std::cout << "scalar only";
#endif
std::cout << '\n';
SUCCEED();
}

TEST_CASE("ts::memcpy_tolower matches scalar reference", "[tolower][correctness]")
{
// Cover sizes that bracket the 16-byte SIMD body: smaller-than, equal-to,
// a couple of multiples, and several offsets between multiples.
for (std::size_t sz : std::array<std::size_t, 12>{0, 1, 5, 15, 16, 17, 23, 31, 32, 33, 64, 257}) {
auto input = make_mixed_case_ascii(sz, 0xC0FFEE + sz);
std::vector<char> expected(sz);
std::vector<char> actual(sz);

memcpy_tolower_scalar(expected.data(), input.data(), sz);
ts::memcpy_tolower(actual.data(), input.data(), sz);

CAPTURE(sz);
REQUIRE(actual == expected);
}
}

TEST_CASE("ts::memcpy_tolower preserves non-ASCII bytes", "[tolower][correctness]")
{
// Every byte value from 0..255 should round-trip unchanged unless it is in
// 'A'..'Z', in which case it should map to 'a'..'z'. This catches anyone
// who later tries to "speed things up" by widening the range to Latin-1.
std::array<unsigned char, 256> input;
for (std::size_t i = 0; i < 256; ++i) {
input[i] = static_cast<unsigned char>(i);
}
std::array<char, 256> output;
ts::memcpy_tolower(output.data(), reinterpret_cast<const char *>(input.data()), input.size());

for (std::size_t i = 0; i < 256; ++i) {
auto in = static_cast<unsigned char>(i);
auto out = static_cast<unsigned char>(output[i]);
auto exp = (in >= 'A' && in <= 'Z') ? static_cast<unsigned char>(in | 0x20) : in;
CAPTURE(i);
REQUIRE(out == exp);
}
}

TEST_CASE("memcpy_tolower throughput", "[bench][tolower]")
{
for (std::size_t sz : kSizes) {
auto input = make_mixed_case_ascii(sz);
std::vector<char> output_scalar(sz);
std::vector<char> output_simd(sz);

std::string scalar_name = "scalar " + std::to_string(sz) + "B";
BENCHMARK(scalar_name.c_str())
{
memcpy_tolower_scalar(output_scalar.data(), input.data(), sz);
return output_scalar[0];
};

std::string simd_name = "ts::mct " + std::to_string(sz) + "B";
BENCHMARK(simd_name.c_str())
{
ts::memcpy_tolower(output_simd.data(), input.data(), sz);
return output_simd[0];
};
}
}