From 5b978770465a5e757c36af650545261b13d69dc7 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 19 Apr 2026 07:00:20 +0000 Subject: [PATCH 1/6] Optimize CmSketch AVX block access Agent-Logs-Url: https://github.com/bitfaster/BitFaster.Caching/sessions/7289ae1d-0fa0-43fd-bbe1-2511d41eb3d9 Co-authored-by: bitfaster <12851828+bitfaster@users.noreply.github.com> --- BitFaster.Caching/Lfu/CmSketchCore.cs | 81 +++++++++++++++++++++------ 1 file changed, 65 insertions(+), 16 deletions(-) diff --git a/BitFaster.Caching/Lfu/CmSketchCore.cs b/BitFaster.Caching/Lfu/CmSketchCore.cs index 9764c51b..a9c93d28 100644 --- a/BitFaster.Caching/Lfu/CmSketchCore.cs +++ b/BitFaster.Caching/Lfu/CmSketchCore.cs @@ -299,11 +299,22 @@ private unsafe int EstimateFrequencyAvx(T value) int counterHash = Rehash(blockHash); int block = (blockHash & blockMask) << 3; - Vector128 h = Avx2.ShiftRightLogicalVariable(Vector128.Create(counterHash).AsUInt32(), Vector128.Create(0U, 8U, 16U, 24U)).AsInt32(); - Vector128 index = Avx2.ShiftLeftLogical(Avx2.And(Avx2.ShiftRightLogical(h, 1), Vector128.Create(15)), 2); - Vector128 blockOffset = Avx2.Add(Avx2.Add(Vector128.Create(block), Avx2.And(h, Vector128.Create(1))), Vector128.Create(0, 2, 4, 6)); + int h0 = counterHash; + int h1 = counterHash >>> 8; + int h2 = counterHash >>> 16; + int h3 = counterHash >>> 24; + + int index0 = ((h0 >>> 1) & 15) << 2; + int index1 = ((h1 >>> 1) & 15) << 2; + int index2 = ((h2 >>> 1) & 15) << 2; + int index3 = ((h3 >>> 1) & 15) << 2; - Vector256 indexLong = Avx2.PermuteVar8x32(Vector256.Create(index, Vector128.Zero), Vector256.Create(0, 4, 1, 5, 2, 5, 3, 7)).AsUInt64(); + int lane0 = h0 & 1; + int lane1 = (h1 & 1) + 2; + int lane2 = h2 & 1; + int lane3 = (h3 & 1) + 2; + + Vector256 index = Vector256.Create((ulong)index0, (ulong)index1, (ulong)index2, (ulong)index3); #if NET6_0_OR_GREATER long* tablePtr = tableAddr; @@ -311,7 +322,19 @@ private unsafe int EstimateFrequencyAvx(T value) fixed (long* tablePtr = table) #endif { - Vector128 count = Avx2.PermuteVar8x32(Avx2.And(Avx2.ShiftRightLogicalVariable(Avx2.GatherVector256(tablePtr, blockOffset, 8), indexLong), Vector256.Create(0xfL)).AsInt32(), Vector256.Create(0, 2, 4, 6, 1, 3, 5, 7)) + Vector256 lower = Avx.LoadVector256(tablePtr + block); + Vector256 upper = Avx.LoadVector256(tablePtr + block + 4); + + ref long lowerRef = ref Unsafe.As, long>(ref lower); + ref long upperRef = ref Unsafe.As, long>(ref upper); + + Vector256 countVector = Vector256.Create( + (ulong)Unsafe.Add(ref lowerRef, lane0), + (ulong)Unsafe.Add(ref lowerRef, lane1), + (ulong)Unsafe.Add(ref upperRef, lane2), + (ulong)Unsafe.Add(ref upperRef, lane3)); + + Vector128 count = Avx2.PermuteVar8x32(Avx2.And(Avx2.ShiftRightLogicalVariable(countVector, index), Vector256.Create(0xfUL)).AsInt32(), Vector256.Create(0, 2, 4, 6, 1, 3, 5, 7)) .GetLower() .AsUInt16(); @@ -333,12 +356,23 @@ private unsafe void IncrementAvx(T value) int counterHash = Rehash(blockHash); int block = (blockHash & blockMask) << 3; - Vector128 h = Avx2.ShiftRightLogicalVariable(Vector128.Create(counterHash).AsUInt32(), Vector128.Create(0U, 8U, 16U, 24U)).AsInt32(); - Vector128 index = Avx2.ShiftLeftLogical(Avx2.And(Avx2.ShiftRightLogical(h, 1), Vector128.Create(15)), 2); - Vector128 blockOffset = Avx2.Add(Avx2.Add(Vector128.Create(block), Avx2.And(h, Vector128.Create(1))), Vector128.Create(0, 2, 4, 6)); + int h0 = counterHash; + int h1 = counterHash >>> 8; + int h2 = counterHash >>> 16; + int h3 = counterHash >>> 24; + + int index0 = ((h0 >>> 1) & 15) << 2; + int index1 = ((h1 >>> 1) & 15) << 2; + int index2 = ((h2 >>> 1) & 15) << 2; + int index3 = ((h3 >>> 1) & 15) << 2; + + int lane0 = h0 & 1; + int lane1 = (h1 & 1) + 2; + int lane2 = h2 & 1; + int lane3 = (h3 & 1) + 2; - Vector256 offsetLong = Avx2.PermuteVar8x32(Vector256.Create(index, Vector128.Zero), Vector256.Create(0, 4, 1, 5, 2, 5, 3, 7)).AsUInt64(); - Vector256 mask = Avx2.ShiftLeftLogicalVariable(Vector256.Create(0xfL), offsetLong); + Vector256 index = Vector256.Create((ulong)index0, (ulong)index1, (ulong)index2, (ulong)index3); + Vector256 mask = Avx2.ShiftLeftLogicalVariable(Vector256.Create(0xfL), index); #if NET6_0_OR_GREATER long* tablePtr = tableAddr; @@ -346,18 +380,33 @@ private unsafe void IncrementAvx(T value) fixed (long* tablePtr = table) #endif { + Vector256 lower = Avx.LoadVector256(tablePtr + block); + Vector256 upper = Avx.LoadVector256(tablePtr + block + 4); + + ref long lowerRef = ref Unsafe.As, long>(ref lower); + ref long upperRef = ref Unsafe.As, long>(ref upper); + + Vector256 count = Vector256.Create( + Unsafe.Add(ref lowerRef, lane0), + Unsafe.Add(ref lowerRef, lane1), + Unsafe.Add(ref upperRef, lane2), + Unsafe.Add(ref upperRef, lane3)); + // Note masked is 'equal' - therefore use AndNot below - Vector256 masked = Avx2.CompareEqual(Avx2.And(Avx2.GatherVector256(tablePtr, blockOffset, 8), mask), mask); + Vector256 masked = Avx2.CompareEqual(Avx2.And(count, mask), mask); // Mask to zero out non matches (add zero below) - first operand is NOT then AND result (order matters) - Vector256 inc = Avx2.AndNot(masked, Avx2.ShiftLeftLogicalVariable(Vector256.Create(1L), offsetLong)); + Vector256 inc = Avx2.AndNot(masked, Avx2.ShiftLeftLogicalVariable(Vector256.Create(1L), index)); bool wasInc = Avx2.MoveMask(Avx2.CompareEqual(masked.AsByte(), Vector256.Zero).AsByte()) == unchecked((int)(0b1111_1111_1111_1111_1111_1111_1111_1111)); - tablePtr[blockOffset.GetElement(0)] += inc.GetElement(0); - tablePtr[blockOffset.GetElement(1)] += inc.GetElement(1); - tablePtr[blockOffset.GetElement(2)] += inc.GetElement(2); - tablePtr[blockOffset.GetElement(3)] += inc.GetElement(3); + Unsafe.Add(ref lowerRef, lane0) = count.GetElement(0) + inc.GetElement(0); + Unsafe.Add(ref lowerRef, lane1) = count.GetElement(1) + inc.GetElement(1); + Unsafe.Add(ref upperRef, lane2) = count.GetElement(2) + inc.GetElement(2); + Unsafe.Add(ref upperRef, lane3) = count.GetElement(3) + inc.GetElement(3); + + Avx.Store(tablePtr + block, lower); + Avx.Store(tablePtr + block + 4, upper); if (wasInc && (++size == sampleSize)) { From 5735a0b1d2dd3fe9bba9c6c18bce9ef2c322ebbb Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 19 Apr 2026 07:06:32 +0000 Subject: [PATCH 2/6] Refine CmSketch AVX vector updates Agent-Logs-Url: https://github.com/bitfaster/BitFaster.Caching/sessions/7289ae1d-0fa0-43fd-bbe1-2511d41eb3d9 Co-authored-by: bitfaster <12851828+bitfaster@users.noreply.github.com> --- BitFaster.Caching/Lfu/CmSketchCore.cs | 46 +++++++++++++++++---------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/BitFaster.Caching/Lfu/CmSketchCore.cs b/BitFaster.Caching/Lfu/CmSketchCore.cs index a9c93d28..d3085d38 100644 --- a/BitFaster.Caching/Lfu/CmSketchCore.cs +++ b/BitFaster.Caching/Lfu/CmSketchCore.cs @@ -325,14 +325,11 @@ private unsafe int EstimateFrequencyAvx(T value) Vector256 lower = Avx.LoadVector256(tablePtr + block); Vector256 upper = Avx.LoadVector256(tablePtr + block + 4); - ref long lowerRef = ref Unsafe.As, long>(ref lower); - ref long upperRef = ref Unsafe.As, long>(ref upper); - Vector256 countVector = Vector256.Create( - (ulong)Unsafe.Add(ref lowerRef, lane0), - (ulong)Unsafe.Add(ref lowerRef, lane1), - (ulong)Unsafe.Add(ref upperRef, lane2), - (ulong)Unsafe.Add(ref upperRef, lane3)); + (ulong)lower.GetElement(lane0), + (ulong)lower.GetElement(lane1), + (ulong)upper.GetElement(lane2), + (ulong)upper.GetElement(lane3)); Vector128 count = Avx2.PermuteVar8x32(Avx2.And(Avx2.ShiftRightLogicalVariable(countVector, index), Vector256.Create(0xfUL)).AsInt32(), Vector256.Create(0, 2, 4, 6, 1, 3, 5, 7)) .GetLower() @@ -383,14 +380,21 @@ private unsafe void IncrementAvx(T value) Vector256 lower = Avx.LoadVector256(tablePtr + block); Vector256 upper = Avx.LoadVector256(tablePtr + block + 4); - ref long lowerRef = ref Unsafe.As, long>(ref lower); - ref long upperRef = ref Unsafe.As, long>(ref upper); + long lower0 = lower.GetElement(0); + long lower1 = lower.GetElement(1); + long lower2 = lower.GetElement(2); + long lower3 = lower.GetElement(3); + + long upper0 = upper.GetElement(0); + long upper1 = upper.GetElement(1); + long upper2 = upper.GetElement(2); + long upper3 = upper.GetElement(3); Vector256 count = Vector256.Create( - Unsafe.Add(ref lowerRef, lane0), - Unsafe.Add(ref lowerRef, lane1), - Unsafe.Add(ref upperRef, lane2), - Unsafe.Add(ref upperRef, lane3)); + lane0 == 0 ? lower0 : lower1, + lane1 == 2 ? lower2 : lower3, + lane2 == 0 ? upper0 : upper1, + lane3 == 2 ? upper2 : upper3); // Note masked is 'equal' - therefore use AndNot below Vector256 masked = Avx2.CompareEqual(Avx2.And(count, mask), mask); @@ -400,10 +404,18 @@ private unsafe void IncrementAvx(T value) bool wasInc = Avx2.MoveMask(Avx2.CompareEqual(masked.AsByte(), Vector256.Zero).AsByte()) == unchecked((int)(0b1111_1111_1111_1111_1111_1111_1111_1111)); - Unsafe.Add(ref lowerRef, lane0) = count.GetElement(0) + inc.GetElement(0); - Unsafe.Add(ref lowerRef, lane1) = count.GetElement(1) + inc.GetElement(1); - Unsafe.Add(ref upperRef, lane2) = count.GetElement(2) + inc.GetElement(2); - Unsafe.Add(ref upperRef, lane3) = count.GetElement(3) + inc.GetElement(3); + long nextLower0 = lane0 == 0 ? count.GetElement(0) + inc.GetElement(0) : lower0; + long nextLower1 = lane0 == 1 ? count.GetElement(0) + inc.GetElement(0) : lower1; + long nextLower2 = lane1 == 2 ? count.GetElement(1) + inc.GetElement(1) : lower2; + long nextLower3 = lane1 == 3 ? count.GetElement(1) + inc.GetElement(1) : lower3; + + long nextUpper0 = lane2 == 0 ? count.GetElement(2) + inc.GetElement(2) : upper0; + long nextUpper1 = lane2 == 1 ? count.GetElement(2) + inc.GetElement(2) : upper1; + long nextUpper2 = lane3 == 2 ? count.GetElement(3) + inc.GetElement(3) : upper2; + long nextUpper3 = lane3 == 3 ? count.GetElement(3) + inc.GetElement(3) : upper3; + + lower = Vector256.Create(nextLower0, nextLower1, nextLower2, nextLower3); + upper = Vector256.Create(nextUpper0, nextUpper1, nextUpper2, nextUpper3); Avx.Store(tablePtr + block, lower); Avx.Store(tablePtr + block + 4, upper); From e1c44bba3aa2ae25309c18c974703fef8558687a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 20 Apr 2026 01:53:01 +0000 Subject: [PATCH 3/6] Use aligned CmSketch AVX block access Agent-Logs-Url: https://github.com/bitfaster/BitFaster.Caching/sessions/68029771-3d41-4b5f-8733-4bb8c9d8ab7c Co-authored-by: bitfaster <12851828+bitfaster@users.noreply.github.com> --- BitFaster.Caching/Lfu/CmSketchCore.cs | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/BitFaster.Caching/Lfu/CmSketchCore.cs b/BitFaster.Caching/Lfu/CmSketchCore.cs index d3085d38..c35a3094 100644 --- a/BitFaster.Caching/Lfu/CmSketchCore.cs +++ b/BitFaster.Caching/Lfu/CmSketchCore.cs @@ -152,7 +152,7 @@ private void EnsureCapacity(long maximumSize) table = GC.AllocateArray(Math.Max(BitOps.CeilingPowerOfTwo(maximum), 8) + pad, pinned); tableAddr = (long*)Unsafe.AsPointer(ref table[0]); - tableAddr = (long*)((long)tableAddr + (long)tableAddr % 64); + tableAddr = (long*)(((nuint)tableAddr + 63u) & ~(nuint)63); blockMask = (int)((uint)(table.Length - pad) >> 3) - 1; } @@ -322,8 +322,13 @@ private unsafe int EstimateFrequencyAvx(T value) fixed (long* tablePtr = table) #endif { +#if NET6_0_OR_GREATER + Vector256 lower = Avx.LoadAlignedVector256(tablePtr + block); + Vector256 upper = Avx.LoadAlignedVector256(tablePtr + block + 4); +#else Vector256 lower = Avx.LoadVector256(tablePtr + block); Vector256 upper = Avx.LoadVector256(tablePtr + block + 4); +#endif Vector256 countVector = Vector256.Create( (ulong)lower.GetElement(lane0), @@ -377,8 +382,13 @@ private unsafe void IncrementAvx(T value) fixed (long* tablePtr = table) #endif { +#if NET6_0_OR_GREATER + Vector256 lower = Avx.LoadAlignedVector256(tablePtr + block); + Vector256 upper = Avx.LoadAlignedVector256(tablePtr + block + 4); +#else Vector256 lower = Avx.LoadVector256(tablePtr + block); Vector256 upper = Avx.LoadVector256(tablePtr + block + 4); +#endif long lower0 = lower.GetElement(0); long lower1 = lower.GetElement(1); @@ -417,8 +427,13 @@ private unsafe void IncrementAvx(T value) lower = Vector256.Create(nextLower0, nextLower1, nextLower2, nextLower3); upper = Vector256.Create(nextUpper0, nextUpper1, nextUpper2, nextUpper3); +#if NET6_0_OR_GREATER + Avx.StoreAligned(tablePtr + block, lower); + Avx.StoreAligned(tablePtr + block + 4, upper); +#else Avx.Store(tablePtr + block, lower); Avx.Store(tablePtr + block + 4, upper); +#endif if (wasInc && (++size == sampleSize)) { From cfbd9ac63f8cbb16dd1355409d89e5e1ca002a37 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 20 Apr 2026 01:55:58 +0000 Subject: [PATCH 4/6] Clean up CmSketch AVX alignment constants Agent-Logs-Url: https://github.com/bitfaster/BitFaster.Caching/sessions/68029771-3d41-4b5f-8733-4bb8c9d8ab7c Co-authored-by: bitfaster <12851828+bitfaster@users.noreply.github.com> --- BitFaster.Caching/Lfu/CmSketchCore.cs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/BitFaster.Caching/Lfu/CmSketchCore.cs b/BitFaster.Caching/Lfu/CmSketchCore.cs index c35a3094..d433c6e3 100644 --- a/BitFaster.Caching/Lfu/CmSketchCore.cs +++ b/BitFaster.Caching/Lfu/CmSketchCore.cs @@ -37,6 +37,7 @@ public unsafe class CmSketchCore { private const long ResetMask = 0x7777777777777777L; private const long OneMask = 0x1111111111111111L; + private const nuint CacheLineAlignmentMask = 63; private long[] table; #if NET6_0_OR_GREATER @@ -152,7 +153,7 @@ private void EnsureCapacity(long maximumSize) table = GC.AllocateArray(Math.Max(BitOps.CeilingPowerOfTwo(maximum), 8) + pad, pinned); tableAddr = (long*)Unsafe.AsPointer(ref table[0]); - tableAddr = (long*)(((nuint)tableAddr + 63u) & ~(nuint)63); + tableAddr = (long*)(((nuint)tableAddr + CacheLineAlignmentMask) & ~CacheLineAlignmentMask); blockMask = (int)((uint)(table.Length - pad) >> 3) - 1; } From 07f4d281e762b2ffc5d1abda014b2dee15643a7e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 20 Apr 2026 02:03:22 +0000 Subject: [PATCH 5/6] Vectorize CmSketch AVX counter updates Agent-Logs-Url: https://github.com/bitfaster/BitFaster.Caching/sessions/c76dc857-c67e-4ce6-a05f-2ebb894fcc0b Co-authored-by: bitfaster <12851828+bitfaster@users.noreply.github.com> --- BitFaster.Caching/Lfu/CmSketchCore.cs | 64 +++++++++++++-------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/BitFaster.Caching/Lfu/CmSketchCore.cs b/BitFaster.Caching/Lfu/CmSketchCore.cs index d433c6e3..4bce5007 100644 --- a/BitFaster.Caching/Lfu/CmSketchCore.cs +++ b/BitFaster.Caching/Lfu/CmSketchCore.cs @@ -374,8 +374,29 @@ private unsafe void IncrementAvx(T value) int lane2 = h2 & 1; int lane3 = (h3 & 1) + 2; - Vector256 index = Vector256.Create((ulong)index0, (ulong)index1, (ulong)index2, (ulong)index3); - Vector256 mask = Avx2.ShiftLeftLogicalVariable(Vector256.Create(0xfL), index); + Vector256 lowerIndex = Vector256.Create( + lane0 == 0 ? (ulong)index0 : 0UL, + lane0 == 1 ? (ulong)index0 : 0UL, + lane1 == 2 ? (ulong)index1 : 0UL, + lane1 == 3 ? (ulong)index1 : 0UL); + + Vector256 upperIndex = Vector256.Create( + lane2 == 0 ? (ulong)index2 : 0UL, + lane2 == 1 ? (ulong)index2 : 0UL, + lane3 == 2 ? (ulong)index3 : 0UL, + lane3 == 3 ? (ulong)index3 : 0UL); + + Vector256 lowerLaneMask = Vector256.Create( + lane0 == 0 ? -1L : 0L, + lane0 == 1 ? -1L : 0L, + lane1 == 2 ? -1L : 0L, + lane1 == 3 ? -1L : 0L); + + Vector256 upperLaneMask = Vector256.Create( + lane2 == 0 ? -1L : 0L, + lane2 == 1 ? -1L : 0L, + lane3 == 2 ? -1L : 0L, + lane3 == 3 ? -1L : 0L); #if NET6_0_OR_GREATER long* tablePtr = tableAddr; @@ -391,42 +412,21 @@ private unsafe void IncrementAvx(T value) Vector256 upper = Avx.LoadVector256(tablePtr + block + 4); #endif - long lower0 = lower.GetElement(0); - long lower1 = lower.GetElement(1); - long lower2 = lower.GetElement(2); - long lower3 = lower.GetElement(3); - - long upper0 = upper.GetElement(0); - long upper1 = upper.GetElement(1); - long upper2 = upper.GetElement(2); - long upper3 = upper.GetElement(3); - - Vector256 count = Vector256.Create( - lane0 == 0 ? lower0 : lower1, - lane1 == 2 ? lower2 : lower3, - lane2 == 0 ? upper0 : upper1, - lane3 == 2 ? upper2 : upper3); + Vector256 lowerMask = Avx2.And(Avx2.ShiftLeftLogicalVariable(Vector256.Create(0xfL), lowerIndex), lowerLaneMask); + Vector256 upperMask = Avx2.And(Avx2.ShiftLeftLogicalVariable(Vector256.Create(0xfL), upperIndex), upperLaneMask); // Note masked is 'equal' - therefore use AndNot below - Vector256 masked = Avx2.CompareEqual(Avx2.And(count, mask), mask); + Vector256 lowerMasked = Avx2.CompareEqual(Avx2.And(lower, lowerMask), lowerMask); + Vector256 upperMasked = Avx2.CompareEqual(Avx2.And(upper, upperMask), upperMask); // Mask to zero out non matches (add zero below) - first operand is NOT then AND result (order matters) - Vector256 inc = Avx2.AndNot(masked, Avx2.ShiftLeftLogicalVariable(Vector256.Create(1L), index)); - - bool wasInc = Avx2.MoveMask(Avx2.CompareEqual(masked.AsByte(), Vector256.Zero).AsByte()) == unchecked((int)(0b1111_1111_1111_1111_1111_1111_1111_1111)); - - long nextLower0 = lane0 == 0 ? count.GetElement(0) + inc.GetElement(0) : lower0; - long nextLower1 = lane0 == 1 ? count.GetElement(0) + inc.GetElement(0) : lower1; - long nextLower2 = lane1 == 2 ? count.GetElement(1) + inc.GetElement(1) : lower2; - long nextLower3 = lane1 == 3 ? count.GetElement(1) + inc.GetElement(1) : lower3; + Vector256 lowerInc = Avx2.And(Avx2.AndNot(lowerMasked, Avx2.ShiftLeftLogicalVariable(Vector256.Create(1L), lowerIndex)), lowerLaneMask); + Vector256 upperInc = Avx2.And(Avx2.AndNot(upperMasked, Avx2.ShiftLeftLogicalVariable(Vector256.Create(1L), upperIndex)), upperLaneMask); - long nextUpper0 = lane2 == 0 ? count.GetElement(2) + inc.GetElement(2) : upper0; - long nextUpper1 = lane2 == 1 ? count.GetElement(2) + inc.GetElement(2) : upper1; - long nextUpper2 = lane3 == 2 ? count.GetElement(3) + inc.GetElement(3) : upper2; - long nextUpper3 = lane3 == 3 ? count.GetElement(3) + inc.GetElement(3) : upper3; + bool wasInc = Avx2.MoveMask(Avx2.CompareEqual(Avx2.Or(lowerInc, upperInc).AsByte(), Vector256.Zero).AsByte()) != unchecked((int)(0b1111_1111_1111_1111_1111_1111_1111_1111)); - lower = Vector256.Create(nextLower0, nextLower1, nextLower2, nextLower3); - upper = Vector256.Create(nextUpper0, nextUpper1, nextUpper2, nextUpper3); + lower = Avx2.Add(lower, lowerInc); + upper = Avx2.Add(upper, upperInc); #if NET6_0_OR_GREATER Avx.StoreAligned(tablePtr + block, lower); From aabff690569eb7de58764bdd3910a9998c6fa644 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 21 Apr 2026 07:22:02 +0000 Subject: [PATCH 6/6] Remove branches from CmSketch AVX lane masks Agent-Logs-Url: https://github.com/bitfaster/BitFaster.Caching/sessions/cdf1657f-7869-4fc8-a0a6-43ccc27a3f87 Co-authored-by: bitfaster <12851828+bitfaster@users.noreply.github.com> --- BitFaster.Caching/Lfu/CmSketchCore.cs | 34 +++++++++------------------ 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/BitFaster.Caching/Lfu/CmSketchCore.cs b/BitFaster.Caching/Lfu/CmSketchCore.cs index 4bce5007..3a659c64 100644 --- a/BitFaster.Caching/Lfu/CmSketchCore.cs +++ b/BitFaster.Caching/Lfu/CmSketchCore.cs @@ -374,29 +374,17 @@ private unsafe void IncrementAvx(T value) int lane2 = h2 & 1; int lane3 = (h3 & 1) + 2; - Vector256 lowerIndex = Vector256.Create( - lane0 == 0 ? (ulong)index0 : 0UL, - lane0 == 1 ? (ulong)index0 : 0UL, - lane1 == 2 ? (ulong)index1 : 0UL, - lane1 == 3 ? (ulong)index1 : 0UL); - - Vector256 upperIndex = Vector256.Create( - lane2 == 0 ? (ulong)index2 : 0UL, - lane2 == 1 ? (ulong)index2 : 0UL, - lane3 == 2 ? (ulong)index3 : 0UL, - lane3 == 3 ? (ulong)index3 : 0UL); - - Vector256 lowerLaneMask = Vector256.Create( - lane0 == 0 ? -1L : 0L, - lane0 == 1 ? -1L : 0L, - lane1 == 2 ? -1L : 0L, - lane1 == 3 ? -1L : 0L); - - Vector256 upperLaneMask = Vector256.Create( - lane2 == 0 ? -1L : 0L, - lane2 == 1 ? -1L : 0L, - lane3 == 2 ? -1L : 0L, - lane3 == 3 ? -1L : 0L); + Vector256 laneOffsets = Vector256.Create(0L, 1L, 2L, 3L); + Vector256 lowerIndex = Vector256.Create((ulong)index0, (ulong)index0, (ulong)index1, (ulong)index1); + Vector256 upperIndex = Vector256.Create((ulong)index2, (ulong)index2, (ulong)index3, (ulong)index3); + + Vector256 lowerLaneMask = Avx2.CompareEqual( + laneOffsets, + Vector256.Create((long)lane0, (long)lane0, (long)lane1, (long)lane1)); + + Vector256 upperLaneMask = Avx2.CompareEqual( + laneOffsets, + Vector256.Create((long)lane2, (long)lane2, (long)lane3, (long)lane3)); #if NET6_0_OR_GREATER long* tablePtr = tableAddr;