diff --git a/benchmark/CDT.Benchmarks/PredicatesBenchmarks.cs b/benchmark/CDT.Benchmarks/PredicatesBenchmarks.cs new file mode 100644 index 0000000..942eb7d --- /dev/null +++ b/benchmark/CDT.Benchmarks/PredicatesBenchmarks.cs @@ -0,0 +1,418 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Configs; +using BenchmarkDotNet.Diagnosers; +using BenchmarkDotNet.Running; +using CDT.Predicates; + +// --------------------------------------------------------------------------- +// Predicates microbenchmarks +// --------------------------------------------------------------------------- + +/// +/// Microbenchmarks that compare and +/// head-to-head for every predicate variant. +/// +/// +/// Two input scenarios are covered for each method: +/// +/// General – random coordinates in general position. +/// The adaptive fast-path (Stage A) returns immediately without any +/// expansion arithmetic, so these numbers reflect the absolute best +/// case for . +/// NearDegenerate – nearly-collinear / nearly-co-circular +/// points that force the adaptive predicate through Stage B–D (and +/// therefore all the way to the exact expansion code). These numbers +/// show the cost of the slow path and are comparable to what +/// always pays. +/// +/// +/// +[MemoryDiagnoser] +[EventPipeProfiler(EventPipeProfile.CpuSampling)] +[GroupBenchmarksBy(BenchmarkLogicalGroupRule.ByCategory)] +[CategoriesColumn] +[ShortRunJob] +public class PredicatesBenchmarks +{ + // Number of predicate calls per benchmark iteration – large enough to + // dominate timer resolution, small enough for a short-run job. + private const int N = 4_000; + + // ── General-position input arrays ────────────────────────────────────── + // Orient2d uses 3 points (6 doubles / 6 floats) per call. + // InCircle uses 4 points (8 doubles / 8 floats) per call. + private double[] _orient2dD = null!; + private float[] _orient2dF = null!; + private double[] _inCircleD = null!; + private float[] _inCircleF = null!; + + // ── Near-degenerate input arrays ─────────────────────────────────────── + // Orient2d: nearly-collinear triples. + // InCircle: points nearly on a common circumcircle. + private double[] _orient2dDeg = null!; + private float[] _orient2dFDeg = null!; + private double[] _inCircleDeg = null!; + private float[] _inCircleFDeg = null!; + + [GlobalSetup] + public void Setup() + { + var rng = new Random(unchecked((int)0xDEAD_BEEF)); + + // ── general-position ─────────────────────────────────────────────── + _orient2dD = FillDoubles(rng, 6 * N, scale: 1000.0); + _orient2dF = ToFloats(_orient2dD); + _inCircleD = FillDoubles(rng, 8 * N, scale: 1000.0); + _inCircleF = ToFloats(_inCircleD); + + // ── near-degenerate ──────────────────────────────────────────────── + // Orient2d: triple (A, B, C) where C is nearly on line AB. + // We construct A=(0,0), B=(1,1), C=(t, t+ε) for varying t and ε≈1e-14. + _orient2dDeg = BuildNearCollinear(N); + _orient2dFDeg = ToFloats(_orient2dDeg); + + // InCircle: quadruple (A,B,C,D) where D is nearly on the + // circumcircle of A, B, C. + _inCircleDeg = BuildNearCircle(N); + _inCircleFDeg = ToFloats(_inCircleDeg); + } + + // ========================================================================= + // Orient2d – double + // ========================================================================= + + [Benchmark(Description = "Adaptive – Orient2d double (general)")] + [BenchmarkCategory("Orient2d-double")] + public double Adaptive_Orient2d_Double_General() + { + double acc = 0.0; + double[] pts = _orient2dD; + for (int i = 0; i < N; i++) + { + int j = i * 6; + acc += PredicatesAdaptive.Orient2d( + pts[j], pts[j + 1], pts[j + 2], pts[j + 3], pts[j + 4], pts[j + 5]); + } + return acc; + } + + [Benchmark(Description = "Exact – Orient2d double (general)")] + [BenchmarkCategory("Orient2d-double")] + public double Exact_Orient2d_Double_General() + { + double acc = 0.0; + double[] pts = _orient2dD; + for (int i = 0; i < N; i++) + { + int j = i * 6; + acc += PredicatesExact.Orient2d( + pts[j], pts[j + 1], pts[j + 2], pts[j + 3], pts[j + 4], pts[j + 5]); + } + return acc; + } + + [Benchmark(Description = "Adaptive – Orient2d double (near-degenerate)")] + [BenchmarkCategory("Orient2d-double-hard")] + public double Adaptive_Orient2d_Double_NearDegenerate() + { + double acc = 0.0; + double[] pts = _orient2dDeg; + for (int i = 0; i < N; i++) + { + int j = i * 6; + acc += PredicatesAdaptive.Orient2d( + pts[j], pts[j + 1], pts[j + 2], pts[j + 3], pts[j + 4], pts[j + 5]); + } + return acc; + } + + [Benchmark(Description = "Exact – Orient2d double (near-degenerate)")] + [BenchmarkCategory("Orient2d-double-hard")] + public double Exact_Orient2d_Double_NearDegenerate() + { + double acc = 0.0; + double[] pts = _orient2dDeg; + for (int i = 0; i < N; i++) + { + int j = i * 6; + acc += PredicatesExact.Orient2d( + pts[j], pts[j + 1], pts[j + 2], pts[j + 3], pts[j + 4], pts[j + 5]); + } + return acc; + } + + // ========================================================================= + // Orient2d – float + // ========================================================================= + + [Benchmark(Description = "Adaptive – Orient2d float (general)")] + [BenchmarkCategory("Orient2d-float")] + public float Adaptive_Orient2d_Float_General() + { + float acc = 0f; + float[] pts = _orient2dF; + for (int i = 0; i < N; i++) + { + int j = i * 6; + acc += PredicatesAdaptive.Orient2d( + pts[j], pts[j + 1], pts[j + 2], pts[j + 3], pts[j + 4], pts[j + 5]); + } + return acc; + } + + [Benchmark(Description = "Exact – Orient2d float (general)")] + [BenchmarkCategory("Orient2d-float")] + public float Exact_Orient2d_Float_General() + { + float acc = 0f; + float[] pts = _orient2dF; + for (int i = 0; i < N; i++) + { + int j = i * 6; + acc += PredicatesExact.Orient2d( + pts[j], pts[j + 1], pts[j + 2], pts[j + 3], pts[j + 4], pts[j + 5]); + } + return acc; + } + + [Benchmark(Description = "Adaptive – Orient2d float (near-degenerate)")] + [BenchmarkCategory("Orient2d-float-hard")] + public float Adaptive_Orient2d_Float_NearDegenerate() + { + float acc = 0f; + float[] pts = _orient2dFDeg; + for (int i = 0; i < N; i++) + { + int j = i * 6; + acc += PredicatesAdaptive.Orient2d( + pts[j], pts[j + 1], pts[j + 2], pts[j + 3], pts[j + 4], pts[j + 5]); + } + return acc; + } + + [Benchmark(Description = "Exact – Orient2d float (near-degenerate)")] + [BenchmarkCategory("Orient2d-float-hard")] + public float Exact_Orient2d_Float_NearDegenerate() + { + float acc = 0f; + float[] pts = _orient2dFDeg; + for (int i = 0; i < N; i++) + { + int j = i * 6; + acc += PredicatesExact.Orient2d( + pts[j], pts[j + 1], pts[j + 2], pts[j + 3], pts[j + 4], pts[j + 5]); + } + return acc; + } + + // ========================================================================= + // InCircle – double + // ========================================================================= + + [Benchmark(Description = "Adaptive – InCircle double (general)")] + [BenchmarkCategory("InCircle-double")] + public double Adaptive_InCircle_Double_General() + { + double acc = 0.0; + double[] pts = _inCircleD; + for (int i = 0; i < N; i++) + { + int j = i * 8; + acc += PredicatesAdaptive.InCircle( + pts[j], pts[j + 1], pts[j + 2], pts[j + 3], + pts[j + 4], pts[j + 5], pts[j + 6], pts[j + 7]); + } + return acc; + } + + [Benchmark(Description = "Exact – InCircle double (general)")] + [BenchmarkCategory("InCircle-double")] + public double Exact_InCircle_Double_General() + { + double acc = 0.0; + double[] pts = _inCircleD; + for (int i = 0; i < N; i++) + { + int j = i * 8; + acc += PredicatesExact.InCircle( + pts[j], pts[j + 1], pts[j + 2], pts[j + 3], + pts[j + 4], pts[j + 5], pts[j + 6], pts[j + 7]); + } + return acc; + } + + [Benchmark(Description = "Adaptive – InCircle double (near-degenerate)")] + [BenchmarkCategory("InCircle-double-hard")] + public double Adaptive_InCircle_Double_NearDegenerate() + { + double acc = 0.0; + double[] pts = _inCircleDeg; + for (int i = 0; i < N; i++) + { + int j = i * 8; + acc += PredicatesAdaptive.InCircle( + pts[j], pts[j + 1], pts[j + 2], pts[j + 3], + pts[j + 4], pts[j + 5], pts[j + 6], pts[j + 7]); + } + return acc; + } + + [Benchmark(Description = "Exact – InCircle double (near-degenerate)")] + [BenchmarkCategory("InCircle-double-hard")] + public double Exact_InCircle_Double_NearDegenerate() + { + double acc = 0.0; + double[] pts = _inCircleDeg; + for (int i = 0; i < N; i++) + { + int j = i * 8; + acc += PredicatesExact.InCircle( + pts[j], pts[j + 1], pts[j + 2], pts[j + 3], + pts[j + 4], pts[j + 5], pts[j + 6], pts[j + 7]); + } + return acc; + } + + // ========================================================================= + // InCircle – float + // ========================================================================= + + [Benchmark(Description = "Adaptive – InCircle float (general)")] + [BenchmarkCategory("InCircle-float")] + public float Adaptive_InCircle_Float_General() + { + float acc = 0f; + float[] pts = _inCircleF; + for (int i = 0; i < N; i++) + { + int j = i * 8; + acc += PredicatesAdaptive.InCircle( + pts[j], pts[j + 1], pts[j + 2], pts[j + 3], + pts[j + 4], pts[j + 5], pts[j + 6], pts[j + 7]); + } + return acc; + } + + [Benchmark(Description = "Exact – InCircle float (general)")] + [BenchmarkCategory("InCircle-float")] + public float Exact_InCircle_Float_General() + { + float acc = 0f; + float[] pts = _inCircleF; + for (int i = 0; i < N; i++) + { + int j = i * 8; + acc += PredicatesExact.InCircle( + pts[j], pts[j + 1], pts[j + 2], pts[j + 3], + pts[j + 4], pts[j + 5], pts[j + 6], pts[j + 7]); + } + return acc; + } + + [Benchmark(Description = "Adaptive – InCircle float (near-degenerate)")] + [BenchmarkCategory("InCircle-float-hard")] + public float Adaptive_InCircle_Float_NearDegenerate() + { + float acc = 0f; + float[] pts = _inCircleFDeg; + for (int i = 0; i < N; i++) + { + int j = i * 8; + acc += PredicatesAdaptive.InCircle( + pts[j], pts[j + 1], pts[j + 2], pts[j + 3], + pts[j + 4], pts[j + 5], pts[j + 6], pts[j + 7]); + } + return acc; + } + + [Benchmark(Description = "Exact – InCircle float (near-degenerate)")] + [BenchmarkCategory("InCircle-float-hard")] + public float Exact_InCircle_Float_NearDegenerate() + { + float acc = 0f; + float[] pts = _inCircleFDeg; + for (int i = 0; i < N; i++) + { + int j = i * 8; + acc += PredicatesExact.InCircle( + pts[j], pts[j + 1], pts[j + 2], pts[j + 3], + pts[j + 4], pts[j + 5], pts[j + 6], pts[j + 7]); + } + return acc; + } + + // ========================================================================= + // Helpers + // ========================================================================= + + private static double[] FillDoubles(Random rng, int count, double scale) + { + var arr = new double[count]; + for (int i = 0; i < arr.Length; i++) + arr[i] = (rng.NextDouble() - 0.5) * 2.0 * scale; + return arr; + } + + private static float[] ToFloats(double[] src) + { + var arr = new float[src.Length]; + for (int i = 0; i < src.Length; i++) + arr[i] = (float)src[i]; + return arr; + } + + /// + /// Builds an array of nearly-collinear Orient2d triples. + /// A=(0,0), B=(t,t), C=(2t, 2t+ε) – nearly collinear because ε is tiny. + /// Varies t across [1,100] and ε across [1e-14, 1e-13] to keep results + /// diverse while reliably triggering Stage B/C of the adaptive predicate. + /// + private static double[] BuildNearCollinear(int n) + { + var arr = new double[6 * n]; + for (int i = 0; i < n; i++) + { + double t = 1.0 + i * (99.0 / n); // t in [1, 100] + double eps = 1e-14 * (1.0 + i % 10); // ε in [1e-14, 1e-13] + int j = i * 6; + arr[j] = 0.0; arr[j + 1] = 0.0; // A + arr[j + 2] = t; arr[j + 3] = t; // B + arr[j + 4] = 2 * t; arr[j + 5] = 2 * t + eps; // C (nearly collinear) + } + return arr; + } + + /// + /// Builds nearly-co-circular InCircle quadruples. + /// A, B, C are vertices of an equilateral triangle inscribed in the unit + /// circle; D is placed at radius 1 + ε so it is just barely outside – + /// but close enough that the fast floating-point test cannot decide. + /// + private static double[] BuildNearCircle(int n) + { + var arr = new double[8 * n]; + for (int i = 0; i < n; i++) + { + double r = 100.0 + i * (900.0 / n); // radius in [100, 1000] + double eps = 1e-11 * r * (1.0 + i % 5); // ε scaled to radius + + double ax = r, ay = 0.0; + double bx = r * Math.Cos(2 * Math.PI / 3), + by = r * Math.Sin(2 * Math.PI / 3); + double cx = r * Math.Cos(4 * Math.PI / 3), + cy = r * Math.Sin(4 * Math.PI / 3); + double dx = r + eps, dy = 0.0; // D just outside + + int j = i * 8; + arr[j] = ax; arr[j + 1] = ay; + arr[j + 2] = bx; arr[j + 3] = by; + arr[j + 4] = cx; arr[j + 5] = cy; + arr[j + 6] = dx; arr[j + 7] = dy; + } + return arr; + } +} diff --git a/src/CDT.Core/Predicates/PredicatesAdaptive.cs b/src/CDT.Core/Predicates/PredicatesAdaptive.cs index 41b19d3..153b709 100644 --- a/src/CDT.Core/Predicates/PredicatesAdaptive.cs +++ b/src/CDT.Core/Predicates/PredicatesAdaptive.cs @@ -6,6 +6,8 @@ // artem-ogre/CDT (predicates.h, predicates::adaptive namespace). using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; namespace CDT.Predicates; @@ -59,7 +61,8 @@ public static class PredicatesAdaptive /// zero if collinear, or a negative value if to the right. /// /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] // opt-15: aggressive JIT opt for fast-path Stage A + [SkipLocalsInit] public static double Orient2d( double ax, double ay, double bx, double by, double cx, double cy) { @@ -192,7 +195,8 @@ public static float Orient2d( /// zero if on, or a negative value if outside. /// /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [MethodImpl(MethodImplOptions.AggressiveInlining)] // opt-16: keep only AggressiveInlining (AggressiveOptimization inflated the Stage-B frame and hurt Stage-A) + [SkipLocalsInit] public static double InCircle( double ax, double ay, double bx, double by, double cx, double cy, double dx, double dy) @@ -367,8 +371,14 @@ internal static (double aHi, double aLo) Split(double a) [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static double MultTail(double a, double b, double p) { - var (aHi, aLo) = Split(a); - var (bHi, bLo) = Split(b); + double c = SplitterD * a; + double aBig = c - a; + double aHi = c - aBig; + double aLo = a - aHi; + c = SplitterD * b; + double bBig = c - b; + double bHi = c - bBig; + double bLo = b - bHi; double y = p - aHi * bHi; y -= aLo * bHi; y -= aHi * bLo; @@ -379,6 +389,8 @@ internal static double MultTail(double a, double b, double p) /// Exact expansion of ax*by - ay*bx (up to 4 non-zero terms). /// Matches Lenthe ExpansionBase::TwoTwoDiff. /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + [SkipLocalsInit] // opt-11: x0..x3 are all unconditionally computed before conditional write internal static int TwoTwoDiff(double ax, double by, double ay, double bx, Span h) { double axby1 = ax * by; @@ -408,6 +420,8 @@ internal static int TwoTwoDiff(double ax, double by, double ay, double bx, Span< /// Matches Lenthe ExpansionBase::ScaleExpansion. /// Output has up to 2*elen terms. /// + [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] // opt-7, opt-18 + [SkipLocalsInit] // opt-8: locals (hIdx, Q, hh, Ti, ti, Qi) are all written before read internal static int ScaleExpansion(Span e, int elen, double b, Span h) { if (elen == 0 || b == 0.0) @@ -416,31 +430,40 @@ internal static int ScaleExpansion(Span e, int elen, double b, Spane*s*s + e*t*t as an expansion (two ScaleExpansion calls each, then sum). /// Max output: 32 terms for 4-term input (used for InCircle Stage B lift terms). /// + [SkipLocalsInit] // keep SkipLocalsInit; remove AggressiveInlining (inlining 3× into AdaptiveInCircle bloats Stage-A JIT frame) internal static int ScaleExpansionSum(Span e, int elen, double s, double t, Span h) { Span es = stackalloc double[8]; @@ -470,63 +494,148 @@ internal static int ScaleExpansionSum(Span e, int elen, double s, double /// Merge-then-accumulate two expansions. Matches Lenthe ExpansionBase::ExpansionSum: /// std::merge by |value| (stable), then sequential grow-expansion accumulation. /// + [MethodImpl(MethodImplOptions.AggressiveOptimization)] // opt-20: aggressive JIT optimization for this hot method + [SkipLocalsInit] internal static int ExpansionSum(Span e, int elen, Span f, int flen, Span h) { if (elen == 0 && flen == 0) { return 0; } - if (elen == 0) { f[..flen].CopyTo(h); return flen; } - if (flen == 0) { e[..elen].CopyTo(h); return elen; } + if (elen == 0) + { + // opt-5: Unsafe.CopyBlockUnaligned replaces Span.CopyTo (eliminates Memmove call overhead) + Unsafe.CopyBlockUnaligned( + ref Unsafe.As(ref MemoryMarshal.GetReference(h)), + ref Unsafe.As(ref MemoryMarshal.GetReference(f)), + (uint)(flen * sizeof(double))); + return flen; + } + if (flen == 0) + { + // opt-5: same as above for flen==0 fast path + Unsafe.CopyBlockUnaligned( + ref Unsafe.As(ref MemoryMarshal.GetReference(h)), + ref Unsafe.As(ref MemoryMarshal.GetReference(e)), + (uint)(elen * sizeof(double))); + return elen; + } int total = elen + flen; - // Merge sorted by |value| into temporary buffer. - // Maximum merged size for InCircle Stage D is 192+192=384 ≤ 400, so - // the stackalloc path is always taken for that call site. - Span merged = total <= 400 ? stackalloc double[400] : new double[total]; + // opt-1: Tiered stackalloc — allocate only as much as the actual input size requires. + // Using unsafe ref to the first element lets us hold the pointer across the branches + // without assigning the Span itself to an outer variable (which Roslyn disallows for + // stack-allocated Spans that might escape). + if (total <= 16) + { + Span merged16 = stackalloc double[16]; + return ExpansionSumCore(e, elen, f, flen, h, merged16); + } + if (total <= 64) + { + Span merged64 = stackalloc double[64]; + return ExpansionSumCore(e, elen, f, flen, h, merged64); + } + if (total <= 400) + { + Span merged400 = stackalloc double[400]; + return ExpansionSumCore(e, elen, f, flen, h, merged400); + } + return ExpansionSumCore(e, elen, f, flen, h, new double[total]); + } + + // opt-2, opt-3, opt-4: Core merge+accumulate logic — receives a pre-sized scratch buffer. + // All span accesses use MemoryMarshal.GetReference + Unsafe.Add to eliminate bounds checks. + [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] + private static int ExpansionSumCore( + Span e, int elen, Span f, int flen, Span h, Span merged) + { + ref double eRef = ref MemoryMarshal.GetReference(e); + ref double fRef = ref MemoryMarshal.GetReference(f); + ref double mRef = ref MemoryMarshal.GetReference(merged); + int ei = 0, fi = 0, mi = 0; while (ei < elen && fi < flen) { - if (Math.Abs(f[fi]) < Math.Abs(e[ei])) + double eVal = Unsafe.Add(ref eRef, ei); + double fVal = Unsafe.Add(ref fRef, fi); + if (Math.Abs(fVal) < Math.Abs(eVal)) { - merged[mi++] = f[fi++]; + Unsafe.Add(ref mRef, mi++) = fVal; + fi++; } else { - merged[mi++] = e[ei++]; + Unsafe.Add(ref mRef, mi++) = eVal; + ei++; } } - while (ei < elen) { merged[mi++] = e[ei++]; } - while (fi < flen) { merged[mi++] = f[fi++]; } + // opt-4: tail copy loops → Unsafe.CopyBlockUnaligned + if (ei < elen) + { + int rem = elen - ei; + Unsafe.CopyBlockUnaligned( + ref Unsafe.As(ref Unsafe.Add(ref mRef, mi)), + ref Unsafe.As(ref Unsafe.Add(ref eRef, ei)), + (uint)(rem * sizeof(double))); + mi += rem; + } + if (fi < flen) + { + int rem = flen - fi; + Unsafe.CopyBlockUnaligned( + ref Unsafe.As(ref Unsafe.Add(ref mRef, mi)), + ref Unsafe.As(ref Unsafe.Add(ref fRef, fi)), + (uint)(rem * sizeof(double))); + mi += rem; + } - // Sequential accumulation + // opt-3: bounds-check-free accumulation loop using ref locals + ref double hRef = ref MemoryMarshal.GetReference(h); int hIdx = 0; - double Q = merged[0]; - double Qnew = merged[1] + Q; - double hh = FastPlusTail(merged[1], Q, Qnew); + double Q = Unsafe.Add(ref mRef, 0); + double m1 = Unsafe.Add(ref mRef, 1); + double Qnew = m1 + Q; + double hh = FastPlusTail(m1, Q, Qnew); Q = Qnew; - if (hh != 0.0) { h[hIdx++] = hh; } + if (hh != 0.0) { Unsafe.Add(ref hRef, hIdx++) = hh; } for (int g = 2; g < mi; g++) { - Qnew = Q + merged[g]; - hh = PlusTail(Q, merged[g], Qnew); + double mg = Unsafe.Add(ref mRef, g); + Qnew = Q + mg; + hh = PlusTail(Q, mg, Qnew); Q = Qnew; - if (hh != 0.0) { h[hIdx++] = hh; } + if (hh != 0.0) { Unsafe.Add(ref hRef, hIdx++) = hh; } } - if (Q != 0.0) { h[hIdx++] = Q; } + if (Q != 0.0) { Unsafe.Add(ref hRef, hIdx++) = Q; } return hIdx; } [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static double Estimate(Span e, int elen) { - double sum = 0.0; - for (int i = 0; i < elen; i++) { sum += e[i]; } - return sum; + if (Vector256.IsHardwareAccelerated && elen >= 4) + { + ref double eRef = ref MemoryMarshal.GetReference(e); + Vector256 acc = Vector256.Zero; + int i = 0; + for (; i <= elen - 4; i += 4) + acc = Vector256.Add(acc, Vector256.LoadUnsafe(ref eRef, (nuint)i)); + double sum = Vector256.Sum(acc); + // opt-13: bounds-check-free scalar tail using Unsafe.Add + for (; i < elen; i++) sum += Unsafe.Add(ref eRef, i); + return sum; + } + + // opt-13: bounds-check-free scalar loop + ref double sRef = ref MemoryMarshal.GetReference(e); + double s = 0.0; + for (int i = 0; i < elen; i++) { s += Unsafe.Add(ref sRef, i); } + return s; } - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] // opt-12 internal static double MostSignificant(Span e, int elen) { for (int i = elen - 1; i >= 0; i--) @@ -536,8 +645,21 @@ internal static double MostSignificant(Span e, int elen) return 0.0; } + [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] // opt-14 internal static void NegateInto(Span src, int len, Span dst) { + if (Vector256.IsHardwareAccelerated && len >= 4) + { + ref double srcRef = ref MemoryMarshal.GetReference(src); + ref double dstRef = ref MemoryMarshal.GetReference(dst); + int i = 0; + for (; i <= len - 4; i += 4) + Vector256.Negate(Vector256.LoadUnsafe(ref srcRef, (nuint)i)) + .StoreUnsafe(ref dstRef, (nuint)i); + for (; i < len; i++) dst[i] = -src[i]; + return; + } + for (int i = 0; i < len; i++) { dst[i] = -src[i]; } } } diff --git a/src/CDT.Core/Predicates/PredicatesExact.cs b/src/CDT.Core/Predicates/PredicatesExact.cs index b99ceb1..11f7360 100644 --- a/src/CDT.Core/Predicates/PredicatesExact.cs +++ b/src/CDT.Core/Predicates/PredicatesExact.cs @@ -4,6 +4,8 @@ // // Geometric predicates — exact (arbitrary-precision) paths. +using System.Runtime.CompilerServices; + namespace CDT.Predicates; /// @@ -41,6 +43,8 @@ public static class PredicatesExact /// zero if collinear, or a negative value if to the right. /// /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] // opt-17: inline into Adaptive.Orient2d Stage D + [SkipLocalsInit] public static double Orient2d( double ax, double ay, double bx, double by, double cx, double cy) { @@ -122,6 +126,8 @@ public static float Orient2d( /// zero if on, or a negative value if outside. /// /// + [MethodImpl(MethodImplOptions.AggressiveOptimization)] // opt-20: aggressive JIT opt for this hot method + [SkipLocalsInit] public static double InCircle( double ax, double ay, double bx, double by, double cx, double cy, double dx, double dy) @@ -172,22 +178,22 @@ public static double InCircle( int adetLen = PredicatesAdaptive.ScaleExpansionSum(bcd, bcdLen, ax, ay, adet); // bdet = -(cda*bx*bx + cda*by*by) - Span bdetPos = stackalloc double[96]; - int bdetPosLen = PredicatesAdaptive.ScaleExpansionSum(cda, cdaLen, bx, by, bdetPos); + // opt-18: eliminate bdetPos[96] by computing ScaleExpansionSum directly into bdet, + // then negating in-place. Saves 768 bytes of stack per call. Span bdet = stackalloc double[96]; - PredicatesAdaptive.NegateInto(bdetPos, bdetPosLen, bdet); - int bdetLen = bdetPosLen; + int bdetLen = PredicatesAdaptive.ScaleExpansionSum(cda, cdaLen, bx, by, bdet); + PredicatesAdaptive.NegateInto(bdet, bdetLen, bdet); // cdet = dab*cx*cx + dab*cy*cy Span cdet = stackalloc double[96]; int cdetLen = PredicatesAdaptive.ScaleExpansionSum(dab, dabLen, cx, cy, cdet); // ddet = -(abc*dx*dx + abc*dy*dy) - Span ddetPos = stackalloc double[96]; - int ddetPosLen = PredicatesAdaptive.ScaleExpansionSum(abc, abcLen, dx, dy, ddetPos); + // opt-19: same as opt-18 — eliminate ddetPos[96] stackalloc with in-place negate. + // Saves another 768 bytes of stack per call (total savings: 1 536 bytes). Span ddet = stackalloc double[96]; - PredicatesAdaptive.NegateInto(ddetPos, ddetPosLen, ddet); - int ddetLen = ddetPosLen; + int ddetLen = PredicatesAdaptive.ScaleExpansionSum(abc, abcLen, dx, dy, ddet); + PredicatesAdaptive.NegateInto(ddet, ddetLen, ddet); // deter = (adet + bdet) + (cdet + ddet) Span ab2 = stackalloc double[192];