diff --git a/communication/src/main/java/datadog/communication/ddagent/DDAgentFeaturesDiscovery.java b/communication/src/main/java/datadog/communication/ddagent/DDAgentFeaturesDiscovery.java index 55929c73f51..10c1e57efd7 100644 --- a/communication/src/main/java/datadog/communication/ddagent/DDAgentFeaturesDiscovery.java +++ b/communication/src/main/java/datadog/communication/ddagent/DDAgentFeaturesDiscovery.java @@ -101,7 +101,6 @@ private static class State { String version; String telemetryProxyEndpoint; Set peerTags = emptySet(); - String orgPropagationMarker; long lastTimeDiscovered; } @@ -317,8 +316,6 @@ private boolean processInfoResponse(State newState, String response) { ? unmodifiableSet(new HashSet<>((List) peer_tags)) : emptySet(); } - Object opm = map.get("org_prop_marker"); - newState.orgPropagationMarker = (opm instanceof String) ? (String) opm : null; try { newState.state = Strings.sha256(response); } catch (Throwable ex) { @@ -406,10 +403,6 @@ public Set peerTags() { return discoveryState.peerTags; } - public String getOrgPropagationMarker() { - return discoveryState.orgPropagationMarker; - } - public String getMetricsEndpoint() { return discoveryState.metricsEndpoint; } diff --git a/communication/src/test/groovy/datadog/communication/ddagent/DDAgentFeaturesDiscoveryTest.groovy b/communication/src/test/groovy/datadog/communication/ddagent/DDAgentFeaturesDiscoveryTest.groovy index ff97ef6d3a2..8bcda6eb811 100644 --- a/communication/src/test/groovy/datadog/communication/ddagent/DDAgentFeaturesDiscoveryTest.groovy +++ b/communication/src/test/groovy/datadog/communication/ddagent/DDAgentFeaturesDiscoveryTest.groovy @@ -50,7 +50,6 @@ class DDAgentFeaturesDiscoveryTest extends DDSpecification { static final String INFO_WITH_LONG_RUNNING_SPANS = loadJsonFile("agent-info-with-long-running-spans.json") static final String INFO_WITH_TELEMETRY_PROXY_RESPONSE = loadJsonFile("agent-info-with-telemetry-proxy.json") static final String INFO_WITH_OLD_EVP_PROXY = loadJsonFile("agent-info-with-old-evp-proxy.json") - static final String INFO_WITH_OPM = loadJsonFile("agent-info-with-opm.json") static final String PROBE_STATE = "probestate" def "test parse /info response"() { @@ -210,34 +209,6 @@ class DDAgentFeaturesDiscoveryTest extends DDSpecification { 0 * _ } - def "test parse /info response with org propagation marker"() { - setup: - OkHttpClient client = Mock(OkHttpClient) - DDAgentFeaturesDiscovery features = new DDAgentFeaturesDiscovery(client, monitoring, agentUrl, V0_5, true, false) - - when: "/info available" - features.discover() - - then: - 1 * client.newCall(_) >> { Request request -> infoResponse(request, INFO_WITH_OPM) } - features.getOrgPropagationMarker() == "abc123def0" - 0 * _ - } - - def "test parse /info response without org propagation marker"() { - setup: - OkHttpClient client = Mock(OkHttpClient) - DDAgentFeaturesDiscovery features = new DDAgentFeaturesDiscovery(client, monitoring, agentUrl, V0_5, true, false) - - when: "/info available" - features.discover() - - then: - 1 * client.newCall(_) >> { Request request -> infoResponse(request, INFO_RESPONSE) } - features.getOrgPropagationMarker() == null - 0 * _ - } - def "test fallback when /info empty"() { setup: OkHttpClient client = Mock(OkHttpClient) diff --git a/dd-trace-api/src/main/java/datadog/trace/api/config/GeneralConfig.java b/dd-trace-api/src/main/java/datadog/trace/api/config/GeneralConfig.java index 79d3861d8ab..c9963516f1a 100644 --- a/dd-trace-api/src/main/java/datadog/trace/api/config/GeneralConfig.java +++ b/dd-trace-api/src/main/java/datadog/trace/api/config/GeneralConfig.java @@ -79,7 +79,6 @@ public final class GeneralConfig { public static final String TRACER_METRICS_MAX_PENDING = "trace.tracer.metrics.max.pending"; public static final String TRACER_METRICS_IGNORED_RESOURCES = "trace.tracer.metrics.ignored.resources"; - public static final String AZURE_APP_SERVICES = "azure.app.services"; public static final String INTERNAL_EXIT_ON_FAILURE = "trace.internal.exit.on.failure"; diff --git a/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdversarialMetricsBenchmark.java b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdversarialMetricsBenchmark.java index 634dea23358..01ba90097a4 100644 --- a/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdversarialMetricsBenchmark.java +++ b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdversarialMetricsBenchmark.java @@ -60,7 +60,7 @@ @Fork(value = 1) public class AdversarialMetricsBenchmark { - private ConflatingMetricsAggregator aggregator; + private ClientStatsAggregator aggregator; private CountingHealthMetrics health; @State(Scope.Thread) @@ -72,13 +72,13 @@ public static class ThreadState { public void setup() { this.health = new CountingHealthMetrics(); this.aggregator = - new ConflatingMetricsAggregator( + new ClientStatsAggregator( new WellKnownTags("", "", "", "", "", ""), Collections.emptySet(), - new ConflatingMetricsAggregatorBenchmark.FixedAgentFeaturesDiscovery( + new ClientStatsAggregatorBenchmark.FixedAgentFeaturesDiscovery( Collections.singleton("peer.hostname"), Collections.emptySet()), this.health, - new ConflatingMetricsAggregatorBenchmark.NullSink(), + new ClientStatsAggregatorBenchmark.NullSink(), 2048, 2048, false); diff --git a/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorBenchmark.java b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/ClientStatsAggregatorBenchmark.java similarity index 95% rename from dd-trace-core/src/jmh/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorBenchmark.java rename to dd-trace-core/src/jmh/java/datadog/trace/common/metrics/ClientStatsAggregatorBenchmark.java index b9a2f7f8c54..b9d72eaf3ab 100644 --- a/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorBenchmark.java +++ b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/ClientStatsAggregatorBenchmark.java @@ -34,12 +34,12 @@ @BenchmarkMode(Mode.AverageTime) @OutputTimeUnit(MICROSECONDS) @Fork(value = 1) -public class ConflatingMetricsAggregatorBenchmark { +public class ClientStatsAggregatorBenchmark { private final DDAgentFeaturesDiscovery featuresDiscovery = new FixedAgentFeaturesDiscovery( Collections.singleton("peer.hostname"), Collections.emptySet()); - private final ConflatingMetricsAggregator aggregator = - new ConflatingMetricsAggregator( + private final ClientStatsAggregator aggregator = + new ClientStatsAggregator( new WellKnownTags("", "", "", "", "", ""), Collections.emptySet(), featuresDiscovery, diff --git a/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorDDSpanBenchmark.java b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/ClientStatsAggregatorDDSpanBenchmark.java similarity index 87% rename from dd-trace-core/src/jmh/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorDDSpanBenchmark.java rename to dd-trace-core/src/jmh/java/datadog/trace/common/metrics/ClientStatsAggregatorDDSpanBenchmark.java index 89059857d9c..0453b8888db 100644 --- a/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorDDSpanBenchmark.java +++ b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/ClientStatsAggregatorDDSpanBenchmark.java @@ -28,8 +28,8 @@ import org.openjdk.jmh.infra.Blackhole; /** - * Parallels {@link ConflatingMetricsAggregatorBenchmark} but uses real {@link DDSpan} instances - * instead of the lightweight {@code SimpleSpan} mock, so the JIT exercises the production {@link + * Parallels {@link ClientStatsAggregatorBenchmark} but uses real {@link DDSpan} instances instead + * of the lightweight {@code SimpleSpan} mock, so the JIT exercises the production {@link * CoreSpan#isKind} path (cached span.kind ordinal + bit-test) rather than the groovy mock's * dispatch. * @@ -50,21 +50,21 @@ @BenchmarkMode(Mode.AverageTime) @OutputTimeUnit(MICROSECONDS) @Fork(value = 1) -public class ConflatingMetricsAggregatorDDSpanBenchmark { +public class ClientStatsAggregatorDDSpanBenchmark { private static final CoreTracer TRACER = CoreTracer.builder().writer(new NoopWriter()).strictTraceWrites(false).build(); private final DDAgentFeaturesDiscovery featuresDiscovery = - new ConflatingMetricsAggregatorBenchmark.FixedAgentFeaturesDiscovery( + new ClientStatsAggregatorBenchmark.FixedAgentFeaturesDiscovery( Collections.singleton("peer.hostname"), Collections.emptySet()); - private final ConflatingMetricsAggregator aggregator = - new ConflatingMetricsAggregator( + private final ClientStatsAggregator aggregator = + new ClientStatsAggregator( new WellKnownTags("", "", "", "", "", ""), Collections.emptySet(), featuresDiscovery, HealthMetrics.NO_OP, - new ConflatingMetricsAggregatorBenchmark.NullSink(), + new ClientStatsAggregatorBenchmark.NullSink(), 2048, 2048, false); diff --git a/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/HighCardinalityPeerMetricsBenchmark.java b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/HighCardinalityPeerMetricsBenchmark.java index 67caaca6ced..8fc45b4acd8 100644 --- a/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/HighCardinalityPeerMetricsBenchmark.java +++ b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/HighCardinalityPeerMetricsBenchmark.java @@ -50,7 +50,7 @@ @Fork(value = 1) public class HighCardinalityPeerMetricsBenchmark { - private ConflatingMetricsAggregator aggregator; + private ClientStatsAggregator aggregator; private CountingHealthMetrics health; @State(Scope.Thread) @@ -62,13 +62,13 @@ public static class ThreadState { public void setup() { this.health = new CountingHealthMetrics(); this.aggregator = - new ConflatingMetricsAggregator( + new ClientStatsAggregator( new WellKnownTags("", "", "", "", "", ""), Collections.emptySet(), - new ConflatingMetricsAggregatorBenchmark.FixedAgentFeaturesDiscovery( + new ClientStatsAggregatorBenchmark.FixedAgentFeaturesDiscovery( Collections.singleton("peer.hostname"), Collections.emptySet()), this.health, - new ConflatingMetricsAggregatorBenchmark.NullSink(), + new ClientStatsAggregatorBenchmark.NullSink(), 2048, 2048, false); diff --git a/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/HighCardinalityResourceMetricsBenchmark.java b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/HighCardinalityResourceMetricsBenchmark.java index 5ae8c3a715f..90c1f088935 100644 --- a/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/HighCardinalityResourceMetricsBenchmark.java +++ b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/HighCardinalityResourceMetricsBenchmark.java @@ -46,7 +46,7 @@ @Fork(value = 1) public class HighCardinalityResourceMetricsBenchmark { - private ConflatingMetricsAggregator aggregator; + private ClientStatsAggregator aggregator; private CountingHealthMetrics health; @State(Scope.Thread) @@ -58,13 +58,13 @@ public static class ThreadState { public void setup() { this.health = new CountingHealthMetrics(); this.aggregator = - new ConflatingMetricsAggregator( + new ClientStatsAggregator( new WellKnownTags("", "", "", "", "", ""), Collections.emptySet(), - new ConflatingMetricsAggregatorBenchmark.FixedAgentFeaturesDiscovery( + new ClientStatsAggregatorBenchmark.FixedAgentFeaturesDiscovery( Collections.singleton("peer.hostname"), Collections.emptySet()), this.health, - new ConflatingMetricsAggregatorBenchmark.NullSink(), + new ClientStatsAggregatorBenchmark.NullSink(), 2048, 2048, false); diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java index 5bc985491de..1ce3399bffa 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java @@ -1,131 +1,102 @@ package datadog.trace.common.metrics; -import static datadog.trace.bootstrap.instrumentation.api.UTF8BytesString.EMPTY; - import datadog.metrics.api.Histogram; -import datadog.trace.api.Pair; -import datadog.trace.api.cache.DDCache; -import datadog.trace.api.cache.DDCaches; import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString; +import datadog.trace.core.monitor.HealthMetrics; import datadog.trace.util.Hashtable; import datadog.trace.util.LongHashingUtils; -import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; -import java.util.function.Function; import javax.annotation.Nullable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** - * Hashtable entry for the consumer-side aggregator. Holds the UTF8-encoded label fields that {@link - * SerializingMetricWriter} writes to the wire plus the mutable counter/histogram state for the key. - * - *

{@link #matches(SpanSnapshot)} compares the entry's stored UTF8 forms against the snapshot's - * raw {@code CharSequence}/{@code String}/{@code String[]} fields via content-equality, so {@code - * String} vs {@code UTF8BytesString} mixing on the same logical key collapses into one entry - * instead of splitting. - * - *

The static UTF8 caches that used to live on {@code MetricKey} and {@code - * ConflatingMetricsAggregator} are consolidated here. - * - *

Deliberate cohesion. This class concentrates five responsibilities -- the static UTF8 - * caches, the canonicalized label fields, the raw {@code peerTagNames}/{@code peerTagValues} arrays - * used by {@link #matches}, the pre-encoded {@code peerTags} list used by the serializer, and the - * mutable counter/histogram aggregate state -- on a single object. The prior design split the label - * fields and aggregate state across separate {@code MetricKey} and {@code AggregateMetric} - * instances, allocating both per unique key on miss; folding them yields one allocation per unique - * key. The class is wider than its predecessors as a result, but that's the trade we explicitly - * chose. - * - *

Required vs optional field absence. Required label fields ({@code resource}, {@code - * service}, {@code operationName}, {@code type}, {@code spanKind}) canonicalize a {@code null} - * snapshot value into {@link UTF8BytesString#EMPTY} via {@link #canonicalize} -- they are never - * {@code null} on a constructed entry. Optional label fields ({@code serviceSource}, {@code - * httpMethod}, {@code httpEndpoint}, {@code grpcStatusCode}) stay {@code null} on the entry when - * the snapshot value was {@code null}; the serializer uses {@code != null} to decide whether to - * emit them on the wire. {@link #contentEquals} treats {@code null} and length-0 as equivalent so - * {@link #matches} works against either form. + * {@link datadog.trace.util.Hashtable} entry used by the aggregator thread. * - *

Not thread-safe. Counter and histogram updates are performed by the single aggregator - * thread; producer threads tag durations via {@link #ERROR_TAG} / {@link #TOP_LEVEL_TAG} bits and - * hand them off through the snapshot inbox. + *

Stores the canonical UTF8 label values that identify one aggregate row, plus the mutable + * counter and histogram state for that row. Labels are canonicalized before hashing, so overflow + * values replaced with the sentinel use the same hash and map to the same row. * - *

Single-writer invariant relies on convention. The aggregator thread is the only mutator - * of this class and of {@link AggregateTable}. Nothing enforces this at runtime -- a stray mutation - * from a different thread (e.g. an HTTP-client callback) would corrupt counters or hashtable chains - * silently. The {@code ClearSignal} routing in {@link Aggregator} is the explicit mechanism for - * funneling cross-thread requests (e.g. {@code disable()}) back onto the aggregator thread; any new - * entry point that mutates aggregate state must do the same. + *

Not thread-safe — all mutation is on the aggregator thread. Tests must call {@link + * #resetCardinalityHandlers()} in setup to avoid cross-test handler pollution (handlers are + * static); tests using {@link PeerTagSchema} must also call {@link + * PeerTagSchema#resetHandlers(HealthMetrics)} on the schema instance. */ final class AggregateEntry extends Hashtable.Entry { + private static final Logger log = LoggerFactory.getLogger(AggregateEntry.class); + static final long ERROR_TAG = 0x8000000000000000L; static final long TOP_LEVEL_TAG = 0x4000000000000000L; - // UTF8 caches consolidated from the previous MetricKey + ConflatingMetricsAggregator split. - private static final DDCache RESOURCE_CACHE = - DDCaches.newFixedSizeCache(32); - private static final DDCache SERVICE_CACHE = - DDCaches.newFixedSizeCache(32); - private static final DDCache OPERATION_CACHE = - DDCaches.newFixedSizeCache(64); - private static final DDCache SERVICE_SOURCE_CACHE = - DDCaches.newFixedSizeCache(16); - private static final DDCache TYPE_CACHE = DDCaches.newFixedSizeCache(8); - private static final DDCache SPAN_KIND_CACHE = - DDCaches.newFixedSizeCache(16); - private static final DDCache HTTP_METHOD_CACHE = - DDCaches.newFixedSizeCache(8); - private static final DDCache HTTP_ENDPOINT_CACHE = - DDCaches.newFixedSizeCache(32); - private static final DDCache GRPC_STATUS_CODE_CACHE = - DDCaches.newFixedSizeCache(32); - - /** - * Outer cache keyed by peer-tag name, with an inner per-name cache keyed by value. The inner - * cache produces the "name:value" encoded form the serializer writes. - */ - private static final DDCache< - String, Pair, Function>> - PEER_TAGS_CACHE = DDCaches.newFixedSizeCache(64); - - private static final Function< - String, Pair, Function>> - PEER_TAGS_CACHE_ADDER = - key -> - Pair.of( - DDCaches.newFixedSizeCache(512), - value -> UTF8BytesString.create(key + ":" + value)); - - private final UTF8BytesString resource; - private final UTF8BytesString service; - private final UTF8BytesString operationName; - @Nullable private final UTF8BytesString serviceSource; - private final UTF8BytesString type; - private final UTF8BytesString spanKind; - @Nullable private final UTF8BytesString httpMethod; - @Nullable private final UTF8BytesString httpEndpoint; - @Nullable private final UTF8BytesString grpcStatusCode; - private final short httpStatusCode; - - /** Whether the root span carried the {@code synthetics} origin tag (synthetic-monitoring run). */ - private final boolean synthetic; - - /** Whether this span is the trace root ({@code parentId == 0}). */ - private final boolean traceRoot; - - // Peer tags carried in two forms: parallel String[] arrays mirroring the snapshot's (schema + - // values) shape for matches(), and pre-encoded List ("name:value") for the - // serializer. peerTagNames is the schema's names array (shared by-reference when the schema - // hasn't been replaced); peerTagValues is the per-span String[] parallel to it. - // - // Package-private so the in-package test helper (AggregateEntryTestUtils) can compare entries - // by raw layout; production access comes from this class's own matches() + constructor. - @Nullable final String[] peerTagNames; - @Nullable final String[] peerTagValues; - private final List peerTags; - - // Mutable aggregate state -- single-thread (consumer/aggregator) writer. + private static final UTF8BytesString[] EMPTY_TAGS = new UTF8BytesString[0]; + + // Sentinel substitution is disabled until per-component config is wired in a follow-up PR. + // Tests that need sentinel mode should pass useBlockedSentinel=true explicitly. + static final boolean LIMITS_ENABLED = false; + + // Per-field cardinality handlers. Limits live on MetricCardinalityLimits -- see that class for + // per-field rationale. + static final PropertyCardinalityHandler RESOURCE_HANDLER = + new PropertyCardinalityHandler("resource", MetricCardinalityLimits.RESOURCE, LIMITS_ENABLED); + static final PropertyCardinalityHandler SERVICE_HANDLER = + new PropertyCardinalityHandler("service", MetricCardinalityLimits.SERVICE, LIMITS_ENABLED); + static final PropertyCardinalityHandler OPERATION_HANDLER = + new PropertyCardinalityHandler( + "operation", MetricCardinalityLimits.OPERATION, LIMITS_ENABLED); + static final PropertyCardinalityHandler SERVICE_SOURCE_HANDLER = + new PropertyCardinalityHandler( + "service_source", MetricCardinalityLimits.SERVICE_SOURCE, LIMITS_ENABLED); + static final PropertyCardinalityHandler TYPE_HANDLER = + new PropertyCardinalityHandler("type", MetricCardinalityLimits.TYPE, LIMITS_ENABLED); + static final PropertyCardinalityHandler SPAN_KIND_HANDLER = + new PropertyCardinalityHandler( + "span_kind", MetricCardinalityLimits.SPAN_KIND, LIMITS_ENABLED); + static final PropertyCardinalityHandler HTTP_METHOD_HANDLER = + new PropertyCardinalityHandler( + "http_method", MetricCardinalityLimits.HTTP_METHOD, LIMITS_ENABLED); + static final PropertyCardinalityHandler HTTP_ENDPOINT_HANDLER = + new PropertyCardinalityHandler( + "http_endpoint", MetricCardinalityLimits.HTTP_ENDPOINT, LIMITS_ENABLED); + static final PropertyCardinalityHandler GRPC_STATUS_CODE_HANDLER = + new PropertyCardinalityHandler( + "grpc_status_code", MetricCardinalityLimits.GRPC_STATUS_CODE, LIMITS_ENABLED); + + // Single authoritative list used by resetCardinalityHandlers(). populateFrom() and hashOf() keep + // named access for readability and to avoid per-span iteration overhead; this array ensures the + // reset site stays in sync even if a new field is added without updating the loop by name. + private static final PropertyCardinalityHandler[] FIELD_HANDLERS = { + RESOURCE_HANDLER, + SERVICE_HANDLER, + OPERATION_HANDLER, + SERVICE_SOURCE_HANDLER, + TYPE_HANDLER, + SPAN_KIND_HANDLER, + HTTP_METHOD_HANDLER, + HTTP_ENDPOINT_HANDLER, + GRPC_STATUS_CODE_HANDLER, + }; + + final UTF8BytesString resource; + final UTF8BytesString service; + final UTF8BytesString operationName; + // Optional fields use UTF8BytesString.EMPTY as the "absent" sentinel rather than null. The + // cardinality handlers map null inputs to EMPTY, and createUtf8 does the same for the of(...) + // factory, so callers don't need to special-case absence. + final UTF8BytesString serviceSource; + final UTF8BytesString type; + final UTF8BytesString spanKind; + final UTF8BytesString httpMethod; + final UTF8BytesString httpEndpoint; + final UTF8BytesString grpcStatusCode; + final short httpStatusCode; + final boolean synthetic; + final boolean traceRoot; + final List peerTags; + + // Mutable aggregate state -- single-thread (aggregator) writer. private final Histogram okLatencies = Histogram.newHistogram(); /** @@ -141,31 +112,46 @@ final class AggregateEntry extends Hashtable.Entry { private int topLevelCount; private long duration; - /** Hot-path constructor for the producer/consumer flow. Builds UTF8 fields via the caches. */ - AggregateEntry(SpanSnapshot s, long keyHash) { + /** + * Field-bearing constructor. Package-private so {@link AggregateEntryTestUtils} can build + * expected entries. + */ + AggregateEntry( + long keyHash, + UTF8BytesString resource, + UTF8BytesString service, + UTF8BytesString operationName, + UTF8BytesString serviceSource, + UTF8BytesString type, + UTF8BytesString spanKind, + UTF8BytesString httpMethod, + UTF8BytesString httpEndpoint, + UTF8BytesString grpcStatusCode, + short httpStatusCode, + boolean synthetic, + boolean traceRoot, + List peerTags) { super(keyHash); - this.resource = canonicalize(RESOURCE_CACHE, s.resourceName); - this.service = canonicalize(SERVICE_CACHE, s.serviceName); - this.operationName = canonicalize(OPERATION_CACHE, s.operationName); - this.serviceSource = canonicalizeOptional(SERVICE_SOURCE_CACHE, s.serviceNameSource); - this.type = canonicalize(TYPE_CACHE, s.spanType); - this.spanKind = canonicalize(SPAN_KIND_CACHE, s.spanKind); - this.httpMethod = canonicalizeOptional(HTTP_METHOD_CACHE, s.httpMethod); - this.httpEndpoint = canonicalizeOptional(HTTP_ENDPOINT_CACHE, s.httpEndpoint); - this.grpcStatusCode = canonicalizeOptional(GRPC_STATUS_CODE_CACHE, s.grpcStatusCode); - this.httpStatusCode = s.httpStatusCode; - this.synthetic = s.synthetic; - this.traceRoot = s.traceRoot; - this.peerTagNames = s.peerTagSchema == null ? null : s.peerTagSchema.names; - this.peerTagValues = s.peerTagValues; - this.peerTags = materializePeerTags(this.peerTagNames, this.peerTagValues); + this.resource = resource; + this.service = service; + this.operationName = operationName; + this.serviceSource = serviceSource; + this.type = type; + this.spanKind = spanKind; + this.httpMethod = httpMethod; + this.httpEndpoint = httpEndpoint; + this.grpcStatusCode = grpcStatusCode; + this.httpStatusCode = httpStatusCode; + this.synthetic = synthetic; + this.traceRoot = traceRoot; + this.peerTags = peerTags; } /** * Records a single hit. {@code tagAndDuration} carries the duration nanos with optional {@link * #ERROR_TAG} / {@link #TOP_LEVEL_TAG} bits OR-ed in. */ - void recordOneDuration(long tagAndDuration) { + AggregateEntry recordOneDuration(long tagAndDuration) { ++hitCount; if ((tagAndDuration & TOP_LEVEL_TAG) == TOP_LEVEL_TAG) { tagAndDuration ^= TOP_LEVEL_TAG; @@ -179,6 +165,7 @@ void recordOneDuration(long tagAndDuration) { okLatencies.accept(tagAndDuration); } duration += tagAndDuration; + return this; } int getErrorCount() { @@ -223,10 +210,10 @@ private Histogram errorLatenciesForWrite() { /** * Resets the per-cycle counters and histograms. Label fields ({@code resource}, {@code service}, - * ..., {@code peerTagNames}, {@code peerTagValues}) are deliberately left intact -- they're the - * entry's bucket identity and must persist so a subsequent snapshot with the same key reuses this - * entry instead of allocating a fresh one. Entries that stay at {@code hitCount == 0} across a - * cycle are reaped by {@link AggregateTable#expungeStaleAggregates}. + * ..., {@code peerTags}) are deliberately left intact -- they're the entry's bucket identity and + * must persist so a subsequent snapshot with the same key reuses this entry instead of allocating + * a fresh one. Entries that stay at {@code hitCount == 0} across a cycle are reaped by {@link + * AggregateTable#expungeStaleAggregates}. */ void clear() { this.errorCount = 0; @@ -240,70 +227,71 @@ void clear() { } } - boolean matches(SpanSnapshot s) { - String[] snapshotNames = s.peerTagSchema == null ? null : s.peerTagSchema.names; - return httpStatusCode == s.httpStatusCode - && synthetic == s.synthetic - && traceRoot == s.traceRoot - && contentEquals(resource, s.resourceName) - && contentEquals(service, s.serviceName) - && contentEquals(operationName, s.operationName) - && contentEquals(serviceSource, s.serviceNameSource) - && contentEquals(type, s.spanType) - && contentEquals(spanKind, s.spanKind) - && Arrays.equals(peerTagNames, snapshotNames) - && Arrays.equals(peerTagValues, s.peerTagValues) - && contentEquals(httpMethod, s.httpMethod) - && contentEquals(httpEndpoint, s.httpEndpoint) - && contentEquals(grpcStatusCode, s.grpcStatusCode); + /** Resets the static per-field cardinality handlers. Does not cover {@link PeerTagSchema}. */ + static void resetCardinalityHandlers() { + resetCardinalityHandlers(HealthMetrics.NO_OP); } - /** - * Pre-checks {@link #keyHash} against {@code keyHash} before delegating to {@link - * #matches(SpanSnapshot)}. The hash check is cheap and rules out most mismatches without touching - * the field-by-field comparison. - */ - boolean matches(long keyHash, SpanSnapshot s) { - return this.keyHash == keyHash && matches(s); + static void resetCardinalityHandlers(HealthMetrics healthMetrics) { + for (PropertyCardinalityHandler handler : FIELD_HANDLERS) { + reportIfBlocked(healthMetrics, handler); + } + PeerTagSchema.INTERNAL.resetHandlers(healthMetrics); + } + + private static void reportIfBlocked( + HealthMetrics healthMetrics, PropertyCardinalityHandler handler) { + long blocked = handler.reset(); + if (blocked > 0) { + log.warn( + "Cardinality limit reached for stats field '{}'; further values will be reported as tracer_blocked_value", + handler.name); + healthMetrics.onTagCardinalityBlocked(handler.statsDTag(), blocked); + } } /** - * Computes the 64-bit lookup hash for a {@link SpanSnapshot}. Chained per-field calls -- no - * varargs / Object[] allocation, no autoboxing on primitive overloads. The constructor's - * super({@code hashOf(s)}) call uses the same function so an entry built from a snapshot hashes - * to the same bucket the snapshot itself looks up. + * 64-bit lookup hash, computed over UTF8-encoded fields so that cardinality-blocked values (which + * all canonicalize to the same sentinel {@link UTF8BytesString}) collide in the same bucket. + * {@link UTF8BytesString#hashCode()} returns the underlying String hash, so entries built via + * {@link #of} produce the same hash as entries built from a snapshot with matching content. * - *

Hashes are content-stable across {@code String} / {@code UTF8BytesString}: {@link - * UTF8BytesString#hashCode()} returns the underlying {@code String}'s hash. + *

Field order intentionally mirrors {@link Canonical#matches} -- UTF8 fields first (highest + * cardinality first for matches' short-circuit benefit), then the peer-tag list, then the + * primitives. The hash itself is order-stable across all callers; the lockstep ordering is purely + * for readability when reasoning about lookup and equality in tandem. */ - static long hashOf(SpanSnapshot s) { + static long hashOf( + UTF8BytesString resource, + UTF8BytesString service, + UTF8BytesString operationName, + UTF8BytesString serviceSource, + UTF8BytesString type, + UTF8BytesString spanKind, + UTF8BytesString httpMethod, + UTF8BytesString httpEndpoint, + UTF8BytesString grpcStatusCode, + short httpStatusCode, + boolean synthetic, + boolean traceRoot, + UTF8BytesString[] peerTags, + int peerTagCount) { long h = 0; - h = LongHashingUtils.addToHash(h, s.resourceName); - h = LongHashingUtils.addToHash(h, s.serviceName); - h = LongHashingUtils.addToHash(h, s.operationName); - h = LongHashingUtils.addToHash(h, s.serviceNameSource); - h = LongHashingUtils.addToHash(h, s.spanType); - h = LongHashingUtils.addToHash(h, s.httpStatusCode); - h = LongHashingUtils.addToHash(h, s.synthetic); - h = LongHashingUtils.addToHash(h, s.traceRoot); - h = LongHashingUtils.addToHash(h, s.spanKind); - // Always mix in both the schema's content hash and the values' content hash, unconditionally - // (no null-skip). Arrays.hashCode is content-based for both String[]s; the default - // Object[].hashCode is identity-based, which would let two snapshots with content-equal but - // distinct PeerTagSchema instances hash to different buckets. Null inputs hash to 0 here, - // distinct from {@code Arrays.hashCode(empty)} = 1 or any non-empty array. - // - // peerTagValues is gated by peerTagSchema: the slot's peerTagValues is a reusable scratch - // buffer that may carry stale contents from a prior tag-firing publish when this publish had - // no peer tags. Hash it only when the schema says it's meaningful, matching the matches() - // contract. - h = LongHashingUtils.addToHash(h, s.peerTagSchema == null ? 0 : s.peerTagSchema.namesHash); - h = - LongHashingUtils.addToHash( - h, s.peerTagSchema == null ? 0 : Arrays.hashCode(s.peerTagValues)); - h = LongHashingUtils.addToHash(h, s.httpMethod); - h = LongHashingUtils.addToHash(h, s.httpEndpoint); - h = LongHashingUtils.addToHash(h, s.grpcStatusCode); + h = LongHashingUtils.addToHash(h, resource); + h = LongHashingUtils.addToHash(h, service); + h = LongHashingUtils.addToHash(h, operationName); + h = LongHashingUtils.addToHash(h, serviceSource); + h = LongHashingUtils.addToHash(h, type); + h = LongHashingUtils.addToHash(h, spanKind); + h = LongHashingUtils.addToHash(h, httpMethod); + h = LongHashingUtils.addToHash(h, httpEndpoint); + h = LongHashingUtils.addToHash(h, grpcStatusCode); + for (int i = 0; i < peerTagCount; i++) { + h = LongHashingUtils.addToHash(h, peerTags[i]); + } + h = LongHashingUtils.addToHash(h, httpStatusCode); + h = LongHashingUtils.addToHash(h, synthetic); + h = LongHashingUtils.addToHash(h, traceRoot); return h; } @@ -320,11 +308,20 @@ UTF8BytesString getOperationName() { return operationName; } - @Nullable UTF8BytesString getServiceSource() { return serviceSource; } + /** + * Whether the snapshot carried a service-source value. Encapsulates the EMPTY-as-absent + * convention: optional fields use {@link UTF8BytesString#EMPTY} as the sentinel for "no value + * captured" (see field comment) -- callers that need a presence check should go through this + * predicate rather than comparing against {@code EMPTY} directly. + */ + boolean hasServiceSource() { + return serviceSource.length() > 0; + } + UTF8BytesString getType() { return type; } @@ -333,21 +330,40 @@ UTF8BytesString getSpanKind() { return spanKind; } - @Nullable UTF8BytesString getHttpMethod() { return httpMethod; } - @Nullable + /** + * Whether the snapshot carried an HTTP method. See {@link #hasServiceSource} for the contract. + */ + boolean hasHttpMethod() { + return httpMethod.length() > 0; + } + UTF8BytesString getHttpEndpoint() { return httpEndpoint; } - @Nullable + /** + * Whether the snapshot carried an HTTP endpoint. See {@link #hasServiceSource} for the contract. + */ + boolean hasHttpEndpoint() { + return httpEndpoint.length() > 0; + } + UTF8BytesString getGrpcStatusCode() { return grpcStatusCode; } + /** + * Whether the snapshot carried a gRPC status code. See {@link #hasServiceSource} for the + * contract. + */ + boolean hasGrpcStatusCode() { + return grpcStatusCode.length() > 0; + } + int getHttpStatusCode() { return httpStatusCode; } @@ -365,101 +381,184 @@ List getPeerTags() { } // Production AggregateEntry intentionally has no equals/hashCode override -- AggregateTable - // bucketing uses keyHash + matches(SpanSnapshot) directly and never invokes Object.equals. - // For tests that need value-equality (Spock argument matchers), use AggregateEntryTestUtils in - // src/test, which provides equals/hashCode helpers without exposing the contract in production. - - // ----- helpers ----- - - private static UTF8BytesString canonicalize( - DDCache cache, CharSequence charSeq) { - if (charSeq == null) { - return EMPTY; - } - if (charSeq instanceof UTF8BytesString) { - return (UTF8BytesString) charSeq; - } - return cache.computeIfAbsent(charSeq.toString(), UTF8BytesString::create); - } + // bucketing uses keyHash + Canonical.matches and never invokes Object.equals. For tests that + // need value-equality (Spock argument matchers), use AggregateEntryTestUtils in src/test. /** - * Like {@link #canonicalize} but returns {@code null} for a {@code null} input (rather than - * {@link UTF8BytesString#EMPTY}). Used for the four optional fields so the serializer can - * distinguish "absent" via a {@code != null} check and elide the field on the wire. + * Reusable scratch buffer for canonicalizing a {@link SpanSnapshot} into UTF8 fields, computing + * its lookup hash, comparing against existing entries, and building a fresh entry on miss. * - *

The {@code instanceof UTF8BytesString} short-circuit is dead code for {@link - * SpanSnapshot#httpMethod}/{@code httpEndpoint}/{@code grpcStatusCode} (statically {@code - * String}) but live for {@link SpanSnapshot#serviceNameSource} ({@link CharSequence}); keeping a - * single helper keeps the constructor consistent. + *

One instance is held by an {@link AggregateTable} and reused on every {@code findOrInsert} + * call. Single-threaded use only. Fields are deliberately mutable -- this is a hot-path scratch + * area, not a value class. */ - @Nullable - private static UTF8BytesString canonicalizeOptional( - DDCache cache, @Nullable CharSequence charSeq) { - if (charSeq == null) { - return null; + static final class Canonical { + UTF8BytesString resource; + UTF8BytesString service; + UTF8BytesString operationName; + UTF8BytesString serviceSource; + UTF8BytesString type; + UTF8BytesString spanKind; + UTF8BytesString httpMethod; + UTF8BytesString httpEndpoint; + UTF8BytesString grpcStatusCode; + short httpStatusCode; + boolean synthetic; + boolean traceRoot; + + /** + * Reusable buffer of canonicalized peer-tag UTF8 forms. Cleared and refilled in {@link + * #populate}; on miss, {@link #createEntry} copies it into an immutable list for the entry to + * own. Zero allocation on the hit path. Sized lazily to the schema's tag count; resized if the + * schema grows. + */ + UTF8BytesString[] peerTagsBuffer = EMPTY_TAGS; + + int peerTagsSize = 0; + + long keyHash; + + /** Canonicalize all fields from {@code s} through the handlers into this buffer. */ + void populateFrom(SpanSnapshot s) { + this.resource = RESOURCE_HANDLER.register(s.resourceName); + this.service = SERVICE_HANDLER.register(s.serviceName); + this.operationName = OPERATION_HANDLER.register(s.operationName); + this.serviceSource = SERVICE_SOURCE_HANDLER.register(s.serviceNameSource); + this.type = TYPE_HANDLER.register(s.spanType); + this.spanKind = SPAN_KIND_HANDLER.register(s.spanKind); + this.httpMethod = HTTP_METHOD_HANDLER.register(s.httpMethod); + this.httpEndpoint = HTTP_ENDPOINT_HANDLER.register(s.httpEndpoint); + this.grpcStatusCode = GRPC_STATUS_CODE_HANDLER.register(s.grpcStatusCode); + this.httpStatusCode = s.httpStatusCode; + this.synthetic = s.synthetic; + this.traceRoot = s.traceRoot; + populatePeerTags(s.peerTagSchema, s.peerTagValues); + this.keyHash = + hashOf( + resource, + service, + operationName, + serviceSource, + type, + spanKind, + httpMethod, + httpEndpoint, + grpcStatusCode, + httpStatusCode, + synthetic, + traceRoot, + peerTagsBuffer, + peerTagsSize); } - if (charSeq instanceof UTF8BytesString) { - return (UTF8BytesString) charSeq; - } - return cache.computeIfAbsent(charSeq.toString(), UTF8BytesString::create); - } - /** - * UTF8 vs raw CharSequence content-equality, no allocation in the common (String) case. - * - *

Treats {@code null} and empty (length 0) as equivalent on either side. This matches the - * canonicalization semantics: {@link #canonicalize} maps a {@code null} input to {@link - * UTF8BytesString#EMPTY}, so an entry built from a snapshot with a null field needs to match a - * subsequent snapshot whose field is still null. {@code intHash(null) == 0 == "".hashCode()}, so - * the hash already agrees with this view. - */ - private static boolean contentEquals(UTF8BytesString a, CharSequence b) { - if (a == null || a.length() == 0) { - return b == null || b.length() == 0; + /** + * Replaces the current peer-tag buffer contents with the canonical UTF8 forms for this + * snapshot. + * + *

Each non-null value is canonicalized with the handler at the same schema index. Null + * values are skipped because they would canonicalize to {@link UTF8BytesString#EMPTY} and be + * filtered out anyway. Producer-side {@code capturePeerTagValues} produces sparse-null arrays, + * so the skip pays off whenever a span carries only a subset of the configured peer tags. + */ + private void populatePeerTags(PeerTagSchema schema, String[] values) { + peerTagsSize = 0; + if (schema == null || values == null) { + return; + } + int n = Math.min(schema.size(), values.length); + if (peerTagsBuffer.length < n) { + peerTagsBuffer = new UTF8BytesString[n]; + } + for (int i = 0; i < n; i++) { + String value = values[i]; + if (value == null) { + continue; + } + peerTagsBuffer[peerTagsSize++] = schema.register(i, value); + } } - // UTF8BytesString.toString() returns the underlying String -- O(1), no allocation. - return b != null && a.toString().contentEquals(b); - } - /** - * Encodes the per-span peer-tag values into the {@code List} the serializer - * consumes. Reads name/value pairs at the same index from the schema's names and the snapshot's - * values; null value slots are skipped (the span didn't set that peer tag). Counts hits once for - * exact-size allocation and preserves the singletonList fast path for the common one-entry case - * (e.g. internal-kind base.service). - */ - private static List materializePeerTags( - @Nullable String[] names, @Nullable String[] values) { - if (names == null || values == null) { - return Collections.emptyList(); + /** + * Whether this canonicalized snapshot matches the given entry. Compares UTF8 fields via + * content-equality (so an entry surviving a handler reset still matches a freshly-canonicalized + * snapshot of the same content). + * + *

Field order is cardinality-tuned: resource / service / operationName first because they + * vary most across collisions, then the remaining UTF8 fields, then the peer-tag list + * comparison (slowest), then the primitives. All UTF8 fields are non-null by the EMPTY- + * sentinel invariant (see field comments above), so direct {@code a.equals(b)} is safe. + */ + boolean matches(AggregateEntry e) { + return resource.equals(e.resource) + && service.equals(e.service) + && operationName.equals(e.operationName) + && serviceSource.equals(e.serviceSource) + && type.equals(e.type) + && spanKind.equals(e.spanKind) + && httpMethod.equals(e.httpMethod) + && httpEndpoint.equals(e.httpEndpoint) + && grpcStatusCode.equals(e.grpcStatusCode) + && peerTagsEqual(peerTagsBuffer, peerTagsSize, e.peerTags) + && httpStatusCode == e.httpStatusCode + && synthetic == e.synthetic + && traceRoot == e.traceRoot; } - int n = names.length; - int firstHit = -1; - int hitCount = 0; - for (int i = 0; i < n; i++) { - if (values[i] != null) { - if (hitCount == 0) firstHit = i; - hitCount++; + + private static boolean peerTagsEqual(UTF8BytesString[] a, int aSize, List b) { + if (aSize != b.size()) { + return false; } + for (int i = 0; i < aSize; i++) { + if (!a[i].equals(b.get(i))) { + return false; + } + } + return true; } - if (hitCount == 0) { - return Collections.emptyList(); - } - if (hitCount == 1) { - return Collections.singletonList(encodePeerTag(names[firstHit], values[firstHit])); - } - List tags = new ArrayList<>(hitCount); - for (int i = firstHit; i < n; i++) { - if (values[i] != null) { - tags.add(encodePeerTag(names[i], values[i])); + + /** + * Build a new entry from the currently-populated canonical fields. The peer-tag buffer is + * copied into an immutable list so the entry's reference stays stable across subsequent {@link + * #populate} calls. + */ + AggregateEntry createEntry() { + List snapshottedPeerTags; + int n = peerTagsSize; + if (n == 0) { + snapshottedPeerTags = Collections.emptyList(); + } else if (n == 1) { + snapshottedPeerTags = Collections.singletonList(peerTagsBuffer[0]); + } else { + snapshottedPeerTags = Arrays.asList(Arrays.copyOf(peerTagsBuffer, n)); } + return new AggregateEntry( + keyHash, + resource, + service, + operationName, + serviceSource, + type, + spanKind, + httpMethod, + httpEndpoint, + grpcStatusCode, + httpStatusCode, + synthetic, + traceRoot, + snapshottedPeerTags); } - return tags; } - private static UTF8BytesString encodePeerTag(String name, String value) { - final Pair, Function> - cacheAndCreator = PEER_TAGS_CACHE.computeIfAbsent(name, PEER_TAGS_CACHE_ADDER); - return cacheAndCreator.getLeft().computeIfAbsent(value, cacheAndCreator.getRight()); + // ----- helpers ----- + + /** Direct {@link UTF8BytesString} creation that bypasses the cardinality handlers. */ + static UTF8BytesString createUtf8(CharSequence cs) { + if (cs == null) { + return UTF8BytesString.EMPTY; + } + if (cs instanceof UTF8BytesString) { + return (UTF8BytesString) cs; + } + return UTF8BytesString.create(cs.toString()); } } diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateTable.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateTable.java index abadc7e5f17..fcd6c5fcdb4 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateTable.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateTable.java @@ -2,24 +2,23 @@ import datadog.trace.util.Hashtable; import datadog.trace.util.Hashtable.MutatingTableIterator; -import datadog.trace.util.Hashtable.Support; import java.util.function.BiConsumer; import java.util.function.Consumer; /** - * Consumer-side {@link AggregateEntry} store, keyed on the raw fields of a {@link SpanSnapshot}. + * The {@link AggregateEntry} store of the consuming aggregator thread, keyed on the canonical + * UTF8-encoded labels of a {@link SpanSnapshot}. * - *

Replaces the prior {@code LRUCache}. The win is on the - * steady-state hit path: a snapshot lookup is a 64-bit hash compute + bucket walk + field-wise - * {@code matches}, with no per-snapshot {@link AggregateEntry} allocation and no UTF8 cache - * lookups. The UTF8-encoded forms (formerly held on {@code MetricKey}) and the mutable counters - * (formerly held on {@code AggregateMetric}) both live on the {@link AggregateEntry} now, built - * once per unique key at insert time. + *

{@link #findOrInsert} canonicalizes the snapshot's fields through the cardinality handlers (so + * cardinality-blocked values share a sentinel and collapse into one entry) and then computes the + * lookup hash from that canonical form. Canonicalization runs into a reusable {@link + * AggregateEntry.Canonical} scratch buffer; on a hit nothing is allocated, on a miss the buffer's + * references are copied into a fresh entry and the buffer is overwritten on the next call. * *

Not thread-safe. The aggregator thread is the sole writer of both this table and its * contained {@link AggregateEntry} state. Any cross-thread request that needs to mutate -- e.g. - * {@link ConflatingMetricsAggregator#disable()} -- must funnel onto the aggregator thread via the - * inbox (see the {@code ClearSignal} routing in {@link Aggregator}). The invariant is convention- + * {@link ClientStatsAggregator#disable()} -- must funnel onto the aggregator thread via the inbox + * (see the {@code ClearSignal} routing in {@link Aggregator}). The invariant is convention- * enforced; nothing here checks the calling thread at runtime, so a wrong-thread call would corrupt * bucket chains silently. */ @@ -27,6 +26,7 @@ final class AggregateTable { private final Hashtable.Entry[] buckets; private final int maxAggregates; + private final AggregateEntry.Canonical canonical = new AggregateEntry.Canonical(); private int size; /** @@ -37,7 +37,7 @@ final class AggregateTable { private int evictCursor; AggregateTable(int maxAggregates) { - this.buckets = Support.create(maxAggregates, Support.MAX_RATIO); + this.buckets = Hashtable.Support.create(maxAggregates, Hashtable.Support.MAX_RATIO); this.maxAggregates = maxAggregates; } @@ -55,33 +55,39 @@ boolean isEmpty() { * caller should drop the data point in that case. */ AggregateEntry findOrInsert(SpanSnapshot snapshot) { - long keyHash = AggregateEntry.hashOf(snapshot); - for (AggregateEntry candidate = Support.bucket(buckets, keyHash); + canonical.populateFrom(snapshot); + long keyHash = canonical.keyHash; + for (AggregateEntry candidate = Hashtable.Support.bucket(buckets, keyHash); candidate != null; candidate = candidate.next()) { - if (candidate.matches(keyHash, snapshot)) { + if (candidate.keyHash == keyHash && canonical.matches(candidate)) { return candidate; } } if (size >= maxAggregates && !evictOneStale()) { return null; } - AggregateEntry entry = new AggregateEntry(snapshot, keyHash); - Support.insertHeadEntry(buckets, keyHash, entry); + AggregateEntry entry = canonical.createEntry(); + Hashtable.Support.insertHeadEntry(buckets, keyHash, entry); size++; return entry; } /** * Unlinks the first entry whose {@code getHitCount() == 0}, resuming the scan from {@link - * #evictCursor} so back-to-back evictions amortize to O(1) per call. Worst case for a single call + * #evictCursor} so consecutive evictions amortize to O(1) per call. Worst case for a single call * is still O(N) when nearly every entry is hot, but a sustained eviction stream never re-scans * the hot prefix more than twice across N evictions. * - *

The semantic intent: at cap with all entries live, drop the new key (reported via {@code - * onStatsAggregateDropped}) rather than evicting an established one. Cap is sized to the - * steady-state working set, so eviction is rare; this cursor optimization handles the - * pathological "persistently at cap" case. + *

If the table is full and every entry was used in this cycle, drop the new key (reported via + * {@code onStatsAggregateDropped}) rather than evicting an established one. Cap is sized to the + * steady-state working set, so eviction is rare in the common case. + * + *

How often this fires depends on {@link AggregateEntry#LIMITS_ENABLED}. With limits enabled, + * over-cap values for a given field collapse into a shared {@code blocked_by_tracer} bucket, so + * the table itself rarely reaches {@code maxAggregates}. With limits disabled (the default), + * over-cap values flow to distinct buckets and {@code maxAggregates} becomes the load-bearing + * backstop -- the cursor-resumed scan was added specifically for this regime. */ private boolean evictOneStale() { // Two passes -- [cursor, length) then [0, cursor) -- using the half-open-range iterator. The @@ -93,7 +99,7 @@ private boolean evictOneStale() { /** Scans {@code [startBucket, endBucket)} for the first stale entry and unlinks it. */ private boolean evictOneStaleInRange(int startBucket, int endBucket) { MutatingTableIterator iter = - Support.mutatingTableIterator(buckets, startBucket, endBucket); + Hashtable.Support.mutatingTableIterator(buckets, startBucket, endBucket); while (iter.hasNext()) { AggregateEntry e = iter.next(); if (e.getHitCount() == 0) { @@ -108,7 +114,7 @@ private boolean evictOneStaleInRange(int startBucket, int endBucket) { } void forEach(Consumer consumer) { - Support.forEach(buckets, consumer); + Hashtable.Support.forEach(buckets, consumer); } /** @@ -117,12 +123,13 @@ void forEach(Consumer consumer) { * plus whatever side-band state it needs as {@code context}. */ void forEach(T context, BiConsumer consumer) { - Support.forEach(buckets, context, consumer); + Hashtable.Support.forEach(buckets, context, consumer); } /** Removes entries whose {@code getHitCount() == 0}. */ void expungeStaleAggregates() { - for (MutatingTableIterator iter = Support.mutatingTableIterator(buckets); + for (MutatingTableIterator iter = + Hashtable.Support.mutatingTableIterator(buckets); iter.hasNext(); ) { AggregateEntry e = iter.next(); if (e.getHitCount() == 0) { @@ -133,7 +140,7 @@ void expungeStaleAggregates() { } void clear() { - Support.clear(buckets); + Hashtable.Support.clear(buckets); size = 0; evictCursor = 0; } diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java index d809d452522..8a33d3f1ea7 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java @@ -30,10 +30,10 @@ final class Aggregator implements Runnable { /** * Per-cycle hook run on the aggregator thread at the start of each report cycle, before the - * flush. Used by {@link ConflatingMetricsAggregator} to reconcile its cached peer-tag schema - * against {@link datadog.communication.ddagent.DDAgentFeaturesDiscovery}; running before the - * flush guarantees that any test awaiting {@code writer.finishBucket()} observes the schema in - * its post-reconcile state. May be {@code null}. + * flush. Used by {@link ClientStatsAggregator} to reconcile its cached peer-tag schema against + * {@link datadog.communication.ddagent.DDAgentFeaturesDiscovery}; running before the flush + * guarantees that any test awaiting {@code writer.finishBucket()} observes the schema in its + * post-reconcile state. May be {@code null}. */ private final Runnable onReportCycle; diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ClientStatsAggregator.java similarity index 75% rename from dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java rename to dd-trace-core/src/main/java/datadog/trace/common/metrics/ClientStatsAggregator.java index 895ee434854..5b571b2a661 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ClientStatsAggregator.java @@ -4,7 +4,6 @@ import static datadog.trace.api.DDSpanTypes.RPC; import static datadog.trace.bootstrap.instrumentation.api.Tags.HTTP_ENDPOINT; import static datadog.trace.bootstrap.instrumentation.api.Tags.HTTP_METHOD; -import static datadog.trace.bootstrap.instrumentation.api.Tags.SPAN_KIND; import static datadog.trace.common.metrics.AggregateEntry.ERROR_TAG; import static datadog.trace.common.metrics.AggregateEntry.TOP_LEVEL_TAG; import static datadog.trace.common.metrics.SignalItem.ClearSignal.CLEAR; @@ -40,14 +39,14 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public final class ConflatingMetricsAggregator implements MetricsAggregator, EventListener { +public final class ClientStatsAggregator implements MetricsAggregator, EventListener { - private static final Logger log = LoggerFactory.getLogger(ConflatingMetricsAggregator.class); + private static final Logger log = LoggerFactory.getLogger(ClientStatsAggregator.class); private static final Map DEFAULT_HEADERS = Collections.singletonMap(DDAgentApi.DATADOG_META_TRACER_VERSION, DDTraceCoreInfo.VERSION); - private static final CharSequence SYNTHETICS_ORIGIN = "synthetics"; + private static final String SYNTHETICS_ORIGIN = "synthetics"; private static final SpanKindFilter METRICS_ELIGIBLE_KINDS = SpanKindFilter.builder() @@ -75,25 +74,26 @@ public final class ConflatingMetricsAggregator implements MetricsAggregator, Eve private final boolean includeEndpointInMetrics; /** - * Cached peer-aggregation schema. Producers read this reference once per trace and pass it - * through to the consumer in {@link SpanSnapshot}; they never inspect the schema's discovery - * state or rebuild it. Reconciliation is the aggregator thread's job: {@link - * #reconcilePeerTagSchema()} compares the schema's {@link PeerTagSchema#state} against {@link - * DDAgentFeaturesDiscovery#state()} once per reporting cycle and either updates the state in - * place (when the tag set is unchanged) or swaps in a freshly-built schema. + * Cached peer-aggregation schema read by producer threads. * - *

{@code null} only on the bootstrap window before {@link #bootstrapPeerTagSchema()} runs on - * the first publish. + *

Producers read this reference once per trace and store it on each {@link SpanSnapshot}. They + * do not inspect the schema's discovery state or rebuild it. The aggregator thread reconciles the + * schema once per reporting cycle in {@link #resetCardinalityHandlers()}: if feature discovery + * has a new state but the tag set is unchanged, it updates {@link PeerTagSchema#state} in-place + * to preserve the warm cardinality handlers; otherwise it swaps in a freshly-built schema. * - *

{@code volatile} so the consumer's reconcile-time replacement is visible to producer - * threads; the schema's own internal mutable state ({@link PeerTagSchema#state}) is exercised + *

An empty schema (size 0) represents the "peer tags unconfigured" state; {@code null} only on + * the bootstrap window before {@link #bootstrapPeerTagSchema()} runs on the first publish. + * + *

{@code volatile} so producer threads always see the latest replacement. The schema's own + * internal mutable state (handlers, block counters, {@link PeerTagSchema#state}) is exercised * only on the aggregator thread. */ private volatile PeerTagSchema cachedPeerTagSchema; private volatile AgentTaskScheduler.Scheduled cancellation; - public ConflatingMetricsAggregator( + public ClientStatsAggregator( Config config, SharedCommunicationObjects sharedCommunicationObjects, HealthMetrics healthMetrics) { @@ -114,7 +114,7 @@ public ConflatingMetricsAggregator( config.isTraceResourceRenamingEnabled()); } - ConflatingMetricsAggregator( + ClientStatsAggregator( WellKnownTags wellKnownTags, Set ignoredResources, DDAgentFeaturesDiscovery features, @@ -136,7 +136,7 @@ public ConflatingMetricsAggregator( includeEndpointInMetrics); } - ConflatingMetricsAggregator( + ClientStatsAggregator( WellKnownTags wellKnownTags, Set ignoredResources, DDAgentFeaturesDiscovery features, @@ -160,7 +160,7 @@ public ConflatingMetricsAggregator( includeEndpointInMetrics); } - ConflatingMetricsAggregator( + ClientStatsAggregator( Set ignoredResources, DDAgentFeaturesDiscovery features, HealthMetrics healthMetric, @@ -185,7 +185,7 @@ public ConflatingMetricsAggregator( reportingInterval, timeUnit, healthMetric, - this::reconcilePeerTagSchema); + this::resetCardinalityHandlers); this.thread = newAgentThread(METRICS_AGGREGATOR, aggregator); this.reportingInterval = reportingInterval; this.reportingIntervalTimeUnit = timeUnit; @@ -267,17 +267,26 @@ public boolean publish(List> trace) { boolean forceKeep = false; int counted = 0; if (features.supportsMetrics()) { + // Producer-side fast path: one volatile read and use whatever schema is currently cached. + // The aggregator thread keeps this schema in sync with feature discovery in + // resetCardinalityHandlers(). The only producer-side rebuild is the one-time bootstrap on + // the first publish. + PeerTagSchema peerTagSchema = cachedPeerTagSchema; + if (peerTagSchema == null) { + peerTagSchema = bootstrapPeerTagSchema(); + } for (CoreSpan span : trace) { boolean isTopLevel = span.isTopLevel(); if (shouldComputeMetric(span, isTopLevel)) { final CharSequence resourceName = span.getResourceName(); - if (resourceName != null && ignoredResources.contains(resourceName.toString())) { + if (resourceName != null + && !ignoredResources.isEmpty() + && ignoredResources.contains(resourceName.toString())) { // skip publishing all children - forceKeep = false; break; } counted++; - forceKeep |= publish(span, isTopLevel); + forceKeep |= publish(span, isTopLevel, peerTagSchema); } } healthMetrics.onClientStatTraceComputed(counted, trace.size(), !forceKeep); @@ -292,20 +301,13 @@ private boolean shouldComputeMetric(CoreSpan span, boolean isTopLevel) { && span.getDurationNano() > 0; } - private boolean publish(CoreSpan span, boolean isTopLevel) { - // Error decision drives force-keep sampling regardless of whether the snapshot gets queued. + private boolean publish(CoreSpan span, boolean isTopLevel, PeerTagSchema peerTagSchema) { boolean error = span.getError() > 0; - - // Fast-path the inbox-full case before any tag extraction or snapshot allocation. size() is - // approximate on jctools' MPSC queue but that's fine: if we under-estimate, we fall through - // and let inbox.offer be the source of truth (existing behavior); if we over-estimate, we - // drop a snapshot that would have fit -- acceptable, onStatsInboxFull was going to fire - // imminently anyway. + // size() is approximate on jctools MPSC queues but good enough for a fast-path overflow check. if (inbox.size() >= inbox.capacity()) { healthMetrics.onStatsInboxFull(); return error; } - // Extract HTTP method and endpoint only if the feature is enabled String httpMethod = null; String httpEndpoint = null; @@ -322,20 +324,23 @@ private boolean publish(CoreSpan span, boolean isTopLevel) { Object grpcStatusObj = span.unsafeGetTag(InstrumentationTags.GRPC_STATUS_CODE); grpcStatusCode = grpcStatusObj != null ? grpcStatusObj.toString() : null; } - // CharSequence default keeps unsafeGetTag's generic at CharSequence so UTF8BytesString - // tag values don't trigger a ClassCastException on the String assignment. - final String spanKind = span.unsafeGetTag(SPAN_KIND, (CharSequence) "").toString(); + // DDSpan resolves this from a cached span.kind ordinal via a small lookup array, skipping a + // tag-map lookup. Other CoreSpan impls fall back to the tag map by default. + String spanKind = span.getSpanKindString(); + if (spanKind == null) { + spanKind = ""; + } long tagAndDuration = span.getDurationNano() | (error ? ERROR_TAG : 0L) | (isTopLevel ? TOP_LEVEL_TAG : 0L); - PeerTagSchema peerTagSchema = peerTagSchemaFor(span); + PeerTagSchema spanPeerTagSchema = peerTagSchemaFor(spanKind, peerTagSchema); String[] peerTagValues = - peerTagSchema == null ? null : capturePeerTagValues(span, peerTagSchema); + spanPeerTagSchema == null ? null : capturePeerTagValues(span, spanPeerTagSchema); if (peerTagValues == null) { - // No tags fired -- drop the schema reference so the consumer doesn't bother iterating an - // all-null array. - peerTagSchema = null; + // capture returned no non-null values -- drop the schema reference so the consumer doesn't + // bother iterating an all-null array. + spanPeerTagSchema = null; } SpanSnapshot snapshot = @@ -349,7 +354,7 @@ private boolean publish(CoreSpan span, boolean isTopLevel) { isSynthetic(span), span.getParentId() == 0, spanKind, - peerTagSchema, + spanPeerTagSchema, peerTagValues, httpMethod, httpEndpoint, @@ -362,26 +367,6 @@ private boolean publish(CoreSpan span, boolean isTopLevel) { return error; } - /** - * Picks the peer-tag schema for a span. For internal-kind spans we always use the static {@link - * PeerTagSchema#INTERNAL} singleton (one entry for {@code base.service}); for {@code - * client}/{@code producer}/{@code consumer} kinds we use the cached peer-aggregation schema - * synced from {@link DDAgentFeaturesDiscovery#peerTags()}. Other kinds get {@code null}. - */ - private PeerTagSchema peerTagSchemaFor(CoreSpan span) { - if (span.isKind(PEER_AGGREGATION_KINDS)) { - PeerTagSchema schema = cachedPeerTagSchema; - if (schema == null) { - schema = bootstrapPeerTagSchema(); - } - return schema.size() > 0 ? schema : null; - } - if (span.isKind(INTERNAL_KIND)) { - return PeerTagSchema.INTERNAL; - } - return null; - } - /** * One-time producer-side bootstrap of {@link #cachedPeerTagSchema}. Synchronized double-check * guards against two producers racing on the very first publish; after this returns, {@code @@ -416,14 +401,32 @@ private PeerTagSchema buildPeerTagSchema() { return PeerTagSchema.of(names == null ? Collections.emptySet() : names, state); } + /** + * Single reset hook invoked on the aggregator thread at the end of each report cycle. Reconciles + * the cached peer-tag schema against the latest feature discovery, then resets all cardinality + * state in lockstep: the static property handlers + {@code PeerTagSchema.INTERNAL} (via {@link + * AggregateEntry#resetCardinalityHandlers()}) and the cached peer-tag schema (with whatever + * reconciliation just produced). New handlers added anywhere in this pipeline should be reset + * from here. + */ + private void resetCardinalityHandlers() { + reconcilePeerTagSchema(); + AggregateEntry.resetCardinalityHandlers(healthMetrics); + PeerTagSchema schema = cachedPeerTagSchema; + if (schema != null) { + schema.resetHandlers(healthMetrics); + } + } + /** * Reconciles {@link #cachedPeerTagSchema} with the latest feature discovery. Runs on the * aggregator thread once per reporting cycle via the reset hook passed to {@link Aggregator}. * Cheap fast path: an equality check against the cached schema's embedded {@link * DDAgentFeaturesDiscovery#state()} hash short-circuits when discovery's response hasn't changed * since the schema was built. On mismatch, a set compare distinguishes "discovery response - * changed but peer tags are the same" (just update the cached state in place) from "tags actually - * changed" (build a new schema and swap the volatile reference). + * changed but peer tags are the same" (just update the cached state in place to preserve the warm + * cardinality handlers) from "tags actually changed" (build a new schema and swap the volatile + * reference). */ private void reconcilePeerTagSchema() { PeerTagSchema cached = cachedPeerTagSchema; @@ -436,19 +439,37 @@ private void reconcilePeerTagSchema() { return; } Set latestNames = features.peerTags(); - Set normalized = latestNames == null ? Collections.emptySet() : latestNames; + Set normalized = latestNames == null ? Collections.emptySet() : latestNames; if (cached.hasSameTagsAs(normalized)) { cached.state = latestState; } else { + // Tags actually changed: flush the outgoing schema's accumulated block telemetry before + // discarding it, otherwise the partial-cycle blockedCounts would silently disappear. + cached.resetHandlers(healthMetrics); cachedPeerTagSchema = PeerTagSchema.of(normalized, latestState); } } /** - * Captures the span's peer-tag values into a {@code String[]} parallel to {@code schema.names}. - * Slots remain {@code null} for tags the span didn't set; the array itself is lazily allocated on - * the first hit so spans that fire no peer tags pay zero allocation. Returns {@code null} when - * none of the configured peer tags are set on the span. + * Picks the peer-tag schema for a span. The {@code peerTagSchema} argument is the per-trace + * cached schema (read once in {@link #publish(List)} via the volatile {@link + * #cachedPeerTagSchema}, with {@link #bootstrapPeerTagSchema()} taking care of the first-publish + * window) -- always non-null but possibly empty when peer tags are unconfigured. For + * internal-kind spans the static {@link PeerTagSchema#INTERNAL} schema is used regardless. + */ + private static PeerTagSchema peerTagSchemaFor(String spanKind, PeerTagSchema peerTagSchema) { + if (peerTagSchema.size() > 0 && PEER_AGGREGATION_KINDS.matches(spanKind)) { + return peerTagSchema; + } + if (INTERNAL_KIND.matches(spanKind)) { + return PeerTagSchema.INTERNAL; + } + return null; + } + + /** + * Captures the span's peer tag values into a {@code String[]} parallel to {@code schema.names}. + * Returns {@code null} when none of the configured peer tags are set on the span. */ private static String[] capturePeerTagValues(CoreSpan span, PeerTagSchema schema) { String[] names = schema.names; @@ -467,7 +488,8 @@ private static String[] capturePeerTagValues(CoreSpan span, PeerTagSchema sch } private static boolean isSynthetic(CoreSpan span) { - return span.getOrigin() != null && SYNTHETICS_ORIGIN.equals(span.getOrigin().toString()); + CharSequence origin = span.getOrigin(); + return origin != null && SYNTHETICS_ORIGIN.contentEquals(origin); } public void stop() { @@ -526,11 +548,10 @@ private void disable() { } } - private static final class ReportTask - implements AgentTaskScheduler.Task { + private static final class ReportTask implements AgentTaskScheduler.Task { @Override - public void run(ConflatingMetricsAggregator target) { + public void run(ClientStatsAggregator target) { target.report(); } } diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/MetricCardinalityLimits.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/MetricCardinalityLimits.java new file mode 100644 index 00000000000..58dcd3a70a1 --- /dev/null +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/MetricCardinalityLimits.java @@ -0,0 +1,73 @@ +package datadog.trace.common.metrics; + +/** + * Per-field limits for distinct metric-key values seen during one reporting cycle. When a field + * exceeds its limit, additional values are replaced with {@code tracer_blocked_value} so they share + * one aggregate row instead of creating more rows in the table. + * + *

Values are sized to the typical-service workload with headroom; "typical" estimates are noted + * inline. Raise if a workload routinely hits the sentinel; lower carries proportional memory + * savings but risks suppressing legitimate distinctions. + */ +final class MetricCardinalityLimits { + private MetricCardinalityLimits() {} + + /** + * Distinct {@code resource.name} values per cycle. Highest-cardinality field by far: DB-query + * obfuscations, HTTP route templates, custom resources. Typical service: 30-200 unique. + */ + static final int RESOURCE = 256; + + /** + * Distinct {@code service.name} values per cycle. Local service plus downstream peer-service + * names. Microservice meshes typically reference 10-50 distinct services. + */ + static final int SERVICE = 32; + + /** + * Distinct {@code operation.name} values per cycle. Names like {@code http.request}, {@code + * db.query}, etc. Typical service: 10-30 across integrations. + */ + static final int OPERATION = 64; + + /** + * Distinct {@code _dd.base_service} override values per cycle. Used rarely; usually empty or one + * of a handful per service. + */ + static final int SERVICE_SOURCE = 16; + + /** + * Distinct {@code span.type} values per cycle. {@code DDSpanTypes} catalog is ~30; a single + * service usually spans 5-10 integration types. + */ + static final int TYPE = 16; + + /** + * Distinct {@code span.kind} values per cycle. OTel defines exactly 5 (server/client/producer/ + * consumer/internal); 8 still leaves 60% headroom in case a producer invents new kinds. + */ + static final int SPAN_KIND = 8; + + /** + * Distinct HTTP method values per cycle. Standard verbs are 7-9; WebDAV/custom adds a few more. + */ + static final int HTTP_METHOD = 16; + + /** + * Distinct {@code http.endpoint} values per cycle. Path templates -- same shape as {@code + * RESOURCE} for HTTP-heavy services. Only used when {@code includeEndpointInMetrics} is enabled. + */ + static final int HTTP_ENDPOINT = 64; + + /** + * Distinct gRPC status code values per cycle. gRPC spec defines exactly 17 codes (0-16); 24 + * leaves headroom for unknown-code edge cases without wasting space. + */ + static final int GRPC_STATUS_CODE = 24; + + /** + * Distinct values per peer-tag name (e.g. distinct {@code peer.hostname} values). Each configured + * peer tag gets its own handler at this limit. + */ + static final int PEER_TAG_VALUE = 512; +} diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/MetricsAggregatorFactory.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/MetricsAggregatorFactory.java index 09464310113..b9530871763 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/MetricsAggregatorFactory.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/MetricsAggregatorFactory.java @@ -15,7 +15,7 @@ public static MetricsAggregator createMetricsAggregator( HealthMetrics healthMetrics) { if (config.isTracerMetricsEnabled()) { log.debug("tracer metrics enabled"); - return new ConflatingMetricsAggregator(config, sharedCommunicationObjects, healthMetrics); + return new ClientStatsAggregator(config, sharedCommunicationObjects, healthMetrics); } log.debug("tracer metrics disabled"); return NoOpMetricsAggregator.INSTANCE; diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java index d3a3d47d65a..e9eb7a6b823 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java @@ -3,17 +3,20 @@ import static datadog.trace.api.DDTags.BASE_SERVICE; import datadog.communication.ddagent.DDAgentFeaturesDiscovery; +import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString; +import datadog.trace.core.monitor.HealthMetrics; import java.util.Set; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** - * Names of the peer-tags eligible for client-stats aggregation, packed into a flat {@code String[]} - * for parallel-array access by producers and the aggregator thread. + * Parallel arrays of peer-tag names and their {@link TagCardinalityHandler}s, using matching + * indexes. * - *

This is the minimal carrier shape used by {@link SpanSnapshot}: the producer captures per-span - * values into a {@code String[]} parallel to {@link #names}, and the aggregator reconstructs the - * encoded {@code tag:value} pairs from the same name index. It replaces the prior "flat pairs" - * {@code [name0, value0, name1, value1, ...]} layout, which forced a worst-case allocation + - * trim-and-copy on every span. + *

Each schema stores peer-tag names and their cardinality handlers by index. Producers capture + * span tag values into a {@code String[]} with the same ordering as {@link #names}. Consumers pass + * the index and captured value to {@link #register(int, String)} to canonicalize it through the + * per-tag cardinality handler. * *

Two schemas exist: * @@ -21,35 +24,39 @@ *

  • {@link #INTERNAL} -- a singleton with one entry for {@code base.service}, used for * internal-kind spans where only the base service is aggregated. *
  • A peer-aggregation schema built via {@link #of(Set, String)} for {@code client}/{@code - * producer}/{@code consumer} spans. {@link ConflatingMetricsAggregator} caches the most - * recently built schema and reconciles it on the aggregator thread once per reporting cycle - * by comparing {@link #state} against {@link DDAgentFeaturesDiscovery#state()}. + * producer}/{@code consumer} spans. {@link ClientStatsAggregator} caches the most recently + * built schema and reconciles it on the aggregator thread once per reporting cycle by + * comparing {@link #state} against {@link DDAgentFeaturesDiscovery#state()}. * * - *

    This class deliberately has no cardinality limiters -- callers that need those layer them on - * top. + *

    Cardinality blocks are counted inside each {@link TagCardinalityHandler} and flushed once per + * cycle (with a warn log) in {@link #resetHandlers(HealthMetrics)}. * - *

    Thread-safety: {@link #names} is final and safe to read from any thread. {@link #state} - * is exercised only on the aggregator thread (read and updated in reconciliation); producer threads - * access the schema only through the volatile {@code cachedPeerTagSchema} reference in {@link - * ConflatingMetricsAggregator}. + *

    Each {@link SpanSnapshot} captures its own schema reference so producer and consumer agree on + * the indexing even if the current schema is replaced between capture and consumption. + * + *

    Thread-safety: the aggregator thread is the only thread that mutates this schema, + * including its {@link TagCardinalityHandler}s and {@link #state}. Producer threads may read {@link + * #names} and {@link #handlers} because they are final and published through the volatile {@code + * cachedPeerTagSchema} reference in {@link ClientStatsAggregator}. */ final class PeerTagSchema { - /** Singleton schema for internal-kind spans -- only {@code base.service}. */ - static final PeerTagSchema INTERNAL = - // INTERNAL is never reconciled, so the state value is irrelevant. - new PeerTagSchema(new String[] {BASE_SERVICE}, null); - - final String[] names; + private static final Logger log = LoggerFactory.getLogger(PeerTagSchema.class); /** - * Precomputed {@code Arrays.hashCode(names)}. The schema is shared across many publishes so - * recomputing it on the aggregator hot path (per-publish call to {@code AggregateEntry.hashOf}) - * was waste -- it showed up as a top aggregator-thread sample. Cached here, computed once at - * construction. + * Sentinel {@link #state} for schemas that are never reconciled against feature discovery: the + * {@link #INTERNAL} singleton and test-built schemas. A {@code null} state always mismatches a + * real discovery hash, so a schema built with it would rebuild on first reconcile -- but neither + * of these schemas takes that path. */ - final int namesHash; + static final String NO_STATE = null; + + /** Singleton schema for internal-kind spans -- only {@code base.service}. */ + static final PeerTagSchema INTERNAL = new PeerTagSchema(new String[] {BASE_SERVICE}, NO_STATE); + + final String[] names; + final TagCardinalityHandler[] handlers; /** * The {@code DDAgentFeaturesDiscovery.state()} hash this schema was built from. The aggregator @@ -60,30 +67,27 @@ final class PeerTagSchema { */ String state; - private PeerTagSchema(String[] names, String state) { - this.names = names; - this.namesHash = java.util.Arrays.hashCode(names); - this.state = state; - } - /** Builds a schema for the given peer-tag names. Order is determined by the {@link Set}. */ - static PeerTagSchema of(Set tags, String state) { - return new PeerTagSchema(tags.toArray(new String[0]), state); + static PeerTagSchema of(Set names, String state) { + return new PeerTagSchema(names.toArray(new String[0]), state); } - /** - * Test-only factory that takes the names array directly so tests can build a schema in a specific - * order without going through a {@link Set}. - */ - static PeerTagSchema testSchema(String[] names) { - return new PeerTagSchema(names, null); + PeerTagSchema(String[] names, String state) { + this.names = names; + this.state = state; + this.handlers = new TagCardinalityHandler[names.length]; + for (int i = 0; i < names.length; i++) { + this.handlers[i] = + new TagCardinalityHandler( + names[i], MetricCardinalityLimits.PEER_TAG_VALUE, AggregateEntry.LIMITS_ENABLED); + } } /** - * Whether this schema's tag names exactly match {@code other}. Used by the aggregator's reconcile - * path: when a feature discovery refresh changes {@link DDAgentFeaturesDiscovery#state()} but the - * resulting set is unchanged, the aggregator can keep this schema and just update {@link #state} - * instead of rebuilding. + * Whether this schema contains exactly the same tag names as {@code other}. Used during + * reconciliation: if feature discovery has a new {@link DDAgentFeaturesDiscovery#state()} but the + * peer-tag set is unchanged, the aggregator can reuse this schema and update {@link #state} + * instead of rebuilding the handlers. */ boolean hasSameTagsAs(Set other) { if (this.names.length != other.size()) { @@ -97,7 +101,38 @@ boolean hasSameTagsAs(Set other) { return true; } + /** + * Canonicalizes the peer-tag value at slot {@code i}. Returns {@link UTF8BytesString#EMPTY} for + * null inputs and the handler's {@code ":tracer_blocked_value"} sentinel when the per-tag + * cardinality budget is exhausted. + */ + UTF8BytesString register(int i, String value) { + return handlers[i].register(value); + } + + /** + * Resets every {@link TagCardinalityHandler}'s working set, flushes accumulated per-tag block + * counts to {@link HealthMetrics}, and emits a warn log for each tag that hit its limit this + * cycle. Must be called on the aggregator thread; handlers are not thread-safe. + */ + void resetHandlers(HealthMetrics healthMetrics) { + for (int i = 0; i < handlers.length; i++) { + long blocked = handlers[i].reset(); + if (blocked > 0) { + log.warn( + "Cardinality limit reached for peer tag '{}'; further values are reported as" + + " 'tracer_blocked_value' until the next reporting cycle", + names[i]); + healthMetrics.onTagCardinalityBlocked(handlers[i].statsDTag(), blocked); + } + } + } + int size() { return names.length; } + + String name(int i) { + return names[i]; + } } diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/PropertyCardinalityHandler.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/PropertyCardinalityHandler.java new file mode 100644 index 00000000000..ca86f0be8f6 --- /dev/null +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/PropertyCardinalityHandler.java @@ -0,0 +1,181 @@ +package datadog.trace.common.metrics; + +import datadog.trace.api.internal.VisibleForTesting; +import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString; +import java.util.Arrays; + +/** + * Cardinality-capped UTF8 encoder and cache for one aggregate-key field ({@code value} → + * {@code UTF8(value)}). + * + *

    Each reporting cycle (interval between client-stats flushes) has its own cardinality budget. + * Once the budget is exhausted, new values get the {@code tracer_blocked_value} sentinel (or a + * fresh allocation when sentinel mode is disabled). A prior-cycle table preserves {@link + * UTF8BytesString} instances across the reset, so stable workloads pay zero allocations after the + * first cycle. + * + *

    Dual role -- limiter and cache. Prior versions ran a per-field {@code DDCache} for UTF8 + * reuse with a separate global cardinality cap on top. Under high load that wasn't enough to stave + * off long GC cycles: every miss still concatenated / UTF8-encoded the value before the cache could + * store it. A cardinality limiter and a recent-value cache are both sets of recently used + * values, so this class collapses them into one structure. Cardinality limiting happens first, + * which lets the blocked path skip the concatenation and encoding entirely. + * + *

    A pure limiter would fully reset each reporting cycle and destroy the cache. To preserve UTF8 + * reuse across resets, the handler keeps the previous cycle's entries verbatim in a parallel table + * and reuses any matching {@link UTF8BytesString} when a value first appears in the new cycle. + */ +final class PropertyCardinalityHandler { + // Upper bound prevents int overflow in the (cardinalityLimit * 2 - 1) capacity calculation. + // Practical limits are 8..512; this cap is well beyond any realistic configuration. + private static final int MAX_CARDINALITY_LIMIT = 1 << 29; + + final String name; + private final int cardinalityLimit; + private final int capacityMask; + + /** + * Whether to substitute the {@code tracer_blocked_value} sentinel when the per-cycle budget is + * exhausted. With limits enabled (sentinel mode), overflow values collapse to one bucket; with + * limits disabled, the cache size is still bounded by {@link #cardinalityLimit} but over-budget + * values get freshly-allocated {@link UTF8BytesString}s instead, so the wire format carries the + * real value and entries don't collapse. Prior-cycle reuse runs in either mode. + */ + private final boolean useBlockedSentinel; + + // Single open-addressed table per cycle. The stored UTF8BytesString IS the slot identity -- + // equality is checked by comparing its underlying String against the incoming CharSequence. + private UTF8BytesString[] curValues; + // Values from the previous reporting cycle, kept so values that persist across cycles can reuse + // their UTF8BytesString instance without re-allocating. + private UTF8BytesString[] priorValues; + private int curSize; + + private UTF8BytesString cacheBlocked = null; + private String[] statsDTag = null; + + /** Accumulated block count for the current cycle. Returned and zeroed by {@link #reset()}. */ + private long blockedCount; + + /** + * Test convenience: limits-enabled mode (blocked sentinel substitution active). Production uses + * the three-argument constructor with the flag from {@code Config}. + */ + @VisibleForTesting + PropertyCardinalityHandler(String name, int cardinalityLimit) { + this(name, cardinalityLimit, true); + } + + PropertyCardinalityHandler(String name, int cardinalityLimit, boolean useBlockedSentinel) { + this.name = name; + if (cardinalityLimit <= 0) { + throw new IllegalArgumentException("cardinalityLimit must be positive: " + cardinalityLimit); + } + if (cardinalityLimit > MAX_CARDINALITY_LIMIT) { + throw new IllegalArgumentException( + "cardinalityLimit must be at most " + MAX_CARDINALITY_LIMIT + ": " + cardinalityLimit); + } + this.cardinalityLimit = cardinalityLimit; + this.useBlockedSentinel = useBlockedSentinel; + // Capacity = next power of two >= 2 * cardinalityLimit. Linear-probing load factor stays + // <= 0.5 even when the budget is full, which keeps probe chains short. + final int capacity = Integer.highestOneBit(cardinalityLimit * 2 - 1) << 1; + this.capacityMask = capacity - 1; + this.curValues = new UTF8BytesString[capacity]; + this.priorValues = new UTF8BytesString[capacity]; + } + + /** + * Canonicalizes {@code value} through the cardinality budget and per-cycle reuse cache. Null + * inputs map to {@link UTF8BytesString#EMPTY} -- {@link AggregateEntry} doesn't need to + * pre-check. + * + *

    The value hash is computed once and used as the initial probe slot in both tables. {@code h + * ^ (h >>> 16)} folds high hash bits into the low bits to reduce collisions for inputs that share + * a common low-bit pattern. {@link UTF8BytesString#hashCode} is content-stable with the + * underlying String, so a String input and a UTF8BytesString of the same content map to the same + * slot. + */ + UTF8BytesString register(CharSequence value) { + if (value == null) { + return UTF8BytesString.EMPTY; + } + // Initial table slot, used to probe current and prior tables. + int h = value.hashCode(); + int start = (h ^ (h >>> 16)) & this.capacityMask; + + // First, look in the current-cycle table. + // If found, this value already consumed cardinality budget in this cycle. + int slot = start; + UTF8BytesString existing; + while ((existing = this.curValues[slot]) != null + && existing != value + && !existing.toString().contentEquals(value)) { + slot = (slot + 1) & this.capacityMask; + } + if (existing != null) { + return existing; + } + // This value is new for the current cycle. + boolean capExhausted = this.curSize >= this.cardinalityLimit; + // If sentinel mode is enabled and the field is over budget, collapse this + // value to tracer_blocked_value and count it as blocked. + if (capExhausted && this.useBlockedSentinel) { + this.blockedCount++; + return this.tracerBlockedValue(); + } + // Otherwise, try to reuse from the prior cycle if possible to avoid re-allocation. + // Runs whether or not the current budget is exhausted, so persistent values keep their + // UTF8BytesString instance across cycles. + int priorSlot = start; + UTF8BytesString priorMatch; + while ((priorMatch = this.priorValues[priorSlot]) != null + && priorMatch != value + && !priorMatch.toString().contentEquals(value)) { + priorSlot = (priorSlot + 1) & this.capacityMask; + } + UTF8BytesString utf8 = priorMatch != null ? priorMatch : UTF8BytesString.create(value); + // If there is still budget, remember this value in the current-cycle table. + if (!capExhausted) { + this.curValues[slot] = utf8; + this.curSize += 1; + } + // If capExhausted && !useBlockedSentinel, this returns the real value but + // does not cache it in the current cycle. + return utf8; + } + + private UTF8BytesString tracerBlockedValue() { + UTF8BytesString cacheBlocked = this.cacheBlocked; + if (cacheBlocked != null) return cacheBlocked; + + this.cacheBlocked = cacheBlocked = UTF8BytesString.create("tracer_blocked_value"); + return cacheBlocked; + } + + /** + * Resets the per-cycle working set and returns the accumulated block count for this cycle. The + * caller is responsible for reporting the count to health metrics if non-zero. + */ + String[] statsDTag() { + if (statsDTag == null) { + statsDTag = new String[] {"collapsed:" + name}; + } + return statsDTag; + } + + long reset() { + long count = this.blockedCount; + this.blockedCount = 0; + // Flip pointers: the just-completed cycle becomes prior; what was prior (2 cycles ago) is + // recycled into the new (empty) current. + final UTF8BytesString[] tmp = this.priorValues; + this.priorValues = this.curValues; + this.curValues = tmp; + // Null the new current. The values pulled out of prior are still reachable through any + // AggregateEntry rows they ended up populating; this just drops the handler's references. + Arrays.fill(this.curValues, null); + this.curSize = 0; + return count; + } +} diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/SerializingMetricWriter.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/SerializingMetricWriter.java index 972bd1e86ed..622a4a14cb0 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/SerializingMetricWriter.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/SerializingMetricWriter.java @@ -91,7 +91,7 @@ public SerializingMetricWriter( this.buffer = new GrowableBuffer(initialCapacity); this.writer = new MsgPackWriter(buffer); this.sink = sink; - this.gitInfoProvider = new GitInfoProvider(); + this.gitInfoProvider = gitInfoProvider; } @Override @@ -151,11 +151,12 @@ public void startBucket(int metricCount, long start, long duration) { @Override public void add(AggregateEntry entry) { - // Calculate dynamic map size based on optional fields - final boolean hasHttpMethod = entry.getHttpMethod() != null; - final boolean hasHttpEndpoint = entry.getHttpEndpoint() != null; - final boolean hasServiceSource = entry.getServiceSource() != null; - final boolean hasGrpcStatusCode = entry.getGrpcStatusCode() != null; + // Dynamic map size based on optional fields; AggregateEntry encapsulates the EMPTY-as-absent + // sentinel via its hasFoo() predicates so the serializer doesn't depend on the storage choice. + final boolean hasHttpMethod = entry.hasHttpMethod(); + final boolean hasHttpEndpoint = entry.hasHttpEndpoint(); + final boolean hasServiceSource = entry.hasServiceSource(); + final boolean hasGrpcStatusCode = entry.hasGrpcStatusCode(); final int mapSize = 15 + (hasServiceSource ? 1 : 0) diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/SpanSnapshot.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/SpanSnapshot.java index 152ac42bb55..8bbc6a29edb 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/SpanSnapshot.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/SpanSnapshot.java @@ -1,5 +1,7 @@ package datadog.trace.common.metrics; +import javax.annotation.Nullable; + /** * Immutable per-span value posted from the producer to the aggregator thread. Carries the raw * inputs the aggregator needs to look up or build an {@link AggregateEntry} and update its @@ -22,21 +24,21 @@ final class SpanSnapshot implements InboxItem { /** * Schema for {@link #peerTagValues}. {@code null} when the span has no peer tags. The schema - * carries the names in parallel-array form; {@code peerTagValues} holds the per-span tag values - * at the same indices. + * carries the names + {@link TagCardinalityHandler}s in parallel array form; {@code + * peerTagValues} holds the per-span tag values at the same indices. */ - final PeerTagSchema peerTagSchema; + @Nullable final PeerTagSchema peerTagSchema; /** * Peer tag values captured from the span, parallel to {@code peerTagSchema.names}. A {@code null} * entry means the span didn't have that peer tag set. {@code null} (the whole array) when {@link * #peerTagSchema} is {@code null}. */ - final String[] peerTagValues; + @Nullable final String[] peerTagValues; - final String httpMethod; - final String httpEndpoint; - final String grpcStatusCode; + @Nullable final String httpMethod; + @Nullable final String httpEndpoint; + @Nullable final String grpcStatusCode; /** Duration in nanoseconds, OR-ed with {@code ERROR_TAG} / {@code TOP_LEVEL_TAG} as needed. */ final long tagAndDuration; @@ -51,11 +53,11 @@ final class SpanSnapshot implements InboxItem { boolean synthetic, boolean traceRoot, String spanKind, - PeerTagSchema peerTagSchema, - String[] peerTagValues, - String httpMethod, - String httpEndpoint, - String grpcStatusCode, + @Nullable PeerTagSchema peerTagSchema, + @Nullable String[] peerTagValues, + @Nullable String httpMethod, + @Nullable String httpEndpoint, + @Nullable String grpcStatusCode, long tagAndDuration) { this.resourceName = resourceName; this.serviceName = serviceName; diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/TagCardinalityHandler.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/TagCardinalityHandler.java new file mode 100644 index 00000000000..3c3d151142c --- /dev/null +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/TagCardinalityHandler.java @@ -0,0 +1,170 @@ +package datadog.trace.common.metrics; + +import datadog.trace.api.internal.VisibleForTesting; +import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString; +import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; +import java.util.Arrays; + +/** + * Cardinality-capped UTF8 encoder and cache for one peer-tag name ({@code value} → {@code + * UTF8("tag:value")}). + * + *

    Same per-cycle budget and prior-cycle reuse as {@link PropertyCardinalityHandler}. The + * difference is that the cached output is the pre-encoded {@code "tag:value"} string, so a parallel + * raw-value keys table is needed alongside the UTF8 values table. + */ +final class TagCardinalityHandler { + // Upper bound prevents int overflow in the (cardinalityLimit * 2 - 1) capacity calculation. + // Practical limits are 8..512; this cap is well beyond any realistic configuration. + private static final int MAX_CARDINALITY_LIMIT = 1 << 29; + + private final String tag; + private String[] statsDTag = null; + private final int cardinalityLimit; + private final int capacityMask; + + /** See {@link PropertyCardinalityHandler}'s field of the same name. */ + private final boolean useBlockedSentinel; + + private String[] curKeys; + private UTF8BytesString[] curValues; + private String[] priorKeys; + private UTF8BytesString[] priorValues; + private int curSize; + + private UTF8BytesString cacheBlocked = null; + + /** Accumulated block count for the current cycle. Returned and zeroed by {@link #reset()}. */ + private long blockedCount; + + /** + * Test convenience: limits-enabled mode. Production uses the three-argument constructor with the + * flag from {@code Config}. + */ + @VisibleForTesting + TagCardinalityHandler(String tag, int cardinalityLimit) { + this(tag, cardinalityLimit, true); + } + + TagCardinalityHandler(String tag, int cardinalityLimit, boolean useBlockedSentinel) { + if (cardinalityLimit <= 0) { + throw new IllegalArgumentException("cardinalityLimit must be positive: " + cardinalityLimit); + } + if (cardinalityLimit > MAX_CARDINALITY_LIMIT) { + throw new IllegalArgumentException( + "cardinalityLimit must be at most " + MAX_CARDINALITY_LIMIT + ": " + cardinalityLimit); + } + this.tag = tag; + this.cardinalityLimit = cardinalityLimit; + this.useBlockedSentinel = useBlockedSentinel; + final int capacity = Integer.highestOneBit(cardinalityLimit * 2 - 1) << 1; + this.capacityMask = capacity - 1; + this.curKeys = new String[capacity]; + this.curValues = new UTF8BytesString[capacity]; + this.priorKeys = new String[capacity]; + this.priorValues = new UTF8BytesString[capacity]; + } + + /** + * Returns the UTF8 value to use for {@code value} in the current reporting cycle. Null inputs are + * returned as {@link UTF8BytesString#EMPTY}. + * + *

    The value hash is computed once and used as the initial probe slot in both tables. The + * {@code h ^ (h >>> 16)} calculation folds high hash bits into the low bits, which reduces + * clustering when values share similar low-bit hash patterns. + */ + @SuppressFBWarnings( + value = "ES_COMPARING_PARAMETER_STRING_WITH_EQ", + justification = + "Intentional identity fast-path: the reference check short-circuits the .equals() call" + + " when the stored key and probe value are the same instance.") + UTF8BytesString register(String value) { + if (value == null) { + return UTF8BytesString.EMPTY; + } + // Compute the initial probe slot once. The same start slot is used for the + // current-cycle table and, on miss, for the prior-cycle table. + int h = value.hashCode(); + int start = (h ^ (h >>> 16)) & this.capacityMask; + + // Look for the raw value in the current-cycle table. + int slot = start; + String existing; + while ((existing = this.curKeys[slot]) != null + && existing != value + && !existing.equals(value)) { + slot = (slot + 1) & this.capacityMask; + } + // If found, return the already encoded "tag:value" UTF8 value. + if (existing != null) { + return this.curValues[slot]; + } + // This value is new for the current cycle. + boolean capExhausted = this.curSize >= this.cardinalityLimit; + // If sentinel mode is enabled and the tag has reached its value budget, + // collapse this value to "tag:tracer_blocked_value" and record the block. + if (capExhausted && this.useBlockedSentinel) { + this.blockedCount++; + return this.tracerBlockedValue(); + } + // Try to find the same raw value in the previous-cycle table so the encoded + // UTF8 object can be reused after a reset. + int priorSlot = start; + String priorKey; + while ((priorKey = this.priorKeys[priorSlot]) != null + && priorKey != value + && !priorKey.equals(value)) { + priorSlot = (priorSlot + 1) & this.capacityMask; + } + // Reuse the previous encoded "tag:value" UTF8 value if present; otherwise + // create it from the fixed tag name and the raw value. + UTF8BytesString utf8 = + priorKey != null + ? this.priorValues[priorSlot] + : UTF8BytesString.create(this.tag + ":" + value); + // If still within budget, remember the raw value and its encoded UTF8 + // output in the current-cycle table. + if (!capExhausted) { + this.curKeys[slot] = value; + this.curValues[slot] = utf8; + this.curSize += 1; + } + // If capExhausted && !useBlockedSentinel, return the real "tag:value" value + // without caching it in the current cycle. + return utf8; + } + + private UTF8BytesString tracerBlockedValue() { + UTF8BytesString cacheBlocked = this.cacheBlocked; + if (cacheBlocked != null) return cacheBlocked; + + this.cacheBlocked = cacheBlocked = UTF8BytesString.create(this.tag + ":tracer_blocked_value"); + return cacheBlocked; + } + + String[] statsDTag() { + if (statsDTag == null) { + statsDTag = new String[] {"collapsed:" + tag}; + } + return statsDTag; + } + + /** + * Resets the per-cycle working set and returns the accumulated block count for this cycle. The + * caller is responsible for reporting the count to health metrics if non-zero. + */ + long reset() { + long count = this.blockedCount; + this.blockedCount = 0; + final String[] tmpKeys = this.priorKeys; + final UTF8BytesString[] tmpValues = this.priorValues; + this.priorKeys = this.curKeys; + this.priorValues = this.curValues; + this.curKeys = tmpKeys; + this.curValues = tmpValues; + Arrays.fill(this.curKeys, null); + Arrays.fill(this.curValues, null); + this.curSize = 0; + return count; + } +} diff --git a/dd-trace-core/src/main/java/datadog/trace/core/CoreSpan.java b/dd-trace-core/src/main/java/datadog/trace/core/CoreSpan.java index 4d0d8c87f99..dd3b2bc6e06 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/CoreSpan.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/CoreSpan.java @@ -1,6 +1,7 @@ package datadog.trace.core; import datadog.trace.api.DDTraceId; +import datadog.trace.bootstrap.instrumentation.api.Tags; import java.util.Map; public interface CoreSpan> { @@ -76,6 +77,16 @@ public interface CoreSpan> { boolean isKind(SpanKindFilter filter); + /** + * Returns the {@code span.kind} tag value as a String, or {@code null} if not set. Default + * implementation reads the tag map; {@link DDSpan} overrides to use a cached ordinal that + * resolves via a small lookup array, skipping the tag-map lookup on the hot path. + */ + default String getSpanKindString() { + Object v = unsafeGetTag(Tags.SPAN_KIND); + return v == null ? null : v.toString(); + } + CharSequence getType(); /** diff --git a/dd-trace-core/src/main/java/datadog/trace/core/CoreTracer.java b/dd-trace-core/src/main/java/datadog/trace/core/CoreTracer.java index 28f9e39c710..5296c014aa5 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/CoreTracer.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/CoreTracer.java @@ -103,7 +103,6 @@ import datadog.trace.core.propagation.PropagationTags; import datadog.trace.core.propagation.TracingPropagator; import datadog.trace.core.propagation.XRayPropagator; -import datadog.trace.core.propagation.opg.OrgGuard; import datadog.trace.core.scopemanager.ContinuableScopeManager; import datadog.trace.core.servicediscovery.ServiceDiscovery; import datadog.trace.core.servicediscovery.ServiceDiscoveryFactory; @@ -825,21 +824,11 @@ private CoreTracer( sharedCommunicationObjects.whenReady(this.dataStreamsMonitoring::start); - propagationTagsFactory = PropagationTags.factory(config); - // Register context propagators - HttpCodec.Extractor baseExtractor = + HttpCodec.Extractor tracingExtractor = extractor == null ? HttpCodec.createExtractor(config, this::captureTraceConfig) : extractor; - OrgGuard orgGuard = - OrgGuard.create( - config, - featuresDiscovery::getOrgPropagationMarker, - propagationTagsFactory, - this.healthMetrics); - HttpCodec.Extractor tracingExtractor = orgGuard.decorateExtractor(baseExtractor); - HttpCodec.Injector tracingInjector = orgGuard.decorateInjector(injector); TracingPropagator tracingPropagator = - new TracingPropagator(config.isApmTracingEnabled(), tracingInjector, tracingExtractor); + new TracingPropagator(config.isApmTracingEnabled(), injector, tracingExtractor); Propagators.register(TRACING_CONCERN, tracingPropagator); Propagators.register(XRAY_TRACING_CONCERN, new XRayPropagator(config), false); if (config.isDataStreamsEnabled()) { @@ -897,6 +886,7 @@ private CoreTracer( StatusLogger.logStatus(config); + propagationTagsFactory = PropagationTags.factory(config); this.profilingContextIntegration = profilingContextIntegration; this.injectBaggageAsTags = injectBaggageAsTags; this.injectLinksAsTags = injectLinksAsTags; diff --git a/dd-trace-core/src/main/java/datadog/trace/core/DDSpan.java b/dd-trace-core/src/main/java/datadog/trace/core/DDSpan.java index 33206b491b8..c9a15901c03 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/DDSpan.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/DDSpan.java @@ -966,6 +966,11 @@ public boolean isKind(SpanKindFilter filter) { return filter.matches(context.getSpanKindOrdinal()); } + @Override + public String getSpanKindString() { + return context.getSpanKindString(); + } + @Override public void copyPropagationAndBaggage(final AgentSpan source) { if (source instanceof DDSpan) { diff --git a/dd-trace-core/src/main/java/datadog/trace/core/monitor/HealthMetrics.java b/dd-trace-core/src/main/java/datadog/trace/core/monitor/HealthMetrics.java index 33e26267eaa..e506732777f 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/monitor/HealthMetrics.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/monitor/HealthMetrics.java @@ -105,6 +105,15 @@ public void onStatsAggregateDropped() {} */ public void onStatsInboxFull() {} + /** + * Reports a batch of {@code count} tag values collapsed into the {@code blocked_by_tracer} + * sentinel for {@code tag} during the just-completed reporting cycle (per-tag cardinality budget + * exhausted, or per-value length cap exceeded). Called from the aggregator thread once per + * affected tag at cycle reset, so the implementation can do a single counter update rather than + * one per blocked value. + */ + public void onTagCardinalityBlocked(String[] statsDTag, long count) {} + /** * @return Human-readable summary of the current health metrics. */ diff --git a/dd-trace-core/src/main/java/datadog/trace/core/monitor/TracerHealthMetrics.java b/dd-trace-core/src/main/java/datadog/trace/core/monitor/TracerHealthMetrics.java index 2716d19e967..f9c76ff0766 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/monitor/TracerHealthMetrics.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/monitor/TracerHealthMetrics.java @@ -31,6 +31,7 @@ public class TracerHealthMetrics extends HealthMetrics implements AutoCloseable httpStatus -> new String[] {"status:" + httpStatus}; private static final String[] NO_TAGS = new String[0]; + private static final String[] COLLAPSED_WHOLE_KEY_TAGS = new String[] {"collapsed:whole_key"}; private static final String[] STATUS_OK_TAGS = STATUS_TAGS.apply(200); private final RadixTreeCache statusTagsCache = new RadixTreeCache<>(16, 32, STATUS_TAGS, 200, 400); @@ -372,6 +373,7 @@ public void onClientStatErrorReceived() { @Override public void onStatsAggregateDropped() { statsAggregateDropped.increment(); + statsd.count("datadog.tracer.stats.collapsed_spans", 1, COLLAPSED_WHOLE_KEY_TAGS); } @Override @@ -379,6 +381,11 @@ public void onStatsInboxFull() { statsInboxFull.increment(); } + @Override + public void onTagCardinalityBlocked(String[] statsDTag, long count) { + statsd.count("datadog.tracer.stats.collapsed_spans", count, statsDTag); + } + @Override public void close() { if (null != cancellation) { diff --git a/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/ConflatingMetricAggregatorTest.groovy b/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/ClientStatsAggregatorTest.groovy similarity index 92% rename from dd-trace-core/src/test/groovy/datadog/trace/common/metrics/ConflatingMetricAggregatorTest.groovy rename to dd-trace-core/src/test/groovy/datadog/trace/common/metrics/ClientStatsAggregatorTest.groovy index 00bd706b8fb..2f409d7baa5 100644 --- a/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/ConflatingMetricAggregatorTest.groovy +++ b/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/ClientStatsAggregatorTest.groovy @@ -18,7 +18,11 @@ import java.util.concurrent.TimeoutException import java.util.function.Supplier import spock.lang.Shared -class ConflatingMetricAggregatorTest extends DDSpecification { +class ClientStatsAggregatorTest extends DDSpecification { + + def setup() { + AggregateEntry.resetCardinalityHandlers() + } static Set empty = new HashSet<>() @@ -35,7 +39,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { DDAgentFeaturesDiscovery features = Mock(DDAgentFeaturesDiscovery) features.supportsMetrics() >> true WellKnownTags wellKnownTags = new WellKnownTags("runtimeid", "hostname", "env", "service", "version", "language") - ConflatingMetricsAggregator aggregator = new ConflatingMetricsAggregator( + ClientStatsAggregator aggregator = new ClientStatsAggregator( wellKnownTags, empty, features, @@ -65,7 +69,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { DDAgentFeaturesDiscovery features = Mock(DDAgentFeaturesDiscovery) features.supportsMetrics() >> true WellKnownTags wellKnownTags = new WellKnownTags("runtimeid", "hostname", "env", "service", "version", "language") - ConflatingMetricsAggregator aggregator = new ConflatingMetricsAggregator( + ClientStatsAggregator aggregator = new ClientStatsAggregator( wellKnownTags, [ignoredResourceName].toSet(), features, @@ -103,7 +107,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { DDAgentFeaturesDiscovery features = Mock(DDAgentFeaturesDiscovery) features.supportsMetrics() >> true features.peerTags() >> [] - ConflatingMetricsAggregator aggregator = new ConflatingMetricsAggregator(empty, + ClientStatsAggregator aggregator = new ClientStatsAggregator(empty, features, HealthMetrics.NO_OP, sink, writer, 10, queueSize, reportingInterval, SECONDS, false) aggregator.start() @@ -151,7 +155,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { DDAgentFeaturesDiscovery features = Mock(DDAgentFeaturesDiscovery) features.supportsMetrics() >> true features.peerTags() >> [] - ConflatingMetricsAggregator aggregator = new ConflatingMetricsAggregator(empty, + ClientStatsAggregator aggregator = new ClientStatsAggregator(empty, features, HealthMetrics.NO_OP, sink, writer, 10, queueSize, reportingInterval, SECONDS, false) aggregator.start() @@ -199,7 +203,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { DDAgentFeaturesDiscovery features = Mock(DDAgentFeaturesDiscovery) features.supportsMetrics() >> true features.peerTags() >> [] - ConflatingMetricsAggregator aggregator = new ConflatingMetricsAggregator(empty, + ClientStatsAggregator aggregator = new ClientStatsAggregator(empty, features, HealthMetrics.NO_OP, sink, writer, 10, queueSize, reportingInterval, SECONDS, true) aggregator.start() @@ -259,46 +263,34 @@ class ConflatingMetricAggregatorTest extends DDSpecification { "client" | "GET" | "/external/api" | true } - def "should create bucket for each set of peer tags"() { + def "should create separate buckets for distinct peer tag values"() { + // Peer-tag NAMES are configured per-tracer and stable for the duration of a trace publish; + // peer-tag VALUES vary per-span. Two spans with the same names but different values should + // produce two distinct aggregate buckets. setup: - // Peer-tag schema is reconciled with feature discovery once per reporting cycle (on the - // aggregator thread, in the post-report hook), not per-span on the producer. Drive two - // reporting cycles with different peerTags() configurations to verify the aggregator buckets - // each cycle by the schema that was current at publish time. MetricWriter writer = Mock(MetricWriter) Sink sink = Stub(Sink) DDAgentFeaturesDiscovery features = Mock(DDAgentFeaturesDiscovery) features.supportsMetrics() >> true - features.peerTags() >>> [["country"], ["country", "georegion"]] - // Bump the discovered state hash so reconcile during report cycle 1 sees a mismatch and - // rebuilds the schema for span 2. Three calls: bootstrap (span1's publish), reconcile-during- - // report-1 (mismatch -> rebuild + 2nd peerTags() call), reconcile-during-report-2 (no change). - features.state() >>> ["state-1", "state-2", "state-2"] - ConflatingMetricsAggregator aggregator = new ConflatingMetricsAggregator(empty, + features.peerTags() >> ["country", "georegion"] + ClientStatsAggregator aggregator = new ClientStatsAggregator(empty, features, HealthMetrics.NO_OP, sink, writer, 10, queueSize, reportingInterval, SECONDS, false) aggregator.start() - when: "cycle 1 -- peerTags=[country]" - CountDownLatch latch1 = new CountDownLatch(1) + when: + CountDownLatch latch = new CountDownLatch(1) aggregator.publish([ new SimpleSpan("service", "operation", "resource", "type", true, false, false, 0, 100, HTTP_OK) - .setTag(SPAN_KIND, "client").setTag("country", "france").setTag("georegion", "europe") - ]) - aggregator.report() - def cycle1Triggered = latch1.await(2, SECONDS) - - and: "cycle 2 -- reconcile picks up peerTags=[country, georegion]" - CountDownLatch latch2 = new CountDownLatch(1) - aggregator.publish([ + .setTag(SPAN_KIND, "client").setTag("country", "france").setTag("georegion", "europe"), new SimpleSpan("service", "operation", "resource", "type", true, false, false, 0, 100, HTTP_OK) - .setTag(SPAN_KIND, "client").setTag("country", "france").setTag("georegion", "europe") + .setTag(SPAN_KIND, "client").setTag("country", "germany").setTag("georegion", "europe") ]) aggregator.report() - def cycle2Triggered = latch2.await(2, SECONDS) + def latchTriggered = latch.await(2, SECONDS) then: - cycle1Triggered - cycle2Triggered + latchTriggered + 1 * writer.startBucket(2, _, _) 1 * writer.add({ AggregateEntryTestUtils.equals(it, AggregateEntryTestUtils.of( @@ -311,7 +303,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { false, false, "client", - [UTF8BytesString.create("country:france")], + [UTF8BytesString.create("country:france"), UTF8BytesString.create("georegion:europe")], null, null, null @@ -331,7 +323,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { false, false, "client", - [UTF8BytesString.create("country:france"), UTF8BytesString.create("georegion:europe")], + [UTF8BytesString.create("country:germany"), UTF8BytesString.create("georegion:europe")], null, null, null @@ -339,7 +331,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { }) >> { AggregateEntry e -> assert e.getHitCount() == 1 && e.getTopLevelCount() == 0 && e.getDuration() == 100 } - 2 * writer.finishBucket() >> { latch1.countDown(); latch2.countDown() } + 1 * writer.finishBucket() >> { latch.countDown() } cleanup: aggregator.close() @@ -352,7 +344,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { DDAgentFeaturesDiscovery features = Mock(DDAgentFeaturesDiscovery) features.supportsMetrics() >> true features.peerTags() >> ["peer.hostname", "_dd.base_service"] - ConflatingMetricsAggregator aggregator = new ConflatingMetricsAggregator(empty, + ClientStatsAggregator aggregator = new ClientStatsAggregator(empty, features, HealthMetrics.NO_OP, sink, writer, 10, queueSize, reportingInterval, SECONDS, false) aggregator.start() @@ -407,7 +399,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { DDAgentFeaturesDiscovery features = Mock(DDAgentFeaturesDiscovery) features.supportsMetrics() >> true features.peerTags() >> [] - ConflatingMetricsAggregator aggregator = new ConflatingMetricsAggregator(empty, features, HealthMetrics.NO_OP, + ClientStatsAggregator aggregator = new ClientStatsAggregator(empty, features, HealthMetrics.NO_OP, sink, writer, 10, queueSize, reportingInterval, SECONDS, false) aggregator.start() @@ -461,7 +453,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { DDAgentFeaturesDiscovery features = Mock(DDAgentFeaturesDiscovery) features.supportsMetrics() >> true features.peerTags() >> [] - ConflatingMetricsAggregator aggregator = new ConflatingMetricsAggregator(empty, + ClientStatsAggregator aggregator = new ClientStatsAggregator(empty, features, HealthMetrics.NO_OP, sink, writer, 10, queueSize, reportingInterval, SECONDS, false) long duration = 100 List trace = [ @@ -537,7 +529,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { DDAgentFeaturesDiscovery features = Mock(DDAgentFeaturesDiscovery) features.supportsMetrics() >> true features.peerTags() >> [] - ConflatingMetricsAggregator aggregator = new ConflatingMetricsAggregator(empty, + ClientStatsAggregator aggregator = new ClientStatsAggregator(empty, features, HealthMetrics.NO_OP, sink, writer, 10, queueSize, reportingInterval, SECONDS, true) aggregator.start() @@ -672,7 +664,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { DDAgentFeaturesDiscovery features = Mock(DDAgentFeaturesDiscovery) features.supportsMetrics() >> true features.peerTags() >> [] - ConflatingMetricsAggregator aggregator = new ConflatingMetricsAggregator(empty, + ClientStatsAggregator aggregator = new ClientStatsAggregator(empty, features, HealthMetrics.NO_OP, sink, writer, 10, queueSize, reportingInterval, SECONDS, true) aggregator.start() @@ -795,7 +787,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { DDAgentFeaturesDiscovery features = Mock(DDAgentFeaturesDiscovery) features.supportsMetrics() >> true features.peerTags() >> [] - ConflatingMetricsAggregator aggregator = new ConflatingMetricsAggregator(empty, + ClientStatsAggregator aggregator = new ClientStatsAggregator(empty, features, HealthMetrics.NO_OP, sink, writer, 10, queueSize, reportingInterval, SECONDS, true) aggregator.start() @@ -869,7 +861,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { DDAgentFeaturesDiscovery features = Mock(DDAgentFeaturesDiscovery) features.supportsMetrics() >> true features.peerTags() >> [] - ConflatingMetricsAggregator aggregator = new ConflatingMetricsAggregator(empty, + ClientStatsAggregator aggregator = new ClientStatsAggregator(empty, features, HealthMetrics.NO_OP, sink, writer, 10, queueSize, reportingInterval, SECONDS, false) aggregator.start() @@ -945,7 +937,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { DDAgentFeaturesDiscovery features = Mock(DDAgentFeaturesDiscovery) features.supportsMetrics() >> true features.peerTags() >> [] - ConflatingMetricsAggregator aggregator = new ConflatingMetricsAggregator(empty, + ClientStatsAggregator aggregator = new ClientStatsAggregator(empty, features, HealthMetrics.NO_OP, sink, writer, maxAggregates, queueSize, reportingInterval, SECONDS, false) long duration = 100 aggregator.start() @@ -1015,7 +1007,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { features.supportsMetrics() >> true features.peerTags() >> [] HealthMetrics healthMetrics = Mock(HealthMetrics) - ConflatingMetricsAggregator aggregator = new ConflatingMetricsAggregator(empty, + ClientStatsAggregator aggregator = new ClientStatsAggregator(empty, features, healthMetrics, sink, writer, maxAggregates, queueSize, reportingInterval, SECONDS, false) long duration = 100 aggregator.start() @@ -1049,7 +1041,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { features.supportsMetrics() >> true features.peerTags() >> [] HealthMetrics healthMetrics = Mock(HealthMetrics) - ConflatingMetricsAggregator aggregator = new ConflatingMetricsAggregator(empty, + ClientStatsAggregator aggregator = new ClientStatsAggregator(empty, features, healthMetrics, sink, writer, maxAggregates, queueSize, reportingInterval, SECONDS, false) aggregator.start() @@ -1094,7 +1086,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { DDAgentFeaturesDiscovery features = Mock(DDAgentFeaturesDiscovery) features.supportsMetrics() >> true features.peerTags() >> [] - ConflatingMetricsAggregator aggregator = new ConflatingMetricsAggregator(empty, + ClientStatsAggregator aggregator = new ClientStatsAggregator(empty, features, HealthMetrics.NO_OP, sink, writer, maxAggregates, queueSize, reportingInterval, SECONDS, false) long duration = 100 aggregator.start() @@ -1198,7 +1190,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { DDAgentFeaturesDiscovery features = Mock(DDAgentFeaturesDiscovery) features.supportsMetrics() >> true features.peerTags() >> [] - ConflatingMetricsAggregator aggregator = new ConflatingMetricsAggregator(empty, + ClientStatsAggregator aggregator = new ClientStatsAggregator(empty, features, HealthMetrics.NO_OP, sink, writer, maxAggregates, queueSize, reportingInterval, SECONDS, false) long duration = 100 aggregator.start() @@ -1258,7 +1250,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { DDAgentFeaturesDiscovery features = Mock(DDAgentFeaturesDiscovery) features.supportsMetrics() >> true features.peerTags() >> [] - ConflatingMetricsAggregator aggregator = new ConflatingMetricsAggregator(empty, + ClientStatsAggregator aggregator = new ClientStatsAggregator(empty, features, HealthMetrics.NO_OP, sink, writer, maxAggregates, queueSize, 1, SECONDS, false) long duration = 100 aggregator.start() @@ -1309,7 +1301,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { DDAgentFeaturesDiscovery features = Mock(DDAgentFeaturesDiscovery) features.supportsMetrics() >> true features.peerTags() >> [] - ConflatingMetricsAggregator aggregator = new ConflatingMetricsAggregator(empty, + ClientStatsAggregator aggregator = new ClientStatsAggregator(empty, features, HealthMetrics.NO_OP, sink, writer, maxAggregates, queueSize, 1, SECONDS, false) long duration = 100 aggregator.start() @@ -1340,7 +1332,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { MetricWriter writer = Mock(MetricWriter) Sink sink = Stub(Sink) DDAgentFeaturesDiscovery features = Mock(DDAgentFeaturesDiscovery) - ConflatingMetricsAggregator aggregator = new ConflatingMetricsAggregator(empty, + ClientStatsAggregator aggregator = new ClientStatsAggregator(empty, features, HealthMetrics.NO_OP, sink, writer, maxAggregates, queueSize, 1, SECONDS, false) aggregator.start() @@ -1362,7 +1354,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { DDAgentFeaturesDiscovery features = Mock(DDAgentFeaturesDiscovery) features.supportsMetrics() >> false features.peerTags() >> [] - ConflatingMetricsAggregator aggregator = new ConflatingMetricsAggregator(empty, + ClientStatsAggregator aggregator = new ClientStatsAggregator(empty, features, HealthMetrics.NO_OP, sink, writer, 10, queueSize, 200, MILLISECONDS, false) final spans = [ new SimpleSpan("service", "operation", "resource", "type", false, true, false, 0, 10, HTTP_OK) @@ -1394,7 +1386,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { Sink sink = Stub(Sink) DDAgentFeaturesDiscovery features = Mock(DDAgentFeaturesDiscovery) features.supportsMetrics() >> true - ConflatingMetricsAggregator aggregator = new ConflatingMetricsAggregator(empty, + ClientStatsAggregator aggregator = new ClientStatsAggregator(empty, features, HealthMetrics.NO_OP, sink, writer, maxAggregates, queueSize, 1, SECONDS, false) when: @@ -1427,7 +1419,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { Sink sink = Stub(Sink) DDAgentFeaturesDiscovery features = Mock(DDAgentFeaturesDiscovery) features.supportsMetrics() >> true - ConflatingMetricsAggregator aggregator = new ConflatingMetricsAggregator(empty, + ClientStatsAggregator aggregator = new ClientStatsAggregator(empty, features, HealthMetrics.NO_OP, sink, writer, 10, queueSize, reportingInterval, SECONDS, false) aggregator.start() @@ -1476,7 +1468,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { DDAgentFeaturesDiscovery features = Mock(DDAgentFeaturesDiscovery) features.supportsMetrics() >> true features.peerTags() >> [] - ConflatingMetricsAggregator aggregator = new ConflatingMetricsAggregator(empty, + ClientStatsAggregator aggregator = new ClientStatsAggregator(empty, features, HealthMetrics.NO_OP, sink, writer, 10, queueSize, reportingInterval, SECONDS, false) aggregator.start() @@ -1533,7 +1525,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { DDAgentFeaturesDiscovery features = Mock(DDAgentFeaturesDiscovery) features.supportsMetrics() >> true features.peerTags() >> [] - ConflatingMetricsAggregator aggregator = new ConflatingMetricsAggregator(empty, + ClientStatsAggregator aggregator = new ClientStatsAggregator(empty, features, HealthMetrics.NO_OP, sink, writer, 10, queueSize, reportingInterval, SECONDS, true) aggregator.start() @@ -1630,7 +1622,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { DDAgentFeaturesDiscovery features = Mock(DDAgentFeaturesDiscovery) features.supportsMetrics() >> true features.peerTags() >> [] - ConflatingMetricsAggregator aggregator = new ConflatingMetricsAggregator(empty, + ClientStatsAggregator aggregator = new ClientStatsAggregator(empty, features, HealthMetrics.NO_OP, sink, writer, 10, queueSize, reportingInterval, SECONDS, false) aggregator.start() @@ -1709,14 +1701,14 @@ class ConflatingMetricAggregatorTest extends DDSpecification { aggregator.close() } - def reportAndWaitUntilEmpty(ConflatingMetricsAggregator aggregator) { + def reportAndWaitUntilEmpty(ClientStatsAggregator aggregator) { waitUntilEmpty(aggregator) aggregator.report() waitUntilEmpty(aggregator) } - def waitUntilEmpty(ConflatingMetricsAggregator aggregator) { + def waitUntilEmpty(ClientStatsAggregator aggregator) { int i = 0 while (!aggregator.inbox.isEmpty() && i++ < 100) { Thread.sleep(10) diff --git a/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/FootprintForkedTest.groovy b/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/FootprintForkedTest.groovy index eceedeb1935..86a91c23b3f 100644 --- a/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/FootprintForkedTest.groovy +++ b/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/FootprintForkedTest.groovy @@ -37,7 +37,7 @@ class FootprintForkedTest extends DDSpecification { it.supportsMetrics() >> true it.peerTags() >> [] } - ConflatingMetricsAggregator aggregator = new ConflatingMetricsAggregator( + ClientStatsAggregator aggregator = new ClientStatsAggregator( new WellKnownTags("runtimeid","hostname", "env", "service", "version","language"), [].toSet() as Set, features, diff --git a/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/MetricsAggregatorFactoryTest.groovy b/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/MetricsAggregatorFactoryTest.groovy index 07f246bf9a9..dc9eb86fde3 100644 --- a/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/MetricsAggregatorFactoryTest.groovy +++ b/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/MetricsAggregatorFactoryTest.groovy @@ -28,6 +28,6 @@ class MetricsAggregatorFactoryTest extends DDSpecification { expect: def aggregator = MetricsAggregatorFactory.createMetricsAggregator(config, sco, HealthMetrics.NO_OP, ) - assert aggregator instanceof ConflatingMetricsAggregator + assert aggregator instanceof ClientStatsAggregator } } diff --git a/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/SerializingMetricWriterTest.groovy b/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/SerializingMetricWriterTest.groovy index cc0880bc30a..080a77238e4 100644 --- a/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/SerializingMetricWriterTest.groovy +++ b/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/SerializingMetricWriterTest.groovy @@ -181,7 +181,7 @@ class SerializingMetricWriterTest extends DDSpecification { def content = [e] - ValidatingSink sink = new ValidatingSink(wellKnownTags, startTime, duration, content) + ValidatingSink sink = new ValidatingSink(wellKnownTags, startTime, duration, content, shaCommit) SerializingMetricWriter writer = new SerializingMetricWriter(wellKnownTags, sink, 128, gitInfoProvider) when: @@ -231,13 +231,15 @@ class SerializingMetricWriterTest extends DDSpecification { private final long duration private boolean validated = false private List content + private final String expectedGitCommitSha ValidatingSink(WellKnownTags wellKnownTags, long startTimeNanos, long duration, - List content) { + List content, String expectedGitCommitSha = null) { this.wellKnownTags = wellKnownTags this.startTimeNanos = startTimeNanos this.duration = duration this.content = content + this.expectedGitCommitSha = expectedGitCommitSha } @Override @@ -248,7 +250,7 @@ class SerializingMetricWriterTest extends DDSpecification { void accept(int messageCount, ByteBuffer buffer) { MessageUnpacker unpacker = MessagePack.newDefaultUnpacker(buffer) int mapSize = unpacker.unpackMapHeader() - String gitCommitSha = GitInfoProvider.INSTANCE.getGitInfo()?.getCommit()?.getSha() + String gitCommitSha = expectedGitCommitSha assert mapSize == (7 + (Config.get().isExperimentalPropagateProcessTagsEnabled() ? 1 : 0) + (gitCommitSha != null ? 1 : 0)) assert unpacker.unpackString() == "RuntimeID" @@ -283,12 +285,13 @@ class SerializingMetricWriterTest extends DDSpecification { int statCount = unpacker.unpackArrayHeader() assert statCount == content.size() for (AggregateEntry entry : content) { + // counters now live on AggregateEntry int metricMapSize = unpacker.unpackMapHeader() // Calculate expected map size based on optional fields - boolean hasHttpMethod = entry.getHttpMethod() != null - boolean hasHttpEndpoint = entry.getHttpEndpoint() != null - boolean hasServiceSource = entry.getServiceSource() != null - boolean hasGrpcStatusCode = entry.getGrpcStatusCode() != null + boolean hasHttpMethod = entry.hasHttpMethod() + boolean hasHttpEndpoint = entry.hasHttpEndpoint() + boolean hasServiceSource = entry.hasServiceSource() + boolean hasGrpcStatusCode = entry.hasGrpcStatusCode() int expectedMapSize = 15 + (hasServiceSource ? 1 : 0) + (hasHttpMethod ? 1 : 0) + (hasHttpEndpoint ? 1 : 0) + (hasGrpcStatusCode ? 1 : 0) assert metricMapSize == expectedMapSize int elementCount = 0 diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntryTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntryTest.java index 7fd767533c7..dd70083d9d4 100644 --- a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntryTest.java +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntryTest.java @@ -1,10 +1,11 @@ package datadog.trace.common.metrics; +import static datadog.trace.bootstrap.instrumentation.api.UTF8BytesString.EMPTY; import static datadog.trace.common.metrics.AggregateEntry.ERROR_TAG; import static datadog.trace.common.metrics.AggregateEntry.TOP_LEVEL_TAG; import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertNotEquals; +import static org.junit.jupiter.api.Assertions.assertNotSame; +import static org.junit.jupiter.api.Assertions.assertSame; import static org.junit.jupiter.api.Assertions.assertTrue; import datadog.metrics.agent.AgentMeter; @@ -13,10 +14,16 @@ import datadog.metrics.impl.MonitoringImpl; import java.util.concurrent.TimeUnit; import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; class AggregateEntryTest { + @BeforeEach + void resetCardinalityHandlers() { + AggregateEntry.resetCardinalityHandlers(); + } + @BeforeAll static void initAgentMeter() { // recordOneDuration -> Histogram.accept needs AgentMeter to be initialized. @@ -84,82 +91,46 @@ void okAndErrorLatenciesTrackedSeparately() { } @Test - void testUtilsEqualsIsConsistentWithHashCodeAcrossDifferentSchemaLayouts() { - // Contract test for AggregateEntryTestUtils (the test-side equality helper used by Spock - // mock matchers). Production AggregateEntry has no equals override. - // - // Two entries with identical encoded peerTags but different raw layouts must not be equal, - // because hashOf folds in the raw arrays. Equality on the encoded list would let them - // collapse while their hashCodes differ -- violating the contract. - // - // A: schema ["a","b"], values [null,"x"] -> encoded ["b:x"] - // B: schema ["b","c"], values ["x",null] -> encoded ["b:x"] - AggregateEntry a = - AggregateEntryTestUtils.forSnapshot( - snapshotWithPeerTags(new String[] {"a", "b"}, new String[] {null, "x"})); - AggregateEntry b = - AggregateEntryTestUtils.forSnapshot( - snapshotWithPeerTags(new String[] {"b", "c"}, new String[] {"x", null})); - - // Sanity: same encoded peer tags, despite different raw layout. - assertEquals(a.getPeerTags(), b.getPeerTags()); - - // Different raw layouts -> entries must not be equal via the test helper. - assertFalse(AggregateEntryTestUtils.equals(a, b)); - // And different hashCodes (matching the inequality). - assertNotEquals(AggregateEntryTestUtils.hashCode(a), AggregateEntryTestUtils.hashCode(b)); + void absentOptionalFieldsResolveToEmptySentinel() { + // serviceSource / httpMethod / httpEndpoint / grpcStatusCode = null on input -> EMPTY on the + // entry. EMPTY is the universal "absent" sentinel; SerializingMetricWriter and equality use + // identity comparison against it. + AggregateEntry entry = newEntry(); + assertSame(EMPTY, entry.getServiceSource()); + assertSame(EMPTY, entry.getHttpMethod()); + assertSame(EMPTY, entry.getHttpEndpoint()); + assertSame(EMPTY, entry.getGrpcStatusCode()); } @Test - void testUtilsEqualEntriesHaveEqualHashCodes() { - AggregateEntry a = - AggregateEntryTestUtils.forSnapshot( - snapshotWithPeerTags(new String[] {"a", "b"}, new String[] {null, "x"})); - AggregateEntry b = - AggregateEntryTestUtils.forSnapshot( - snapshotWithPeerTags(new String[] {"a", "b"}, new String[] {null, "x"})); - - assertTrue(AggregateEntryTestUtils.equals(a, b)); - assertEquals(AggregateEntryTestUtils.hashCode(a), AggregateEntryTestUtils.hashCode(b)); - } - - private static SpanSnapshot snapshotWithPeerTags(String[] names, String[] values) { - return new SpanSnapshot( - "resource", - "svc", - "op", - null, - "type", - (short) 200, - false, - true, - "client", - PeerTagSchema.testSchema(names), - values, - null, - null, - null, - 0L); - } - - private static AggregateEntry newEntry() { - SpanSnapshot snapshot = - new SpanSnapshot( + void presentOptionalFieldsCarryTheirValue() { + AggregateEntry entry = + AggregateEntryTestUtils.of( "resource", "svc", "op", - null, + "src", "type", - (short) 200, + 200, false, true, "client", null, - null, - null, - null, - null, - 0L); - return AggregateEntryTestUtils.forSnapshot(snapshot); + "GET", + "/api/v1/foo", + "0"); + assertNotSame(EMPTY, entry.getServiceSource()); + assertNotSame(EMPTY, entry.getHttpMethod()); + assertNotSame(EMPTY, entry.getHttpEndpoint()); + assertNotSame(EMPTY, entry.getGrpcStatusCode()); + assertEquals("src", entry.getServiceSource().toString()); + assertEquals("GET", entry.getHttpMethod().toString()); + assertEquals("/api/v1/foo", entry.getHttpEndpoint().toString()); + assertEquals("0", entry.getGrpcStatusCode().toString()); + } + + private static AggregateEntry newEntry() { + return AggregateEntryTestUtils.of( + "resource", "svc", "op", null, "type", 200, false, true, "client", null, null, null, null); } } diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntryTestUtils.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntryTestUtils.java index ed6fd5a3a7e..a8ced924c3e 100644 --- a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntryTestUtils.java +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntryTestUtils.java @@ -1,7 +1,7 @@ package datadog.trace.common.metrics; import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString; -import java.util.Arrays; +import java.util.Collections; import java.util.List; import java.util.Objects; import javax.annotation.Nullable; @@ -10,34 +10,26 @@ * Test-side helpers for {@link AggregateEntry}: a positional-args fixture factory plus a field-wise * equality contract for use with Spock mock argument matchers and JUnit assertions. Lives in {@code * src/test} so the production class stays free of test-only API; same {@code - * datadog.trace.common.metrics} package so this helper can reach package-private fields and - * constructors. + * datadog.trace.common.metrics} package so this helper can reach package-private members. * *

    Production {@code AggregateEntry} intentionally has no {@code equals}/{@code hashCode} - * override -- {@link AggregateTable} bucketing goes through {@link AggregateEntry#matches} keyed on - * {@link AggregateEntry#keyHash}, and no production code path invokes {@link Object#equals}. + * override -- {@link AggregateTable} bucketing goes through the {@code Canonical} scratch buffer + * keyed on {@link AggregateEntry#keyHash}, and no production code path invokes {@link + * Object#equals}. * - *

    The equality helper compares the raw {@code peerTagNames}/{@code peerTagValues} arrays (not - * the encoded {@code peerTags} list) so it stays consistent with {@link AggregateEntry#hashOf}, - * which folds in raw arrays via {@link PeerTagSchema#hashCode()} and {@link - * Arrays#hashCode(Object[])}. Comparing the encoded list would let two entries with different raw - * layouts (e.g. tag {@code "b"} at index 1 in schema A vs index 0 in schema B, with matching - * values) collapse to the same encoded form -- a real bug surfaced during PR #11382 review. + *

    Peer tags live as a single pre-encoded {@code List} on the entry + * (canonicalization through {@link PeerTagSchema#register} already collapsed identical values), so + * equality compares the list directly. The hash side (computed in {@link AggregateEntry#hashOf}) + * folds in the encoded list, so the contract stays consistent. */ public final class AggregateEntryTestUtils { private AggregateEntryTestUtils() {} /** - * Builds an {@link AggregateEntry} from the same positional shape the prior {@code new - * MetricKey(...)} took. Accepts a pre-encoded {@code List} of {@code - * "name:value"} peer tags and recovers the parallel-array {@code (names, values)} form by - * splitting on the {@code ':'} delimiter. - * - *

    Test-only. The split is at the first {@code ':'}, so peer-tag values - * containing a colon (URLs, IPv6 addresses, {@code service:env} patterns) will be silently - * misparsed and the recovered (name, value) pair will be wrong. Keep test data colon-free in - * peer-tag values, or wire a production-style snapshot through {@link #forSnapshot(SpanSnapshot)} - * directly instead. + * Builds an {@link AggregateEntry} from positional args. Bypasses the cardinality handlers so + * tests can create expected values without mutating shared handler state. Content-equal entries + * from {@link AggregateEntry.Canonical#createEntry} still compare equal via {@link + * #equals(AggregateEntry, AggregateEntry)}. */ public static AggregateEntry of( CharSequence resource, @@ -53,49 +45,48 @@ public static AggregateEntry of( @Nullable CharSequence httpMethod, @Nullable CharSequence httpEndpoint, @Nullable CharSequence grpcStatusCode) { - PeerTagSchema schema = null; - String[] values = null; - if (peerTags != null && !peerTags.isEmpty()) { - String[] names = new String[peerTags.size()]; - values = new String[peerTags.size()]; - int i = 0; - for (UTF8BytesString t : peerTags) { - String s = t.toString(); - int colon = s.indexOf(':'); - names[i] = colon < 0 ? s : s.substring(0, colon); - values[i] = colon < 0 ? "" : s.substring(colon + 1); - i++; - } - schema = PeerTagSchema.testSchema(names); - } - SpanSnapshot syntheticSnapshot = - new SpanSnapshot( - resource, - service == null ? null : service.toString(), - operationName, - serviceSource, - type, + UTF8BytesString resourceUtf = AggregateEntry.createUtf8(resource); + UTF8BytesString serviceUtf = AggregateEntry.createUtf8(service); + UTF8BytesString operationNameUtf = AggregateEntry.createUtf8(operationName); + UTF8BytesString serviceSourceUtf = AggregateEntry.createUtf8(serviceSource); + UTF8BytesString typeUtf = AggregateEntry.createUtf8(type); + UTF8BytesString spanKindUtf = AggregateEntry.createUtf8(spanKind); + UTF8BytesString httpMethodUtf = AggregateEntry.createUtf8(httpMethod); + UTF8BytesString httpEndpointUtf = AggregateEntry.createUtf8(httpEndpoint); + UTF8BytesString grpcUtf = AggregateEntry.createUtf8(grpcStatusCode); + List peerTagsList = peerTags == null ? Collections.emptyList() : peerTags; + UTF8BytesString[] peerTagsArr = peerTagsList.toArray(new UTF8BytesString[0]); + long keyHash = + AggregateEntry.hashOf( + resourceUtf, + serviceUtf, + operationNameUtf, + serviceSourceUtf, + typeUtf, + spanKindUtf, + httpMethodUtf, + httpEndpointUtf, + grpcUtf, (short) httpStatusCode, synthetic, traceRoot, - spanKind == null ? null : spanKind.toString(), - schema, - values, - httpMethod == null ? null : httpMethod.toString(), - httpEndpoint == null ? null : httpEndpoint.toString(), - grpcStatusCode == null ? null : grpcStatusCode.toString(), - 0L); - return forSnapshot(syntheticSnapshot); - } - - /** - * Builds an {@link AggregateEntry} from {@code s} by computing its lookup hash via {@link - * AggregateEntry#hashOf(SpanSnapshot)} and calling the package-private constructor directly. - * Production callers route through {@link AggregateTable#findOrInsert} which already has the - * {@code keyHash} on hand; tests rarely do, so this helper hides the second argument. - */ - public static AggregateEntry forSnapshot(SpanSnapshot s) { - return new AggregateEntry(s, AggregateEntry.hashOf(s)); + peerTagsArr, + peerTagsArr.length); + return new AggregateEntry( + keyHash, + resourceUtf, + serviceUtf, + operationNameUtf, + serviceSourceUtf, + typeUtf, + spanKindUtf, + httpMethodUtf, + httpEndpointUtf, + grpcUtf, + (short) httpStatusCode, + synthetic, + traceRoot, + peerTagsList); } /** @@ -114,8 +105,7 @@ public static boolean equals(AggregateEntry a, AggregateEntry b) { && Objects.equals(a.getServiceSource(), b.getServiceSource()) && Objects.equals(a.getType(), b.getType()) && Objects.equals(a.getSpanKind(), b.getSpanKind()) - && Arrays.equals(a.peerTagNames, b.peerTagNames) - && Arrays.equals(a.peerTagValues, b.peerTagValues) + && a.getPeerTags().equals(b.getPeerTags()) && Objects.equals(a.getHttpMethod(), b.getHttpMethod()) && Objects.equals(a.getHttpEndpoint(), b.getHttpEndpoint()) && Objects.equals(a.getGrpcStatusCode(), b.getGrpcStatusCode()); @@ -123,8 +113,8 @@ public static boolean equals(AggregateEntry a, AggregateEntry b) { /** * Stable hash matching {@link #equals(AggregateEntry, AggregateEntry)} -- derived from {@link - * AggregateEntry#keyHash}, which {@link AggregateEntry#hashOf} computes from the same raw fields - * the helper's {@code equals} compares. + * AggregateEntry#keyHash}, which {@link AggregateEntry#hashOf} computes from the same fields the + * helper's {@code equals} compares. */ public static int hashCode(AggregateEntry e) { return e == null ? 0 : (int) e.keyHash; diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableTest.java index 618ead2ab43..05acd57985d 100644 --- a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableTest.java +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableTest.java @@ -19,6 +19,7 @@ import java.util.Map; import java.util.concurrent.TimeUnit; import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; class AggregateTableTest { @@ -31,6 +32,15 @@ static void initAgentMeter() { monitoring.newTimer("test.init"); } + @BeforeEach + void resetCardinalityHandlers() { + // AggregateEntry's property handlers are static and accumulate state across tests. Some tests + // in this class (e.g. backToBackEvictionsAllSucceed) drive 40 distinct services, which exceeds + // MetricCardinalityLimits.SERVICE (32) and leaves later tests seeing a shared "blocked" + // canonical for "a"/"b"/"c"-style services -- collapsing distinct snapshots into one entry. + AggregateEntry.resetCardinalityHandlers(); + } + @Test void insertOnMissReturnsNewAggregate() { AggregateTable table = new AggregateTable(8); @@ -337,7 +347,7 @@ SnapshotBuilder peerTags(String... namesAndValues) { names[i] = namesAndValues[2 * i]; values[i] = namesAndValues[2 * i + 1]; } - this.peerTagSchema = PeerTagSchema.testSchema(names); + this.peerTagSchema = new PeerTagSchema(names, PeerTagSchema.NO_STATE); this.peerTagValues = values; return this; } diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/CardinalityHandlerTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/CardinalityHandlerTest.java new file mode 100644 index 00000000000..5a882aba11b --- /dev/null +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/CardinalityHandlerTest.java @@ -0,0 +1,260 @@ +package datadog.trace.common.metrics; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotSame; +import static org.junit.jupiter.api.Assertions.assertSame; + +import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString; +import org.junit.jupiter.api.Test; + +class CardinalityHandlerTest { + + @Test + void propertyReturnsSameInstanceForRepeatedValueUntilLimit() { + PropertyCardinalityHandler h = new PropertyCardinalityHandler("test", 3); + UTF8BytesString a1 = h.register("a"); + UTF8BytesString a2 = h.register("a"); + assertSame(a1, a2); + assertEquals("a", a1.toString()); + } + + @Test + void propertyOverLimitReturnsBlockedSentinel() { + PropertyCardinalityHandler h = new PropertyCardinalityHandler("test", 2); + UTF8BytesString a = h.register("a"); + UTF8BytesString b = h.register("b"); + UTF8BytesString blocked1 = h.register("c"); + UTF8BytesString blocked2 = h.register("d"); + + assertEquals("tracer_blocked_value", blocked1.toString()); + assertSame(blocked1, blocked2); // same sentinel for all overflow values + assertNotSame(blocked1, a); + assertNotSame(blocked1, b); + } + + @Test + void propertyResetRefreshesBudget() { + PropertyCardinalityHandler h = new PropertyCardinalityHandler("test", 2); + h.register("a"); + h.register("b"); + UTF8BytesString blocked = h.register("c"); + assertEquals("tracer_blocked_value", blocked.toString()); + + h.reset(); + + // After reset, three distinct values fit again. Prior-cycle instances are reused + // (see propertyPriorCycleInstancesAreReusedAcrossReset for the dedicated check); here + // we just confirm that the budget refreshed so values previously blocked now have + // a slot. + UTF8BytesString afterReset = h.register("a"); + assertEquals("a", afterReset.toString()); + UTF8BytesString c = h.register("c"); + assertEquals("c", c.toString()); + UTF8BytesString blockedAgain = h.register("d"); + UTF8BytesString blockedYetAgain = h.register("e"); + assertEquals("tracer_blocked_value", blockedAgain.toString()); + assertSame(blockedAgain, blockedYetAgain); + } + + @Test + void propertyPriorCycleInstancesAreReusedAcrossReset() { + // Dual role: the handler is also a UTF8 cache. Values held in the prior cycle are + // reused on the first registration in the new cycle, so aggregate entries that hold a + // reference to a UTF8BytesString still match on identity after the per-cycle reset. + // This is the cache-survives-reset property the canonical-key lookup depends on. + PropertyCardinalityHandler h = new PropertyCardinalityHandler("test", 4); + UTF8BytesString aBefore = h.register("a"); + UTF8BytesString bBefore = h.register("b"); + + h.reset(); + + assertSame(aBefore, h.register("a")); + assertSame(bBefore, h.register("b")); + // Same-cycle subsequent registration continues to return the reused instance. + assertSame(aBefore, h.register("a")); + } + + @Test + void propertyPriorCycleReuseSurvivesOneResetButNotTwo() { + // Reuse window is one cycle deep -- the handler swaps current/prior on reset, so a + // value last seen two cycles ago is no longer cached and will be re-allocated. + PropertyCardinalityHandler h = new PropertyCardinalityHandler("test", 4); + UTF8BytesString first = h.register("a"); + + h.reset(); + h.reset(); + + UTF8BytesString afterTwoResets = h.register("a"); + assertNotSame(first, afterTwoResets); + assertEquals("a", afterTwoResets.toString()); + } + + @Test + void tagPrefixesValuesAndReusesUnderLimit() { + TagCardinalityHandler h = new TagCardinalityHandler("peer.hostname", 4); + UTF8BytesString first = h.register("host-a"); + UTF8BytesString second = h.register("host-a"); + UTF8BytesString other = h.register("host-b"); + + assertSame(first, second); + assertNotSame(first, other); + assertEquals("peer.hostname:host-a", first.toString()); + assertEquals("peer.hostname:host-b", other.toString()); + } + + @Test + void tagOverLimitReturnsTaggedSentinel() { + TagCardinalityHandler h = new TagCardinalityHandler("peer.service", 1); + h.register("svc-1"); + UTF8BytesString blocked = h.register("svc-2"); + assertEquals("peer.service:tracer_blocked_value", blocked.toString()); + } + + @Test + void tagResetRefreshesBudgetAndSentinelStaysStable() { + TagCardinalityHandler h = new TagCardinalityHandler("x", 1); + h.register("v1"); + UTF8BytesString blockedBefore = h.register("v2"); + h.reset(); + h.register("v1"); + UTF8BytesString blockedAfter = h.register("v2"); + // Both are the same sentinel instance (cacheBlocked is not cleared on reset). + assertSame(blockedBefore, blockedAfter); + } + + @Test + void tagPriorCycleInstancesAreReusedAcrossReset() { + // Mirrors propertyPriorCycleInstancesAreReusedAcrossReset: the pre-built "tag:value" + // UTF8BytesString from the prior cycle is reused on the first registration in the new + // cycle -- no re-concatenation, no re-encoding. + TagCardinalityHandler h = new TagCardinalityHandler("peer.hostname", 4); + UTF8BytesString hostABefore = h.register("host-a"); + UTF8BytesString hostBBefore = h.register("host-b"); + + h.reset(); + + assertSame(hostABefore, h.register("host-a")); + assertSame(hostBBefore, h.register("host-b")); + } + + @Test + void propertyRegisterOfNullReturnsEmpty() { + PropertyCardinalityHandler h = new PropertyCardinalityHandler("test", 4); + // Null input short-circuits to UTF8BytesString.EMPTY -- the universal "absent" sentinel that + // AggregateEntry's optional UTF8 fields use in place of null. + assertSame(UTF8BytesString.EMPTY, h.register(null)); + } + + @Test + void propertyRegisterOfNullDoesNotConsumeBudget() { + PropertyCardinalityHandler h = new PropertyCardinalityHandler("test", 2); + h.register(null); + h.register(null); + h.register(null); + // Three null registrations didn't consume the budget; two real values still fit. + assertEquals("a", h.register("a").toString()); + assertEquals("b", h.register("b").toString()); + // Third real value spills to the blocked sentinel (limit = 2). + assertEquals("tracer_blocked_value", h.register("c").toString()); + } + + @Test + void tagRegisterOfNullReturnsEmpty() { + TagCardinalityHandler h = new TagCardinalityHandler("peer.hostname", 4); + // Null returns EMPTY (no "tag:" prefix applied -- the sentinel is the same EMPTY singleton + // every handler returns for null input). + assertSame(UTF8BytesString.EMPTY, h.register(null)); + } + + @Test + void tagRegisterOfNullDoesNotConsumeBudget() { + TagCardinalityHandler h = new TagCardinalityHandler("peer.hostname", 2); + h.register(null); + h.register(null); + h.register(null); + // Three null registrations didn't consume the budget; two real values still fit. + assertEquals("peer.hostname:a", h.register("a").toString()); + assertEquals("peer.hostname:b", h.register("b").toString()); + // Third real value spills to the blocked sentinel (limit = 2). + assertEquals("peer.hostname:tracer_blocked_value", h.register("c").toString()); + } + + @Test + void propertyResetReturnsBlockedCount() { + PropertyCardinalityHandler h = new PropertyCardinalityHandler("test", 1); + h.register("a"); // within limit + h.register("b"); // blocked + h.register("c"); // blocked + assertEquals(2, h.reset()); + assertEquals(0, h.reset()); // no blocks in the empty new cycle + } + + @Test + void tagResetReturnsBlockedCount() { + TagCardinalityHandler h = new TagCardinalityHandler("peer.hostname", 1); + h.register("a"); // within limit + h.register("b"); // blocked + h.register("c"); // blocked + assertEquals(2, h.reset()); + assertEquals(0, h.reset()); // no blocks in the empty new cycle + } + + // ---- limits-disabled mode (Config flag off): cache size still capped, but over-cap values + // get freshly-allocated UTF8 rather than the blocked sentinel. + + @Test + void propertyOverLimitWithSentinelDisabledReturnsFreshUtf8() { + PropertyCardinalityHandler h = new PropertyCardinalityHandler("test", 2, false); + UTF8BytesString a = h.register("a"); + UTF8BytesString b = h.register("b"); + UTF8BytesString c = h.register("c"); + UTF8BytesString d = h.register("d"); + + // Real values (not the "tracer_blocked_value" sentinel) so the wire format carries them. + assertEquals("c", c.toString()); + assertEquals("d", d.toString()); + // The first two stay cached and identity-stable. + assertSame(a, h.register("a")); + assertSame(b, h.register("b")); + // Over-cap values are NOT cached -- a second call allocates a fresh instance. + assertNotSame(c, h.register("c")); + assertEquals("c", h.register("c").toString()); + } + + @Test + void propertyOverLimitWithSentinelDisabledReusesPriorCycleInstances() { + // Prior-cycle reuse runs in disabled mode too: a value that was seen last cycle but is now + // over-budget still gets its prior-cycle UTF8BytesString back instead of an allocation. + PropertyCardinalityHandler h = new PropertyCardinalityHandler("test", 2, false); + UTF8BytesString cBeforeReset = h.register("c"); + + h.reset(); + + // Fill the budget with two different values so "c" lands over-cap. + h.register("x"); + h.register("y"); + UTF8BytesString cAfterReset = h.register("c"); + assertSame(cBeforeReset, cAfterReset); + } + + @Test + void tagOverLimitWithSentinelDisabledReturnsFreshUtf8() { + TagCardinalityHandler h = new TagCardinalityHandler("peer.hostname", 1, false); + h.register("host-a"); + UTF8BytesString hostB = h.register("host-b"); + UTF8BytesString hostC = h.register("host-c"); + + assertEquals("peer.hostname:host-b", hostB.toString()); + assertEquals("peer.hostname:host-c", hostC.toString()); + } + + @Test + void tagOverLimitWithSentinelDisabledNeverSubstitutesBlockedSentinel() { + // The sentinel should never materialize in disabled mode -- over-cap values carry their real + // "tag:value" content rather than the blocked sentinel. + TagCardinalityHandler h = new TagCardinalityHandler("peer.service", 1, false); + h.register("svc-1"); + UTF8BytesString overCap = h.register("svc-2"); + assertEquals("peer.service:svc-2", overCap.toString()); + } +} diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorBootstrapTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/ClientStatsAggregatorBootstrapTest.java similarity index 95% rename from dd-trace-core/src/test/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorBootstrapTest.java rename to dd-trace-core/src/test/java/datadog/trace/common/metrics/ClientStatsAggregatorBootstrapTest.java index 59681d4724e..758f5db9910 100644 --- a/dd-trace-core/src/test/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorBootstrapTest.java +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/ClientStatsAggregatorBootstrapTest.java @@ -4,7 +4,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.ArgumentMatchers.any; -import static org.mockito.ArgumentMatchers.eq; import static org.mockito.Mockito.atLeastOnce; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.times; @@ -12,7 +11,6 @@ import static org.mockito.Mockito.when; import datadog.communication.ddagent.DDAgentFeaturesDiscovery; -import datadog.trace.bootstrap.instrumentation.api.Tags; import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString; import datadog.trace.core.CoreSpan; import datadog.trace.core.SpanKindFilter; @@ -26,8 +24,7 @@ import org.mockito.ArgumentCaptor; /** - * Coverage for the {@code ConflatingMetricsAggregator} peer-tag schema bootstrap and reconcile - * paths. + * Coverage for the {@code ClientStatsAggregator} peer-tag schema bootstrap and reconcile paths. * *

      *
    • {@link #bootstrapHappensOnceOnFirstPublish()} -- verifies the synchronized producer-side @@ -43,7 +40,7 @@ * publishes see the new tags. *
    */ -class ConflatingMetricsAggregatorBootstrapTest { +class ClientStatsAggregatorBootstrapTest { @Test void bootstrapHappensOnceOnFirstPublish() { @@ -57,8 +54,8 @@ void bootstrapHappensOnceOnFirstPublish() { when(features.peerTags()).thenReturn(Collections.singleton("peer.hostname")); when(features.state()).thenReturn("state-1"); - ConflatingMetricsAggregator aggregator = - new ConflatingMetricsAggregator( + ClientStatsAggregator aggregator = + new ClientStatsAggregator( Collections.emptySet(), features, healthMetrics, @@ -94,8 +91,8 @@ void reconcileSkipsDeepCompareWhenStateMatches() throws Exception { when(features.peerTags()).thenReturn(Collections.singleton("peer.hostname")); when(features.state()).thenReturn("state-1"); - ConflatingMetricsAggregator aggregator = - new ConflatingMetricsAggregator( + ClientStatsAggregator aggregator = + new ClientStatsAggregator( Collections.emptySet(), features, healthMetrics, @@ -163,8 +160,8 @@ void reconcileSurvivesStateChangeWhenTagsUnchanged() throws Exception { // State hash changes every reconcile -- forces reconcile into the slow path each time. when(features.state()).thenReturn("state-1", "state-2", "state-3"); - ConflatingMetricsAggregator aggregator = - new ConflatingMetricsAggregator( + ClientStatsAggregator aggregator = + new ClientStatsAggregator( Collections.emptySet(), features, healthMetrics, @@ -236,8 +233,8 @@ void reconcileSwapsSchemaWhenTagSetChanges() throws Exception { // (mismatch -> slow path), stable at "state-2" for cycle 2's reconcile (match -> fast path). when(features.state()).thenReturn("state-1", "state-2", "state-2"); - ConflatingMetricsAggregator aggregator = - new ConflatingMetricsAggregator( + ClientStatsAggregator aggregator = + new ClientStatsAggregator( Collections.emptySet(), features, healthMetrics, @@ -324,7 +321,7 @@ private static CoreSpan peerAggregationSpan() { when(span.getHttpStatusCode()).thenReturn((short) 200); when(span.getParentId()).thenReturn(0L); when(span.getOrigin()).thenReturn(null); - when(span.unsafeGetTag(eq(Tags.SPAN_KIND), any(CharSequence.class))).thenReturn("client"); + when(span.getSpanKindString()).thenReturn("client"); // peer.hostname tag is set so capturePeerTagValues fires for the bootstrapped schema. when(span.unsafeGetTag("peer.hostname")).thenReturn("localhost"); return span; diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorDisableTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/ClientStatsAggregatorDisableTest.java similarity index 95% rename from dd-trace-core/src/test/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorDisableTest.java rename to dd-trace-core/src/test/java/datadog/trace/common/metrics/ClientStatsAggregatorDisableTest.java index d072371d25d..3239cca4e90 100644 --- a/dd-trace-core/src/test/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorDisableTest.java +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/ClientStatsAggregatorDisableTest.java @@ -6,7 +6,6 @@ import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.anyInt; import static org.mockito.ArgumentMatchers.anyLong; -import static org.mockito.ArgumentMatchers.eq; import static org.mockito.Mockito.after; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.reset; @@ -15,7 +14,6 @@ import static org.mockito.Mockito.when; import datadog.communication.ddagent.DDAgentFeaturesDiscovery; -import datadog.trace.bootstrap.instrumentation.api.Tags; import datadog.trace.core.CoreSpan; import datadog.trace.core.SpanKindFilter; import datadog.trace.core.monitor.HealthMetrics; @@ -45,7 +43,7 @@ * (the pre-fix path) only via races -- not deterministically -- so the assertions here are about * the observable end-to-end shape rather than thread identity. */ -class ConflatingMetricsAggregatorDisableTest { +class ClientStatsAggregatorDisableTest { @Test void downgradeRoutesClearThroughInboxBeforeNextReport() throws Exception { @@ -57,8 +55,8 @@ void downgradeRoutesClearThroughInboxBeforeNextReport() throws Exception { when(features.peerTags()).thenReturn(Collections.emptySet()); when(features.state()).thenReturn("state-1"); - ConflatingMetricsAggregator aggregator = - new ConflatingMetricsAggregator( + ClientStatsAggregator aggregator = + new ClientStatsAggregator( Collections.emptySet(), features, healthMetrics, @@ -147,8 +145,8 @@ void clearDoesNotTrampleQueuedStopSignal() throws Exception { when(features.peerTags()).thenReturn(Collections.emptySet()); when(features.state()).thenReturn("state-1"); - ConflatingMetricsAggregator aggregator = - new ConflatingMetricsAggregator( + ClientStatsAggregator aggregator = + new ClientStatsAggregator( Collections.emptySet(), features, healthMetrics, @@ -203,7 +201,7 @@ private static CoreSpan metricsEligibleSpan() { when(span.getHttpStatusCode()).thenReturn((short) 200); when(span.getParentId()).thenReturn(0L); when(span.getOrigin()).thenReturn(null); - when(span.unsafeGetTag(eq(Tags.SPAN_KIND), any(CharSequence.class))).thenReturn("client"); + when(span.getSpanKindString()).thenReturn("client"); return span; } @@ -228,7 +226,7 @@ private static CoreSpan markerSpan() { when(span.getHttpStatusCode()).thenReturn((short) 200); when(span.getParentId()).thenReturn(0L); when(span.getOrigin()).thenReturn(null); - when(span.unsafeGetTag(eq(Tags.SPAN_KIND), any(CharSequence.class))).thenReturn("client"); + when(span.getSpanKindString()).thenReturn("client"); return span; } } diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorInboxFullTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/ClientStatsAggregatorInboxFullTest.java similarity index 87% rename from dd-trace-core/src/test/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorInboxFullTest.java rename to dd-trace-core/src/test/java/datadog/trace/common/metrics/ClientStatsAggregatorInboxFullTest.java index f4e4c2da253..3683bef7f42 100644 --- a/dd-trace-core/src/test/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorInboxFullTest.java +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/ClientStatsAggregatorInboxFullTest.java @@ -2,14 +2,12 @@ import static java.util.concurrent.TimeUnit.SECONDS; import static org.mockito.ArgumentMatchers.any; -import static org.mockito.ArgumentMatchers.eq; import static org.mockito.Mockito.atLeastOnce; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; import datadog.communication.ddagent.DDAgentFeaturesDiscovery; -import datadog.trace.bootstrap.instrumentation.api.Tags; import datadog.trace.core.CoreSpan; import datadog.trace.core.SpanKindFilter; import datadog.trace.core.monitor.HealthMetrics; @@ -17,12 +15,12 @@ import org.junit.jupiter.api.Test; /** - * Coverage for the inbox-full fast-path in {@code ConflatingMetricsAggregator.publish}: when the + * Coverage for the inbox-full fast-path in {@code ClientStatsAggregator.publish}: when the * producer-side inbox is at capacity, the next {@code publish} call short-circuits before any tag * extraction or {@code SpanSnapshot} allocation and reports {@code onStatsInboxFull()} to health * metrics. */ -class ConflatingMetricsAggregatorInboxFullTest { +class ClientStatsAggregatorInboxFullTest { @Test void publishFiresOnStatsInboxFullOnceInboxIsAtCapacity() { @@ -38,8 +36,8 @@ void publishFiresOnStatsInboxFullOnceInboxIsAtCapacity() { // never drains -- snapshots accumulate in the inbox until capacity, then the next publish hits // the size-vs-capacity fast path. int queueSize = 8; - ConflatingMetricsAggregator aggregator = - new ConflatingMetricsAggregator( + ClientStatsAggregator aggregator = + new ClientStatsAggregator( Collections.emptySet(), features, healthMetrics, @@ -78,7 +76,7 @@ private static CoreSpan metricsEligibleSpan() { when(span.getHttpStatusCode()).thenReturn((short) 200); when(span.getParentId()).thenReturn(0L); when(span.getOrigin()).thenReturn(null); - when(span.unsafeGetTag(eq(Tags.SPAN_KIND), any(CharSequence.class))).thenReturn("client"); + when(span.getSpanKindString()).thenReturn("client"); return span; } } diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/PeerTagSchemaTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/PeerTagSchemaTest.java index 7d818a2686b..dd5c58c3650 100644 --- a/dd-trace-core/src/test/java/datadog/trace/common/metrics/PeerTagSchemaTest.java +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/PeerTagSchemaTest.java @@ -3,9 +3,9 @@ import static org.junit.jupiter.api.Assertions.assertArrayEquals; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; +import datadog.trace.core.monitor.HealthMetrics; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; @@ -13,20 +13,15 @@ import java.util.Set; import org.junit.jupiter.api.Test; -/** - * Unit tests for {@link PeerTagSchema}. Covers the {@link PeerTagSchema#hasSameTagsAs(Set)} - * predicate that drives the aggregator's reconcile fast/slow path split, the factory shapes, and - * the {@link PeerTagSchema#INTERNAL} singleton. - */ class PeerTagSchemaTest { @Test void ofBuildsSchemaFromSetWithState() { Set tags = new LinkedHashSet<>(Arrays.asList("peer.hostname", "peer.service")); - PeerTagSchema schema = PeerTagSchema.of(tags, "abc123"); + PeerTagSchema schema = PeerTagSchema.of(tags, "state-1234"); assertArrayEquals(new String[] {"peer.hostname", "peer.service"}, schema.names); - assertEquals("abc123", schema.state); + assertEquals("state-1234", schema.state); assertEquals(2, schema.size()); } @@ -36,7 +31,6 @@ void ofHandlesEmptySet() { assertEquals(0, schema.size()); assertEquals(0, schema.names.length); - assertNull(schema.state); } @Test @@ -90,4 +84,35 @@ void hasSameTagsAsHandlesEmpty() { assertTrue(empty.hasSameTagsAs(Collections.emptySet())); assertFalse(empty.hasSameTagsAs(Collections.singleton("peer.hostname"))); } + + @Test + void resetHandlersReportsBlockedCountToHealthMetrics() { + // Build a schema then replace its handler with a sentinel-mode instance at a low limit. + // (Production schemas use AggregateEntry.LIMITS_ENABLED which is currently false; this test + // exercises the reportingpath directly so it stays valid before and after the flag flips.) + PeerTagSchema schema = + new PeerTagSchema(new String[] {"peer.hostname"}, PeerTagSchema.NO_STATE); + schema.handlers[0] = new TagCardinalityHandler("peer.hostname", 1, true); + + schema.register(0, "host-a"); // within limit + schema.register(0, "host-b"); // blocked + schema.register(0, "host-c"); // blocked + + long[] recorded = {0}; + HealthMetrics hm = + new HealthMetrics() { + @Override + public void onTagCardinalityBlocked(String[] tag, long count) { + recorded[0] += count; + } + }; + + schema.resetHandlers(hm); + assertEquals(2, recorded[0]); + + // After the reset, no new values were registered so the next reset reports nothing. + recorded[0] = 0; + schema.resetHandlers(hm); + assertEquals(0, recorded[0]); + } } diff --git a/dd-trace-core/src/test/java/datadog/trace/core/monitor/HealthMetricsTest.java b/dd-trace-core/src/test/java/datadog/trace/core/monitor/HealthMetricsTest.java index 2f9ac1ea7da..03f2384f179 100644 --- a/dd-trace-core/src/test/java/datadog/trace/core/monitor/HealthMetricsTest.java +++ b/dd-trace-core/src/test/java/datadog/trace/core/monitor/HealthMetricsTest.java @@ -387,13 +387,16 @@ void testOnLongRunningUpdate() throws InterruptedException { @Test void testOnStatsAggregateDropped() throws InterruptedException { - CountDownLatch latch = new CountDownLatch(1); + // onStatsAggregateDropped emits two statsd calls: an immediate collapsed_spans count and the + // periodic stats.dropped_aggregates report — wait for both before verifying. + CountDownLatch latch = new CountDownLatch(2); try (TracerHealthMetrics healthMetrics = new TracerHealthMetrics(new Latched(statsD, latch), 100, TimeUnit.MILLISECONDS)) { healthMetrics.start(); healthMetrics.onStatsAggregateDropped(); assertTrue(latch.await(5, TimeUnit.SECONDS)); } + verify(statsD).count("datadog.tracer.stats.collapsed_spans", 1L, "collapsed:whole_key"); verify(statsD).count("stats.dropped_aggregates", 1L, "reason:lru_eviction"); verifyNoMoreInteractions(statsD); } diff --git a/dd-trace-core/src/traceAgentTest/groovy/datadog/trace/common/metrics/MetricsIntegrationTest.groovy b/dd-trace-core/src/traceAgentTest/groovy/datadog/trace/common/metrics/MetricsIntegrationTest.groovy deleted file mode 100644 index 401aaef4b7e..00000000000 --- a/dd-trace-core/src/traceAgentTest/groovy/datadog/trace/common/metrics/MetricsIntegrationTest.groovy +++ /dev/null @@ -1,75 +0,0 @@ -package datadog.trace.common.metrics - -import static datadog.communication.ddagent.DDAgentFeaturesDiscovery.V06_METRICS_ENDPOINT -import static datadog.trace.common.metrics.EventListener.EventType.OK -import static java.util.concurrent.TimeUnit.SECONDS - -import datadog.communication.http.OkHttpUtils -import datadog.metrics.api.Histograms -import datadog.metrics.impl.DDSketchHistograms -import datadog.trace.api.Config -import datadog.trace.api.WellKnownTags -import java.util.concurrent.CopyOnWriteArrayList -import java.util.concurrent.CountDownLatch -import okhttp3.HttpUrl - -class MetricsIntegrationTest extends AbstractTraceAgentTest { - - def setupSpec() { - // Initialize metrics-lib histograms to register the DDSketch implementation - Histograms.register(DDSketchHistograms.FACTORY) - } - - def "send metrics to trace agent should notify with OK event"() { - setup: - def latch = new CountDownLatch(1) - def listener = new BlockingListener(latch) - def agentUrl = Config.get().getAgentUrl() - OkHttpSink sink = new OkHttpSink(OkHttpUtils.buildHttpClient(HttpUrl.parse(agentUrl), 5000L), agentUrl, V06_METRICS_ENDPOINT, true, false, [:]) - sink.register(listener) - - when: - SerializingMetricWriter writer = new SerializingMetricWriter( - new WellKnownTags("runtimeid","hostname", "env", "service", "version","language"), - sink - ) - writer.startBucket(2, System.nanoTime(), SECONDS.toNanos(10)) - // Build entries via the production AggregateEntry.forSnapshot(snap, keyHash) path -- same - // construction as AggregateTable.findOrInsert. Both entries use one peer tag (grault:quux) - // -> schema names=["grault"], values=["quux"]. - PeerTagSchema schema = PeerTagSchema.testSchema(["grault"] as String[]) - SpanSnapshot snap1 = new SpanSnapshot( - "resource1", "service1", "operation1", null, "sql", (short) 0, - false, true, "xyzzy", schema, ["quux"] as String[], null, null, null, 0L) - def entry1 = new AggregateEntry(snap1, AggregateEntry.hashOf(snap1)) - [2, 1, 2, 250, 4].each { entry1.recordOneDuration(it as long) } - writer.add(entry1) - SpanSnapshot snap2 = new SpanSnapshot( - "resource2", "service2", "operation2", null, "web", (short) 200, - false, true, "xyzzy", schema, ["quux"] as String[], null, null, null, 0L) - def entry2 = new AggregateEntry(snap2, AggregateEntry.hashOf(snap2)) - [1, 1, 200, 2, 3, 4, 5, 6, 7, 8].each { entry2.recordOneDuration(it as long) } - writer.add(entry2) - writer.finishBucket() - - then: - latch.await(5, SECONDS) - listener.events.size() == 1 && listener.events[0] == OK - } - - static class BlockingListener implements EventListener { - - List events = new CopyOnWriteArrayList<>() - final CountDownLatch latch - - BlockingListener(CountDownLatch latch) { - this.latch = latch - } - - @Override - void onEvent(EventType eventType, String message) { - events.add(eventType) - latch.countDown() - } - } -} diff --git a/dd-trace-core/src/traceAgentTest/java/datadog/trace/common/metrics/MetricsIntegrationTest.java b/dd-trace-core/src/traceAgentTest/java/datadog/trace/common/metrics/MetricsIntegrationTest.java new file mode 100644 index 00000000000..ed28085a39a --- /dev/null +++ b/dd-trace-core/src/traceAgentTest/java/datadog/trace/common/metrics/MetricsIntegrationTest.java @@ -0,0 +1,187 @@ +package datadog.trace.common.metrics; + +import static datadog.communication.ddagent.DDAgentFeaturesDiscovery.V06_METRICS_ENDPOINT; +import static datadog.trace.junit.utils.config.WithConfigExtension.injectSysConfig; +import static java.util.concurrent.TimeUnit.SECONDS; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import datadog.communication.http.OkHttpUtils; +import datadog.metrics.agent.AgentMeter; +import datadog.metrics.api.statsd.StatsDClient; +import datadog.metrics.impl.DDSketchHistograms; +import datadog.metrics.impl.MonitoringImpl; +import datadog.trace.api.Config; +import datadog.trace.api.ConfigDefaults; +import datadog.trace.api.WellKnownTags; +import datadog.trace.api.config.TracerConfig; +import datadog.trace.junit.utils.config.WithConfigExtension; +import java.time.Duration; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; +import okhttp3.HttpUrl; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.containers.startupcheck.MinimumDurationRunningStartupCheckStrategy; + +@ExtendWith(WithConfigExtension.class) +class MetricsIntegrationTest { + + // CI runs an agent container alongside the build (reached via CI_AGENT_HOST); when building + // locally we start one ourselves with testcontainers. + private static final boolean RUNNING_IN_CI = "true".equals(System.getenv("CI")); + private static GenericContainer agentContainer; + + @BeforeAll + static void setupSpec() { + // recordOneDuration -> Histogram.accept needs the metrics meter / histogram factory registered. + MonitoringImpl monitoring = new MonitoringImpl(StatsDClient.NO_OP, 1, TimeUnit.SECONDS); + AgentMeter.registerIfAbsent(StatsDClient.NO_OP, monitoring, DDSketchHistograms.FACTORY); + + if (!RUNNING_IN_CI) { + Map env = new HashMap<>(); + env.put("DD_APM_ENABLED", "true"); + env.put("DD_BIND_HOST", "0.0.0.0"); + env.put("DD_API_KEY", "invalid_key_but_this_is_fine"); + env.put("DD_HOSTNAME", "doesnotexist"); + env.put("DD_LOGS_STDOUT", "yes"); + agentContainer = + new GenericContainer<>("datadog/agent:7.40.1") + .withEnv(env) + .withExposedPorts(ConfigDefaults.DEFAULT_TRACE_AGENT_PORT) + .withStartupTimeout(Duration.ofSeconds(120)) + // Sleep for a bit so the agent's rate_by_service response is populated -- mirrors the + // race-condition workaround from the original Spock base. + .withStartupCheckStrategy( + new MinimumDurationRunningStartupCheckStrategy(Duration.ofSeconds(10))); + agentContainer.start(); + } + } + + @AfterAll + static void cleanupSpec() { + if (agentContainer != null) { + agentContainer.stop(); + } + } + + @BeforeEach + void setup() { + AggregateEntry.resetCardinalityHandlers(); + injectSysConfig(TracerConfig.AGENT_HOST, agentHost()); + injectSysConfig(TracerConfig.TRACE_AGENT_PORT, agentPort()); + } + + private static String agentHost() { + return agentContainer != null ? agentContainer.getHost() : System.getenv("CI_AGENT_HOST"); + } + + private static String agentPort() { + return agentContainer != null + ? String.valueOf(agentContainer.getMappedPort(ConfigDefaults.DEFAULT_TRACE_AGENT_PORT)) + : String.valueOf(ConfigDefaults.DEFAULT_TRACE_AGENT_PORT); + } + + @Test + void sendMetricsToTraceAgentShouldNotifyWithOkEvent() throws InterruptedException { + // setup + CountDownLatch latch = new CountDownLatch(1); + BlockingListener listener = new BlockingListener(latch); + String agentUrl = Config.get().getAgentUrl(); + OkHttpSink sink = + new OkHttpSink( + OkHttpUtils.buildHttpClient(HttpUrl.parse(agentUrl), 5000L), + agentUrl, + V06_METRICS_ENDPOINT, + true, + false, + Collections.emptyMap()); + sink.register(listener); + + // when + SerializingMetricWriter writer = + new SerializingMetricWriter( + new WellKnownTags("runtimeid", "hostname", "env", "service", "version", "language"), + sink); + writer.startBucket(2, System.nanoTime(), SECONDS.toNanos(10)); + // Build entries through the production AggregateTable.findOrInsert path (canonicalizes the + // snapshot and creates/looks up the entry). Both entries use one peer tag (grault:quux) and no + // additional tags -> schema names=["grault"], values=["quux"]. + AggregateTable table = new AggregateTable(8); + PeerTagSchema schema = new PeerTagSchema(new String[] {"grault"}, PeerTagSchema.NO_STATE); + SpanSnapshot snap1 = + new SpanSnapshot( + "resource1", + "service1", + "operation1", + null, + "sql", + (short) 0, + false, + true, + "xyzzy", + schema, + new String[] {"quux"}, + null, + null, + null, + 0L); + AggregateEntry entry1 = table.findOrInsert(snap1); + for (long duration : new long[] {2, 1, 2, 250, 4}) { + entry1.recordOneDuration(duration); + } + writer.add(entry1); + SpanSnapshot snap2 = + new SpanSnapshot( + "resource2", + "service2", + "operation2", + null, + "web", + (short) 200, + false, + true, + "xyzzy", + schema, + new String[] {"quux"}, + null, + null, + null, + 0L); + AggregateEntry entry2 = table.findOrInsert(snap2); + for (long duration : new long[] {1, 1, 200, 2, 3, 4, 5, 6, 7, 8}) { + entry2.recordOneDuration(duration); + } + writer.add(entry2); + writer.finishBucket(); + + // then + assertTrue(latch.await(5, SECONDS)); + assertEquals(1, listener.events.size()); + assertEquals(EventListener.EventType.OK, listener.events.get(0)); + } + + static class BlockingListener implements EventListener { + final List events = new CopyOnWriteArrayList<>(); + final CountDownLatch latch; + + BlockingListener(CountDownLatch latch) { + this.latch = latch; + } + + @Override + public void onEvent(EventListener.EventType eventType, String message) { + events.add(eventType); + latch.countDown(); + } + } +} diff --git a/internal-api/src/main/java/datadog/trace/api/Config.java b/internal-api/src/main/java/datadog/trace/api/Config.java index d1a9ba4aada..b10bda7c683 100644 --- a/internal-api/src/main/java/datadog/trace/api/Config.java +++ b/internal-api/src/main/java/datadog/trace/api/Config.java @@ -2267,7 +2267,6 @@ && isMetricsOtelEnabled() (long) configProvider.getInteger(TRACER_METRICS_MAX_PENDING, defaultMaxPending) * LEGACY_BATCH_SIZE; tracerMetricsMaxPending = (int) Math.min(requestedMaxPending, MAX_SAFE_ARRAY_SIZE); - reportHostName = configProvider.getBoolean(TRACE_REPORT_HOSTNAME, DEFAULT_TRACE_REPORT_HOSTNAME);