diff --git a/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionWalFileAnalyzer.java b/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionWalFileAnalyzer.java new file mode 100644 index 0000000000000..420c1672cca46 --- /dev/null +++ b/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionWalFileAnalyzer.java @@ -0,0 +1,530 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb; + +import org.apache.tsfile.file.metadata.enums.CompressionType; + +import java.io.File; +import java.io.IOException; +import java.io.RandomAccessFile; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.charset.StandardCharsets; +import java.util.Locale; + +/** + * Inspect a single WAL file and print size breakdowns for its major sections. + * + *

Example: + * + *

+ *   java ... org.apache.iotdb.ConsensusSubscriptionWalFileAnalyzer D:\path\to\_12-25000-1.wal
+ * 
+ */ +public class ConsensusSubscriptionWalFileAnalyzer { + + private static final String V1_MAGIC = "WAL"; + private static final String V2_MAGIC = "V2-WAL"; + private static final String V3_MAGIC = "V3-WAL"; + + private static final int SEGMENT_HEADER_BASE_BYTES = Byte.BYTES + Integer.BYTES; + private static final int COMPRESSED_SEGMENT_EXTRA_HEADER_BYTES = Integer.BYTES; + private static final int WAL_FILE_INFO_END_MARKER_BYTES = Byte.BYTES; + private static final int METADATA_SIZE_FIELD_BYTES = Integer.BYTES; + private static final int V3_EMPTY_METADATA_REMAINING_WITHOUT_MEMTABLE_COUNT = + Long.BYTES * 2 + Short.BYTES * 2 + Integer.BYTES; + + public static void main(final String[] args) throws Exception { + if (args.length == 0 || "--help".equals(args[0]) || "-h".equals(args[0])) { + printUsage(); + return; + } + + final File walFile = new File(args[0]); + if (!walFile.isFile()) { + throw new IllegalArgumentException("WAL file does not exist: " + walFile.getAbsolutePath()); + } + + final WalFileAnalysis analysis = analyze(walFile); + printAnalysis(analysis); + } + + private static void printUsage() { + System.out.println("Usage:"); + System.out.println( + " java ... org.apache.iotdb.ConsensusSubscriptionWalFileAnalyzer "); + } + + private static WalFileAnalysis analyze(final File walFile) throws IOException { + try (RandomAccessFile raf = new RandomAccessFile(walFile, "r"); + FileChannel channel = raf.getChannel()) { + final long totalBytes = channel.size(); + final String version = detectVersion(channel, totalBytes); + final int headMagicBytes = getHeadMagicBytes(version); + final int tailMagicBytes = getTailMagicBytes(version); + + final WalFileAnalysis analysis = new WalFileAnalysis(walFile, version, totalBytes); + analysis.headMagicBytes = Math.min(totalBytes, headMagicBytes); + + if (totalBytes <= headMagicBytes) { + analysis.note = "header-only WAL file (magic only, no body/footer)"; + return analysis; + } + + if (!hasTrailingMagic(channel, totalBytes, version)) { + analysis.note = "missing trailing magic/footer, file may be open or broken"; + return analysis; + } + + analysis.tailMagicBytes = tailMagicBytes; + analysis.metadataSizeFieldBytes = METADATA_SIZE_FIELD_BYTES; + + final long metadataSizeFieldPos = totalBytes - tailMagicBytes - METADATA_SIZE_FIELD_BYTES; + if (metadataSizeFieldPos < headMagicBytes) { + analysis.note = "invalid metadata size position"; + return analysis; + } + + final int metadataBytes = readInt(channel, metadataSizeFieldPos); + analysis.metadataBytes = metadataBytes; + analysis.footerStartOffset = metadataSizeFieldPos - metadataBytes; + if (analysis.footerStartOffset < headMagicBytes) { + analysis.note = "invalid footer start offset"; + return analysis; + } + + final long markerOffset = analysis.footerStartOffset - WAL_FILE_INFO_END_MARKER_BYTES; + if (markerOffset < headMagicBytes) { + analysis.note = "invalid end-marker offset"; + return analysis; + } + + analysis.endMarkerBytes = WAL_FILE_INFO_END_MARKER_BYTES; + analysis.segmentStartOffset = headMagicBytes; + analysis.segmentEndOffsetExclusive = markerOffset; + analysis.segmentRegionBytes = Math.max(0L, markerOffset - headMagicBytes); + + scanSegments(channel, analysis); + parseFooter(channel, analysis); + return analysis; + } + } + + private static void scanSegments(final FileChannel channel, final WalFileAnalysis analysis) + throws IOException { + long offset = analysis.segmentStartOffset; + while (offset < analysis.segmentEndOffsetExclusive) { + if (analysis.segmentEndOffsetExclusive - offset < SEGMENT_HEADER_BASE_BYTES) { + analysis.segmentParseWarning = + "remaining bytes are smaller than a segment header at offset " + offset; + return; + } + + final ByteBuffer headerBuffer = ByteBuffer.allocate(SEGMENT_HEADER_BASE_BYTES); + readFully(channel, headerBuffer, offset); + headerBuffer.flip(); + + final CompressionType compressionType = CompressionType.deserialize(headerBuffer.get()); + final int dataInDiskBytes = headerBuffer.getInt(); + int headerBytes = SEGMENT_HEADER_BASE_BYTES; + if (compressionType != CompressionType.UNCOMPRESSED) { + headerBytes += COMPRESSED_SEGMENT_EXTRA_HEADER_BYTES; + } + + final long nextOffset = offset + headerBytes + dataInDiskBytes; + if (nextOffset > analysis.segmentEndOffsetExclusive) { + analysis.segmentParseWarning = + String.format( + Locale.ROOT, + "segment at offset %d exceeds body boundary (%d > %d)", + offset, + nextOffset, + analysis.segmentEndOffsetExclusive); + return; + } + + analysis.segmentCount++; + analysis.segmentHeaderBytes += headerBytes; + analysis.segmentPayloadBytes += dataInDiskBytes; + if (compressionType != CompressionType.UNCOMPRESSED) { + analysis.compressedSegmentCount++; + } + offset = nextOffset; + } + + if (offset != analysis.segmentEndOffsetExclusive) { + analysis.segmentParseWarning = + String.format( + Locale.ROOT, + "segment parser stopped at %d but expected %d", + offset, + analysis.segmentEndOffsetExclusive); + } + } + + private static void parseFooter(final FileChannel channel, final WalFileAnalysis analysis) + throws IOException { + if (analysis.metadataBytes <= 0) { + return; + } + + final ByteBuffer metadataBuffer = ByteBuffer.allocate(analysis.metadataBytes); + readFully(channel, metadataBuffer, analysis.footerStartOffset); + metadataBuffer.flip(); + + if (metadataBuffer.remaining() < Long.BYTES + Integer.BYTES) { + analysis.footerWarning = "metadata buffer is too small"; + return; + } + + metadataBuffer.getLong(); + analysis.firstSearchIndexBytes = Long.BYTES; + final int entryCount = metadataBuffer.getInt(); + analysis.entryCount = entryCount; + analysis.entryCountBytes = Integer.BYTES; + + analysis.bufferSizeArrayBytes = (long) entryCount * Integer.BYTES; + for (int i = 0; i < entryCount; i++) { + metadataBuffer.getInt(); + } + + final boolean serializedEmptyV3WithoutMemTableCount = + V3_MAGIC.equals(analysis.version) + && entryCount == 0 + && metadataBuffer.remaining() == V3_EMPTY_METADATA_REMAINING_WITHOUT_MEMTABLE_COUNT; + + if (metadataBuffer.hasRemaining() && !serializedEmptyV3WithoutMemTableCount) { + analysis.memTableCountFieldBytes = Integer.BYTES; + analysis.memTableCount = metadataBuffer.getInt(); + analysis.memTableIdsBytes = (long) analysis.memTableCount * Long.BYTES; + for (int i = 0; i < analysis.memTableCount; i++) { + metadataBuffer.getLong(); + } + } + + if (V3_MAGIC.equals(analysis.version) && metadataBuffer.hasRemaining()) { + if (metadataBuffer.remaining() < Long.BYTES * 2) { + analysis.footerWarning = "V3 metadata is truncated before min/max timestamp range"; + return; + } + + analysis.minMaxDataTsBytes = Long.BYTES * 2L; + metadataBuffer.getLong(); + metadataBuffer.getLong(); + + final long requiredWriterMetadataBytes = + (long) entryCount * Long.BYTES * 2 + Short.BYTES * 2 + Integer.BYTES; + if (metadataBuffer.remaining() < requiredWriterMetadataBytes) { + analysis.footerWarning = "V3 metadata is truncated before writer progress arrays"; + return; + } + + analysis.physicalTimesBytes = (long) entryCount * Long.BYTES; + analysis.localSeqsBytes = (long) entryCount * Long.BYTES; + for (int i = 0; i < entryCount; i++) { + metadataBuffer.getLong(); + } + for (int i = 0; i < entryCount; i++) { + metadataBuffer.getLong(); + } + + analysis.defaultWriterIdentityBytes = Short.BYTES * 2L; + metadataBuffer.getShort(); + metadataBuffer.getShort(); + + analysis.overrideCountFieldBytes = Integer.BYTES; + analysis.overrideCount = metadataBuffer.getInt(); + + analysis.overrideIndexesBytes = (long) analysis.overrideCount * Integer.BYTES; + analysis.overrideNodeIdsBytes = (long) analysis.overrideCount * Short.BYTES; + analysis.overrideWriterEpochsBytes = (long) analysis.overrideCount * Short.BYTES; + + for (int i = 0; i < analysis.overrideCount; i++) { + metadataBuffer.getInt(); + } + for (int i = 0; i < analysis.overrideCount; i++) { + metadataBuffer.getShort(); + } + for (int i = 0; i < analysis.overrideCount; i++) { + metadataBuffer.getShort(); + } + } + + analysis.unknownMetadataBytes = metadataBuffer.remaining(); + } + + private static String detectVersion(final FileChannel channel, final long totalBytes) + throws IOException { + if (totalBytes >= V3_MAGIC.length() + && readString(channel, 0, V3_MAGIC.length()).equals(V3_MAGIC)) { + return V3_MAGIC; + } + if (totalBytes >= V2_MAGIC.length() + && readString(channel, 0, V2_MAGIC.length()).equals(V2_MAGIC)) { + return V2_MAGIC; + } + if (totalBytes >= V1_MAGIC.length() + && readString(channel, totalBytes - V1_MAGIC.length(), V1_MAGIC.length()) + .equals(V1_MAGIC)) { + return V1_MAGIC; + } + return "UNKNOWN"; + } + + private static int getHeadMagicBytes(final String version) { + if (V3_MAGIC.equals(version)) { + return V3_MAGIC.length(); + } + if (V2_MAGIC.equals(version)) { + return V2_MAGIC.length(); + } + return 0; + } + + private static int getTailMagicBytes(final String version) { + if (V3_MAGIC.equals(version)) { + return V3_MAGIC.length(); + } + if (V2_MAGIC.equals(version)) { + return V2_MAGIC.length(); + } + if (V1_MAGIC.equals(version)) { + return V1_MAGIC.length(); + } + return 0; + } + + private static boolean hasTrailingMagic( + final FileChannel channel, final long totalBytes, final String version) throws IOException { + final int tailMagicBytes = getTailMagicBytes(version); + if (tailMagicBytes <= 0 || totalBytes < tailMagicBytes) { + return false; + } + return readString(channel, totalBytes - tailMagicBytes, tailMagicBytes).equals(version); + } + + private static String readString(final FileChannel channel, final long offset, final int length) + throws IOException { + final ByteBuffer buffer = ByteBuffer.allocate(length); + readFully(channel, buffer, offset); + buffer.flip(); + return StandardCharsets.UTF_8.decode(buffer).toString(); + } + + private static int readInt(final FileChannel channel, final long offset) throws IOException { + final ByteBuffer buffer = ByteBuffer.allocate(Integer.BYTES); + readFully(channel, buffer, offset); + buffer.flip(); + return buffer.getInt(); + } + + private static void readFully( + final FileChannel channel, final ByteBuffer buffer, final long offset) throws IOException { + long position = offset; + while (buffer.hasRemaining()) { + final int bytesRead = channel.read(buffer, position); + if (bytesRead < 0) { + throw new IOException("Unexpected EOF while reading at offset " + position); + } + position += bytesRead; + } + } + + private static void printAnalysis(final WalFileAnalysis analysis) { + System.out.println("=== WAL File Layout Analysis ==="); + System.out.println("file: " + analysis.file.getAbsolutePath()); + System.out.println("version: " + analysis.version); + System.out.println("total: " + formatBytes(analysis.totalBytes)); + if (analysis.note != null) { + System.out.println("note: " + analysis.note); + } + System.out.println(); + + printSection("head magic", analysis.headMagicBytes, analysis.totalBytes); + printSection("segment headers", analysis.segmentHeaderBytes, analysis.totalBytes); + printSection("segment payload", analysis.segmentPayloadBytes, analysis.totalBytes); + printSection("wal end marker", analysis.endMarkerBytes, analysis.totalBytes); + printSection("footer metadata", analysis.metadataBytes, analysis.totalBytes); + printSection("metadata size field", analysis.metadataSizeFieldBytes, analysis.totalBytes); + printSection("tail magic", analysis.tailMagicBytes, analysis.totalBytes); + final long accountedBytes = + analysis.headMagicBytes + + analysis.segmentHeaderBytes + + analysis.segmentPayloadBytes + + analysis.endMarkerBytes + + analysis.metadataBytes + + analysis.metadataSizeFieldBytes + + analysis.tailMagicBytes; + if (analysis.totalBytes >= accountedBytes) { + printSection("unaccounted", analysis.totalBytes - accountedBytes, analysis.totalBytes); + } + + System.out.println(); + System.out.println( + String.format( + Locale.ROOT, + "segments: total=%d, compressed=%d", + analysis.segmentCount, + analysis.compressedSegmentCount)); + if (analysis.segmentParseWarning != null) { + System.out.println("segment warning: " + analysis.segmentParseWarning); + } + + if (analysis.metadataBytes <= 0) { + return; + } + + System.out.println(); + System.out.println("=== Footer Breakdown ==="); + printSection("v2-compatible base", analysis.getV2BaseMetadataBytes(), analysis.totalBytes); + if (V3_MAGIC.equals(analysis.version)) { + printSection("v3 extension total", analysis.getV3ExtensionBytes(), analysis.totalBytes); + System.out.println( + String.format( + Locale.ROOT, + "v3 extension share of footer: %s", + formatPercent(analysis.getV3ExtensionBytes(), analysis.metadataBytes))); + printSection(" min/max data ts", analysis.minMaxDataTsBytes, analysis.totalBytes); + printSection(" physicalTimes[]", analysis.physicalTimesBytes, analysis.totalBytes); + printSection(" localSeqs[]", analysis.localSeqsBytes, analysis.totalBytes); + printSection( + " default writer identity + override count", + analysis.defaultWriterIdentityBytes + analysis.overrideCountFieldBytes, + analysis.totalBytes); + printSection(" overrideIndexes[]", analysis.overrideIndexesBytes, analysis.totalBytes); + printSection(" overrideNodeIds[]", analysis.overrideNodeIdsBytes, analysis.totalBytes); + printSection( + " overrideWriterEpochs[]", analysis.overrideWriterEpochsBytes, analysis.totalBytes); + } + if (analysis.unknownMetadataBytes > 0) { + printSection("unknown metadata tail", analysis.unknownMetadataBytes, analysis.totalBytes); + } + System.out.println( + String.format( + Locale.ROOT, + "entries=%d, memTables=%d, overrides=%d", + analysis.entryCount, + analysis.memTableCount, + analysis.overrideCount)); + if (analysis.footerWarning != null) { + System.out.println("footer warning: " + analysis.footerWarning); + } + } + + private static void printSection(final String name, final long bytes, final long totalBytes) { + System.out.println( + String.format( + Locale.ROOT, + "%-42s %12s %8s", + name + ":", + formatBytes(bytes), + formatPercent(bytes, totalBytes))); + } + + private static String formatBytes(final long bytes) { + final long absBytes = Math.abs(bytes); + if (absBytes < 1024L) { + return bytes + " B"; + } + if (absBytes < 1024L * 1024L) { + return String.format(Locale.ROOT, "%.2f KiB", bytes / 1024.0d); + } + if (absBytes < 1024L * 1024L * 1024L) { + return String.format(Locale.ROOT, "%.2f MiB", bytes / 1024.0d / 1024.0d); + } + return String.format(Locale.ROOT, "%.2f GiB", bytes / 1024.0d / 1024.0d / 1024.0d); + } + + private static String formatPercent(final long bytes, final long totalBytes) { + if (totalBytes <= 0) { + return "N/A"; + } + return String.format(Locale.ROOT, "%.2f%%", bytes * 100.0d / totalBytes); + } + + private static final class WalFileAnalysis { + private final File file; + private final String version; + private final long totalBytes; + + private long headMagicBytes; + private long segmentHeaderBytes; + private long segmentPayloadBytes; + private long endMarkerBytes; + private int metadataBytes; + private long metadataSizeFieldBytes; + private long tailMagicBytes; + + private long footerStartOffset; + private long segmentStartOffset; + private long segmentEndOffsetExclusive; + private long segmentRegionBytes; + + private int segmentCount; + private int compressedSegmentCount; + + private int entryCount; + private int memTableCount; + private int overrideCount; + private long firstSearchIndexBytes; + private long entryCountBytes; + private long bufferSizeArrayBytes; + private long memTableCountFieldBytes; + private long memTableIdsBytes; + private long minMaxDataTsBytes; + private long physicalTimesBytes; + private long localSeqsBytes; + private long defaultWriterIdentityBytes; + private long overrideCountFieldBytes; + private long overrideIndexesBytes; + private long overrideNodeIdsBytes; + private long overrideWriterEpochsBytes; + private long unknownMetadataBytes; + + private String note; + private String segmentParseWarning; + private String footerWarning; + + private WalFileAnalysis(final File file, final String version, final long totalBytes) { + this.file = file; + this.version = version; + this.totalBytes = totalBytes; + } + + private long getV2BaseMetadataBytes() { + return firstSearchIndexBytes + + entryCountBytes + + bufferSizeArrayBytes + + memTableCountFieldBytes + + memTableIdsBytes; + } + + private long getV3ExtensionBytes() { + return minMaxDataTsBytes + + physicalTimesBytes + + localSeqsBytes + + defaultWriterIdentityBytes + + overrideCountFieldBytes + + overrideIndexesBytes + + overrideNodeIdsBytes + + overrideWriterEpochsBytes; + } + } +} diff --git a/example/subscription/src/main/java/org/apache/iotdb/ConsensusSubscriptionSessionExample.java b/example/subscription/src/main/java/org/apache/iotdb/ConsensusSubscriptionSessionExample.java new file mode 100644 index 0000000000000..c0ebbe37198e8 --- /dev/null +++ b/example/subscription/src/main/java/org/apache/iotdb/ConsensusSubscriptionSessionExample.java @@ -0,0 +1,198 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb; + +import org.apache.iotdb.isession.ISession; +import org.apache.iotdb.isession.util.Version; +import org.apache.iotdb.rpc.subscription.config.TopicConstant; +import org.apache.iotdb.session.Session; +import org.apache.iotdb.session.subscription.ISubscriptionTreeSession; +import org.apache.iotdb.session.subscription.SubscriptionTreeSessionBuilder; +import org.apache.iotdb.session.subscription.consumer.ISubscriptionTreePullConsumer; +import org.apache.iotdb.session.subscription.consumer.tree.SubscriptionTreePullConsumerBuilder; +import org.apache.iotdb.session.subscription.payload.SubscriptionMessage; +import org.apache.iotdb.session.subscription.payload.SubscriptionRecordHandler; + +import org.apache.tsfile.read.query.dataset.ResultSet; + +import java.util.List; +import java.util.Properties; + +public class ConsensusSubscriptionSessionExample { + + private static final String HOST = "127.0.0.1"; + private static final int PORT = 6667; + private static final String USER = "root"; + private static final String PASSWORD = "root"; + + private static final long POLL_TIMEOUT_MS = 1_000L; + private static final int MAX_POLL_ROUNDS = 20; + private static final int EXPECTED_ROWS = 5; + + public static void main(final String[] args) throws Exception { + final long runId = System.currentTimeMillis(); + final String database = "root.db_consensus_example_" + runId; + final String device = database + ".d0"; + final String topic = "topic_consensus_example_" + runId; + final String consumerGroup = "cg_consensus_example_" + runId; + final String consumerId = "consumer_consensus_example_" + runId; + + System.out.println("=== Consensus Subscription Tree Example ==="); + System.out.println("database = " + database); + System.out.println("topic = " + topic); + System.out.println("consumerGroup = " + consumerGroup); + + prepareBootstrapData(database, device); + createConsensusTopic(topic, database + ".**"); + + try (final ISubscriptionTreePullConsumer consumer = + new SubscriptionTreePullConsumerBuilder() + .host(HOST) + .port(PORT) + .username(USER) + .password(PASSWORD) + .consumerId(consumerId) + .consumerGroupId(consumerGroup) + .autoCommit(false) + .build()) { + consumer.open(); + consumer.subscribe(topic); + + writeRealtimeDataAfterSubscribe(device); + pollAndCommit(consumer, EXPECTED_ROWS); + consumer.unsubscribe(topic); + } finally { + dropTopic(topic); + } + } + + private static void prepareBootstrapData(final String database, final String device) + throws Exception { + try (final ISession session = openSession()) { + session.executeNonQueryStatement("CREATE DATABASE " + database); + session.executeNonQueryStatement( + "CREATE TIMESERIES " + + device + + ".s1 with datatype=INT64, encoding=RLE, compressor=SNAPPY"); + session.executeNonQueryStatement( + String.format("insert into %s(time, s1) values (%d, %d)", device, 0L, 0L)); + session.executeNonQueryStatement("flush"); + } + } + + private static void createConsensusTopic(final String topicName, final String path) + throws Exception { + try (final ISubscriptionTreeSession session = + new SubscriptionTreeSessionBuilder() + .host(HOST) + .port(PORT) + .username(USER) + .password(PASSWORD) + .build()) { + session.open(); + + final Properties config = new Properties(); + config.put(TopicConstant.MODE_KEY, TopicConstant.MODE_CONSENSUS_VALUE); + config.put(TopicConstant.FORMAT_KEY, TopicConstant.FORMAT_RECORD_HANDLER_VALUE); + config.put(TopicConstant.PATH_KEY, path); + config.put(TopicConstant.ORDER_MODE_KEY, TopicConstant.ORDER_MODE_PER_WRITER_VALUE); + session.createTopicIfNotExists(topicName, config); + } + } + + private static void writeRealtimeDataAfterSubscribe(final String device) throws Exception { + try (final ISession session = openSession()) { + for (int i = 1; i <= EXPECTED_ROWS; i++) { + session.executeNonQueryStatement( + String.format("insert into %s(time, s1) values (%d, %d)", device, i, i * 10L)); + } + session.executeNonQueryStatement("flush"); + } + } + + private static void pollAndCommit( + final ISubscriptionTreePullConsumer consumer, final int expectedRows) throws Exception { + int totalRows = 0; + int consecutiveEmptyPolls = 0; + + for (int round = 1; round <= MAX_POLL_ROUNDS; round++) { + final List messages = consumer.poll(POLL_TIMEOUT_MS); + if (messages.isEmpty()) { + consecutiveEmptyPolls++; + if (totalRows >= expectedRows && consecutiveEmptyPolls >= 3) { + break; + } + continue; + } + + consecutiveEmptyPolls = 0; + + for (final SubscriptionMessage message : messages) { + for (final ResultSet resultSet : message.getResultSets()) { + final SubscriptionRecordHandler.SubscriptionResultSet subscriptionResultSet = + (SubscriptionRecordHandler.SubscriptionResultSet) resultSet; + System.out.println("Columns = " + subscriptionResultSet.getColumnNames()); + while (subscriptionResultSet.hasNext()) { + System.out.println(subscriptionResultSet.nextRecord()); + totalRows++; + } + } + } + + consumer.commitSync(messages); + System.out.println("poll round " + round + ", totalRows = " + totalRows); + } + + if (totalRows != expectedRows) { + throw new IllegalStateException( + "Expected " + + expectedRows + + " realtime rows, but consumed " + + totalRows + + ". Please check whether consensus subscription is enabled on the server."); + } + } + + private static void dropTopic(final String topicName) throws Exception { + try (final ISubscriptionTreeSession session = + new SubscriptionTreeSessionBuilder() + .host(HOST) + .port(PORT) + .username(USER) + .password(PASSWORD) + .build()) { + session.open(); + session.dropTopicIfExists(topicName); + } + } + + private static ISession openSession() throws Exception { + final ISession session = + new Session.Builder() + .host(HOST) + .port(PORT) + .username(USER) + .password(PASSWORD) + .version(Version.V_1_0) + .build(); + session.open(false); + return session; + } +} diff --git a/example/subscription/src/main/java/org/apache/iotdb/ConsensusTableModelSubscriptionSessionExample.java b/example/subscription/src/main/java/org/apache/iotdb/ConsensusTableModelSubscriptionSessionExample.java new file mode 100644 index 0000000000000..a877a4a861eda --- /dev/null +++ b/example/subscription/src/main/java/org/apache/iotdb/ConsensusTableModelSubscriptionSessionExample.java @@ -0,0 +1,201 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb; + +import org.apache.iotdb.isession.ITableSession; +import org.apache.iotdb.rpc.subscription.config.TopicConstant; +import org.apache.iotdb.session.TableSessionBuilder; +import org.apache.iotdb.session.subscription.ISubscriptionTableSession; +import org.apache.iotdb.session.subscription.SubscriptionTableSessionBuilder; +import org.apache.iotdb.session.subscription.consumer.ISubscriptionTablePullConsumer; +import org.apache.iotdb.session.subscription.consumer.table.SubscriptionTablePullConsumerBuilder; +import org.apache.iotdb.session.subscription.payload.SubscriptionMessage; +import org.apache.iotdb.session.subscription.payload.SubscriptionRecordHandler; + +import org.apache.tsfile.read.query.dataset.ResultSet; + +import java.util.Collections; +import java.util.List; +import java.util.Properties; + +public class ConsensusTableModelSubscriptionSessionExample { + + private static final String HOST = "127.0.0.1"; + private static final int PORT = 6667; + private static final String USER = "root"; + private static final String PASSWORD = "root"; + + private static final long POLL_TIMEOUT_MS = 1_000L; + private static final int MAX_POLL_ROUNDS = 20; + private static final int EXPECTED_ROWS = 5; + + public static void main(final String[] args) throws Exception { + final long runId = System.currentTimeMillis(); + final String database = "db_consensus_example_" + runId; + final String table = "events"; + final String topic = "topic_consensus_table_example_" + runId; + final String consumerGroup = "cg_consensus_table_example_" + runId; + final String consumerId = "consumer_consensus_table_example_" + runId; + + System.out.println("=== Consensus Subscription Table Example ==="); + System.out.println("database = " + database); + System.out.println("table = " + table); + System.out.println("topic = " + topic); + System.out.println("consumerGroup = " + consumerGroup); + + prepareBootstrapData(database, table); + createConsensusTopic(topic, database, table); + + try (final ISubscriptionTablePullConsumer consumer = + new SubscriptionTablePullConsumerBuilder() + .host(HOST) + .port(PORT) + .username(USER) + .password(PASSWORD) + .consumerId(consumerId) + .consumerGroupId(consumerGroup) + .autoCommit(false) + .build()) { + consumer.open(); + consumer.subscribe(topic); + + writeRealtimeDataAfterSubscribe(database, table); + pollAndCommit(consumer, EXPECTED_ROWS); + consumer.unsubscribe(topic); + } finally { + dropTopic(topic); + } + } + + private static void prepareBootstrapData(final String database, final String table) + throws Exception { + try (final ITableSession session = openTableSession()) { + session.executeNonQueryStatement("CREATE DATABASE IF NOT EXISTS " + database); + session.executeNonQueryStatement("USE " + database); + session.executeNonQueryStatement( + "CREATE TABLE " + table + "(tag1 STRING TAG, s1 INT64 FIELD)"); + session.executeNonQueryStatement( + "insert into " + table + "(tag1, s1, time) values ('bootstrap', 0, 0)"); + session.executeNonQueryStatement("flush"); + } + } + + private static void createConsensusTopic( + final String topicName, final String database, final String table) throws Exception { + try (final ISubscriptionTableSession session = + new SubscriptionTableSessionBuilder() + .host(HOST) + .port(PORT) + .username(USER) + .password(PASSWORD) + .build()) { + final Properties config = new Properties(); + config.put(TopicConstant.MODE_KEY, TopicConstant.MODE_CONSENSUS_VALUE); + config.put(TopicConstant.FORMAT_KEY, TopicConstant.FORMAT_RECORD_HANDLER_VALUE); + config.put(TopicConstant.DATABASE_KEY, database); + config.put(TopicConstant.TABLE_KEY, table); + config.put(TopicConstant.ORDER_MODE_KEY, TopicConstant.ORDER_MODE_PER_WRITER_VALUE); + session.createTopicIfNotExists(topicName, config); + } + } + + private static void writeRealtimeDataAfterSubscribe(final String database, final String table) + throws Exception { + try (final ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database); + for (int i = 1; i <= EXPECTED_ROWS; i++) { + session.executeNonQueryStatement( + String.format( + "insert into %s(tag1, s1, time) values ('device_%d', %d, %d)", + table, i, i * 10L, i)); + } + session.executeNonQueryStatement("flush"); + } + } + + private static void pollAndCommit( + final ISubscriptionTablePullConsumer consumer, final int expectedRows) throws Exception { + int totalRows = 0; + int consecutiveEmptyPolls = 0; + + for (int round = 1; round <= MAX_POLL_ROUNDS; round++) { + final List messages = consumer.poll(POLL_TIMEOUT_MS); + if (messages.isEmpty()) { + consecutiveEmptyPolls++; + if (totalRows >= expectedRows && consecutiveEmptyPolls >= 3) { + break; + } + continue; + } + + consecutiveEmptyPolls = 0; + + for (final SubscriptionMessage message : messages) { + for (final ResultSet resultSet : message.getResultSets()) { + final SubscriptionRecordHandler.SubscriptionResultSet subscriptionResultSet = + (SubscriptionRecordHandler.SubscriptionResultSet) resultSet; + System.out.println( + "database = " + + subscriptionResultSet.getDatabaseName() + + ", table = " + + subscriptionResultSet.getTableName()); + System.out.println("Columns = " + subscriptionResultSet.getColumnNames()); + System.out.println("Types = " + subscriptionResultSet.getColumnTypes()); + while (subscriptionResultSet.hasNext()) { + System.out.println(subscriptionResultSet.nextRecord()); + totalRows++; + } + } + } + + consumer.commitSync(messages); + System.out.println("poll round " + round + ", totalRows = " + totalRows); + } + + if (totalRows != expectedRows) { + throw new IllegalStateException( + "Expected " + + expectedRows + + " realtime rows, but consumed " + + totalRows + + ". Please check whether consensus subscription is enabled on the server."); + } + } + + private static void dropTopic(final String topicName) throws Exception { + try (final ISubscriptionTableSession session = + new SubscriptionTableSessionBuilder() + .host(HOST) + .port(PORT) + .username(USER) + .password(PASSWORD) + .build()) { + session.dropTopicIfExists(topicName); + } + } + + private static ITableSession openTableSession() throws Exception { + return new TableSessionBuilder() + .nodeUrls(Collections.singletonList(HOST + ":" + PORT)) + .username(USER) + .password(PASSWORD) + .build(); + } +} diff --git a/integration-test/src/test/java/org/apache/iotdb/subscription/it/consensus/local/AbstractSubscriptionConsensusLocalIT.java b/integration-test/src/test/java/org/apache/iotdb/subscription/it/consensus/local/AbstractSubscriptionConsensusLocalIT.java new file mode 100644 index 0000000000000..4342918c2bed8 --- /dev/null +++ b/integration-test/src/test/java/org/apache/iotdb/subscription/it/consensus/local/AbstractSubscriptionConsensusLocalIT.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.subscription.it.consensus.local; + +import org.apache.iotdb.it.env.EnvFactory; +import org.apache.iotdb.subscription.it.AbstractSubscriptionIT; + +import org.junit.After; +import org.junit.Before; + +import static org.apache.iotdb.subscription.it.IoTDBSubscriptionITConstant.FORCE_SCALABLE_SINGLE_NODE_MODE; + +public abstract class AbstractSubscriptionConsensusLocalIT extends AbstractSubscriptionIT { + + @Override + @Before + public void setUp() throws Exception { + super.setUp(); + + FORCE_SCALABLE_SINGLE_NODE_MODE.accept(EnvFactory.getEnv()); + EnvFactory.getEnv() + .getConfig() + .getCommonConfig() + .setAutoCreateSchemaEnabled(true) + .setPipeMemoryManagementEnabled(false) + .setIsPipeEnableMemoryCheck(false); + + EnvFactory.getEnv().initClusterEnvironment(); + } + + @Override + @After + public void tearDown() throws Exception { + EnvFactory.getEnv().cleanClusterEnvironment(); + + super.tearDown(); + } +} diff --git a/integration-test/src/test/java/org/apache/iotdb/subscription/it/consensus/local/ConsensusSubscriptionITSupport.java b/integration-test/src/test/java/org/apache/iotdb/subscription/it/consensus/local/ConsensusSubscriptionITSupport.java new file mode 100644 index 0000000000000..254b5ffeb8558 --- /dev/null +++ b/integration-test/src/test/java/org/apache/iotdb/subscription/it/consensus/local/ConsensusSubscriptionITSupport.java @@ -0,0 +1,670 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.subscription.it.consensus.local; + +import org.apache.iotdb.isession.ISession; +import org.apache.iotdb.it.env.EnvFactory; +import org.apache.iotdb.rpc.subscription.config.TopicConstant; +import org.apache.iotdb.rpc.subscription.payload.poll.TopicProgress; +import org.apache.iotdb.session.subscription.SubscriptionTreeSession; +import org.apache.iotdb.session.subscription.consumer.tree.SubscriptionTreePullConsumer; +import org.apache.iotdb.session.subscription.payload.SubscriptionMessage; +import org.apache.iotdb.session.subscription.payload.SubscriptionRecordHandler; + +import org.apache.tsfile.read.common.RowRecord; +import org.apache.tsfile.read.query.dataset.ResultSet; +import org.junit.Assert; + +import java.time.Duration; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Properties; +import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; + +import static org.apache.iotdb.subscription.it.IoTDBSubscriptionITConstant.POLL_TIMEOUT_MS; + +final class ConsensusSubscriptionITSupport { + + private static final AtomicInteger IDENTIFIER = new AtomicInteger(0); + private static final int QUIET_ROUNDS_AFTER_DATA = 3; + private static final int QUIET_ROUNDS_WITHOUT_DATA = 8; + + private ConsensusSubscriptionITSupport() { + throw new IllegalStateException("Utility class"); + } + + static TestIdentifiers newIdentifiers(final String prefix) { + final int id = IDENTIFIER.incrementAndGet(); + final String normalized = + prefix.toLowerCase(Locale.ROOT).replaceAll("[^a-z0-9]+", "_").replaceAll("^_+|_+$", ""); + return new TestIdentifiers( + "root.consensus_it_" + normalized + "_" + id, + "topic_consensus_it_" + normalized + "_" + id, + "cg_consensus_it_" + normalized + "_" + id, + "c_consensus_it_" + normalized + "_" + id); + } + + static String bootstrapDatabase(final String database) throws Exception { + createDatabase(database); + final String bootstrapDevice = database + ".bootstrap"; + try (final ISession session = EnvFactory.getEnv().getSessionConnection()) { + session.executeNonQueryStatement( + String.format("insert into %s(time, s1) values (%d, %d)", bootstrapDevice, 0L, 0L)); + session.executeNonQueryStatement("flush"); + } + return rowKey(bootstrapDevice, 0L); + } + + static void createDatabase(final String database) throws Exception { + try (final ISession session = EnvFactory.getEnv().getSessionConnection()) { + session.executeNonQueryStatement("create database " + database); + } + } + + static void createConsensusTopic(final String topicName, final String path) throws Exception { + final String host = EnvFactory.getEnv().getIP(); + final int port = Integer.parseInt(EnvFactory.getEnv().getPort()); + + try (final SubscriptionTreeSession session = new SubscriptionTreeSession(host, port)) { + session.open(); + session.dropTopicIfExists(topicName); + + final Properties config = new Properties(); + config.put(TopicConstant.MODE_KEY, TopicConstant.MODE_CONSENSUS_VALUE); + config.put(TopicConstant.FORMAT_KEY, TopicConstant.FORMAT_RECORD_HANDLER_VALUE); + config.put(TopicConstant.PATH_KEY, path); + config.put(TopicConstant.ORDER_MODE_KEY, TopicConstant.ORDER_MODE_PER_WRITER_VALUE); + session.createTopic(topicName, config); + } + } + + static SubscriptionTreePullConsumer createConsumer( + final String consumerId, final String consumerGroupId) throws Exception { + final SubscriptionTreePullConsumer consumer = + new SubscriptionTreePullConsumer.Builder() + .host(EnvFactory.getEnv().getIP()) + .port(Integer.parseInt(EnvFactory.getEnv().getPort())) + .consumerId(consumerId) + .consumerGroupId(consumerGroupId) + .autoCommit(false) + .buildPullConsumer(); + consumer.open(); + return consumer; + } + + static Set insertRows( + final String database, + final List devices, + final long startTimestampInclusive, + final int rowsPerDevice, + final boolean flush) + throws Exception { + final Set rowKeys = new LinkedHashSet<>(); + + try (final ISession session = EnvFactory.getEnv().getSessionConnection()) { + for (int deviceIndex = 0; deviceIndex < devices.size(); deviceIndex++) { + final String devicePath = database + "." + devices.get(deviceIndex); + final long deviceBaseTimestamp = startTimestampInclusive + deviceIndex * 10_000L; + for (int row = 0; row < rowsPerDevice; row++) { + final long timestamp = deviceBaseTimestamp + row; + session.executeNonQueryStatement( + String.format( + "insert into %s(time, s1) values (%d, %d)", + devicePath, timestamp, timestamp * 10)); + rowKeys.add(rowKey(devicePath, timestamp)); + } + } + if (flush) { + session.executeNonQueryStatement("flush"); + } + } + + return rowKeys; + } + + static Set insertRows( + final String database, + final String device, + final long startTimestampInclusive, + final int rowCount, + final boolean flush) + throws Exception { + return insertRows( + database, Collections.singletonList(device), startTimestampInclusive, rowCount, flush); + } + + static ConsumedRecords pollAndCommitUntilAtLeast( + final SubscriptionTreePullConsumer consumer, + final int expectedUniqueRows, + final int maxPollRounds) + throws Exception { + return pollAndCommitUntilAtLeast( + consumer, expectedUniqueRows, maxPollRounds, Duration.ofMillis(POLL_TIMEOUT_MS)); + } + + static ConsumedRecords pollAndCommitUntilAtLeast( + final SubscriptionTreePullConsumer consumer, + final int expectedUniqueRows, + final int maxPollRounds, + final Duration pollTimeout) + throws Exception { + final ConsumedRecords consumed = new ConsumedRecords(); + int emptyRounds = 0; + + for (int round = 0; round < maxPollRounds; round++) { + final List messages = consumer.poll(pollTimeout); + if (messages.isEmpty()) { + emptyRounds++; + if (consumed.getUniqueRowCount() >= expectedUniqueRows + && emptyRounds >= QUIET_ROUNDS_AFTER_DATA) { + break; + } + if (consumed.getUniqueRowCount() == 0 + && expectedUniqueRows == 0 + && emptyRounds >= QUIET_ROUNDS_WITHOUT_DATA) { + break; + } + continue; + } + + emptyRounds = 0; + consumed.merge(consumeMessages(messages)); + consumer.commitSync(messages); + } + + return consumed; + } + + static ConsumedRecords drainAndCommitUntilQuiet( + final SubscriptionTreePullConsumer consumer, final int maxPollRounds) throws Exception { + return drainAndCommitUntilQuiet(consumer, maxPollRounds, Duration.ofMillis(POLL_TIMEOUT_MS)); + } + + static ConsumedRecords drainAndCommitUntilQuiet( + final SubscriptionTreePullConsumer consumer, + final int maxPollRounds, + final Duration pollTimeout) + throws Exception { + final ConsumedRecords consumed = new ConsumedRecords(); + int emptyRounds = 0; + boolean sawData = false; + + for (int round = 0; round < maxPollRounds; round++) { + final List messages = consumer.poll(pollTimeout); + if (messages.isEmpty()) { + emptyRounds++; + if ((sawData && emptyRounds >= QUIET_ROUNDS_AFTER_DATA) + || (!sawData && emptyRounds >= QUIET_ROUNDS_WITHOUT_DATA)) { + break; + } + continue; + } + + sawData = true; + emptyRounds = 0; + consumed.merge(consumeMessages(messages)); + consumer.commitSync(messages); + } + + return consumed; + } + + static PolledMessageBatch pollFirstNonEmptyBatchWithoutCommit( + final SubscriptionTreePullConsumer consumer, final int maxPollRounds) throws Exception { + return pollFirstNonEmptyBatchWithoutCommit( + consumer, maxPollRounds, Duration.ofMillis(POLL_TIMEOUT_MS)); + } + + static PolledMessageBatch pollFirstNonEmptyBatchWithoutCommit( + final SubscriptionTreePullConsumer consumer, + final int maxPollRounds, + final Duration pollTimeout) + throws Exception { + for (int round = 0; round < maxPollRounds; round++) { + final List messages = consumer.poll(pollTimeout); + if (messages.isEmpty()) { + continue; + } + return new PolledMessageBatch(messages, consumeMessages(messages)); + } + return new PolledMessageBatch(Collections.emptyList(), new ConsumedRecords()); + } + + static CommittedSnapshot pollUntilCommittedRows( + final SubscriptionTreePullConsumer consumer, + final String topicName, + final int minimumRows, + final int maxPollRounds) + throws Exception { + return pollUntilCommittedRows( + consumer, topicName, minimumRows, maxPollRounds, Duration.ofMillis(POLL_TIMEOUT_MS)); + } + + static CommittedSnapshot pollUntilCommittedRows( + final SubscriptionTreePullConsumer consumer, + final String topicName, + final int minimumRows, + final int maxPollRounds, + final Duration pollTimeout) + throws Exception { + final ConsumedRecords committed = new ConsumedRecords(); + int emptyRounds = 0; + + for (int round = 0; round < maxPollRounds; round++) { + final List messages = consumer.poll(pollTimeout); + if (messages.isEmpty()) { + emptyRounds++; + if (committed.getUniqueRowCount() > 0 && emptyRounds >= QUIET_ROUNDS_WITHOUT_DATA) { + break; + } + continue; + } + + emptyRounds = 0; + for (final SubscriptionMessage message : messages) { + final ConsumedRecords batch = consumeMessages(Collections.singletonList(message)); + consumer.commitSync(message); + committed.merge(batch); + if (committed.getUniqueRowCount() >= minimumRows) { + return new CommittedSnapshot( + consumer.committedPositions(topicName), committed.copyRowKeys(), batch.getRowCount()); + } + } + } + + Assert.fail( + "Unable to capture committed checkpoint after " + + minimumRows + + " rows, got " + + committed.getUniqueRowCount()); + return null; + } + + static GroupDrainResult drainConsumersWithoutDuplicates( + final List consumers, + final int expectedUniqueRows, + final int maxPollRounds) + throws Exception { + final List perConsumer = new ArrayList<>(consumers.size()); + for (int i = 0; i < consumers.size(); i++) { + perConsumer.add(new ConsumedRecords()); + } + final ConsumedRecords union = new ConsumedRecords(); + int emptyRounds = 0; + + for (int round = 0; round < maxPollRounds; round++) { + boolean sawData = false; + for (int consumerIndex = 0; consumerIndex < consumers.size(); consumerIndex++) { + final SubscriptionTreePullConsumer consumer = consumers.get(consumerIndex); + final List messages = + consumer.poll(Duration.ofMillis(POLL_TIMEOUT_MS)); + if (messages.isEmpty()) { + continue; + } + + sawData = true; + final ConsumedRecords batch = consumeMessages(messages); + perConsumer.get(consumerIndex).merge(batch); + union.merge(batch); + consumer.commitSync(messages); + } + + if (sawData) { + emptyRounds = 0; + continue; + } + + emptyRounds++; + if (union.getUniqueRowCount() >= expectedUniqueRows + && emptyRounds >= QUIET_ROUNDS_AFTER_DATA) { + break; + } + if (union.getUniqueRowCount() == 0 + && expectedUniqueRows == 0 + && emptyRounds >= QUIET_ROUNDS_WITHOUT_DATA) { + break; + } + } + + return new GroupDrainResult(union, perConsumer); + } + + static void assertExactRowKeys( + final Set expectedRowKeys, final ConsumedRecords consumed) { + Assert.assertTrue( + "Unexpected duplicate row keys: " + consumed.getDuplicateRowKeys(), + consumed.getDuplicateRowKeys().isEmpty()); + Assert.assertEquals(expectedRowKeys, consumed.getRowKeys()); + Assert.assertEquals(expectedRowKeys.size(), consumed.getRowCount()); + } + + static void assertContainsExpectedRowKeys( + final Set expectedRowKeys, final ConsumedRecords consumed, final int maxExtraRows) { + Assert.assertTrue( + "Unexpected duplicate row keys: " + consumed.getDuplicateRowKeys(), + consumed.getDuplicateRowKeys().isEmpty()); + Assert.assertTrue( + "Replay should contain all expected rows. expected=" + + expectedRowKeys + + ", actual=" + + consumed.getRowKeys(), + consumed.getRowKeys().containsAll(expectedRowKeys)); + Assert.assertTrue( + "Replay should contain at most " + maxExtraRows + " extra rows. actual=" + consumed, + consumed.getUniqueRowCount() <= expectedRowKeys.size() + maxExtraRows); + } + + static void assertNoMoreMessages( + final SubscriptionTreePullConsumer consumer, final int rounds, final Duration pollTimeout) + throws Exception { + for (int i = 0; i < rounds; i++) { + Assert.assertTrue( + "Unexpected extra subscription messages after quiescence", + consumer.poll(pollTimeout).isEmpty()); + } + } + + static Set subtract(final Set minuend, final Set subtrahend) { + final Set difference = new LinkedHashSet<>(minuend); + difference.removeAll(subtrahend); + return difference; + } + + static Set measurementPaths(final String devicePath, final String... measurements) { + final Set result = new LinkedHashSet<>(); + Arrays.stream(measurements).forEach(measurement -> result.add(devicePath + "." + measurement)); + return result; + } + + static void cleanup( + final SubscriptionTreePullConsumer consumer, final String topicName, final String database) { + if (consumer != null) { + try { + consumer.unsubscribe(topicName); + } catch (final Exception ignored) { + // ignored on cleanup + } + try { + consumer.close(); + } catch (final Exception ignored) { + // ignored on cleanup + } + } + + final String host = EnvFactory.getEnv().getIP(); + final int port = Integer.parseInt(EnvFactory.getEnv().getPort()); + try (final SubscriptionTreeSession session = new SubscriptionTreeSession(host, port)) { + session.open(); + session.dropTopicIfExists(topicName); + } catch (final Exception ignored) { + // ignored on cleanup + } + + try (final ISession session = EnvFactory.getEnv().getSessionConnection()) { + session.executeNonQueryStatement("drop database " + database); + } catch (final Exception ignored) { + // ignored on cleanup + } + } + + static void pause(final long millis) { + try { + Thread.sleep(millis); + } catch (final InterruptedException e) { + Thread.currentThread().interrupt(); + throw new RuntimeException("Interrupted while waiting for subscription state", e); + } + } + + static String rowKey(final String devicePath, final long timestamp) { + return devicePath + "#" + timestamp; + } + + private static ConsumedRecords consumeMessages(final List messages) + throws Exception { + final ConsumedRecords consumed = new ConsumedRecords(); + for (final SubscriptionMessage message : messages) { + for (final ResultSet resultSet : message.getResultSets()) { + final SubscriptionRecordHandler.SubscriptionResultSet subscriptionResultSet = + (SubscriptionRecordHandler.SubscriptionResultSet) resultSet; + final List columnNames = subscriptionResultSet.getColumnNames(); + final String devicePath = extractDevicePath(columnNames); + if (columnNames.size() > 1) { + consumed.getSeenColumns().addAll(columnNames.subList(1, columnNames.size())); + } + while (subscriptionResultSet.hasNext()) { + final RowRecord record = subscriptionResultSet.nextRecord(); + consumed.addRow(devicePath, record.getTimestamp()); + } + } + } + return consumed; + } + + private static String extractDevicePath(final List columnNames) { + if (columnNames.size() <= 1) { + return ""; + } + final String firstMeasurement = columnNames.get(1); + final int lastDot = firstMeasurement.lastIndexOf('.'); + return lastDot > 0 ? firstMeasurement.substring(0, lastDot) : firstMeasurement; + } + + static final class TestIdentifiers { + + private final String database; + private final String topic; + private final String consumerGroupId; + private final String consumerId; + + private TestIdentifiers( + final String database, + final String topic, + final String consumerGroupId, + final String consumerId) { + this.database = database; + this.topic = topic; + this.consumerGroupId = consumerGroupId; + this.consumerId = consumerId; + } + + String getDatabase() { + return database; + } + + String getTopic() { + return topic; + } + + String getConsumerGroupId() { + return consumerGroupId; + } + + String getConsumerId() { + return consumerId; + } + + String consumer(final String suffix) { + return consumerId + "_" + suffix; + } + + String consumerGroup(final String suffix) { + return consumerGroupId + "_" + suffix; + } + } + + static final class ConsumedRecords { + + private final Set rowKeys = new LinkedHashSet<>(); + private final Set duplicateRowKeys = new LinkedHashSet<>(); + private final Set timestamps = new LinkedHashSet<>(); + private final Set seenColumns = new LinkedHashSet<>(); + private final Map rowsPerDevice = new LinkedHashMap<>(); + private int rowCount; + + void addRow(final String devicePath, final long timestamp) { + rowCount++; + timestamps.add(timestamp); + final String rowKey = rowKey(devicePath, timestamp); + if (!rowKeys.add(rowKey)) { + duplicateRowKeys.add(rowKey); + } + rowsPerDevice.merge(devicePath, 1, Integer::sum); + } + + void merge(final ConsumedRecords other) { + rowCount += other.rowCount; + timestamps.addAll(other.timestamps); + seenColumns.addAll(other.seenColumns); + other.rowsPerDevice.forEach( + (device, count) -> rowsPerDevice.merge(device, count, Integer::sum)); + for (final String rowKey : other.rowKeys) { + if (!rowKeys.add(rowKey)) { + duplicateRowKeys.add(rowKey); + } + } + duplicateRowKeys.addAll(other.duplicateRowKeys); + } + + int getRowCount() { + return rowCount; + } + + int getUniqueRowCount() { + return rowKeys.size(); + } + + Set getRowKeys() { + return rowKeys; + } + + Set copyRowKeys() { + return new LinkedHashSet<>(rowKeys); + } + + Set getDuplicateRowKeys() { + return duplicateRowKeys; + } + + Set getTimestamps() { + return timestamps; + } + + Set getSeenColumns() { + return seenColumns; + } + + Map getRowsPerDevice() { + return rowsPerDevice; + } + + @Override + public String toString() { + return "ConsumedRecords{" + + "rowCount=" + + rowCount + + ", uniqueRowCount=" + + rowKeys.size() + + ", duplicateRowKeys=" + + duplicateRowKeys + + ", rowKeys=" + + rowKeys + + '}'; + } + } + + static final class CommittedSnapshot { + + private final TopicProgress progress; + private final Set committedRowKeys; + private final int rowsInLastCommittedMessage; + + private CommittedSnapshot( + final TopicProgress progress, + final Set committedRowKeys, + final int rowsInLastCommittedMessage) { + this.progress = progress; + this.committedRowKeys = Collections.unmodifiableSet(new LinkedHashSet<>(committedRowKeys)); + this.rowsInLastCommittedMessage = rowsInLastCommittedMessage; + } + + TopicProgress getProgress() { + return progress; + } + + Set getCommittedRowKeys() { + return committedRowKeys; + } + + int getCommittedRowCount() { + return committedRowKeys.size(); + } + + int getRowsInLastCommittedMessage() { + return rowsInLastCommittedMessage; + } + } + + static final class PolledMessageBatch { + + private final List messages; + private final ConsumedRecords consumedRecords; + + private PolledMessageBatch( + final List messages, final ConsumedRecords consumedRecords) { + this.messages = new ArrayList<>(messages); + this.consumedRecords = consumedRecords; + } + + List getMessages() { + return messages; + } + + ConsumedRecords getConsumedRecords() { + return consumedRecords; + } + } + + static final class GroupDrainResult { + + private final ConsumedRecords union; + private final List perConsumer; + + private GroupDrainResult(final ConsumedRecords union, final List perConsumer) { + this.union = union; + this.perConsumer = perConsumer; + } + + ConsumedRecords getUnion() { + return union; + } + + List getPerConsumer() { + return perConsumer; + } + } +} diff --git a/integration-test/src/test/java/org/apache/iotdb/subscription/it/consensus/local/IoTDBConsensusSubscriptionBasicIT.java b/integration-test/src/test/java/org/apache/iotdb/subscription/it/consensus/local/IoTDBConsensusSubscriptionBasicIT.java new file mode 100644 index 0000000000000..5b2e70ce51580 --- /dev/null +++ b/integration-test/src/test/java/org/apache/iotdb/subscription/it/consensus/local/IoTDBConsensusSubscriptionBasicIT.java @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.subscription.it.consensus.local; + +import org.apache.iotdb.it.framework.IoTDBTestRunner; +import org.apache.iotdb.itbase.category.LocalStandaloneIT; +import org.apache.iotdb.session.subscription.consumer.tree.SubscriptionTreePullConsumer; + +import org.junit.Assert; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.runner.RunWith; + +import java.time.Duration; +import java.util.Arrays; +import java.util.LinkedHashSet; +import java.util.Set; + +@RunWith(IoTDBTestRunner.class) +@Category({LocalStandaloneIT.class}) +public class IoTDBConsensusSubscriptionBasicIT extends AbstractSubscriptionConsensusLocalIT { + + @Test + public void testRealtimeOnlyAfterSubscribe() throws Exception { + final ConsensusSubscriptionITSupport.TestIdentifiers ids = + ConsensusSubscriptionITSupport.newIdentifiers("basic_realtime_only_after_subscribe"); + SubscriptionTreePullConsumer consumer = null; + + try { + final String bootstrapRowKey = + ConsensusSubscriptionITSupport.bootstrapDatabase(ids.getDatabase()); + ConsensusSubscriptionITSupport.createConsensusTopic( + ids.getTopic(), ids.getDatabase() + ".**"); + + consumer = + ConsensusSubscriptionITSupport.createConsumer( + ids.getConsumerId(), ids.getConsumerGroupId()); + consumer.subscribe(ids.getTopic()); + + final Set expectedRowKeys = + ConsensusSubscriptionITSupport.insertRows( + ids.getDatabase(), Arrays.asList("d0", "d1"), 100L, 8, true); + + final ConsensusSubscriptionITSupport.ConsumedRecords consumed = + ConsensusSubscriptionITSupport.pollAndCommitUntilAtLeast( + consumer, expectedRowKeys.size(), 40); + + ConsensusSubscriptionITSupport.assertExactRowKeys(expectedRowKeys, consumed); + Assert.assertFalse(consumed.getRowKeys().contains(bootstrapRowKey)); + ConsensusSubscriptionITSupport.assertNoMoreMessages(consumer, 3, Duration.ofMillis(500)); + } finally { + ConsensusSubscriptionITSupport.cleanup(consumer, ids.getTopic(), ids.getDatabase()); + } + } + + @Test + public void testSubscribeBeforeRegionCreation() throws Exception { + final ConsensusSubscriptionITSupport.TestIdentifiers ids = + ConsensusSubscriptionITSupport.newIdentifiers("basic_subscribe_before_region"); + SubscriptionTreePullConsumer consumer = null; + + try { + ConsensusSubscriptionITSupport.createConsensusTopic( + ids.getTopic(), ids.getDatabase() + ".**"); + + consumer = + ConsensusSubscriptionITSupport.createConsumer( + ids.getConsumerId(), ids.getConsumerGroupId()); + consumer.subscribe(ids.getTopic()); + + ConsensusSubscriptionITSupport.createDatabase(ids.getDatabase()); + final Set expectedRowKeys = + ConsensusSubscriptionITSupport.insertRows(ids.getDatabase(), "d0", 1L, 12, true); + + final ConsensusSubscriptionITSupport.ConsumedRecords consumed = + ConsensusSubscriptionITSupport.pollAndCommitUntilAtLeast( + consumer, expectedRowKeys.size(), 50); + + ConsensusSubscriptionITSupport.assertExactRowKeys(expectedRowKeys, consumed); + } finally { + ConsensusSubscriptionITSupport.cleanup(consumer, ids.getTopic(), ids.getDatabase()); + } + } + + @Test + public void testRealtimeRowsSurviveFlush() throws Exception { + final ConsensusSubscriptionITSupport.TestIdentifiers ids = + ConsensusSubscriptionITSupport.newIdentifiers("basic_rows_survive_flush"); + SubscriptionTreePullConsumer consumer = null; + + try { + ConsensusSubscriptionITSupport.bootstrapDatabase(ids.getDatabase()); + ConsensusSubscriptionITSupport.createConsensusTopic( + ids.getTopic(), ids.getDatabase() + ".**"); + + consumer = + ConsensusSubscriptionITSupport.createConsumer( + ids.getConsumerId(), ids.getConsumerGroupId()); + consumer.subscribe(ids.getTopic()); + + final Set expectedRowKeys = new LinkedHashSet<>(); + expectedRowKeys.addAll( + ConsensusSubscriptionITSupport.insertRows(ids.getDatabase(), "d0", 100L, 6, true)); + expectedRowKeys.addAll( + ConsensusSubscriptionITSupport.insertRows(ids.getDatabase(), "d0", 200L, 6, true)); + expectedRowKeys.addAll( + ConsensusSubscriptionITSupport.insertRows(ids.getDatabase(), "d1", 300L, 4, true)); + + final ConsensusSubscriptionITSupport.ConsumedRecords consumed = + ConsensusSubscriptionITSupport.pollAndCommitUntilAtLeast( + consumer, expectedRowKeys.size(), 50); + + ConsensusSubscriptionITSupport.assertExactRowKeys(expectedRowKeys, consumed); + Assert.assertTrue( + "Expected rows from both devices after flush boundaries, actual=" + consumed, + consumed + .getRowsPerDevice() + .keySet() + .containsAll(Arrays.asList(ids.getDatabase() + ".d0", ids.getDatabase() + ".d1"))); + } finally { + ConsensusSubscriptionITSupport.cleanup(consumer, ids.getTopic(), ids.getDatabase()); + } + } +} diff --git a/integration-test/src/test/java/org/apache/iotdb/subscription/it/consensus/local/IoTDBConsensusSubscriptionConsumerGroupIT.java b/integration-test/src/test/java/org/apache/iotdb/subscription/it/consensus/local/IoTDBConsensusSubscriptionConsumerGroupIT.java new file mode 100644 index 0000000000000..3d6efe8608c9b --- /dev/null +++ b/integration-test/src/test/java/org/apache/iotdb/subscription/it/consensus/local/IoTDBConsensusSubscriptionConsumerGroupIT.java @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.subscription.it.consensus.local; + +import org.apache.iotdb.it.framework.IoTDBTestRunner; +import org.apache.iotdb.itbase.category.LocalStandaloneIT; +import org.apache.iotdb.session.subscription.consumer.tree.SubscriptionTreePullConsumer; +import org.apache.iotdb.session.subscription.payload.SubscriptionMessage; + +import org.junit.Assert; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.runner.RunWith; + +import java.util.Arrays; +import java.util.Set; + +@RunWith(IoTDBTestRunner.class) +@Category({LocalStandaloneIT.class}) +public class IoTDBConsensusSubscriptionConsumerGroupIT + extends AbstractSubscriptionConsensusLocalIT { + + @Test + public void testDifferentConsumerGroupsReceiveIndependentFullData() throws Exception { + final ConsensusSubscriptionITSupport.TestIdentifiers ids = + ConsensusSubscriptionITSupport.newIdentifiers("consumer_group_different_groups"); + SubscriptionTreePullConsumer consumer1 = null; + SubscriptionTreePullConsumer consumer2 = null; + + try { + ConsensusSubscriptionITSupport.bootstrapDatabase(ids.getDatabase()); + ConsensusSubscriptionITSupport.createConsensusTopic( + ids.getTopic(), ids.getDatabase() + ".**"); + + consumer1 = + ConsensusSubscriptionITSupport.createConsumer( + ids.consumer("g1"), ids.consumerGroup("g1")); + consumer2 = + ConsensusSubscriptionITSupport.createConsumer( + ids.consumer("g2"), ids.consumerGroup("g2")); + consumer1.subscribe(ids.getTopic()); + consumer2.subscribe(ids.getTopic()); + + final Set expectedRowKeys = + ConsensusSubscriptionITSupport.insertRows( + ids.getDatabase(), Arrays.asList("d0", "d1"), 100L, 10, true); + + final ConsensusSubscriptionITSupport.ConsumedRecords consumed1 = + ConsensusSubscriptionITSupport.pollAndCommitUntilAtLeast( + consumer1, expectedRowKeys.size(), 50); + final ConsensusSubscriptionITSupport.ConsumedRecords consumed2 = + ConsensusSubscriptionITSupport.pollAndCommitUntilAtLeast( + consumer2, expectedRowKeys.size(), 50); + + ConsensusSubscriptionITSupport.assertExactRowKeys(expectedRowKeys, consumed1); + ConsensusSubscriptionITSupport.assertExactRowKeys(expectedRowKeys, consumed2); + } finally { + ConsensusSubscriptionITSupport.cleanup(consumer1, ids.getTopic(), ids.getDatabase()); + ConsensusSubscriptionITSupport.cleanup(consumer2, ids.getTopic(), ids.getDatabase()); + } + } + + @Test + public void testTwoConsumersInSameGroupDoNotDuplicateRows() throws Exception { + final ConsensusSubscriptionITSupport.TestIdentifiers ids = + ConsensusSubscriptionITSupport.newIdentifiers("consumer_group_same_group_no_duplicate"); + SubscriptionTreePullConsumer consumer1 = null; + SubscriptionTreePullConsumer consumer2 = null; + + try { + ConsensusSubscriptionITSupport.bootstrapDatabase(ids.getDatabase()); + ConsensusSubscriptionITSupport.createConsensusTopic( + ids.getTopic(), ids.getDatabase() + ".**"); + + consumer1 = + ConsensusSubscriptionITSupport.createConsumer( + ids.consumer("a"), ids.getConsumerGroupId()); + consumer2 = + ConsensusSubscriptionITSupport.createConsumer( + ids.consumer("b"), ids.getConsumerGroupId()); + consumer1.subscribe(ids.getTopic()); + consumer2.subscribe(ids.getTopic()); + ConsensusSubscriptionITSupport.pause(1_000L); + + final Set expectedRowKeys = + ConsensusSubscriptionITSupport.insertRows(ids.getDatabase(), "d0", 1_000L, 24, true); + + final ConsensusSubscriptionITSupport.GroupDrainResult result = + ConsensusSubscriptionITSupport.drainConsumersWithoutDuplicates( + Arrays.asList(consumer1, consumer2), expectedRowKeys.size(), 60); + + Assert.assertTrue( + "Expected no duplicate rows across the same consumer group, union=" + + result.getUnion() + + ", perConsumer=" + + result.getPerConsumer(), + result.getUnion().getDuplicateRowKeys().isEmpty()); + Assert.assertEquals(expectedRowKeys, result.getUnion().getRowKeys()); + Assert.assertEquals(expectedRowKeys.size(), result.getUnion().getRowCount()); + } finally { + ConsensusSubscriptionITSupport.cleanup(consumer1, ids.getTopic(), ids.getDatabase()); + ConsensusSubscriptionITSupport.cleanup(consumer2, ids.getTopic(), ids.getDatabase()); + } + } + + @Test + public void testCommitAfterUnsubscribeDoesNotThrow() throws Exception { + final ConsensusSubscriptionITSupport.TestIdentifiers ids = + ConsensusSubscriptionITSupport.newIdentifiers("consumer_group_commit_after_unsubscribe"); + SubscriptionTreePullConsumer consumer = null; + + try { + ConsensusSubscriptionITSupport.bootstrapDatabase(ids.getDatabase()); + ConsensusSubscriptionITSupport.createConsensusTopic( + ids.getTopic(), ids.getDatabase() + ".**"); + + consumer = + ConsensusSubscriptionITSupport.createConsumer( + ids.getConsumerId(), ids.getConsumerGroupId()); + consumer.subscribe(ids.getTopic()); + + ConsensusSubscriptionITSupport.insertRows(ids.getDatabase(), "d0", 100L, 16, true); + final ConsensusSubscriptionITSupport.PolledMessageBatch batch = + ConsensusSubscriptionITSupport.pollFirstNonEmptyBatchWithoutCommit(consumer, 30); + + Assert.assertTrue( + "Expected some rows to be polled before unsubscribe, batch=" + batch.getConsumedRecords(), + batch.getConsumedRecords().getRowCount() > 0); + + consumer.unsubscribe(ids.getTopic()); + + for (final SubscriptionMessage message : batch.getMessages()) { + consumer.commitSync(message); + } + } finally { + ConsensusSubscriptionITSupport.cleanup(consumer, ids.getTopic(), ids.getDatabase()); + } + } +} diff --git a/integration-test/src/test/java/org/apache/iotdb/subscription/it/consensus/local/IoTDBConsensusSubscriptionDataIT.java b/integration-test/src/test/java/org/apache/iotdb/subscription/it/consensus/local/IoTDBConsensusSubscriptionDataIT.java new file mode 100644 index 0000000000000..5a90b5e83d8c9 --- /dev/null +++ b/integration-test/src/test/java/org/apache/iotdb/subscription/it/consensus/local/IoTDBConsensusSubscriptionDataIT.java @@ -0,0 +1,173 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.subscription.it.consensus.local; + +import org.apache.iotdb.isession.ISession; +import org.apache.iotdb.it.env.EnvFactory; +import org.apache.iotdb.it.framework.IoTDBTestRunner; +import org.apache.iotdb.itbase.category.LocalStandaloneIT; +import org.apache.iotdb.session.subscription.consumer.tree.SubscriptionTreePullConsumer; + +import org.junit.Assert; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.runner.RunWith; + +import java.util.LinkedHashSet; +import java.util.Set; + +@RunWith(IoTDBTestRunner.class) +@Category({LocalStandaloneIT.class}) +public class IoTDBConsensusSubscriptionDataIT extends AbstractSubscriptionConsensusLocalIT { + + private static final long ONE_WEEK_PLUS_ONE_MS = 604_800_001L; + + @Test + public void testNonAlignedPrimitiveTypes() throws Exception { + final ConsensusSubscriptionITSupport.TestIdentifiers ids = + ConsensusSubscriptionITSupport.newIdentifiers("data_non_aligned_primitive_types"); + SubscriptionTreePullConsumer consumer = null; + + try { + ConsensusSubscriptionITSupport.bootstrapDatabase(ids.getDatabase()); + ConsensusSubscriptionITSupport.createConsensusTopic( + ids.getTopic(), ids.getDatabase() + ".**"); + + consumer = + ConsensusSubscriptionITSupport.createConsumer( + ids.getConsumerId(), ids.getConsumerGroupId()); + consumer.subscribe(ids.getTopic()); + + final String device = ids.getDatabase() + ".d_types"; + final Set expectedRowKeys = new LinkedHashSet<>(); + try (final ISession session = EnvFactory.getEnv().getSessionConnection()) { + for (int i = 0; i < 12; i++) { + final long timestamp = 100L + i; + session.executeNonQueryStatement( + String.format( + "insert into %s(time, s_int32, s_int64, s_float, s_double, s_bool, s_text) " + + "values (%d, %d, %d, %.1f, %.2f, %s, 'text_%d')", + device, + timestamp, + i, + i * 100L, + i + 0.5f, + i + 0.25d, + i % 2 == 0 ? "true" : "false", + i)); + expectedRowKeys.add(ConsensusSubscriptionITSupport.rowKey(device, timestamp)); + } + session.executeNonQueryStatement("flush"); + } + + final ConsensusSubscriptionITSupport.ConsumedRecords consumed = + ConsensusSubscriptionITSupport.pollAndCommitUntilAtLeast( + consumer, expectedRowKeys.size(), 50); + + ConsensusSubscriptionITSupport.assertExactRowKeys(expectedRowKeys, consumed); + Assert.assertTrue( + "Expected all primitive type measurements in the consumed columns, actual=" + + consumed.getSeenColumns(), + consumed + .getSeenColumns() + .containsAll( + ConsensusSubscriptionITSupport.measurementPaths( + device, "s_int32", "s_int64", "s_float", "s_double", "s_bool", "s_text"))); + } finally { + ConsensusSubscriptionITSupport.cleanup(consumer, ids.getTopic(), ids.getDatabase()); + } + } + + @Test + public void testAlignedCrossPartitionRows() throws Exception { + final ConsensusSubscriptionITSupport.TestIdentifiers ids = + ConsensusSubscriptionITSupport.newIdentifiers("data_aligned_cross_partition"); + SubscriptionTreePullConsumer consumer = null; + + try { + final String alignedDevice = ids.getDatabase() + ".d_aligned"; + try (final ISession session = EnvFactory.getEnv().getSessionConnection()) { + session.executeNonQueryStatement("create database " + ids.getDatabase()); + session.executeNonQueryStatement( + String.format( + "create aligned timeseries %s" + + "(s_int32 INT32, s_int64 INT64, s_float FLOAT, " + + "s_double DOUBLE, s_bool BOOLEAN, s_text TEXT)", + alignedDevice)); + session.executeNonQueryStatement( + String.format( + "insert into %s(time, s_int32, s_int64, s_float, s_double, s_bool, s_text) " + + "values (0, 0, 0, 0.0, 0.0, false, 'bootstrap')", + alignedDevice)); + session.executeNonQueryStatement("flush"); + } + + ConsensusSubscriptionITSupport.createConsensusTopic( + ids.getTopic(), ids.getDatabase() + ".**"); + + consumer = + ConsensusSubscriptionITSupport.createConsumer( + ids.getConsumerId(), ids.getConsumerGroupId()); + consumer.subscribe(ids.getTopic()); + + final Set expectedRowKeys = new LinkedHashSet<>(); + try (final ISession session = EnvFactory.getEnv().getSessionConnection()) { + for (int i = 0; i < 6; i++) { + final long timestamp = 100L + ONE_WEEK_PLUS_ONE_MS * i; + session.executeNonQueryStatement( + String.format( + "insert into %s(time, s_int32, s_int64, s_float, s_double, s_bool, s_text) " + + "values (%d, %d, %d, %.1f, %.2f, %s, 'aligned_%d')", + alignedDevice, + timestamp, + i + 1, + (i + 1) * 100L, + i + 1.5f, + i + 1.25d, + i % 2 == 0 ? "true" : "false", + i)); + expectedRowKeys.add(ConsensusSubscriptionITSupport.rowKey(alignedDevice, timestamp)); + } + session.executeNonQueryStatement("flush"); + } + + final ConsensusSubscriptionITSupport.ConsumedRecords consumed = + ConsensusSubscriptionITSupport.pollAndCommitUntilAtLeast( + consumer, expectedRowKeys.size(), 60); + + ConsensusSubscriptionITSupport.assertExactRowKeys(expectedRowKeys, consumed); + Assert.assertTrue( + "Expected aligned measurements in consumed columns, actual=" + consumed.getSeenColumns(), + consumed + .getSeenColumns() + .containsAll( + ConsensusSubscriptionITSupport.measurementPaths( + alignedDevice, + "s_int32", + "s_int64", + "s_float", + "s_double", + "s_bool", + "s_text"))); + } finally { + ConsensusSubscriptionITSupport.cleanup(consumer, ids.getTopic(), ids.getDatabase()); + } + } +} diff --git a/integration-test/src/test/java/org/apache/iotdb/subscription/it/consensus/local/IoTDBConsensusSubscriptionRecoveryIT.java b/integration-test/src/test/java/org/apache/iotdb/subscription/it/consensus/local/IoTDBConsensusSubscriptionRecoveryIT.java new file mode 100644 index 0000000000000..29920c1d72c51 --- /dev/null +++ b/integration-test/src/test/java/org/apache/iotdb/subscription/it/consensus/local/IoTDBConsensusSubscriptionRecoveryIT.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.subscription.it.consensus.local; + +import org.apache.iotdb.it.framework.IoTDBTestRunner; +import org.apache.iotdb.itbase.category.LocalStandaloneIT; +import org.apache.iotdb.rpc.subscription.payload.poll.TopicProgress; +import org.apache.iotdb.session.subscription.consumer.tree.SubscriptionTreePullConsumer; + +import org.junit.Assert; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.runner.RunWith; + +import java.util.LinkedHashSet; +import java.util.Set; + +@RunWith(IoTDBTestRunner.class) +@Category({LocalStandaloneIT.class}) +public class IoTDBConsensusSubscriptionRecoveryIT extends AbstractSubscriptionConsensusLocalIT { + + @Test + public void testConsumerRestartResumesFromCommittedCheckpoint() throws Exception { + final ConsensusSubscriptionITSupport.TestIdentifiers ids = + ConsensusSubscriptionITSupport.newIdentifiers("recovery_consumer_restart"); + SubscriptionTreePullConsumer consumer1 = null; + SubscriptionTreePullConsumer consumer2 = null; + + try { + ConsensusSubscriptionITSupport.bootstrapDatabase(ids.getDatabase()); + ConsensusSubscriptionITSupport.createConsensusTopic( + ids.getTopic(), ids.getDatabase() + ".**"); + + consumer1 = + ConsensusSubscriptionITSupport.createConsumer( + ids.consumer("first"), ids.getConsumerGroupId()); + consumer1.subscribe(ids.getTopic()); + + final Set allRowKeys = + new LinkedHashSet<>( + ConsensusSubscriptionITSupport.insertRows(ids.getDatabase(), "d0", 1_000L, 64, true)); + + final ConsensusSubscriptionITSupport.CommittedSnapshot checkpoint = + ConsensusSubscriptionITSupport.pollUntilCommittedRows(consumer1, ids.getTopic(), 16, 40); + final TopicProgress checkpointProgress = checkpoint.getProgress(); + Assert.assertNotNull(checkpointProgress); + + Set remainingRowKeys = + ConsensusSubscriptionITSupport.subtract(allRowKeys, checkpoint.getCommittedRowKeys()); + + consumer1.close(); + consumer1 = null; + + if (remainingRowKeys.isEmpty()) { + allRowKeys.addAll( + ConsensusSubscriptionITSupport.insertRows(ids.getDatabase(), "d0", 2_000L, 16, true)); + remainingRowKeys = + ConsensusSubscriptionITSupport.subtract(allRowKeys, checkpoint.getCommittedRowKeys()); + } + + Assert.assertFalse( + "Expected rows to remain after the committed checkpoint", remainingRowKeys.isEmpty()); + + consumer2 = + ConsensusSubscriptionITSupport.createConsumer( + ids.consumer("restart"), ids.getConsumerGroupId()); + consumer2.subscribe(ids.getTopic()); + consumer2.seekAfter(ids.getTopic(), checkpointProgress); + ConsensusSubscriptionITSupport.pause(1_000L); + + final ConsensusSubscriptionITSupport.ConsumedRecords replay = + ConsensusSubscriptionITSupport.pollAndCommitUntilAtLeast( + consumer2, remainingRowKeys.size(), 60); + + ConsensusSubscriptionITSupport.assertExactRowKeys(remainingRowKeys, replay); + } finally { + ConsensusSubscriptionITSupport.cleanup(consumer1, ids.getTopic(), ids.getDatabase()); + ConsensusSubscriptionITSupport.cleanup(consumer2, ids.getTopic(), ids.getDatabase()); + } + } +} diff --git a/integration-test/src/test/java/org/apache/iotdb/subscription/it/consensus/local/IoTDBConsensusSubscriptionSeekIT.java b/integration-test/src/test/java/org/apache/iotdb/subscription/it/consensus/local/IoTDBConsensusSubscriptionSeekIT.java new file mode 100644 index 0000000000000..05f380a1112d2 --- /dev/null +++ b/integration-test/src/test/java/org/apache/iotdb/subscription/it/consensus/local/IoTDBConsensusSubscriptionSeekIT.java @@ -0,0 +1,202 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.subscription.it.consensus.local; + +import org.apache.iotdb.it.framework.IoTDBTestRunner; +import org.apache.iotdb.itbase.category.LocalStandaloneIT; +import org.apache.iotdb.rpc.subscription.payload.poll.TopicProgress; +import org.apache.iotdb.session.subscription.consumer.tree.SubscriptionTreePullConsumer; +import org.apache.iotdb.session.subscription.payload.SubscriptionMessage; + +import org.junit.Assert; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.runner.RunWith; + +import java.time.Duration; +import java.util.Set; + +@RunWith(IoTDBTestRunner.class) +@Category({LocalStandaloneIT.class}) +public class IoTDBConsensusSubscriptionSeekIT extends AbstractSubscriptionConsensusLocalIT { + + private static final Duration SEEK_POLL_TIMEOUT = Duration.ofSeconds(1); + private static final int TAIL_ROW_COUNT = 256; + private static final int CHECKPOINT_MINIMUM_ROWS = 96; + + @Test + public void testSeekToBeginningReplaysRows() throws Exception { + final ConsensusSubscriptionITSupport.TestIdentifiers ids = + ConsensusSubscriptionITSupport.newIdentifiers("seek_to_beginning"); + SubscriptionTreePullConsumer consumer = null; + + try { + final String bootstrapRowKey = + ConsensusSubscriptionITSupport.bootstrapDatabase(ids.getDatabase()); + ConsensusSubscriptionITSupport.createConsensusTopic( + ids.getTopic(), ids.getDatabase() + ".**"); + + consumer = + ConsensusSubscriptionITSupport.createConsumer( + ids.getConsumerId(), ids.getConsumerGroupId()); + consumer.subscribe(ids.getTopic()); + + final Set expectedRowKeys = + ConsensusSubscriptionITSupport.insertRows(ids.getDatabase(), "d0", 1_000L, 32, true); + + final ConsensusSubscriptionITSupport.ConsumedRecords initialDrain = + ConsensusSubscriptionITSupport.pollAndCommitUntilAtLeast( + consumer, expectedRowKeys.size(), 40, SEEK_POLL_TIMEOUT); + ConsensusSubscriptionITSupport.assertExactRowKeys(expectedRowKeys, initialDrain); + + consumer.seekToBeginning(ids.getTopic()); + ConsensusSubscriptionITSupport.pause(1_000L); + + final ConsensusSubscriptionITSupport.ConsumedRecords replay = + ConsensusSubscriptionITSupport.pollAndCommitUntilAtLeast( + consumer, expectedRowKeys.size(), 40, SEEK_POLL_TIMEOUT); + + ConsensusSubscriptionITSupport.assertContainsExpectedRowKeys(expectedRowKeys, replay, 1); + if (replay.getUniqueRowCount() == expectedRowKeys.size() + 1) { + Assert.assertTrue( + "Only the bootstrap row is allowed as an extra replayed row, replay=" + replay, + replay.getRowKeys().contains(bootstrapRowKey)); + } + } finally { + ConsensusSubscriptionITSupport.cleanup(consumer, ids.getTopic(), ids.getDatabase()); + } + } + + @Test + public void testSeekAfterCheckpointReplaysExactTail() throws Exception { + final ConsensusSubscriptionITSupport.TestIdentifiers ids = + ConsensusSubscriptionITSupport.newIdentifiers("seek_after_checkpoint"); + SubscriptionTreePullConsumer consumer = null; + + try { + ConsensusSubscriptionITSupport.bootstrapDatabase(ids.getDatabase()); + ConsensusSubscriptionITSupport.createConsensusTopic( + ids.getTopic(), ids.getDatabase() + ".**"); + + consumer = + ConsensusSubscriptionITSupport.createConsumer( + ids.getConsumerId(), ids.getConsumerGroupId()); + consumer.subscribe(ids.getTopic()); + + final Set allRowKeys = + new java.util.LinkedHashSet<>( + ConsensusSubscriptionITSupport.insertRows( + ids.getDatabase(), "d0", 1_000L, TAIL_ROW_COUNT, true)); + + final ConsensusSubscriptionITSupport.CommittedSnapshot checkpoint = + ConsensusSubscriptionITSupport.pollUntilCommittedRows( + consumer, ids.getTopic(), CHECKPOINT_MINIMUM_ROWS, 40, SEEK_POLL_TIMEOUT); + TopicProgress checkpointProgress = checkpoint.getProgress(); + Assert.assertNotNull(checkpointProgress); + + Set expectedTail = + ConsensusSubscriptionITSupport.subtract(allRowKeys, checkpoint.getCommittedRowKeys()); + if (expectedTail.isEmpty()) { + allRowKeys.addAll( + ConsensusSubscriptionITSupport.insertRows(ids.getDatabase(), "d0", 2_000L, 64, true)); + } + + ConsensusSubscriptionITSupport.drainAndCommitUntilQuiet(consumer, 40, SEEK_POLL_TIMEOUT); + expectedTail = + ConsensusSubscriptionITSupport.subtract(allRowKeys, checkpoint.getCommittedRowKeys()); + Assert.assertFalse("Expected a non-empty replay tail", expectedTail.isEmpty()); + + consumer.seekAfter(ids.getTopic(), checkpointProgress); + ConsensusSubscriptionITSupport.pause(1_000L); + + final ConsensusSubscriptionITSupport.ConsumedRecords replay = + ConsensusSubscriptionITSupport.pollAndCommitUntilAtLeast( + consumer, expectedTail.size(), 50, SEEK_POLL_TIMEOUT); + + ConsensusSubscriptionITSupport.assertExactRowKeys(expectedTail, replay); + } finally { + ConsensusSubscriptionITSupport.cleanup(consumer, ids.getTopic(), ids.getDatabase()); + } + } + + @Test + public void testSeekAfterFencesStaleCommitContexts() throws Exception { + final ConsensusSubscriptionITSupport.TestIdentifiers ids = + ConsensusSubscriptionITSupport.newIdentifiers("seek_after_fences_stale_contexts"); + SubscriptionTreePullConsumer consumer = null; + + try { + ConsensusSubscriptionITSupport.bootstrapDatabase(ids.getDatabase()); + ConsensusSubscriptionITSupport.createConsensusTopic( + ids.getTopic(), ids.getDatabase() + ".**"); + + consumer = + ConsensusSubscriptionITSupport.createConsumer( + ids.getConsumerId(), ids.getConsumerGroupId()); + consumer.subscribe(ids.getTopic()); + + final Set allRowKeys = + new java.util.LinkedHashSet<>( + ConsensusSubscriptionITSupport.insertRows( + ids.getDatabase(), "d0", 1_000L, TAIL_ROW_COUNT, true)); + + final ConsensusSubscriptionITSupport.CommittedSnapshot checkpoint = + ConsensusSubscriptionITSupport.pollUntilCommittedRows( + consumer, ids.getTopic(), CHECKPOINT_MINIMUM_ROWS, 40, SEEK_POLL_TIMEOUT); + final TopicProgress checkpointProgress = checkpoint.getProgress(); + Assert.assertNotNull(checkpointProgress); + + Set expectedTail = + ConsensusSubscriptionITSupport.subtract(allRowKeys, checkpoint.getCommittedRowKeys()); + + ConsensusSubscriptionITSupport.PolledMessageBatch staleBatch = + ConsensusSubscriptionITSupport.pollFirstNonEmptyBatchWithoutCommit( + consumer, 5, SEEK_POLL_TIMEOUT); + if (staleBatch.getConsumedRecords().getRowCount() == 0) { + allRowKeys.addAll( + ConsensusSubscriptionITSupport.insertRows(ids.getDatabase(), "d0", 2_000L, 64, true)); + expectedTail = + ConsensusSubscriptionITSupport.subtract(allRowKeys, checkpoint.getCommittedRowKeys()); + staleBatch = + ConsensusSubscriptionITSupport.pollFirstNonEmptyBatchWithoutCommit( + consumer, 10, SEEK_POLL_TIMEOUT); + } + + Assert.assertTrue( + "Expected a stale batch after checkpoint, batch=" + staleBatch.getConsumedRecords(), + staleBatch.getConsumedRecords().getRowCount() > 0); + + consumer.seekAfter(ids.getTopic(), checkpointProgress); + ConsensusSubscriptionITSupport.pause(1_000L); + + for (final SubscriptionMessage staleMessage : staleBatch.getMessages()) { + consumer.commitSync(staleMessage); + } + + final ConsensusSubscriptionITSupport.ConsumedRecords replay = + ConsensusSubscriptionITSupport.pollAndCommitUntilAtLeast( + consumer, expectedTail.size(), 60, SEEK_POLL_TIMEOUT); + + ConsensusSubscriptionITSupport.assertExactRowKeys(expectedTail, replay); + } finally { + ConsensusSubscriptionITSupport.cleanup(consumer, ids.getTopic(), ids.getDatabase()); + } + } +} diff --git a/integration-test/src/test/java/org/apache/iotdb/subscription/it/consensus/local/tablemodel/ConsensusSubscriptionTableITSupport.java b/integration-test/src/test/java/org/apache/iotdb/subscription/it/consensus/local/tablemodel/ConsensusSubscriptionTableITSupport.java new file mode 100644 index 0000000000000..34ca591baf8a4 --- /dev/null +++ b/integration-test/src/test/java/org/apache/iotdb/subscription/it/consensus/local/tablemodel/ConsensusSubscriptionTableITSupport.java @@ -0,0 +1,474 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.subscription.it.consensus.local.tablemodel; + +import org.apache.iotdb.isession.ITableSession; +import org.apache.iotdb.it.env.EnvFactory; +import org.apache.iotdb.rpc.subscription.config.TopicConstant; +import org.apache.iotdb.session.subscription.ISubscriptionTableSession; +import org.apache.iotdb.session.subscription.SubscriptionTableSessionBuilder; +import org.apache.iotdb.session.subscription.consumer.table.SubscriptionTablePullConsumer; +import org.apache.iotdb.session.subscription.consumer.table.SubscriptionTablePullConsumerBuilder; +import org.apache.iotdb.session.subscription.payload.PollResult; +import org.apache.iotdb.session.subscription.payload.SubscriptionMessage; +import org.apache.iotdb.session.subscription.payload.SubscriptionRecordHandler; + +import org.apache.tsfile.read.common.RowRecord; +import org.apache.tsfile.read.query.dataset.ResultSet; +import org.junit.Assert; + +import java.time.Duration; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Properties; +import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; + +final class ConsensusSubscriptionTableITSupport { + + static final String DEFAULT_TABLE_SCHEMA = "tag1 STRING TAG, s1 INT64 FIELD"; + + private static final AtomicInteger IDENTIFIER = new AtomicInteger(0); + private static final Duration DEFAULT_POLL_TIMEOUT = Duration.ofSeconds(1); + private static final int QUIET_ROUNDS_AFTER_DATA = 3; + private static final int QUIET_ROUNDS_WITHOUT_DATA = 8; + + private ConsensusSubscriptionTableITSupport() { + throw new IllegalStateException("Utility class"); + } + + static TestIdentifiers newIdentifiers(final String prefix) { + final int id = IDENTIFIER.incrementAndGet(); + final String normalized = + prefix.toLowerCase(Locale.ROOT).replaceAll("[^a-z0-9]+", "_").replaceAll("^_+|_+$", ""); + return new TestIdentifiers( + "consensus_tbl_it_" + normalized + "_" + id, + "topic_consensus_tbl_it_" + normalized + "_" + id, + "cg_consensus_tbl_it_" + normalized + "_" + id, + "c_consensus_tbl_it_" + normalized + "_" + id); + } + + static void createDatabase(final String database) throws Exception { + try (final ITableSession session = EnvFactory.getEnv().getTableSessionConnection()) { + session.executeNonQueryStatement("create database " + database); + } + } + + static void createDatabaseAndTable( + final String database, final String tableName, final String schema) throws Exception { + try (final ITableSession session = EnvFactory.getEnv().getTableSessionConnection()) { + session.executeNonQueryStatement("create database " + database); + session.executeNonQueryStatement("use " + database); + session.executeNonQueryStatement(String.format("create table %s (%s)", tableName, schema)); + } + } + + static void createTable(final String database, final String tableName, final String schema) + throws Exception { + try (final ITableSession session = EnvFactory.getEnv().getTableSessionConnection()) { + session.executeNonQueryStatement("use " + database); + session.executeNonQueryStatement(String.format("create table %s (%s)", tableName, schema)); + } + } + + static String bootstrapDatabaseAndTable( + final String database, final String tableName, final String schema) throws Exception { + createDatabaseAndTable(database, tableName, schema); + try (final ITableSession session = EnvFactory.getEnv().getTableSessionConnection()) { + session.executeNonQueryStatement("use " + database); + session.executeNonQueryStatement( + String.format( + "insert into %s(tag1, s1, time) values ('bootstrap', %d, %d)", tableName, 0L, 0L)); + session.executeNonQueryStatement("flush"); + } + return rowKey(database, tableName, 0L); + } + + static void createConsensusTopic( + final String topicName, final String databasePattern, final String tablePattern) + throws Exception { + final String host = EnvFactory.getEnv().getIP(); + final int port = Integer.parseInt(EnvFactory.getEnv().getPort()); + + try (final ISubscriptionTableSession session = + new SubscriptionTableSessionBuilder().host(host).port(port).build()) { + session.open(); + session.dropTopicIfExists(topicName); + + final Properties config = new Properties(); + config.put(TopicConstant.MODE_KEY, TopicConstant.MODE_CONSENSUS_VALUE); + config.put(TopicConstant.FORMAT_KEY, TopicConstant.FORMAT_SESSION_DATA_SETS_HANDLER_VALUE); + config.put(TopicConstant.DATABASE_KEY, databasePattern); + config.put(TopicConstant.TABLE_KEY, tablePattern); + session.createTopic(topicName, config); + } + } + + static SubscriptionTablePullConsumer createConsumer( + final String consumerId, final String consumerGroupId) throws Exception { + final SubscriptionTablePullConsumer consumer = + (SubscriptionTablePullConsumer) + new SubscriptionTablePullConsumerBuilder() + .host(EnvFactory.getEnv().getIP()) + .port(Integer.parseInt(EnvFactory.getEnv().getPort())) + .consumerId(consumerId) + .consumerGroupId(consumerGroupId) + .autoCommit(false) + .build(); + consumer.open(); + return consumer; + } + + static Set insertRows( + final String database, + final String tableName, + final long startTimestampInclusive, + final int rowCount, + final boolean flush) + throws Exception { + return insertRows(database, tableName, startTimestampInclusive, rowCount, 10L, flush); + } + + static Set insertRows( + final String database, + final String tableName, + final long startTimestampInclusive, + final int rowCount, + final long valueMultiplier, + final boolean flush) + throws Exception { + final Set rowKeys = new LinkedHashSet<>(); + + try (final ITableSession session = EnvFactory.getEnv().getTableSessionConnection()) { + session.executeNonQueryStatement("use " + database); + for (int row = 0; row < rowCount; row++) { + final long timestamp = startTimestampInclusive + row; + session.executeNonQueryStatement( + String.format( + "insert into %s(tag1, s1, time) values ('%s', %d, %d)", + tableName, tableName + "_tag", timestamp * valueMultiplier, timestamp)); + rowKeys.add(rowKey(database, tableName, timestamp)); + } + if (flush) { + session.executeNonQueryStatement("flush"); + } + } + + return rowKeys; + } + + static ConsumedRecords pollAndCommitUntilAtLeast( + final SubscriptionTablePullConsumer consumer, + final int expectedUniqueRows, + final int maxPollRounds) + throws Exception { + return pollAndCommitUntilAtLeast( + consumer, expectedUniqueRows, maxPollRounds, DEFAULT_POLL_TIMEOUT); + } + + static ConsumedRecords pollAndCommitUntilAtLeast( + final SubscriptionTablePullConsumer consumer, + final int expectedUniqueRows, + final int maxPollRounds, + final Duration pollTimeout) + throws Exception { + final ConsumedRecords consumed = new ConsumedRecords(); + int emptyRounds = 0; + + for (int round = 0; round < maxPollRounds; round++) { + final List messages = consumer.poll(pollTimeout); + if (messages.isEmpty()) { + emptyRounds++; + if (consumed.getUniqueRowCount() >= expectedUniqueRows + && emptyRounds >= QUIET_ROUNDS_AFTER_DATA) { + break; + } + if (consumed.getUniqueRowCount() == 0 + && expectedUniqueRows == 0 + && emptyRounds >= QUIET_ROUNDS_WITHOUT_DATA) { + break; + } + continue; + } + + emptyRounds = 0; + consumed.merge(consumeMessages(messages)); + consumer.commitSync(messages); + } + + return consumed; + } + + static ConsumedRecords pollWithInfoAndCommitUntilAtLeast( + final SubscriptionTablePullConsumer consumer, + final Set topicNames, + final int expectedUniqueRows, + final int maxPollRounds) + throws Exception { + return pollWithInfoAndCommitUntilAtLeast( + consumer, topicNames, expectedUniqueRows, maxPollRounds, DEFAULT_POLL_TIMEOUT); + } + + static ConsumedRecords pollWithInfoAndCommitUntilAtLeast( + final SubscriptionTablePullConsumer consumer, + final Set topicNames, + final int expectedUniqueRows, + final int maxPollRounds, + final Duration pollTimeout) + throws Exception { + final ConsumedRecords consumed = new ConsumedRecords(); + int emptyRounds = 0; + + for (int round = 0; round < maxPollRounds; round++) { + final PollResult pollResult = consumer.pollWithInfo(topicNames, pollTimeout.toMillis()); + final List messages = pollResult.getMessages(); + if (messages.isEmpty()) { + emptyRounds++; + if (consumed.getUniqueRowCount() >= expectedUniqueRows + && emptyRounds >= QUIET_ROUNDS_AFTER_DATA) { + break; + } + continue; + } + + emptyRounds = 0; + consumed.merge(consumeMessages(messages)); + consumer.commitSync(messages); + } + + return consumed; + } + + static void assertExactRowKeys( + final Set expectedRowKeys, final ConsumedRecords consumed) { + Assert.assertTrue( + "Unexpected duplicate row keys: " + consumed.getDuplicateRowKeys(), + consumed.getDuplicateRowKeys().isEmpty()); + Assert.assertEquals(expectedRowKeys, consumed.getRowKeys()); + Assert.assertEquals(expectedRowKeys.size(), consumed.getRowCount()); + } + + static void assertNoMoreMessages( + final SubscriptionTablePullConsumer consumer, final int rounds, final Duration pollTimeout) + throws Exception { + for (int i = 0; i < rounds; i++) { + Assert.assertTrue( + "Unexpected extra subscription messages after quiescence", + consumer.poll(pollTimeout).isEmpty()); + } + } + + static void cleanup( + final SubscriptionTablePullConsumer consumer, + final String topicName, + final String... databases) { + cleanup(consumer, Collections.singleton(topicName), databases); + } + + static void cleanup( + final SubscriptionTablePullConsumer consumer, + final Set topicNames, + final String... databases) { + if (consumer != null) { + try { + consumer.unsubscribe(topicNames); + } catch (final Exception ignored) { + // ignored on cleanup + } + try { + consumer.close(); + } catch (final Exception ignored) { + // ignored on cleanup + } + } + + final String host = EnvFactory.getEnv().getIP(); + final int port = Integer.parseInt(EnvFactory.getEnv().getPort()); + try (final ISubscriptionTableSession session = + new SubscriptionTableSessionBuilder().host(host).port(port).build()) { + session.open(); + for (final String topicName : topicNames) { + session.dropTopicIfExists(topicName); + } + } catch (final Exception ignored) { + // ignored on cleanup + } + + for (final String database : databases) { + try (final ITableSession session = EnvFactory.getEnv().getTableSessionConnection()) { + session.executeNonQueryStatement("drop database if exists " + database); + } catch (final Exception ignored) { + // ignored on cleanup + } + } + } + + static void pause(final long millis) { + try { + Thread.sleep(millis); + } catch (final InterruptedException e) { + Thread.currentThread().interrupt(); + throw new RuntimeException("Interrupted while waiting for subscription state", e); + } + } + + static String rowKey(final String database, final String tableName, final long timestamp) { + return database + "." + tableName + "#" + timestamp; + } + + private static ConsumedRecords consumeMessages(final List messages) + throws Exception { + final ConsumedRecords consumed = new ConsumedRecords(); + for (final SubscriptionMessage message : messages) { + for (final ResultSet resultSet : message.getResultSets()) { + final SubscriptionRecordHandler.SubscriptionResultSet subscriptionResultSet = + (SubscriptionRecordHandler.SubscriptionResultSet) resultSet; + consumed.getSeenColumns().addAll(subscriptionResultSet.getColumnNames()); + final String databaseName = subscriptionResultSet.getDatabaseName(); + final String tableName = subscriptionResultSet.getTableName(); + while (subscriptionResultSet.hasNext()) { + final RowRecord record = subscriptionResultSet.nextRecord(); + consumed.addRow(databaseName, tableName, record.getTimestamp()); + } + } + } + return consumed; + } + + static final class TestIdentifiers { + + private final String database; + private final String topic; + private final String consumerGroupId; + private final String consumerId; + + private TestIdentifiers( + final String database, + final String topic, + final String consumerGroupId, + final String consumerId) { + this.database = database; + this.topic = topic; + this.consumerGroupId = consumerGroupId; + this.consumerId = consumerId; + } + + String getDatabase() { + return database; + } + + String getTopic() { + return topic; + } + + String getConsumerGroupId() { + return consumerGroupId; + } + + String getConsumerId() { + return consumerId; + } + + String database(final String suffix) { + return database + "_" + suffix; + } + + String topic(final String suffix) { + return topic + "_" + suffix; + } + } + + static final class ConsumedRecords { + + private final Set rowKeys = new LinkedHashSet<>(); + private final Set duplicateRowKeys = new LinkedHashSet<>(); + private final Set seenColumns = new LinkedHashSet<>(); + private final Map rowsPerTable = new LinkedHashMap<>(); + private final Map rowsPerDatabase = new LinkedHashMap<>(); + private int rowCount; + + void addRow(final String databaseName, final String tableName, final long timestamp) { + rowCount++; + final String rowKey = rowKey(databaseName, tableName, timestamp); + if (!rowKeys.add(rowKey)) { + duplicateRowKeys.add(rowKey); + } + rowsPerTable.merge(tableName, 1, Integer::sum); + rowsPerDatabase.merge(databaseName, 1, Integer::sum); + } + + void merge(final ConsumedRecords other) { + rowCount += other.rowCount; + rowKeys.addAll(other.rowKeys); + duplicateRowKeys.addAll(other.duplicateRowKeys); + seenColumns.addAll(other.seenColumns); + other.rowsPerTable.forEach((table, count) -> rowsPerTable.merge(table, count, Integer::sum)); + other.rowsPerDatabase.forEach( + (database, count) -> rowsPerDatabase.merge(database, count, Integer::sum)); + } + + Set getRowKeys() { + return rowKeys; + } + + Set getDuplicateRowKeys() { + return duplicateRowKeys; + } + + Set getSeenColumns() { + return seenColumns; + } + + Map getRowsPerTable() { + return rowsPerTable; + } + + Map getRowsPerDatabase() { + return rowsPerDatabase; + } + + int getRowCount() { + return rowCount; + } + + int getUniqueRowCount() { + return rowKeys.size(); + } + + @Override + public String toString() { + return "ConsumedRecords{rowCount=" + + rowCount + + ", uniqueRowCount=" + + getUniqueRowCount() + + ", rowsPerTable=" + + rowsPerTable + + ", rowsPerDatabase=" + + rowsPerDatabase + + ", duplicateRowKeys=" + + duplicateRowKeys + + "}"; + } + } +} diff --git a/integration-test/src/test/java/org/apache/iotdb/subscription/it/consensus/local/tablemodel/IoTDBConsensusSubscriptionBasicTableIT.java b/integration-test/src/test/java/org/apache/iotdb/subscription/it/consensus/local/tablemodel/IoTDBConsensusSubscriptionBasicTableIT.java new file mode 100644 index 0000000000000..1a64e70d4fd6f --- /dev/null +++ b/integration-test/src/test/java/org/apache/iotdb/subscription/it/consensus/local/tablemodel/IoTDBConsensusSubscriptionBasicTableIT.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.subscription.it.consensus.local.tablemodel; + +import org.apache.iotdb.it.framework.IoTDBTestRunner; +import org.apache.iotdb.itbase.category.LocalStandaloneIT; +import org.apache.iotdb.session.subscription.consumer.table.SubscriptionTablePullConsumer; +import org.apache.iotdb.subscription.it.consensus.local.AbstractSubscriptionConsensusLocalIT; + +import org.junit.Assert; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.runner.RunWith; + +import java.time.Duration; +import java.util.LinkedHashSet; +import java.util.Set; + +@RunWith(IoTDBTestRunner.class) +@Category({LocalStandaloneIT.class}) +public class IoTDBConsensusSubscriptionBasicTableIT extends AbstractSubscriptionConsensusLocalIT { + + @Test + public void testRealtimeOnlyAfterSubscribe() throws Exception { + final ConsensusSubscriptionTableITSupport.TestIdentifiers ids = + ConsensusSubscriptionTableITSupport.newIdentifiers( + "table_basic_realtime_only_after_subscribe"); + final String database = ids.getDatabase(); + final String table1 = "t1"; + final String table2 = "t2"; + final String table3 = "t3"; + SubscriptionTablePullConsumer consumer = null; + + try { + final String bootstrapRowKey = + ConsensusSubscriptionTableITSupport.bootstrapDatabaseAndTable( + database, table1, ConsensusSubscriptionTableITSupport.DEFAULT_TABLE_SCHEMA); + ConsensusSubscriptionTableITSupport.createTable( + database, table2, ConsensusSubscriptionTableITSupport.DEFAULT_TABLE_SCHEMA); + ConsensusSubscriptionTableITSupport.createTable( + database, table3, ConsensusSubscriptionTableITSupport.DEFAULT_TABLE_SCHEMA); + ConsensusSubscriptionTableITSupport.createConsensusTopic(ids.getTopic(), database, ".*"); + + consumer = + ConsensusSubscriptionTableITSupport.createConsumer( + ids.getConsumerId(), ids.getConsumerGroupId()); + consumer.subscribe(ids.getTopic()); + + final Set expectedRowKeys = new LinkedHashSet<>(); + expectedRowKeys.addAll( + ConsensusSubscriptionTableITSupport.insertRows(database, table1, 100L, 8, true)); + expectedRowKeys.addAll( + ConsensusSubscriptionTableITSupport.insertRows(database, table2, 200L, 5, true)); + expectedRowKeys.addAll( + ConsensusSubscriptionTableITSupport.insertRows(database, table3, 300L, 4, true)); + + final ConsensusSubscriptionTableITSupport.ConsumedRecords consumed = + ConsensusSubscriptionTableITSupport.pollAndCommitUntilAtLeast( + consumer, expectedRowKeys.size(), 40); + + ConsensusSubscriptionTableITSupport.assertExactRowKeys(expectedRowKeys, consumed); + Assert.assertFalse(consumed.getRowKeys().contains(bootstrapRowKey)); + Assert.assertEquals(8, consumed.getRowsPerTable().getOrDefault(table1, 0).intValue()); + Assert.assertEquals(5, consumed.getRowsPerTable().getOrDefault(table2, 0).intValue()); + Assert.assertEquals(4, consumed.getRowsPerTable().getOrDefault(table3, 0).intValue()); + ConsensusSubscriptionTableITSupport.assertNoMoreMessages(consumer, 3, Duration.ofMillis(500)); + } finally { + ConsensusSubscriptionTableITSupport.cleanup(consumer, ids.getTopic(), database); + } + } +} diff --git a/integration-test/src/test/java/org/apache/iotdb/subscription/it/consensus/local/tablemodel/IoTDBConsensusSubscriptionDataTableIT.java b/integration-test/src/test/java/org/apache/iotdb/subscription/it/consensus/local/tablemodel/IoTDBConsensusSubscriptionDataTableIT.java new file mode 100644 index 0000000000000..1be9a1eb6359a --- /dev/null +++ b/integration-test/src/test/java/org/apache/iotdb/subscription/it/consensus/local/tablemodel/IoTDBConsensusSubscriptionDataTableIT.java @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.subscription.it.consensus.local.tablemodel; + +import org.apache.iotdb.isession.ITableSession; +import org.apache.iotdb.it.env.EnvFactory; +import org.apache.iotdb.it.framework.IoTDBTestRunner; +import org.apache.iotdb.itbase.category.LocalStandaloneIT; +import org.apache.iotdb.session.subscription.consumer.table.SubscriptionTablePullConsumer; +import org.apache.iotdb.subscription.it.consensus.local.AbstractSubscriptionConsensusLocalIT; + +import org.junit.Assert; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.runner.RunWith; + +import java.time.Duration; +import java.util.Arrays; +import java.util.LinkedHashSet; +import java.util.Locale; +import java.util.Set; + +@RunWith(IoTDBTestRunner.class) +@Category({LocalStandaloneIT.class}) +public class IoTDBConsensusSubscriptionDataTableIT extends AbstractSubscriptionConsensusLocalIT { + + private static final long TIME_PARTITION_GAP = 604_800_001L; + private static final String TYPED_TABLE_SCHEMA = + "tag1 STRING TAG, " + + "s_int32 INT32 FIELD, " + + "s_int64 INT64 FIELD, " + + "s_float FLOAT FIELD, " + + "s_double DOUBLE FIELD, " + + "s_bool BOOLEAN FIELD, " + + "s_text TEXT FIELD"; + + @Test + public void testTypedRowsAcrossTimePartitions() throws Exception { + final ConsensusSubscriptionTableITSupport.TestIdentifiers ids = + ConsensusSubscriptionTableITSupport.newIdentifiers( + "table_data_typed_rows_across_partitions"); + final String database = ids.getDatabase(); + final String tableName = "t1"; + final String bootstrapRowKey = + ConsensusSubscriptionTableITSupport.rowKey(database, tableName, 0L); + final Set expectedColumns = + new LinkedHashSet<>( + Arrays.asList("tag1", "s_int32", "s_int64", "s_float", "s_double", "s_bool", "s_text")); + SubscriptionTablePullConsumer consumer = null; + + try { + ConsensusSubscriptionTableITSupport.createDatabaseAndTable( + database, tableName, TYPED_TABLE_SCHEMA); + try (final ITableSession session = EnvFactory.getEnv().getTableSessionConnection()) { + session.executeNonQueryStatement("use " + database); + session.executeNonQueryStatement( + "insert into " + + tableName + + "(tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) " + + "values ('bootstrap', 0, 0, 0.0, 0.0, true, 'bootstrap', 0)"); + session.executeNonQueryStatement("flush"); + } + + ConsensusSubscriptionTableITSupport.createConsensusTopic(ids.getTopic(), database, tableName); + consumer = + ConsensusSubscriptionTableITSupport.createConsumer( + ids.getConsumerId(), ids.getConsumerGroupId()); + consumer.subscribe(ids.getTopic()); + + final long[] timestamps = { + 100L, + 101L, + 102L, + 1_000_000_000L, + 1_000_000_000L + TIME_PARTITION_GAP, + 1_000_000_000L + TIME_PARTITION_GAP * 2 + }; + final Set expectedRowKeys = new LinkedHashSet<>(); + + try (final ITableSession session = EnvFactory.getEnv().getTableSessionConnection()) { + session.executeNonQueryStatement("use " + database); + for (int i = 0; i < timestamps.length; i++) { + final long timestamp = timestamps[i]; + session.executeNonQueryStatement( + String.format( + Locale.ROOT, + "insert into %s(tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) " + + "values ('device_%d', %d, %d, %.1f, %.2f, %s, 'text_%d', %d)", + tableName, + i, + i + 1, + (i + 1L) * 100L, + (i + 1) * 1.1f, + (i + 1) * 2.22d, + i % 2 == 0, + i, + timestamp)); + expectedRowKeys.add( + ConsensusSubscriptionTableITSupport.rowKey(database, tableName, timestamp)); + } + session.executeNonQueryStatement("flush"); + } + + final ConsensusSubscriptionTableITSupport.ConsumedRecords consumed = + ConsensusSubscriptionTableITSupport.pollAndCommitUntilAtLeast( + consumer, expectedRowKeys.size(), 60); + + ConsensusSubscriptionTableITSupport.assertExactRowKeys(expectedRowKeys, consumed); + Assert.assertFalse(consumed.getRowKeys().contains(bootstrapRowKey)); + Assert.assertEquals( + expectedRowKeys.size(), consumed.getRowsPerTable().getOrDefault(tableName, 0).intValue()); + Assert.assertTrue( + "Expected typed columns in consumed records, actual=" + consumed.getSeenColumns(), + consumed.getSeenColumns().containsAll(expectedColumns)); + ConsensusSubscriptionTableITSupport.assertNoMoreMessages(consumer, 3, Duration.ofMillis(500)); + } finally { + ConsensusSubscriptionTableITSupport.cleanup(consumer, ids.getTopic(), database); + } + } +} diff --git a/integration-test/src/test/java/org/apache/iotdb/subscription/it/consensus/local/tablemodel/IoTDBConsensusSubscriptionFilterTableIT.java b/integration-test/src/test/java/org/apache/iotdb/subscription/it/consensus/local/tablemodel/IoTDBConsensusSubscriptionFilterTableIT.java new file mode 100644 index 0000000000000..f47c5f9409c3b --- /dev/null +++ b/integration-test/src/test/java/org/apache/iotdb/subscription/it/consensus/local/tablemodel/IoTDBConsensusSubscriptionFilterTableIT.java @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.subscription.it.consensus.local.tablemodel; + +import org.apache.iotdb.it.framework.IoTDBTestRunner; +import org.apache.iotdb.itbase.category.LocalStandaloneIT; +import org.apache.iotdb.session.subscription.consumer.table.SubscriptionTablePullConsumer; +import org.apache.iotdb.subscription.it.consensus.local.AbstractSubscriptionConsensusLocalIT; + +import org.junit.Assert; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.runner.RunWith; + +import java.time.Duration; +import java.util.Arrays; +import java.util.Collections; +import java.util.LinkedHashSet; +import java.util.Set; + +@RunWith(IoTDBTestRunner.class) +@Category({LocalStandaloneIT.class}) +public class IoTDBConsensusSubscriptionFilterTableIT extends AbstractSubscriptionConsensusLocalIT { + + @Test + public void testDatabaseAndTableFiltering() throws Exception { + final ConsensusSubscriptionTableITSupport.TestIdentifiers ids = + ConsensusSubscriptionTableITSupport.newIdentifiers("table_filter_database_and_table"); + final String database1 = ids.database("db1"); + final String database2 = ids.database("db2"); + final String table1 = "t1"; + final String table2 = "t2"; + SubscriptionTablePullConsumer consumer = null; + + try { + ConsensusSubscriptionTableITSupport.createDatabaseAndTable( + database1, table1, ConsensusSubscriptionTableITSupport.DEFAULT_TABLE_SCHEMA); + ConsensusSubscriptionTableITSupport.createTable( + database1, table2, ConsensusSubscriptionTableITSupport.DEFAULT_TABLE_SCHEMA); + ConsensusSubscriptionTableITSupport.createDatabaseAndTable( + database2, table1, ConsensusSubscriptionTableITSupport.DEFAULT_TABLE_SCHEMA); + + ConsensusSubscriptionTableITSupport.insertRows(database1, table1, 0L, 1, false); + ConsensusSubscriptionTableITSupport.insertRows(database1, table2, 0L, 1, false); + ConsensusSubscriptionTableITSupport.insertRows(database2, table1, 0L, 1, true); + + ConsensusSubscriptionTableITSupport.createConsensusTopic(ids.getTopic(), database1, table1); + + consumer = + ConsensusSubscriptionTableITSupport.createConsumer( + ids.getConsumerId(), ids.getConsumerGroupId()); + consumer.subscribe(ids.getTopic()); + + final Set expectedRowKeys = + ConsensusSubscriptionTableITSupport.insertRows(database1, table1, 100L, 10, false); + ConsensusSubscriptionTableITSupport.insertRows(database1, table2, 100L, 10, false); + ConsensusSubscriptionTableITSupport.insertRows(database2, table1, 100L, 10, true); + + final ConsensusSubscriptionTableITSupport.ConsumedRecords consumed = + ConsensusSubscriptionTableITSupport.pollAndCommitUntilAtLeast( + consumer, expectedRowKeys.size(), 50); + + ConsensusSubscriptionTableITSupport.assertExactRowKeys(expectedRowKeys, consumed); + Assert.assertFalse(consumed.getRowsPerTable().containsKey(table2)); + Assert.assertFalse(consumed.getRowsPerDatabase().containsKey(database2)); + Assert.assertEquals( + expectedRowKeys.size(), + consumed.getRowsPerDatabase().getOrDefault(database1, 0).intValue()); + ConsensusSubscriptionTableITSupport.assertNoMoreMessages(consumer, 3, Duration.ofMillis(500)); + } finally { + ConsensusSubscriptionTableITSupport.cleanup(consumer, ids.getTopic(), database1, database2); + } + } + + @Test + public void testPollWithInfoTopicFilter() throws Exception { + final ConsensusSubscriptionTableITSupport.TestIdentifiers ids = + ConsensusSubscriptionTableITSupport.newIdentifiers("table_filter_poll_with_info"); + final String database = ids.getDatabase(); + final String table1 = "t1"; + final String table2 = "t2"; + final String topic1 = ids.topic("t1"); + final String topic2 = ids.topic("t2"); + final Set subscribedTopics = new LinkedHashSet<>(Arrays.asList(topic1, topic2)); + SubscriptionTablePullConsumer consumer = null; + + try { + ConsensusSubscriptionTableITSupport.createDatabaseAndTable( + database, table1, ConsensusSubscriptionTableITSupport.DEFAULT_TABLE_SCHEMA); + ConsensusSubscriptionTableITSupport.createTable( + database, table2, ConsensusSubscriptionTableITSupport.DEFAULT_TABLE_SCHEMA); + ConsensusSubscriptionTableITSupport.insertRows(database, table1, 0L, 1, false); + ConsensusSubscriptionTableITSupport.insertRows(database, table2, 0L, 1, true); + + ConsensusSubscriptionTableITSupport.createConsensusTopic(topic1, database, table1); + ConsensusSubscriptionTableITSupport.createConsensusTopic(topic2, database, table2); + + consumer = + ConsensusSubscriptionTableITSupport.createConsumer( + ids.getConsumerId(), ids.getConsumerGroupId()); + consumer.subscribe(subscribedTopics); + + final Set expectedRowsTopic1 = + ConsensusSubscriptionTableITSupport.insertRows(database, table1, 100L, 12, false); + final Set expectedRowsTopic2 = + ConsensusSubscriptionTableITSupport.insertRows(database, table2, 200L, 8, true); + + final ConsensusSubscriptionTableITSupport.ConsumedRecords consumedTopic1 = + ConsensusSubscriptionTableITSupport.pollWithInfoAndCommitUntilAtLeast( + consumer, Collections.singleton(topic1), expectedRowsTopic1.size(), 40); + final ConsensusSubscriptionTableITSupport.ConsumedRecords consumedTopic2 = + ConsensusSubscriptionTableITSupport.pollWithInfoAndCommitUntilAtLeast( + consumer, Collections.singleton(topic2), expectedRowsTopic2.size(), 40); + + ConsensusSubscriptionTableITSupport.assertExactRowKeys(expectedRowsTopic1, consumedTopic1); + ConsensusSubscriptionTableITSupport.assertExactRowKeys(expectedRowsTopic2, consumedTopic2); + Assert.assertEquals( + expectedRowsTopic1.size(), + consumedTopic1.getRowsPerTable().getOrDefault(table1, 0).intValue()); + Assert.assertEquals( + expectedRowsTopic2.size(), + consumedTopic2.getRowsPerTable().getOrDefault(table2, 0).intValue()); + ConsensusSubscriptionTableITSupport.assertNoMoreMessages(consumer, 3, Duration.ofMillis(500)); + } finally { + ConsensusSubscriptionTableITSupport.cleanup(consumer, subscribedTopics, database); + } + } +} diff --git a/integration-test/src/test/java/org/apache/iotdb/subscription/it/consensus/local/tablemodel/IoTDBConsensusSubscriptionSubscribeBeforeRegionTableIT.java b/integration-test/src/test/java/org/apache/iotdb/subscription/it/consensus/local/tablemodel/IoTDBConsensusSubscriptionSubscribeBeforeRegionTableIT.java new file mode 100644 index 0000000000000..300fc47475b30 --- /dev/null +++ b/integration-test/src/test/java/org/apache/iotdb/subscription/it/consensus/local/tablemodel/IoTDBConsensusSubscriptionSubscribeBeforeRegionTableIT.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.subscription.it.consensus.local.tablemodel; + +import org.apache.iotdb.it.framework.IoTDBTestRunner; +import org.apache.iotdb.itbase.category.LocalStandaloneIT; +import org.apache.iotdb.session.subscription.consumer.table.SubscriptionTablePullConsumer; +import org.apache.iotdb.subscription.it.consensus.local.AbstractSubscriptionConsensusLocalIT; + +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.runner.RunWith; + +import java.util.Set; + +@RunWith(IoTDBTestRunner.class) +@Category({LocalStandaloneIT.class}) +public class IoTDBConsensusSubscriptionSubscribeBeforeRegionTableIT + extends AbstractSubscriptionConsensusLocalIT { + + @Test + public void testSubscribeBeforeRegionCreation() throws Exception { + final ConsensusSubscriptionTableITSupport.TestIdentifiers ids = + ConsensusSubscriptionTableITSupport.newIdentifiers( + "table_subscribe_before_region_creation"); + final String database = ids.getDatabase(); + final String tableName = "t1"; + SubscriptionTablePullConsumer consumer = null; + + try { + ConsensusSubscriptionTableITSupport.createConsensusTopic(ids.getTopic(), database, ".*"); + + consumer = + ConsensusSubscriptionTableITSupport.createConsumer( + ids.getConsumerId(), ids.getConsumerGroupId()); + consumer.subscribe(ids.getTopic()); + + ConsensusSubscriptionTableITSupport.createDatabaseAndTable( + database, tableName, ConsensusSubscriptionTableITSupport.DEFAULT_TABLE_SCHEMA); + ConsensusSubscriptionTableITSupport.pause(1000); + + final Set expectedRowKeys = + ConsensusSubscriptionTableITSupport.insertRows(database, tableName, 1L, 12, true); + + final ConsensusSubscriptionTableITSupport.ConsumedRecords consumed = + ConsensusSubscriptionTableITSupport.pollAndCommitUntilAtLeast( + consumer, expectedRowKeys.size(), 50); + + ConsensusSubscriptionTableITSupport.assertExactRowKeys(expectedRowKeys, consumed); + } finally { + ConsensusSubscriptionTableITSupport.cleanup(consumer, ids.getTopic(), database); + } + } +} diff --git a/iotdb-client/service-rpc/src/main/java/org/apache/iotdb/rpc/TSStatusCode.java b/iotdb-client/service-rpc/src/main/java/org/apache/iotdb/rpc/TSStatusCode.java index f9e750f4012ba..671890f5bdb06 100644 --- a/iotdb-client/service-rpc/src/main/java/org/apache/iotdb/rpc/TSStatusCode.java +++ b/iotdb-client/service-rpc/src/main/java/org/apache/iotdb/rpc/TSStatusCode.java @@ -310,10 +310,11 @@ public enum TSStatusCode { SUBSCRIPTION_CLOSE_ERROR(1906), SUBSCRIPTION_SUBSCRIBE_ERROR(1907), SUBSCRIPTION_UNSUBSCRIBE_ERROR(1908), - SUBSCRIPTION_MISSING_CUSTOMER(1909), + SUBSCRIPTION_MISSING_CONSUMER(1909), SHOW_SUBSCRIPTION_ERROR(1910), SUBSCRIPTION_PIPE_TIMEOUT_ERROR(1911), SUBSCRIPTION_NOT_ENABLED_ERROR(1912), + SUBSCRIPTION_SEEK_ERROR(1913), // Topic CREATE_TOPIC_ERROR(2000), diff --git a/iotdb-client/subscription/pom.xml b/iotdb-client/subscription/pom.xml index c41ef1e3bde89..bd1e71232b307 100644 --- a/iotdb-client/subscription/pom.xml +++ b/iotdb-client/subscription/pom.xml @@ -77,5 +77,10 @@ org.apache.thrift libthrift + + junit + junit + test + diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/config/TopicConfig.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/config/TopicConfig.java index ea588f1276325..fefd8778bb602 100644 --- a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/config/TopicConfig.java +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/config/TopicConfig.java @@ -35,6 +35,23 @@ public class TopicConfig extends PipeParameters { + private static final Set MODE_VALUE_SET; + private static final Set ORDER_MODE_VALUE_SET; + + static { + final Set modes = new HashSet<>(3); + modes.add(TopicConstant.MODE_SNAPSHOT_VALUE); + modes.add(TopicConstant.MODE_LIVE_VALUE); + modes.add(TopicConstant.MODE_CONSENSUS_VALUE); + MODE_VALUE_SET = Collections.unmodifiableSet(modes); + + final Set orderModes = new HashSet<>(3); + orderModes.add(TopicConstant.ORDER_MODE_LEADER_ONLY_VALUE); + orderModes.add(TopicConstant.ORDER_MODE_MULTI_WRITER_VALUE); + orderModes.add(TopicConstant.ORDER_MODE_PER_WRITER_VALUE); + ORDER_MODE_VALUE_SET = Collections.unmodifiableSet(orderModes); + } + public TopicConfig() { super(Collections.emptyMap()); } @@ -97,6 +114,47 @@ public boolean isTableTopic() { attributes.getOrDefault(SQL_DIALECT_KEY, SQL_DIALECT_TREE_VALUE)); } + public String getMode() { + return normalizeMode( + attributes.getOrDefault(TopicConstant.MODE_KEY, TopicConstant.MODE_DEFAULT_VALUE)); + } + + public boolean isSnapshotMode() { + return TopicConstant.MODE_SNAPSHOT_VALUE.equalsIgnoreCase(getMode()); + } + + public boolean isLiveMode() { + return TopicConstant.MODE_LIVE_VALUE.equalsIgnoreCase(getMode()); + } + + public boolean isConsensusMode() { + return TopicConstant.MODE_CONSENSUS_VALUE.equalsIgnoreCase(getMode()); + } + + public static boolean isValidMode(final String mode) { + return MODE_VALUE_SET.contains(normalizeMode(mode)); + } + + public static String normalizeMode(final String mode) { + return mode == null ? TopicConstant.MODE_DEFAULT_VALUE : mode.trim().toLowerCase(); + } + + public String getOrderMode() { + return normalizeOrderMode( + attributes.getOrDefault( + TopicConstant.ORDER_MODE_KEY, TopicConstant.ORDER_MODE_DEFAULT_VALUE)); + } + + public static boolean isValidOrderMode(final String orderMode) { + return ORDER_MODE_VALUE_SET.contains(normalizeOrderMode(orderMode)); + } + + public static String normalizeOrderMode(final String orderMode) { + return orderMode == null + ? TopicConstant.ORDER_MODE_DEFAULT_VALUE + : orderMode.trim().toLowerCase(); + } + /////////////////////////////// extractor attributes mapping /////////////////////////////// public Map getAttributeWithSqlDialect() { @@ -146,10 +204,11 @@ public Map getAttributesWithSourceRealtimeMode() { } public Map getAttributesWithSourceMode() { - return TopicConstant.MODE_SNAPSHOT_VALUE.equalsIgnoreCase( - attributes.getOrDefault(TopicConstant.MODE_KEY, TopicConstant.MODE_DEFAULT_VALUE)) - ? SNAPSHOT_MODE_CONFIG - : LIVE_MODE_CONFIG; + if (isConsensusMode()) { + throw new IllegalArgumentException( + "Consensus mode topic should not generate pipe source attributes"); + } + return isSnapshotMode() ? SNAPSHOT_MODE_CONFIG : LIVE_MODE_CONFIG; } public Map getAttributesWithSourceLooseRangeOrStrict() { @@ -195,21 +254,33 @@ public Map getAttributesWithProcessorPrefix() { /////////////////////////////// connector attributes mapping /////////////////////////////// + public boolean isRecordFormat() { + return isRecordFormat( + attributes.getOrDefault(TopicConstant.FORMAT_KEY, TopicConstant.FORMAT_DEFAULT_VALUE)); + } + + private boolean isTsFileFormat() { + return isTsFileFormat( + attributes.getOrDefault(TopicConstant.FORMAT_KEY, TopicConstant.FORMAT_DEFAULT_VALUE)); + } + public Map getAttributesWithSinkFormat() { // refer to // org.apache.iotdb.db.pipe.agent.task.connection.PipeEventCollector.parseAndCollectEvent(org.apache.iotdb.db.pipe.event.common.tsfile.PipeTsFileInsertionEvent) - return isTsFileFormat( - attributes.getOrDefault(TopicConstant.FORMAT_KEY, TopicConstant.FORMAT_DEFAULT_VALUE)) - ? SINK_TS_FILE_FORMAT_CONFIG - : SINK_TABLET_FORMAT_CONFIG; + return isTsFileFormat() ? SINK_TS_FILE_FORMAT_CONFIG : SINK_TABLET_FORMAT_CONFIG; } - private boolean isTsFileFormat(final String formatValue) { + private static boolean isTsFileFormat(final String formatValue) { return TopicConstant.FORMAT_TS_FILE_VALUE.equalsIgnoreCase(formatValue) || TopicConstant.FORMAT_TS_FILE_HANDLER_VALUE.equalsIgnoreCase(formatValue) || LEGACY_FORMAT_TS_FILE_HANDLER_VALUE.equalsIgnoreCase(formatValue); } + private static boolean isRecordFormat(final String formatValue) { + return TopicConstant.FORMAT_RECORD_HANDLER_VALUE.equalsIgnoreCase(formatValue) + || TopicConstant.FORMAT_SESSION_DATA_SETS_HANDLER_VALUE.equalsIgnoreCase(formatValue); + } + public Map getAttributesWithSinkPrefix() { final Map attributesWithProcessorPrefix = new HashMap<>(); attributes.forEach( diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/config/TopicConstant.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/config/TopicConstant.java index bb84358648e59..438a3be52f64b 100644 --- a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/config/TopicConstant.java +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/config/TopicConstant.java @@ -28,8 +28,12 @@ public class TopicConstant { public static final String DATABASE_KEY = "database"; public static final String TABLE_KEY = "table"; + public static final String COLUMN_KEY = "column"; + public static final String RETENTION_BYTES_KEY = "retention.bytes"; + public static final String RETENTION_MS_KEY = "retention.ms"; public static final String DATABASE_DEFAULT_VALUE = ".*"; public static final String TABLE_DEFAULT_VALUE = ".*"; + public static final String COLUMN_DEFAULT_VALUE = ".*"; public static final String START_TIME_KEY = "start-time"; public static final String END_TIME_KEY = "end-time"; @@ -38,8 +42,15 @@ public class TopicConstant { public static final String MODE_KEY = "mode"; public static final String MODE_LIVE_VALUE = "live"; public static final String MODE_SNAPSHOT_VALUE = "snapshot"; + public static final String MODE_CONSENSUS_VALUE = "consensus"; public static final String MODE_DEFAULT_VALUE = MODE_LIVE_VALUE; + public static final String ORDER_MODE_KEY = "order-mode"; + public static final String ORDER_MODE_LEADER_ONLY_VALUE = "leader-only"; + public static final String ORDER_MODE_MULTI_WRITER_VALUE = "multi-writer"; + public static final String ORDER_MODE_PER_WRITER_VALUE = "per-writer"; + public static final String ORDER_MODE_DEFAULT_VALUE = ORDER_MODE_LEADER_ONLY_VALUE; + public static final String FORMAT_KEY = "format"; public static final String FORMAT_RECORD_HANDLER_VALUE = "SubscriptionRecordHandler"; public static final String FORMAT_TS_FILE_VALUE = "SubscriptionTsFileHandler"; diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/RegionProgress.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/RegionProgress.java new file mode 100644 index 0000000000000..134f59dfe5dae --- /dev/null +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/RegionProgress.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.rpc.subscription.payload.poll; + +import org.apache.tsfile.utils.ReadWriteIOUtils; + +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Objects; + +public class RegionProgress { + + private final Map writerPositions; + + public RegionProgress(final Map writerPositions) { + this.writerPositions = + writerPositions == null + ? Collections.emptyMap() + : Collections.unmodifiableMap(new LinkedHashMap<>(writerPositions)); + } + + public Map getWriterPositions() { + return writerPositions; + } + + public void serialize(final DataOutputStream stream) throws IOException { + ReadWriteIOUtils.write(writerPositions.size(), stream); + for (final Map.Entry entry : writerPositions.entrySet()) { + entry.getKey().serialize(stream); + entry.getValue().serialize(stream); + } + } + + public static RegionProgress deserialize(final ByteBuffer buffer) { + final int size = ReadWriteIOUtils.readInt(buffer); + final Map writerPositions = new LinkedHashMap<>(size); + for (int i = 0; i < size; i++) { + writerPositions.put(WriterId.deserialize(buffer), WriterProgress.deserialize(buffer)); + } + return new RegionProgress(writerPositions); + } + + @Override + public boolean equals(final Object obj) { + if (this == obj) { + return true; + } + if (!(obj instanceof RegionProgress)) { + return false; + } + final RegionProgress that = (RegionProgress) obj; + return Objects.equals(writerPositions, that.writerPositions); + } + + @Override + public int hashCode() { + return Objects.hash(writerPositions); + } + + @Override + public String toString() { + return "RegionProgress{" + "writerPositions=" + writerPositions + '}'; + } +} diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionCommitContext.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionCommitContext.java index e2bf809d32c20..2246d2f4154c1 100644 --- a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionCommitContext.java +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionCommitContext.java @@ -30,6 +30,8 @@ public class SubscriptionCommitContext implements Comparable { + private static final byte SERIALIZATION_VERSION = 2; + private final int dataNodeId; private final int rebootTimes; @@ -40,6 +42,16 @@ public class SubscriptionCommitContext implements Comparable + Objects.nonNull(context.getWriterId()) ? context.getWriterId().getNodeId() : -1) + .thenComparingLong( + context -> + Objects.nonNull(context.getWriterId()) + ? context.getWriterId().getWriterEpoch() + : -1L) + .thenComparingLong(SubscriptionCommitContext::getPhysicalTime) + .thenComparingLong(SubscriptionCommitContext::getLocalSeq) .compare(this, that); } } diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionPollRequest.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionPollRequest.java index 3337887b185f5..d8c800f247b2d 100644 --- a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionPollRequest.java +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionPollRequest.java @@ -27,10 +27,13 @@ import java.io.DataOutputStream; import java.io.IOException; import java.nio.ByteBuffer; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; public class SubscriptionPollRequest { - private static final Logger LOGGER = LoggerFactory.getLogger(SubscriptionPollResponse.class); + private static final Logger LOGGER = LoggerFactory.getLogger(SubscriptionPollRequest.class); private final transient short requestType; @@ -41,15 +44,31 @@ public class SubscriptionPollRequest { /** The maximum size, in bytes, for the response payload. */ private final transient long maxBytes; + /** + * Per-topic writer-based progress used by the new consensus subscription model. This preserves + * topic boundaries while allowing the consumer to provide a recovery hint on reconnect. + */ + private final transient Map progressByTopic; + public SubscriptionPollRequest( final short requestType, final SubscriptionPollPayload payload, final long timeoutMs, final long maxBytes) { + this(requestType, payload, timeoutMs, maxBytes, Collections.emptyMap()); + } + + public SubscriptionPollRequest( + final short requestType, + final SubscriptionPollPayload payload, + final long timeoutMs, + final long maxBytes, + final Map progressByTopic) { this.requestType = requestType; this.payload = payload; this.timeoutMs = timeoutMs; this.maxBytes = maxBytes; + this.progressByTopic = progressByTopic != null ? progressByTopic : Collections.emptyMap(); } public short getRequestType() { @@ -68,6 +87,10 @@ public long getMaxBytes() { return maxBytes; } + public Map getProgressByTopic() { + return progressByTopic; + } + //////////////////////////// serialization //////////////////////////// public static ByteBuffer serialize(final SubscriptionPollRequest request) throws IOException { @@ -83,6 +106,11 @@ private void serialize(final DataOutputStream stream) throws IOException { payload.serialize(stream); ReadWriteIOUtils.write(timeoutMs, stream); ReadWriteIOUtils.write(maxBytes, stream); + ReadWriteIOUtils.write(progressByTopic.size(), stream); + for (final Map.Entry entry : progressByTopic.entrySet()) { + ReadWriteIOUtils.write(entry.getKey(), stream); + entry.getValue().serialize(stream); + } } public static SubscriptionPollRequest deserialize(final ByteBuffer buffer) { @@ -109,7 +137,20 @@ public static SubscriptionPollRequest deserialize(final ByteBuffer buffer) { final long timeoutMs = ReadWriteIOUtils.readLong(buffer); final long maxBytes = ReadWriteIOUtils.readLong(buffer); - return new SubscriptionPollRequest(requestType, payload, timeoutMs, maxBytes); + + Map progressByTopic = Collections.emptyMap(); + if (buffer.hasRemaining()) { + final int mapSize = ReadWriteIOUtils.readInt(buffer); + if (mapSize > 0) { + progressByTopic = new HashMap<>(mapSize); + for (int i = 0; i < mapSize; i++) { + progressByTopic.put( + ReadWriteIOUtils.readString(buffer), TopicProgress.deserialize(buffer)); + } + } + } + + return new SubscriptionPollRequest(requestType, payload, timeoutMs, maxBytes, progressByTopic); } /////////////////////////////// object /////////////////////////////// @@ -117,13 +158,15 @@ public static SubscriptionPollRequest deserialize(final ByteBuffer buffer) { @Override public String toString() { return "SubscriptionPollRequest{requestType=" - + SubscriptionPollRequestType.valueOf(requestType).toString() + + SubscriptionPollRequestType.valueOf(requestType) + ", payload=" + payload + ", timeoutMs=" + timeoutMs + ", maxBytes=" + maxBytes + + ", progressByTopic.size=" + + progressByTopic.size() + "}"; } } diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionPollResponse.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionPollResponse.java index 06baa30acee9f..df1bb91a9f3e9 100644 --- a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionPollResponse.java +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionPollResponse.java @@ -100,6 +100,9 @@ public static SubscriptionPollResponse deserialize(final ByteBuffer buffer) { case TERMINATION: payload = new TerminationPayload().deserialize(buffer); break; + case WATERMARK: + payload = new WatermarkPayload().deserialize(buffer); + break; default: LOGGER.warn("unexpected response type: {}, payload will be null", responseType); break; @@ -121,9 +124,10 @@ public String toString() { protected Map coreReportMessage() { final Map result = new HashMap<>(); - result.put("responseType", SubscriptionPollResponseType.valueOf(responseType).toString()); - result.put("payload", payload.toString()); - result.put("commitContext", commitContext.toString()); + final SubscriptionPollResponseType type = SubscriptionPollResponseType.valueOf(responseType); + result.put("responseType", type != null ? type.toString() : "UNKNOWN(" + responseType + ")"); + result.put("payload", payload != null ? payload.toString() : "null"); + result.put("commitContext", commitContext != null ? commitContext.toString() : "null"); return result; } } diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionPollResponseType.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionPollResponseType.java index b27791b36c538..4ca6cb09dd67c 100644 --- a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionPollResponseType.java +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionPollResponseType.java @@ -33,6 +33,13 @@ public enum SubscriptionPollResponseType { FILE_SEAL((short) 4), TERMINATION((short) 5), + + /** + * Periodic timestamp-progress signal from the server-side {@code ConsensusPrefetchingQueue}. + * Carries the maximum data timestamp observed so far for a region, enabling client-side watermark + * computation even when a region is idle (no new data). + */ + WATERMARK((short) 7), ; private final short type; diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/TopicProgress.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/TopicProgress.java new file mode 100644 index 0000000000000..35dfd2e0ca33d --- /dev/null +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/TopicProgress.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.rpc.subscription.payload.poll; + +import org.apache.tsfile.utils.PublicBAOS; +import org.apache.tsfile.utils.ReadWriteIOUtils; + +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Objects; + +public class TopicProgress { + + private final Map regionProgress; + + public TopicProgress(final Map regionProgress) { + this.regionProgress = + regionProgress == null + ? Collections.emptyMap() + : Collections.unmodifiableMap(new LinkedHashMap<>(regionProgress)); + } + + public Map getRegionProgress() { + return regionProgress; + } + + public static ByteBuffer serialize(final TopicProgress progress) throws IOException { + try (final PublicBAOS byteArrayOutputStream = new PublicBAOS(); + final DataOutputStream outputStream = new DataOutputStream(byteArrayOutputStream)) { + progress.serialize(outputStream); + return ByteBuffer.wrap(byteArrayOutputStream.getBuf(), 0, byteArrayOutputStream.size()); + } + } + + public void serialize(final DataOutputStream stream) throws IOException { + ReadWriteIOUtils.write(regionProgress.size(), stream); + for (final Map.Entry entry : regionProgress.entrySet()) { + ReadWriteIOUtils.write(entry.getKey(), stream); + entry.getValue().serialize(stream); + } + } + + public static TopicProgress deserialize(final ByteBuffer buffer) { + final int size = ReadWriteIOUtils.readInt(buffer); + final Map regionProgress = new LinkedHashMap<>(size); + for (int i = 0; i < size; i++) { + regionProgress.put(ReadWriteIOUtils.readString(buffer), RegionProgress.deserialize(buffer)); + } + return new TopicProgress(regionProgress); + } + + @Override + public boolean equals(final Object obj) { + if (this == obj) { + return true; + } + if (!(obj instanceof TopicProgress)) { + return false; + } + final TopicProgress that = (TopicProgress) obj; + return Objects.equals(regionProgress, that.regionProgress); + } + + @Override + public int hashCode() { + return Objects.hash(regionProgress); + } + + @Override + public String toString() { + return "TopicProgress{" + "regionProgress=" + regionProgress + '}'; + } +} diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/WatermarkPayload.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/WatermarkPayload.java new file mode 100644 index 0000000000000..32dab88967497 --- /dev/null +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/WatermarkPayload.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.rpc.subscription.payload.poll; + +import org.apache.tsfile.utils.ReadWriteIOUtils; + +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; + +/** + * Payload for {@link SubscriptionPollResponseType#WATERMARK}. + * + *

Periodically injected by the server-side {@code ConsensusPrefetchingQueue} to report timestamp + * progress for a region. Carries the maximum data timestamp observed so far, enabling client-side + * {@code WatermarkProcessor} to advance its watermark even when a region is idle (no new data). + * + *

The {@code dataNodeId} identifies which DataNode emitted this watermark, allowing the client + * to track per-node progress across leader transitions. + */ +public class WatermarkPayload implements SubscriptionPollPayload { + + /** Maximum data timestamp observed across all InsertNodes in this region's queue. */ + private transient long watermarkTimestamp; + + /** The DataNode ID that emitted this watermark. */ + private transient int dataNodeId; + + public WatermarkPayload() {} + + public WatermarkPayload(final long watermarkTimestamp, final int dataNodeId) { + this.watermarkTimestamp = watermarkTimestamp; + this.dataNodeId = dataNodeId; + } + + public long getWatermarkTimestamp() { + return watermarkTimestamp; + } + + public int getDataNodeId() { + return dataNodeId; + } + + @Override + public void serialize(final DataOutputStream stream) throws IOException { + ReadWriteIOUtils.write(watermarkTimestamp, stream); + ReadWriteIOUtils.write(dataNodeId, stream); + } + + @Override + public SubscriptionPollPayload deserialize(final ByteBuffer buffer) { + watermarkTimestamp = ReadWriteIOUtils.readLong(buffer); + dataNodeId = ReadWriteIOUtils.readInt(buffer); + return this; + } + + @Override + public String toString() { + return "WatermarkPayload{watermarkTimestamp=" + + watermarkTimestamp + + ", dataNodeId=" + + dataNodeId + + '}'; + } +} diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/WriterId.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/WriterId.java new file mode 100644 index 0000000000000..ce21e07fe008d --- /dev/null +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/WriterId.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.rpc.subscription.payload.poll; + +import org.apache.tsfile.utils.ReadWriteIOUtils; + +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Objects; + +public class WriterId { + + private final String regionId; + private final int nodeId; + private final long writerEpoch; + + public WriterId(final String regionId, final int nodeId, final long writerEpoch) { + this.regionId = regionId; + this.nodeId = nodeId; + this.writerEpoch = writerEpoch; + } + + public String getRegionId() { + return regionId; + } + + public int getNodeId() { + return nodeId; + } + + public long getWriterEpoch() { + return writerEpoch; + } + + public void serialize(final DataOutputStream stream) throws IOException { + ReadWriteIOUtils.write(regionId, stream); + ReadWriteIOUtils.write(nodeId, stream); + ReadWriteIOUtils.write(writerEpoch, stream); + } + + public static WriterId deserialize(final ByteBuffer buffer) { + return new WriterId( + ReadWriteIOUtils.readString(buffer), + ReadWriteIOUtils.readInt(buffer), + ReadWriteIOUtils.readLong(buffer)); + } + + @Override + public boolean equals(final Object obj) { + if (this == obj) { + return true; + } + if (!(obj instanceof WriterId)) { + return false; + } + final WriterId that = (WriterId) obj; + return nodeId == that.nodeId + && writerEpoch == that.writerEpoch + && Objects.equals(regionId, that.regionId); + } + + @Override + public int hashCode() { + return Objects.hash(regionId, nodeId, writerEpoch); + } + + @Override + public String toString() { + return "WriterId{" + + "regionId='" + + regionId + + '\'' + + ", nodeId=" + + nodeId + + ", writerEpoch=" + + writerEpoch + + '}'; + } +} diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/WriterProgress.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/WriterProgress.java new file mode 100644 index 0000000000000..f38ea770e8ff6 --- /dev/null +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/WriterProgress.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.rpc.subscription.payload.poll; + +import org.apache.tsfile.utils.ReadWriteIOUtils; + +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Objects; + +public class WriterProgress { + + private final long physicalTime; + private final long localSeq; + + public WriterProgress(final long physicalTime, final long localSeq) { + this.physicalTime = physicalTime; + this.localSeq = localSeq; + } + + public long getPhysicalTime() { + return physicalTime; + } + + public long getLocalSeq() { + return localSeq; + } + + public void serialize(final DataOutputStream stream) throws IOException { + ReadWriteIOUtils.write(physicalTime, stream); + ReadWriteIOUtils.write(localSeq, stream); + } + + public static WriterProgress deserialize(final ByteBuffer buffer) { + return new WriterProgress(ReadWriteIOUtils.readLong(buffer), ReadWriteIOUtils.readLong(buffer)); + } + + @Override + public boolean equals(final Object obj) { + if (this == obj) { + return true; + } + if (!(obj instanceof WriterProgress)) { + return false; + } + final WriterProgress that = (WriterProgress) obj; + return physicalTime == that.physicalTime && localSeq == that.localSeq; + } + + @Override + public int hashCode() { + return Objects.hash(physicalTime, localSeq); + } + + @Override + public String toString() { + return "WriterProgress{" + "physicalTime=" + physicalTime + ", localSeq=" + localSeq + '}'; + } +} diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/request/PipeSubscribeRequestType.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/request/PipeSubscribeRequestType.java index d649aa567ade4..9fcc1d86b0c75 100644 --- a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/request/PipeSubscribeRequestType.java +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/request/PipeSubscribeRequestType.java @@ -31,6 +31,7 @@ public enum PipeSubscribeRequestType { CLOSE((short) 4), SUBSCRIBE((short) 5), UNSUBSCRIBE((short) 6), + SEEK((short) 7), ; private final short type; diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/request/PipeSubscribeSeekReq.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/request/PipeSubscribeSeekReq.java new file mode 100644 index 0000000000000..895417537d5d1 --- /dev/null +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/request/PipeSubscribeSeekReq.java @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.rpc.subscription.payload.request; + +import org.apache.iotdb.rpc.subscription.payload.poll.TopicProgress; +import org.apache.iotdb.service.rpc.thrift.TPipeSubscribeReq; + +import org.apache.tsfile.utils.PublicBAOS; +import org.apache.tsfile.utils.ReadWriteIOUtils; + +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Collections; +import java.util.Objects; + +public class PipeSubscribeSeekReq extends TPipeSubscribeReq { + + /** Seek type constants. */ + public static final short SEEK_TO_BEGINNING = 1; + + public static final short SEEK_TO_END = 2; + public static final short SEEK_TO_TIMESTAMP = 3; + public static final short SEEK_TO_TOPIC_PROGRESS = 6; + public static final short SEEK_AFTER_TOPIC_PROGRESS = 7; + + private transient String topicName; + private transient short seekType; + private transient long timestamp; // only meaningful when seekType == SEEK_TO_TIMESTAMP + private transient TopicProgress topicProgress = new TopicProgress(Collections.emptyMap()); + + public String getTopicName() { + return topicName; + } + + public short getSeekType() { + return seekType; + } + + public long getTimestamp() { + return timestamp; + } + + public TopicProgress getTopicProgress() { + return topicProgress; + } + + /////////////////////////////// Thrift /////////////////////////////// + + /** + * Serialize the incoming parameters into {@code PipeSubscribeSeekReq}, called by the subscription + * client. + */ + public static PipeSubscribeSeekReq toTPipeSubscribeReq( + final String topicName, final short seekType, final long timestamp) throws IOException { + return toTPipeSubscribeReq(topicName, seekType, timestamp, null); + } + + public static PipeSubscribeSeekReq toTPipeSubscribeReq( + final String topicName, final TopicProgress topicProgress) throws IOException { + return toTPipeSubscribeReq(topicName, SEEK_TO_TOPIC_PROGRESS, 0, topicProgress); + } + + public static PipeSubscribeSeekReq toTPipeSubscribeSeekAfterReq( + final String topicName, final TopicProgress topicProgress) throws IOException { + return toTPipeSubscribeReq(topicName, SEEK_AFTER_TOPIC_PROGRESS, 0, topicProgress); + } + + public static PipeSubscribeSeekReq toTPipeSubscribeReq( + final String topicName, + final short seekType, + final long timestamp, + final TopicProgress topicProgress) + throws IOException { + final PipeSubscribeSeekReq req = new PipeSubscribeSeekReq(); + + req.topicName = topicName; + req.seekType = seekType; + req.timestamp = timestamp; + req.topicProgress = + Objects.nonNull(topicProgress) ? topicProgress : new TopicProgress(Collections.emptyMap()); + + req.version = PipeSubscribeRequestVersion.VERSION_1.getVersion(); + req.type = PipeSubscribeRequestType.SEEK.getType(); + try (final PublicBAOS byteArrayOutputStream = new PublicBAOS(); + final DataOutputStream outputStream = new DataOutputStream(byteArrayOutputStream)) { + ReadWriteIOUtils.write(topicName, outputStream); + ReadWriteIOUtils.write(seekType, outputStream); + if (seekType == SEEK_TO_TIMESTAMP) { + ReadWriteIOUtils.write(timestamp, outputStream); + } else if (seekType == SEEK_TO_TOPIC_PROGRESS || seekType == SEEK_AFTER_TOPIC_PROGRESS) { + req.topicProgress.serialize(outputStream); + } + req.body = ByteBuffer.wrap(byteArrayOutputStream.getBuf(), 0, byteArrayOutputStream.size()); + } + + return req; + } + + /** + * Deserialize {@code TPipeSubscribeReq} to obtain parameters, called by the subscription server. + */ + public static PipeSubscribeSeekReq fromTPipeSubscribeReq(final TPipeSubscribeReq seekReq) { + final PipeSubscribeSeekReq req = new PipeSubscribeSeekReq(); + + if (Objects.nonNull(seekReq.body) && seekReq.body.hasRemaining()) { + req.topicName = ReadWriteIOUtils.readString(seekReq.body); + req.seekType = ReadWriteIOUtils.readShort(seekReq.body); + if (req.seekType == SEEK_TO_TIMESTAMP) { + req.timestamp = ReadWriteIOUtils.readLong(seekReq.body); + } else if (req.seekType == SEEK_TO_TOPIC_PROGRESS + || req.seekType == SEEK_AFTER_TOPIC_PROGRESS) { + req.topicProgress = TopicProgress.deserialize(seekReq.body); + } + } + + req.version = seekReq.version; + req.type = seekReq.type; + + return req; + } + + /////////////////////////////// Object /////////////////////////////// + + @Override + public boolean equals(final Object obj) { + if (this == obj) { + return true; + } + if (obj == null || getClass() != obj.getClass()) { + return false; + } + final PipeSubscribeSeekReq that = (PipeSubscribeSeekReq) obj; + return Objects.equals(this.topicName, that.topicName) + && this.seekType == that.seekType + && this.timestamp == that.timestamp + && Objects.equals(this.topicProgress, that.topicProgress) + && this.version == that.version + && this.type == that.type + && Objects.equals(this.body, that.body); + } + + @Override + public int hashCode() { + return Objects.hash(topicName, seekType, timestamp, topicProgress, version, type, body); + } +} diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/response/PipeSubscribeSeekResp.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/response/PipeSubscribeSeekResp.java new file mode 100644 index 0000000000000..c6ea90d5bb069 --- /dev/null +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/response/PipeSubscribeSeekResp.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.rpc.subscription.payload.response; + +import org.apache.iotdb.common.rpc.thrift.TSStatus; +import org.apache.iotdb.service.rpc.thrift.TPipeSubscribeResp; + +import java.util.Objects; + +public class PipeSubscribeSeekResp extends TPipeSubscribeResp { + + /////////////////////////////// Thrift /////////////////////////////// + + /** + * Serialize the incoming parameters into {@code PipeSubscribeSeekResp}, called by the + * subscription server. + */ + public static PipeSubscribeSeekResp toTPipeSubscribeResp(final TSStatus status) { + final PipeSubscribeSeekResp resp = new PipeSubscribeSeekResp(); + + resp.status = status; + resp.version = PipeSubscribeResponseVersion.VERSION_1.getVersion(); + resp.type = PipeSubscribeResponseType.ACK.getType(); + + return resp; + } + + /** + * Deserialize {@code TPipeSubscribeResp} to obtain parameters, called by the subscription client. + */ + public static PipeSubscribeSeekResp fromTPipeSubscribeResp(final TPipeSubscribeResp seekResp) { + final PipeSubscribeSeekResp resp = new PipeSubscribeSeekResp(); + + resp.status = seekResp.status; + resp.version = seekResp.version; + resp.type = seekResp.type; + resp.body = seekResp.body; + + return resp; + } + + /////////////////////////////// Object /////////////////////////////// + + @Override + public boolean equals(final Object obj) { + if (this == obj) { + return true; + } + if (obj == null || getClass() != obj.getClass()) { + return false; + } + final PipeSubscribeSeekResp that = (PipeSubscribeSeekResp) obj; + return Objects.equals(this.status, that.status) + && this.version == that.version + && this.type == that.type + && Objects.equals(this.body, that.body); + } + + @Override + public int hashCode() { + return Objects.hash(status, version, type, body); + } +} diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/ISubscriptionTablePullConsumer.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/ISubscriptionTablePullConsumer.java index 0168a1ba3846d..abc5e2de2ff92 100644 --- a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/ISubscriptionTablePullConsumer.java +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/ISubscriptionTablePullConsumer.java @@ -20,6 +20,7 @@ package org.apache.iotdb.session.subscription.consumer; import org.apache.iotdb.rpc.subscription.exception.SubscriptionException; +import org.apache.iotdb.rpc.subscription.payload.poll.TopicProgress; import org.apache.iotdb.session.subscription.payload.SubscriptionMessage; import java.time.Duration; @@ -179,6 +180,19 @@ List poll(final Set topicNames, final Duration time void commitAsync( final Iterable messages, final AsyncCommitCallback callback); + void seekToBeginning(final String topicName) throws SubscriptionException; + + void seekToEnd(final String topicName) throws SubscriptionException; + + TopicProgress positions(final String topicName) throws SubscriptionException; + + TopicProgress committedPositions(final String topicName) throws SubscriptionException; + + void seek(final String topicName, final TopicProgress topicProgress) throws SubscriptionException; + + void seekAfter(final String topicName, final TopicProgress topicProgress) + throws SubscriptionException; + /** * Retrieves the unique identifier of this consumer. If no consumer ID was provided at the time of * consumer construction, a random globally unique ID is automatically assigned after the consumer diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/ISubscriptionTreePullConsumer.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/ISubscriptionTreePullConsumer.java index 803b7c51224a4..fc9d55bfe218a 100644 --- a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/ISubscriptionTreePullConsumer.java +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/ISubscriptionTreePullConsumer.java @@ -20,6 +20,7 @@ package org.apache.iotdb.session.subscription.consumer; import org.apache.iotdb.rpc.subscription.exception.SubscriptionException; +import org.apache.iotdb.rpc.subscription.payload.poll.TopicProgress; import org.apache.iotdb.session.subscription.payload.SubscriptionMessage; import java.time.Duration; @@ -179,6 +180,19 @@ List poll(final Set topicNames, final Duration time void commitAsync( final Iterable messages, final AsyncCommitCallback callback); + void seekToBeginning(final String topicName) throws SubscriptionException; + + void seekToEnd(final String topicName) throws SubscriptionException; + + TopicProgress positions(final String topicName) throws SubscriptionException; + + TopicProgress committedPositions(final String topicName) throws SubscriptionException; + + void seek(final String topicName, final TopicProgress topicProgress) throws SubscriptionException; + + void seekAfter(final String topicName, final TopicProgress topicProgress) + throws SubscriptionException; + /** * Retrieves the unique identifier of this consumer. If no consumer ID was provided at the time of * consumer construction, a random globally unique ID is automatically assigned after the consumer diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionConsumer.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionConsumer.java index e9fbb1672e563..506b678231340 100644 --- a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionConsumer.java +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionConsumer.java @@ -34,11 +34,17 @@ import org.apache.iotdb.rpc.subscription.payload.poll.FileInitPayload; import org.apache.iotdb.rpc.subscription.payload.poll.FilePiecePayload; import org.apache.iotdb.rpc.subscription.payload.poll.FileSealPayload; +import org.apache.iotdb.rpc.subscription.payload.poll.RegionProgress; import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext; import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionPollPayload; import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionPollResponse; import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionPollResponseType; import org.apache.iotdb.rpc.subscription.payload.poll.TabletsPayload; +import org.apache.iotdb.rpc.subscription.payload.poll.TopicProgress; +import org.apache.iotdb.rpc.subscription.payload.poll.WatermarkPayload; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterId; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterProgress; +import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeSeekReq; import org.apache.iotdb.session.subscription.consumer.AsyncCommitCallback; import org.apache.iotdb.session.subscription.payload.SubscriptionMessage; import org.apache.iotdb.session.subscription.payload.SubscriptionMessageType; @@ -77,6 +83,7 @@ import java.util.concurrent.Callable; import java.util.concurrent.CancellationException; import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; import java.util.concurrent.ScheduledFuture; @@ -88,6 +95,7 @@ import static org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionPollResponseType.FILE_INIT; import static org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionPollResponseType.TABLETS; import static org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionPollResponseType.TERMINATION; +import static org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionPollResponseType.WATERMARK; import static org.apache.iotdb.session.subscription.util.SetPartitioner.partition; abstract class AbstractSubscriptionConsumer implements AutoCloseable { @@ -121,6 +129,26 @@ abstract class AbstractSubscriptionConsumer implements AutoCloseable { private final int connectionTimeoutInMs; private final int maxPollParallelism; + /** + * The latest watermark timestamp received from the server. Updated when WATERMARK events are + * processed and stripped. Consumer users can query this to check timestamp progress. + */ + protected volatile long latestWatermarkTimestamp = Long.MIN_VALUE; + + /** Per-topic current positions used as the consumer-guided positioning hint in poll requests. */ + private final Map currentPositionsByTopic = new ConcurrentHashMap<>(); + + /** Per-topic committed positions used as durable recovery points for explicit seek/checkpoint. */ + private final Map committedPositionsByTopic = new ConcurrentHashMap<>(); + + /** + * Ack contexts for consensus messages that were already processed locally but could not be + * committed because the original provider became unavailable. They are flushed after the same + * topic+region is observed again from a live provider. + */ + private final Map> pendingRedirectAcksByTopicRegion = + new ConcurrentHashMap<>(); + @SuppressWarnings("java:S3077") protected volatile Map subscribedTopics = new HashMap<>(); @@ -376,6 +404,106 @@ private void unsubscribe(Set topicNames, final boolean needParse) providers.acquireReadLock(); try { unsubscribeWithRedirection(topicNames); + topicNames.forEach(this::clearPendingRedirectAcks); + } finally { + providers.releaseReadLock(); + } + } + + /////////////////////////////// seek /////////////////////////////// + + /** + * Seeks to the earliest available WAL position. Actual position depends on WAL retention — old + * segments may have been reclaimed. + */ + public void seekToBeginning(final String topicName) throws SubscriptionException { + checkIfOpened(); + seekInternal(topicName, PipeSubscribeSeekReq.SEEK_TO_BEGINNING, 0); + clearCurrentPositions(topicName); + clearCommittedPositions(topicName); + clearPendingRedirectAcks(topicName); + } + + /** Seeks to the current WAL tail. Only newly written data will be consumed after this. */ + public void seekToEnd(final String topicName) throws SubscriptionException { + checkIfOpened(); + seekInternal(topicName, PipeSubscribeSeekReq.SEEK_TO_END, 0); + clearCurrentPositions(topicName); + clearCommittedPositions(topicName); + clearPendingRedirectAcks(topicName); + } + + /** + * Returns the latest observed per-region positions for the given topic. This is the consumer's + * current fetch position hint and is sent back to the server on subsequent poll requests. + */ + public TopicProgress positions(final String topicName) throws SubscriptionException { + checkIfOpened(); + final TopicProgress progress = currentPositionsByTopic.get(topicName); + return Objects.nonNull(progress) + ? new TopicProgress(progress.getRegionProgress()) + : new TopicProgress(Collections.emptyMap()); + } + + /** + * Returns the latest committed per-region positions for the given topic. This is the recoverable + * checkpoint position that should be persisted by callers. + */ + public TopicProgress committedPositions(final String topicName) throws SubscriptionException { + checkIfOpened(); + final TopicProgress progress = committedPositionsByTopic.get(topicName); + return Objects.nonNull(progress) + ? new TopicProgress(progress.getRegionProgress()) + : new TopicProgress(Collections.emptyMap()); + } + + public void seek(final String topicName, final TopicProgress topicProgress) + throws SubscriptionException { + checkIfOpened(); + final TopicProgress safeProgress = + Objects.nonNull(topicProgress) ? topicProgress : new TopicProgress(Collections.emptyMap()); + seekInternalTopicProgress(topicName, safeProgress); + overlayCurrentPositions(topicName, safeProgress); + overlayCommittedPositions(topicName, safeProgress); + clearPendingRedirectAcks(topicName); + } + + public void seekAfter(final String topicName, final TopicProgress topicProgress) + throws SubscriptionException { + checkIfOpened(); + final TopicProgress safeProgress = + Objects.nonNull(topicProgress) ? topicProgress : new TopicProgress(Collections.emptyMap()); + seekAfterInternalTopicProgress(topicName, safeProgress); + overlayCurrentPositions(topicName, safeProgress); + overlayCommittedPositions(topicName, safeProgress); + clearPendingRedirectAcks(topicName); + } + + private void seekInternal(final String topicName, final short seekType, final long timestamp) + throws SubscriptionException { + providers.acquireReadLock(); + try { + seekWithRedirection(topicName, seekType, timestamp); + } finally { + providers.releaseReadLock(); + } + } + + private void seekInternalTopicProgress(final String topicName, final TopicProgress topicProgress) + throws SubscriptionException { + providers.acquireReadLock(); + try { + seekWithRedirectionTopicProgress(topicName, topicProgress); + } finally { + providers.releaseReadLock(); + } + } + + private void seekAfterInternalTopicProgress( + final String topicName, final TopicProgress topicProgress) throws SubscriptionException { + providers.acquireReadLock(); + try { + seekAfterWithRedirectionTopicProgress(topicName, topicProgress); } finally { providers.releaseReadLock(); } @@ -522,9 +650,44 @@ private Path getFilePath( unsubscribe(Collections.singleton(topicNameToUnsubscribe), false); return Optional.empty(); }); + put( + WATERMARK, + (resp, timer) -> { + final SubscriptionCommitContext commitContext = resp.getCommitContext(); + final WatermarkPayload payload = (WatermarkPayload) resp.getPayload(); + return Optional.of( + new SubscriptionMessage( + commitContext, payload.getWatermarkTimestamp())); + }); } }); + /** + * Returns the set of DataNode IDs for providers that are currently available. Used by subclasses + * to detect unavailable DataNodes and notify the progress ordering processor. + */ + protected Set getAvailableDataNodeIds() { + providers.acquireReadLock(); + try { + final Set ids = new HashSet<>(); + for (final AbstractSubscriptionProvider provider : providers.getAllAvailableProviders()) { + ids.add(provider.getDataNodeId()); + } + return ids; + } finally { + providers.releaseReadLock(); + } + } + + /** + * Returns the latest watermark timestamp received from the server. This tracks the maximum data + * timestamp observed across all polled regions. Returns {@code Long.MIN_VALUE} if no watermark + * has been received yet. + */ + public long getLatestWatermarkTimestamp() { + return latestWatermarkTimestamp; + } + protected List multiplePoll( /* @NotNull */ final Set topicNames, final long timeoutMs) { if (topicNames.isEmpty()) { @@ -685,6 +848,8 @@ private List singlePoll( // add all current messages to result messages messages.addAll(currentMessages); + advanceCurrentPositions(currentMessages); + flushPendingRedirectAcks(currentMessages); // TODO: maybe we can poll a few more times if (!messages.isEmpty()) { @@ -1079,7 +1244,7 @@ private List pollInternal( } // ignore SubscriptionConnectionException to improve poll auto retry try { - return provider.poll(topicNames, timeoutMs); + return provider.poll(topicNames, timeoutMs, buildCurrentProgressByTopic(topicNames)); } catch (final SubscriptionConnectionException ignored) { return Collections.emptyList(); } @@ -1174,7 +1339,80 @@ private void commit(final Iterable commitContexts, fi for (final Entry> entry : dataNodeIdToSubscriptionCommitContexts.entrySet()) { commitInternal(entry.getKey(), entry.getValue(), nack); + if (!nack) { + advanceCommittedPositions(entry.getValue()); + } + } + } + + protected Set ackWithPartialProgress( + final Iterable messages) throws SubscriptionException { + final List bufferedMessages = new ArrayList<>(); + final List commitContexts = new ArrayList<>(); + for (final SubscriptionMessage message : messages) { + bufferedMessages.add(message); + commitContexts.add(message.getCommitContext()); + } + + final Set removableCommitContexts = + ackCommitContextsWithPartialProgress(commitContexts); + final Set removableMessages = new HashSet<>(); + for (final SubscriptionMessage message : bufferedMessages) { + if (removableCommitContexts.contains(message.getCommitContext())) { + removableMessages.add(message); + } } + return removableMessages; + } + + protected Set ackCommitContextsWithPartialProgress( + final Iterable commitContexts) throws SubscriptionException { + final Map> dataNodeIdToCommitContexts = + new HashMap<>(); + for (final SubscriptionCommitContext commitContext : commitContexts) { + dataNodeIdToCommitContexts + .computeIfAbsent(commitContext.getDataNodeId(), ignored -> new ArrayList<>()) + .add(commitContext); + } + + final Set removableCommitContexts = new HashSet<>(); + for (final Entry> entry : + dataNodeIdToCommitContexts.entrySet()) { + final List groupedCommitContexts = entry.getValue(); + try { + commitInternal(entry.getKey(), groupedCommitContexts, false); + advanceCommittedPositions(groupedCommitContexts); + removableCommitContexts.addAll(groupedCommitContexts); + } catch (final SubscriptionConnectionException e) { + int stagedCount = 0; + int retainedCount = 0; + for (final SubscriptionCommitContext commitContext : groupedCommitContexts) { + if (isConsensusCommitContext(commitContext)) { + stagePendingRedirectAck(commitContext); + removableCommitContexts.add(commitContext); + stagedCount++; + } else { + retainedCount++; + } + } + if (stagedCount > 0) { + LOGGER.warn( + "{} staged {} consensus ack(s) for redirect after provider {} became unavailable", + this, + stagedCount, + entry.getKey()); + } + if (retainedCount > 0) { + LOGGER.warn( + "{} keep {} non-consensus ack(s) pending after provider {} commit failure", + this, + retainedCount, + entry.getKey(), + e); + } + } + } + return removableCommitContexts; } protected void nack(final Iterable messages) throws SubscriptionException { @@ -1390,6 +1628,365 @@ private void unsubscribeWithRedirection(final Set topicNames) throw new SubscriptionRuntimeCriticalException(errorMessage); } + /** + * Sends seek request to ALL available providers. Unlike subscribe/unsubscribe, seek is only + * considered successful if every available provider acknowledges it because data regions for the + * topic may be distributed across different nodes. + */ + private void seekWithRedirection( + final String topicName, final short seekType, final long timestamp) + throws SubscriptionException { + final List providers = this.providers.getAllAvailableProviders(); + if (providers.isEmpty()) { + throw new SubscriptionConnectionException( + String.format( + "Cluster has no available subscription providers when %s seek topic %s", + this, topicName)); + } + final List failedProviders = new ArrayList<>(); + Throwable firstFailure = null; + for (final AbstractSubscriptionProvider provider : providers) { + try { + provider.seek(topicName, seekType, timestamp); + } catch (final Exception e) { + failedProviders.add(provider); + if (Objects.isNull(firstFailure)) { + firstFailure = e; + } + LOGGER.warn( + "{} failed to seek topic {} from subscription provider {}; seek requires every provider to succeed, so the client will continue notifying the remaining providers before failing this seek.", + this, + topicName, + provider, + e); + } + } + if (!failedProviders.isEmpty()) { + final String errorMessage = + String.format( + "%s failed to seek topic %s on subscription providers %s; seek requires every available provider to succeed", + this, topicName, failedProviders); + LOGGER.warn(errorMessage); + throw new SubscriptionRuntimeCriticalException(errorMessage, firstFailure); + } + } + + /** Same all-provider success requirement as {@link #seekWithRedirection(String, short, long)}. */ + private void seekWithRedirectionTopicProgress( + final String topicName, final TopicProgress topicProgress) throws SubscriptionException { + final List providers = this.providers.getAllAvailableProviders(); + if (providers.isEmpty()) { + throw new SubscriptionConnectionException( + String.format( + "Cluster has no available subscription providers when %s seek topic %s", + this, topicName)); + } + final List failedProviders = new ArrayList<>(); + Throwable firstFailure = null; + for (final AbstractSubscriptionProvider provider : providers) { + try { + provider.seekToTopicProgress(topicName, topicProgress); + } catch (final Exception e) { + failedProviders.add(provider); + if (Objects.isNull(firstFailure)) { + firstFailure = e; + } + LOGGER.warn( + "{} failed to seek topic {} to topicProgress(regionCount={}) from provider {}; seek requires every provider to succeed, so the client will continue notifying the remaining providers before failing this seek.", + this, + topicName, + topicProgress.getRegionProgress().size(), + provider, + e); + } + } + if (!failedProviders.isEmpty()) { + final String errorMessage = + String.format( + "%s failed to seek topic %s to topicProgress(regionCount=%d) on subscription providers %s; seek requires every available provider to succeed", + this, topicName, topicProgress.getRegionProgress().size(), failedProviders); + LOGGER.warn(errorMessage); + throw new SubscriptionRuntimeCriticalException(errorMessage, firstFailure); + } + } + + /** Same all-provider success requirement as {@link #seekWithRedirection(String, short, long)}. */ + private void seekAfterWithRedirectionTopicProgress( + final String topicName, final TopicProgress topicProgress) throws SubscriptionException { + final List providers = this.providers.getAllAvailableProviders(); + if (providers.isEmpty()) { + throw new SubscriptionConnectionException( + String.format( + "Cluster has no available subscription providers when %s seekAfter topic %s", + this, topicName)); + } + final List failedProviders = new ArrayList<>(); + Throwable firstFailure = null; + for (final AbstractSubscriptionProvider provider : providers) { + try { + provider.seekAfterTopicProgress(topicName, topicProgress); + } catch (final Exception e) { + failedProviders.add(provider); + if (Objects.isNull(firstFailure)) { + firstFailure = e; + } + LOGGER.warn( + "{} failed to seekAfter topic {} to topicProgress(regionCount={}) from provider {}; seek requires every provider to succeed, so the client will continue notifying the remaining providers before failing this seekAfter.", + this, + topicName, + topicProgress.getRegionProgress().size(), + provider, + e); + } + } + if (!failedProviders.isEmpty()) { + final String errorMessage = + String.format( + "%s failed to seekAfter topic %s to topicProgress(regionCount=%d) on subscription providers %s; seek requires every available provider to succeed", + this, topicName, topicProgress.getRegionProgress().size(), failedProviders); + LOGGER.warn(errorMessage); + throw new SubscriptionRuntimeCriticalException(errorMessage, firstFailure); + } + } + + private Map buildCurrentProgressByTopic(final Set topicNames) { + final Map result = new HashMap<>(); + for (final String topicName : topicNames) { + final TopicProgress topicProgress = currentPositionsByTopic.get(topicName); + if (Objects.isNull(topicProgress) || topicProgress.getRegionProgress().isEmpty()) { + continue; + } + result.put(topicName, new TopicProgress(topicProgress.getRegionProgress())); + } + return result; + } + + private void advanceCurrentPositions(final List messages) { + for (final SubscriptionMessage message : messages) { + final SubscriptionCommitContext commitContext = message.getCommitContext(); + if (Objects.isNull(commitContext) || Objects.isNull(commitContext.getTopicName())) { + continue; + } + mergeTopicProgress( + currentPositionsByTopic, + commitContext.getTopicName(), + extractWriterId(commitContext), + extractWriterProgress(commitContext)); + } + } + + private void advanceCommittedPositions( + final List subscriptionCommitContexts) { + for (final SubscriptionCommitContext commitContext : subscriptionCommitContexts) { + if (Objects.isNull(commitContext) || Objects.isNull(commitContext.getTopicName())) { + continue; + } + mergeTopicProgress( + committedPositionsByTopic, + commitContext.getTopicName(), + extractWriterId(commitContext), + extractWriterProgress(commitContext)); + } + } + + private boolean isConsensusCommitContext(final SubscriptionCommitContext commitContext) { + return Objects.nonNull(commitContext) + && Objects.nonNull(commitContext.getWriterId()) + && Objects.nonNull(commitContext.getWriterProgress()) + && Objects.nonNull(commitContext.getRegionId()) + && !commitContext.getRegionId().isEmpty(); + } + + private String buildTopicRegionKey(final SubscriptionCommitContext commitContext) { + return commitContext.getTopicName() + '\u0001' + commitContext.getRegionId(); + } + + private void stagePendingRedirectAck(final SubscriptionCommitContext commitContext) { + pendingRedirectAcksByTopicRegion + .computeIfAbsent( + buildTopicRegionKey(commitContext), ignored -> ConcurrentHashMap.newKeySet()) + .add(commitContext); + } + + private void flushPendingRedirectAcks(final List currentMessages) { + final Map redirectTargetByTopicRegion = new HashMap<>(); + for (final SubscriptionMessage message : currentMessages) { + final SubscriptionCommitContext commitContext = message.getCommitContext(); + if (!isConsensusCommitContext(commitContext)) { + continue; + } + redirectTargetByTopicRegion.put( + buildTopicRegionKey(commitContext), commitContext.getDataNodeId()); + } + + for (final Entry entry : redirectTargetByTopicRegion.entrySet()) { + final Set pendingContexts = + pendingRedirectAcksByTopicRegion.get(entry.getKey()); + if (Objects.isNull(pendingContexts) || pendingContexts.isEmpty()) { + continue; + } + + final List contextsToRedirect = new ArrayList<>(pendingContexts); + try { + commitInternal(entry.getValue(), contextsToRedirect, false); + advanceCommittedPositions(contextsToRedirect); + contextsToRedirect.forEach(pendingContexts::remove); + if (pendingContexts.isEmpty()) { + pendingRedirectAcksByTopicRegion.remove(entry.getKey(), pendingContexts); + } + } catch (final SubscriptionException e) { + LOGGER.warn( + "{} failed to redirect {} pending consensus ack(s) for {} via provider {}", + this, + contextsToRedirect.size(), + entry.getKey(), + entry.getValue(), + e); + } + } + } + + private boolean isNewerWriterProgress( + final long newPhysicalTime, + final long newLocalSeq, + final long oldPhysicalTime, + final long oldLocalSeq) { + return newPhysicalTime > oldPhysicalTime + || (newPhysicalTime == oldPhysicalTime && newLocalSeq > oldLocalSeq); + } + + private void clearCurrentPositions(final String topicName) { + currentPositionsByTopic.remove(topicName); + } + + private void clearCommittedPositions(final String topicName) { + committedPositionsByTopic.remove(topicName); + } + + private void clearPendingRedirectAcks(final String topicName) { + final String prefix = topicName + '\u0001'; + pendingRedirectAcksByTopicRegion.keySet().removeIf(key -> key.startsWith(prefix)); + } + + private void setCurrentPositions(final String topicName, final TopicProgress topicProgress) { + if (Objects.isNull(topicProgress) || topicProgress.getRegionProgress().isEmpty()) { + currentPositionsByTopic.remove(topicName); + return; + } + currentPositionsByTopic.put(topicName, new TopicProgress(topicProgress.getRegionProgress())); + } + + private void setCommittedPositions(final String topicName, final TopicProgress topicProgress) { + if (Objects.isNull(topicProgress) || topicProgress.getRegionProgress().isEmpty()) { + committedPositionsByTopic.remove(topicName); + return; + } + committedPositionsByTopic.put(topicName, new TopicProgress(topicProgress.getRegionProgress())); + } + + private void overlayCurrentPositions(final String topicName, final TopicProgress topicProgress) { + overlayTopicProgress(currentPositionsByTopic, topicName, topicProgress); + } + + private void overlayCommittedPositions( + final String topicName, final TopicProgress topicProgress) { + overlayTopicProgress(committedPositionsByTopic, topicName, topicProgress); + } + + private void overlayTopicProgress( + final Map progressByTopic, + final String topicName, + final TopicProgress topicProgress) { + if (Objects.isNull(topicName) + || topicName.isEmpty() + || Objects.isNull(topicProgress) + || topicProgress.getRegionProgress().isEmpty()) { + return; + } + progressByTopic.compute( + topicName, + (ignored, oldTopicProgress) -> { + final Map mergedRegionProgress = + Objects.nonNull(oldTopicProgress) + ? new HashMap<>(oldTopicProgress.getRegionProgress()) + : new HashMap<>(); + topicProgress + .getRegionProgress() + .forEach( + (regionId, regionProgress) -> { + if (Objects.isNull(regionId) + || regionId.isEmpty() + || Objects.isNull(regionProgress) + || regionProgress.getWriterPositions().isEmpty()) { + return; + } + mergedRegionProgress.put( + regionId, + new RegionProgress(new HashMap<>(regionProgress.getWriterPositions()))); + }); + return mergedRegionProgress.isEmpty() ? null : new TopicProgress(mergedRegionProgress); + }); + } + + private WriterId extractWriterId(final SubscriptionCommitContext commitContext) { + if (Objects.nonNull(commitContext.getWriterId())) { + return commitContext.getWriterId(); + } + if (Objects.isNull(commitContext.getRegionId()) || commitContext.getRegionId().isEmpty()) { + return null; + } + return new WriterId(commitContext.getRegionId(), commitContext.getDataNodeId(), 0L); + } + + private WriterProgress extractWriterProgress(final SubscriptionCommitContext commitContext) { + if (Objects.nonNull(commitContext.getWriterProgress())) { + return commitContext.getWriterProgress(); + } + if (commitContext.getLocalSeq() < 0) { + return null; + } + return new WriterProgress(commitContext.getPhysicalTime(), commitContext.getLocalSeq()); + } + + private void mergeTopicProgress( + final Map progressByTopic, + final String topicName, + final WriterId writerId, + final WriterProgress writerProgress) { + if (Objects.isNull(writerId) + || Objects.isNull(writerProgress) + || Objects.isNull(topicName) + || topicName.isEmpty()) { + return; + } + progressByTopic.compute( + topicName, + (key, oldTopicProgress) -> { + final Map regionProgressById = + Objects.nonNull(oldTopicProgress) + ? new HashMap<>(oldTopicProgress.getRegionProgress()) + : new HashMap<>(); + final RegionProgress oldRegionProgress = regionProgressById.get(writerId.getRegionId()); + final Map writerPositions = + Objects.nonNull(oldRegionProgress) + ? new HashMap<>(oldRegionProgress.getWriterPositions()) + : new HashMap<>(); + writerPositions.merge( + writerId, + writerProgress, + (oldVal, newVal) -> + isNewerWriterProgress( + newVal.getPhysicalTime(), + newVal.getLocalSeq(), + oldVal.getPhysicalTime(), + oldVal.getLocalSeq()) + ? newVal + : oldVal); + regionProgressById.put(writerId.getRegionId(), new RegionProgress(writerPositions)); + return new TopicProgress(regionProgressById); + }); + } + Map fetchAllEndPointsWithRedirection() throws SubscriptionException { final List providers = this.providers.getAllAvailableProviders(); if (providers.isEmpty()) { diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionProvider.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionProvider.java index 7f3582d195d6a..cfa0390a48300 100644 --- a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionProvider.java +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionProvider.java @@ -37,11 +37,13 @@ import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionPollRequest; import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionPollRequestType; import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionPollResponse; +import org.apache.iotdb.rpc.subscription.payload.poll.TopicProgress; import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeCloseReq; import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeCommitReq; import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeHandshakeReq; import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeHeartbeatReq; import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribePollReq; +import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeSeekReq; import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeSubscribeReq; import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeUnsubscribeReq; import org.apache.iotdb.rpc.subscription.payload.response.PipeSubscribeHandshakeResp; @@ -59,6 +61,7 @@ import org.slf4j.LoggerFactory; import java.io.IOException; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -332,14 +335,107 @@ Map unsubscribe(final Set topicNames) throws Subscr return unsubscribeResp.getTopics(); } + void seek(final String topicName, final short seekType, final long timestamp) + throws SubscriptionException { + final PipeSubscribeSeekReq req; + try { + req = PipeSubscribeSeekReq.toTPipeSubscribeReq(topicName, seekType, timestamp); + } catch (final IOException e) { + LOGGER.warn( + "IOException occurred when SubscriptionProvider {} serialize seek request for topic {}", + this, + topicName, + e); + throw new SubscriptionRuntimeNonCriticalException(e.getMessage(), e); + } + final TPipeSubscribeResp resp; + try { + resp = getSessionConnection().pipeSubscribe(req); + } catch (final TException | IoTDBConnectionException e) { + LOGGER.warn( + "TException/IoTDBConnectionException occurred when SubscriptionProvider {} seek with request for topic {}, set SubscriptionProvider unavailable", + this, + topicName, + e); + setUnavailable(); + throw new SubscriptionConnectionException(e.getMessage(), e); + } + verifyPipeSubscribeSuccess(resp.status); + } + + void seekToTopicProgress(final String topicName, final TopicProgress topicProgress) + throws SubscriptionException { + final PipeSubscribeSeekReq req; + try { + req = PipeSubscribeSeekReq.toTPipeSubscribeReq(topicName, topicProgress); + } catch (final IOException e) { + LOGGER.warn( + "IOException occurred when SubscriptionProvider {} serialize seek(topicProgress) for topic {}", + this, + topicName, + e); + throw new SubscriptionRuntimeNonCriticalException(e.getMessage(), e); + } + final TPipeSubscribeResp resp; + try { + resp = getSessionConnection().pipeSubscribe(req); + } catch (final TException | IoTDBConnectionException e) { + LOGGER.warn( + "TException/IoTDBConnectionException occurred when SubscriptionProvider {} seek(topicProgress) for topic {}, set SubscriptionProvider unavailable", + this, + topicName, + e); + setUnavailable(); + throw new SubscriptionConnectionException(e.getMessage(), e); + } + verifyPipeSubscribeSuccess(resp.status); + } + + void seekAfterTopicProgress(final String topicName, final TopicProgress topicProgress) + throws SubscriptionException { + final PipeSubscribeSeekReq req; + try { + req = PipeSubscribeSeekReq.toTPipeSubscribeSeekAfterReq(topicName, topicProgress); + } catch (final IOException e) { + LOGGER.warn( + "IOException occurred when SubscriptionProvider {} serialize seekAfter(topicProgress) for topic {}", + this, + topicName, + e); + throw new SubscriptionRuntimeNonCriticalException(e.getMessage(), e); + } + final TPipeSubscribeResp resp; + try { + resp = getSessionConnection().pipeSubscribe(req); + } catch (final TException | IoTDBConnectionException e) { + LOGGER.warn( + "TException/IoTDBConnectionException occurred when SubscriptionProvider {} seekAfter(topicProgress) for topic {}, set SubscriptionProvider unavailable", + this, + topicName, + e); + setUnavailable(); + throw new SubscriptionConnectionException(e.getMessage(), e); + } + verifyPipeSubscribeSuccess(resp.status); + } + List poll(final Set topicNames, final long timeoutMs) throws SubscriptionException { + return poll(topicNames, timeoutMs, Collections.emptyMap()); + } + + List poll( + final Set topicNames, + final long timeoutMs, + final Map progressByTopic) + throws SubscriptionException { return poll( new SubscriptionPollRequest( SubscriptionPollRequestType.POLL.getType(), new PollPayload(topicNames), timeoutMs, - session.getThriftMaxFrameSize())); + session.getThriftMaxFrameSize(), + progressByTopic)); } List pollFile( @@ -447,7 +543,7 @@ private static void verifyPipeSubscribeSuccess(final TSStatus status) String.format(SUBSCRIPTION_PIPE_TIMEOUT_FORMATTER, status.code, status.message)); case 1900: // SUBSCRIPTION_VERSION_ERROR case 1901: // SUBSCRIPTION_TYPE_ERROR - case 1909: // SUBSCRIPTION_MISSING_CUSTOMER + case 1909: // SUBSCRIPTION_MISSING_CONSUMER case 1912: // SUBSCRIPTION_NOT_ENABLED_ERROR default: { diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionPullConsumer.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionPullConsumer.java index 991857bc685ee..2607baebe2962 100644 --- a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionPullConsumer.java +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionPullConsumer.java @@ -22,8 +22,11 @@ import org.apache.iotdb.rpc.subscription.config.ConsumerConstant; import org.apache.iotdb.rpc.subscription.exception.SubscriptionException; import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext; +import org.apache.iotdb.rpc.subscription.payload.poll.TopicProgress; import org.apache.iotdb.session.subscription.consumer.AsyncCommitCallback; +import org.apache.iotdb.session.subscription.payload.PollResult; import org.apache.iotdb.session.subscription.payload.SubscriptionMessage; +import org.apache.iotdb.session.subscription.payload.SubscriptionMessageType; import org.apache.iotdb.session.subscription.util.CollectionUtils; import org.apache.iotdb.session.subscription.util.IdentifierUtils; @@ -31,6 +34,7 @@ import org.slf4j.LoggerFactory; import java.time.Duration; +import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; @@ -65,6 +69,8 @@ public abstract class AbstractSubscriptionPullConsumer extends AbstractSubscript private final boolean autoCommit; private final long autoCommitIntervalMs; + private final List processors = new ArrayList<>(); + private SortedMap> uncommittedCommitContexts; private final AtomicBoolean isClosed = new AtomicBoolean(true); @@ -135,6 +141,24 @@ public synchronized void close() { return; } + // flush all processors and commit any remaining buffered messages + if (!processors.isEmpty()) { + final List flushed = new ArrayList<>(); + for (final SubscriptionMessageProcessor processor : processors) { + final List out = processor.flush(); + if (out != null) { + flushed.addAll(out); + } + } + if (!flushed.isEmpty() && autoCommit) { + try { + commitSync(flushed); + } catch (final SubscriptionException e) { + LOGGER.warn("Failed to commit flushed processor messages on close", e); + } + } + } + if (autoCommit) { // commit all uncommitted messages commitAllUncommittedMessages(); @@ -186,7 +210,7 @@ protected List poll(final Set topicNames, final lon } final List messages = multiplePoll(parsedTopicNames, timeoutMs); - if (messages.isEmpty()) { + if (messages.isEmpty() && processors.isEmpty()) { LOGGER.info( "SubscriptionPullConsumer {} poll empty message from topics {} after {} millisecond(s)", this, @@ -195,6 +219,35 @@ protected List poll(final Set topicNames, final lon return messages; } + // Apply processor chain if configured + List processed = messages; + if (!processors.isEmpty()) { + for (final SubscriptionMessageProcessor processor : processors) { + processed = processor.process(processed); + } + } + + // Update watermark timestamp before stripping watermark events + for (final SubscriptionMessage m : processed) { + if (m.getMessageType() == SubscriptionMessageType.WATERMARK.getType()) { + final long ts = m.getWatermarkTimestamp(); + if (ts > latestWatermarkTimestamp) { + latestWatermarkTimestamp = ts; + } + } + } + + // Strip system messages — they are only for processors, not for users + processed.removeIf( + m -> { + final short type = m.getMessageType(); + return type == SubscriptionMessageType.WATERMARK.getType(); + }); + + if (processed.isEmpty()) { + return processed; + } + // add to uncommitted messages if (autoCommit) { final long currentTimestamp = System.currentTimeMillis(); @@ -205,12 +258,56 @@ protected List poll(final Set topicNames, final lon uncommittedCommitContexts .computeIfAbsent(index, o -> new ConcurrentSkipListSet<>()) .addAll( - messages.stream() + processed.stream() .map(SubscriptionMessage::getCommitContext) .collect(Collectors.toList())); } - return messages; + return processed; + } + + /////////////////////////////// processor /////////////////////////////// + + /** + * Adds a message processor to the pipeline. Processors are applied in order on each poll() call. + * + * @param processor the processor to add + */ + protected AbstractSubscriptionPullConsumer addProcessor( + final SubscriptionMessageProcessor processor) { + processors.add(processor); + return this; + } + + /** + * Polls with processor metadata. Returns a {@link PollResult} containing the messages, the total + * number of buffered messages across all processors, and the current watermark. + */ + protected PollResult pollWithInfo(final long timeoutMs) throws SubscriptionException { + final List messages = poll(timeoutMs); + int totalBuffered = 0; + long watermark = -1; + for (final SubscriptionMessageProcessor processor : processors) { + totalBuffered += processor.getBufferedCount(); + if (processor instanceof WatermarkProcessor) { + watermark = ((WatermarkProcessor) processor).getWatermark(); + } + } + return new PollResult(messages, totalBuffered, watermark); + } + + protected PollResult pollWithInfo(final Set topicNames, final long timeoutMs) + throws SubscriptionException { + final List messages = poll(topicNames, timeoutMs); + int totalBuffered = 0; + long watermark = -1; + for (final SubscriptionMessageProcessor processor : processors) { + totalBuffered += processor.getBufferedCount(); + if (processor instanceof WatermarkProcessor) { + watermark = ((WatermarkProcessor) processor).getWatermark(); + } + } + return new PollResult(messages, totalBuffered, watermark); } /////////////////////////////// commit /////////////////////////////// @@ -242,6 +339,46 @@ protected void commitAsync( super.commitAsync(messages, callback); } + /////////////////////////////// seek /////////////////////////////// + + /** + * Clears uncommitted auto-commit messages after seek to prevent stale acks from committing events + * that belonged to the pre-seek position. + */ + @Override + public void seekToBeginning(final String topicName) throws SubscriptionException { + super.seekToBeginning(topicName); + if (autoCommit) { + uncommittedCommitContexts.clear(); + } + } + + @Override + public void seekToEnd(final String topicName) throws SubscriptionException { + super.seekToEnd(topicName); + if (autoCommit) { + uncommittedCommitContexts.clear(); + } + } + + @Override + public void seek(final String topicName, final TopicProgress topicProgress) + throws SubscriptionException { + super.seek(topicName, topicProgress); + if (autoCommit) { + uncommittedCommitContexts.clear(); + } + } + + @Override + public void seekAfter(final String topicName, final TopicProgress topicProgress) + throws SubscriptionException { + super.seekAfter(topicName, topicProgress); + if (autoCommit) { + uncommittedCommitContexts.clear(); + } + } + /////////////////////////////// auto commit /////////////////////////////// private void submitAutoCommitWorker() { @@ -278,8 +415,19 @@ public void run() { for (final Map.Entry> entry : uncommittedCommitContexts.headMap(index).entrySet()) { try { - ackCommitContexts(entry.getValue()); - uncommittedCommitContexts.remove(entry.getKey()); + final Set removableCommitContexts = + ackCommitContextsWithPartialProgress(entry.getValue()); + if (removableCommitContexts.isEmpty()) { + continue; + } + if (removableCommitContexts.size() == entry.getValue().size()) { + uncommittedCommitContexts.remove(entry.getKey()); + continue; + } + entry.getValue().removeAll(removableCommitContexts); + if (entry.getValue().isEmpty()) { + uncommittedCommitContexts.remove(entry.getKey()); + } } catch (final Exception e) { LOGGER.warn("something unexpected happened when auto commit messages...", e); } @@ -291,8 +439,19 @@ private void commitAllUncommittedMessages() { for (final Map.Entry> entry : uncommittedCommitContexts.entrySet()) { try { - ackCommitContexts(entry.getValue()); - uncommittedCommitContexts.remove(entry.getKey()); + final Set removableCommitContexts = + ackCommitContextsWithPartialProgress(entry.getValue()); + if (removableCommitContexts.isEmpty()) { + continue; + } + if (removableCommitContexts.size() == entry.getValue().size()) { + uncommittedCommitContexts.remove(entry.getKey()); + continue; + } + entry.getValue().removeAll(removableCommitContexts); + if (entry.getValue().isEmpty()) { + uncommittedCommitContexts.remove(entry.getKey()); + } } catch (final Exception e) { LOGGER.warn("something unexpected happened when commit messages during close", e); } diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionPushConsumer.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionPushConsumer.java index 3ff93db218b27..1ac9f08696ddb 100644 --- a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionPushConsumer.java +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionPushConsumer.java @@ -26,6 +26,7 @@ import org.apache.iotdb.session.subscription.consumer.ConsumeResult; import org.apache.iotdb.session.subscription.consumer.tree.SubscriptionTreePushConsumer; import org.apache.iotdb.session.subscription.payload.SubscriptionMessage; +import org.apache.iotdb.session.subscription.payload.SubscriptionMessageType; import org.apache.iotdb.session.subscription.util.CollectionUtils; import org.slf4j.Logger; @@ -180,6 +181,21 @@ public void run() { try { final List messages = multiplePoll(subscribedTopics.keySet(), autoPollTimeoutMs); + // Update watermark timestamp before stripping watermark events + for (final SubscriptionMessage m : messages) { + if (m.getMessageType() == SubscriptionMessageType.WATERMARK.getType()) { + final long ts = m.getWatermarkTimestamp(); + if (ts > latestWatermarkTimestamp) { + latestWatermarkTimestamp = ts; + } + } + } + // Strip system messages — push consumer does not use processors + messages.removeIf( + m -> { + final short type = m.getMessageType(); + return type == SubscriptionMessageType.WATERMARK.getType(); + }); if (messages.isEmpty()) { LOGGER.info( "SubscriptionPushConsumer {} poll empty message from topics {} after {} millisecond(s)", diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/ColumnAlignProcessor.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/ColumnAlignProcessor.java new file mode 100644 index 0000000000000..13910a86c9abe --- /dev/null +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/ColumnAlignProcessor.java @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.session.subscription.consumer.base; + +import org.apache.iotdb.session.subscription.payload.SubscriptionMessage; +import org.apache.iotdb.session.subscription.payload.SubscriptionMessageType; + +import org.apache.tsfile.utils.BitMap; +import org.apache.tsfile.write.record.Tablet; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +/** + * A non-buffering processor that forward-fills null columns in each Tablet using the last known + * value for the same device/table. This is useful for CDC scenarios where a write only updates a + * subset of columns, leaving others null; the processor fills them with the most recent value. + * + *

State is maintained per device (identified by {@code Tablet.getDeviceId()} for tree-model or + * {@code Tablet.getTableName()} for table-model). + */ +public class ColumnAlignProcessor implements SubscriptionMessageProcessor { + + // deviceKey -> (columnIndex -> lastValue) + private final Map> lastValues = new HashMap<>(); + + @Override + public List process(final List messages) { + for (final SubscriptionMessage message : messages) { + if (message.getMessageType() != SubscriptionMessageType.RECORD_HANDLER.getType()) { + continue; + } + final Iterator tablets = message.getRecordTabletIterator(); + while (tablets.hasNext()) { + fillTablet(tablets.next()); + } + } + return messages; + } + + @Override + public List flush() { + return Collections.emptyList(); + } + + private void fillTablet(final Tablet tablet) { + final String deviceKey = getDeviceKey(tablet); + final Map cache = lastValues.computeIfAbsent(deviceKey, k -> new HashMap<>()); + + final Object[] values = tablet.getValues(); + final BitMap[] bitMaps = tablet.getBitMaps(); + final int rowSize = tablet.getRowSize(); + final int columnCount = values.length; + + for (int row = 0; row < rowSize; row++) { + for (int col = 0; col < columnCount; col++) { + final boolean isNull = + bitMaps != null && bitMaps[col] != null && bitMaps[col].isMarked(row); + if (isNull) { + // try forward-fill from cache + final Object cached = cache.get(col); + if (cached != null) { + setValueAt(values[col], row, cached); + bitMaps[col].unmark(row); + } + } else { + // update cache with this non-null value + cache.put(col, getValueAt(values[col], row)); + } + } + } + } + + private static String getDeviceKey(final Tablet tablet) { + // tree model uses deviceId; table model uses tableName + final String deviceId = tablet.getDeviceId(); + return deviceId != null ? deviceId : tablet.getTableName(); + } + + private static Object getValueAt(final Object columnArray, final int row) { + if (columnArray instanceof long[]) { + return ((long[]) columnArray)[row]; + } else if (columnArray instanceof int[]) { + return ((int[]) columnArray)[row]; + } else if (columnArray instanceof double[]) { + return ((double[]) columnArray)[row]; + } else if (columnArray instanceof float[]) { + return ((float[]) columnArray)[row]; + } else if (columnArray instanceof boolean[]) { + return ((boolean[]) columnArray)[row]; + } else if (columnArray instanceof Object[]) { + return ((Object[]) columnArray)[row]; + } + return null; + } + + private static void setValueAt(final Object columnArray, final int row, final Object value) { + if (columnArray instanceof long[]) { + ((long[]) columnArray)[row] = (Long) value; + } else if (columnArray instanceof int[]) { + ((int[]) columnArray)[row] = (Integer) value; + } else if (columnArray instanceof double[]) { + ((double[]) columnArray)[row] = (Double) value; + } else if (columnArray instanceof float[]) { + ((float[]) columnArray)[row] = (Float) value; + } else if (columnArray instanceof boolean[]) { + ((boolean[]) columnArray)[row] = (Boolean) value; + } else if (columnArray instanceof Object[]) { + ((Object[]) columnArray)[row] = value; + } + } +} diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/SubscriptionMessageProcessor.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/SubscriptionMessageProcessor.java new file mode 100644 index 0000000000000..ceee674cd6901 --- /dev/null +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/SubscriptionMessageProcessor.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.session.subscription.consumer.base; + +import org.apache.iotdb.session.subscription.payload.SubscriptionMessage; + +import java.util.List; + +/** + * A processor that transforms, filters, or enriches subscription messages in the pull consumer + * pipeline. Processors are chained and invoked on each poll() call. + * + *

Processors may buffer messages internally (e.g., for watermark-based ordering) and return them + * in later process() calls. Buffered messages should be released via {@link #flush()} when the + * consumer closes. + */ +public interface SubscriptionMessageProcessor { + + /** + * Process a batch of messages. May return fewer, more, or different messages than the input. + * + * @param messages the messages from the previous stage (or raw poll) + * @return messages to pass to the next stage (or to the user) + */ + List process(List messages); + + /** + * Flush all internally buffered messages. Called when the consumer is closing. + * + * @return any remaining buffered messages + */ + List flush(); + + /** Returns the number of messages currently buffered by this processor. */ + default int getBufferedCount() { + return 0; + } +} diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/WatermarkProcessor.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/WatermarkProcessor.java new file mode 100644 index 0000000000000..8c17896ce5de5 --- /dev/null +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/WatermarkProcessor.java @@ -0,0 +1,258 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.session.subscription.consumer.base; + +import org.apache.iotdb.session.subscription.payload.SubscriptionMessage; +import org.apache.iotdb.session.subscription.payload.SubscriptionMessageType; + +import org.apache.tsfile.write.record.Tablet; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.PriorityQueue; + +/** + * A buffering processor that reorders messages based on watermark semantics. Messages are buffered + * internally and emitted only when the watermark advances past their maximum timestamp. + * + *

Watermark = (minimum of latest timestamp per active source) - maxOutOfOrdernessMs + * + *

A source is considered "stale" if its latest timestamp has not increased for {@code + * staleSourceTimeoutMs}. Stale sources are excluded from the watermark calculation, preventing a + * single slow or idle source from anchoring the global watermark indefinitely. + * + *

Server-side WATERMARK events (carrying per-region timestamp progress) serve as heartbeats, + * confirming source liveness. They advance the per-source timestamp only when their timestamp is + * higher than the previously observed value. + * + *

A timeout mechanism ensures that buffered messages are eventually flushed even if no new data + * arrives, preventing unbounded buffering. + * + *

Note: This processor is primarily intended as a reference implementation. For + * production use with large-scale out-of-order data, consider using a downstream stream processing + * framework (Flink, Spark) for watermark handling. + */ +public class WatermarkProcessor implements SubscriptionMessageProcessor { + + private static final long DEFAULT_STALE_SOURCE_TIMEOUT_MS = 30_000L; + private static final long DEFAULT_MAX_BUFFER_BYTES = 64L * 1024 * 1024; // 64 MB + + private final long maxOutOfOrdernessMs; + private final long timeoutMs; + private final long staleSourceTimeoutMs; + private final long maxBufferBytes; + + // Buffer ordered by message max timestamp + private final PriorityQueue buffer = + new PriorityQueue<>((a, b) -> Long.compare(a.maxTimestamp, b.maxTimestamp)); + + // Track latest timestamp per source (deviceId/tableName) + private final java.util.Map latestPerSource = new java.util.HashMap<>(); + // Track wall-clock time when each source's timestamp last increased + private final java.util.Map lastAdvancedTimeMs = new java.util.HashMap<>(); + private long lastEmitTimeMs = System.currentTimeMillis(); + private long bufferedBytes = 0; + + // Current watermark value + private long watermark = Long.MIN_VALUE; + + /** + * Creates a WatermarkProcessor with default stale source timeout (30 seconds). + * + * @param maxOutOfOrdernessMs maximum expected out-of-orderness in milliseconds + * @param timeoutMs if no data arrives within this duration, force-flush all buffered messages + */ + public WatermarkProcessor(final long maxOutOfOrdernessMs, final long timeoutMs) { + this(maxOutOfOrdernessMs, timeoutMs, DEFAULT_STALE_SOURCE_TIMEOUT_MS, DEFAULT_MAX_BUFFER_BYTES); + } + + /** + * Creates a WatermarkProcessor. + * + * @param maxOutOfOrdernessMs maximum expected out-of-orderness in milliseconds + * @param timeoutMs if no data arrives within this duration, force-flush all buffered messages + * @param staleSourceTimeoutMs if a source's timestamp has not increased for this duration, it is + * excluded from watermark calculation. Use {@link Long#MAX_VALUE} to disable. + * @param maxBufferBytes maximum total estimated bytes of buffered messages. When exceeded, all + * buffered messages are force-flushed regardless of watermark. Defaults to 64 MB. + */ + public WatermarkProcessor( + final long maxOutOfOrdernessMs, + final long timeoutMs, + final long staleSourceTimeoutMs, + final long maxBufferBytes) { + this.maxOutOfOrdernessMs = maxOutOfOrdernessMs; + this.timeoutMs = timeoutMs; + this.staleSourceTimeoutMs = staleSourceTimeoutMs; + this.maxBufferBytes = maxBufferBytes; + } + + @Override + public List process(final List messages) { + final long now = System.currentTimeMillis(); + + // Buffer incoming messages and update per-source timestamps + for (final SubscriptionMessage message : messages) { + // WATERMARK events carry server-side timestamp progress per region. + // They serve as heartbeats and advance per-source tracking only when the timestamp + // actually increases. + if (message.getMessageType() == SubscriptionMessageType.WATERMARK.getType()) { + final String regionKey = + "region-" + + message.getCommitContext().getDataNodeId() + + "-" + + message.getCommitContext().getRegionId(); + advanceSourceTimestamp(regionKey, message.getWatermarkTimestamp(), now); + continue; // Do not buffer system events + } + + final long maxTs = extractMaxTimestamp(message); + final long estimatedSize = message.estimateSize(); + buffer.add(new TimestampedMessage(message, maxTs, estimatedSize)); + bufferedBytes += estimatedSize; + updateSourceTimestamp(message, maxTs, now); + } + + // Compute watermark = min(latest per active source) - maxOutOfOrderness + // Sources whose timestamp has not increased for staleSourceTimeoutMs are excluded. + if (!latestPerSource.isEmpty()) { + long minLatest = Long.MAX_VALUE; + for (final java.util.Map.Entry entry : latestPerSource.entrySet()) { + final Long lastAdv = lastAdvancedTimeMs.get(entry.getKey()); + if (lastAdv != null && (now - lastAdv) <= staleSourceTimeoutMs) { + minLatest = Math.min(minLatest, entry.getValue()); + } + } + if (minLatest != Long.MAX_VALUE) { + watermark = minLatest - maxOutOfOrdernessMs; + } + // If all sources are stale, watermark stays unchanged and timeout will handle it. + } + + // Emit messages whose maxTimestamp <= watermark + final List emitted = emit(watermark); + + // Buffer overflow: force-flush all if buffer exceeds byte limit + if (bufferedBytes > maxBufferBytes) { + return forceFlushAll(); + } + + // Timeout: if nothing was emitted and timeout exceeded, force-flush all + if (emitted.isEmpty() && (now - lastEmitTimeMs) >= timeoutMs && !buffer.isEmpty()) { + return forceFlushAll(); + } + + if (!emitted.isEmpty()) { + lastEmitTimeMs = now; + } + return emitted; + } + + @Override + public List flush() { + return forceFlushAll(); + } + + @Override + public int getBufferedCount() { + return buffer.size(); + } + + /** Returns the current watermark value. */ + public long getWatermark() { + return watermark; + } + + private List emit(final long watermarkValue) { + final List result = new ArrayList<>(); + while (!buffer.isEmpty() && buffer.peek().maxTimestamp <= watermarkValue) { + final TimestampedMessage tm = buffer.poll(); + bufferedBytes -= tm.estimatedSize; + result.add(tm.message); + } + return result; + } + + private List forceFlushAll() { + final List result = new ArrayList<>(buffer.size()); + while (!buffer.isEmpty()) { + result.add(buffer.poll().message); + } + bufferedBytes = 0; + lastEmitTimeMs = System.currentTimeMillis(); + return result; + } + + private static long extractMaxTimestamp(final SubscriptionMessage message) { + long maxTs = Long.MIN_VALUE; + if (message.getMessageType() == SubscriptionMessageType.RECORD_HANDLER.getType()) { + final Iterator it = message.getRecordTabletIterator(); + while (it.hasNext()) { + final Tablet tablet = it.next(); + final long[] timestamps = tablet.getTimestamps(); + final int rowSize = tablet.getRowSize(); + for (int i = 0; i < rowSize; i++) { + maxTs = Math.max(maxTs, timestamps[i]); + } + } + } + // For non-tablet messages or empty messages, use current wall clock + if (maxTs == Long.MIN_VALUE) { + maxTs = System.currentTimeMillis(); + } + return maxTs; + } + + private void updateSourceTimestamp( + final SubscriptionMessage message, final long maxTs, final long nowMs) { + // Use region-based key so data events and WATERMARK events share the same key namespace. + final String regionId = message.getCommitContext().getRegionId(); + final int dataNodeId = message.getCommitContext().getDataNodeId(); + final String key = "region-" + dataNodeId + "-" + regionId; + advanceSourceTimestamp(key, maxTs, nowMs); + } + + /** + * Updates the per-source timestamp tracking. Only records a new "last advanced" wall-clock time + * when the timestamp actually increases, so that stale sources (whose timestamps don't advance) + * are eventually excluded from watermark calculation. + */ + private void advanceSourceTimestamp(final String key, final long newTs, final long nowMs) { + final Long oldTs = latestPerSource.get(key); + if (oldTs == null || newTs > oldTs) { + latestPerSource.put(key, newTs); + lastAdvancedTimeMs.put(key, nowMs); + } + } + + private static final class TimestampedMessage { + final SubscriptionMessage message; + final long maxTimestamp; + final long estimatedSize; + + TimestampedMessage( + final SubscriptionMessage message, final long maxTimestamp, final long estimatedSize) { + this.message = message; + this.maxTimestamp = maxTimestamp; + this.estimatedSize = estimatedSize; + } + } +} diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/table/SubscriptionTablePullConsumer.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/table/SubscriptionTablePullConsumer.java index 83dd39aebbf7d..e3fb90cda470a 100644 --- a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/table/SubscriptionTablePullConsumer.java +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/table/SubscriptionTablePullConsumer.java @@ -25,6 +25,8 @@ import org.apache.iotdb.session.subscription.consumer.ISubscriptionTablePullConsumer; import org.apache.iotdb.session.subscription.consumer.base.AbstractSubscriptionProvider; import org.apache.iotdb.session.subscription.consumer.base.AbstractSubscriptionPullConsumer; +import org.apache.iotdb.session.subscription.consumer.base.SubscriptionMessageProcessor; +import org.apache.iotdb.session.subscription.payload.PollResult; import org.apache.iotdb.session.subscription.payload.SubscriptionMessage; import java.time.Duration; @@ -173,4 +175,24 @@ public String getConsumerGroupId() { public boolean allTopicMessagesHaveBeenConsumed() { return super.allTopicMessagesHaveBeenConsumed(); } + + /////////////////////////////// processor /////////////////////////////// + + public SubscriptionTablePullConsumer addProcessor(final SubscriptionMessageProcessor processor) { + super.addProcessor(processor); + return this; + } + + public PollResult pollWithInfo(final long timeoutMs) throws SubscriptionException { + return super.pollWithInfo(timeoutMs); + } + + public PollResult pollWithInfo(final Duration timeout) throws SubscriptionException { + return super.pollWithInfo(timeout.toMillis()); + } + + public PollResult pollWithInfo(final Set topicNames, final long timeoutMs) + throws SubscriptionException { + return super.pollWithInfo(topicNames, timeoutMs); + } } diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/tree/SubscriptionTreePullConsumer.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/tree/SubscriptionTreePullConsumer.java index 23050893f660d..c4daab68839aa 100644 --- a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/tree/SubscriptionTreePullConsumer.java +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/tree/SubscriptionTreePullConsumer.java @@ -27,6 +27,8 @@ import org.apache.iotdb.session.subscription.consumer.ISubscriptionTreePullConsumer; import org.apache.iotdb.session.subscription.consumer.base.AbstractSubscriptionProvider; import org.apache.iotdb.session.subscription.consumer.base.AbstractSubscriptionPullConsumer; +import org.apache.iotdb.session.subscription.consumer.base.SubscriptionMessageProcessor; +import org.apache.iotdb.session.subscription.payload.PollResult; import org.apache.iotdb.session.subscription.payload.SubscriptionMessage; import org.apache.iotdb.session.subscription.util.IdentifierUtils; @@ -220,6 +222,26 @@ public boolean allTopicMessagesHaveBeenConsumed() { return super.allTopicMessagesHaveBeenConsumed(); } + /////////////////////////////// processor /////////////////////////////// + + public SubscriptionTreePullConsumer addProcessor(final SubscriptionMessageProcessor processor) { + super.addProcessor(processor); + return this; + } + + public PollResult pollWithInfo(final long timeoutMs) throws SubscriptionException { + return super.pollWithInfo(timeoutMs); + } + + public PollResult pollWithInfo(final Duration timeout) throws SubscriptionException { + return super.pollWithInfo(timeout.toMillis()); + } + + public PollResult pollWithInfo(final Set topicNames, final long timeoutMs) + throws SubscriptionException { + return super.pollWithInfo(topicNames, timeoutMs); + } + /////////////////////////////// builder /////////////////////////////// @Deprecated // keep for forward compatibility diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/payload/PollResult.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/payload/PollResult.java new file mode 100644 index 0000000000000..be56548116e11 --- /dev/null +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/payload/PollResult.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.session.subscription.payload; + +import java.util.Collections; +import java.util.List; + +/** Result of a poll operation that includes processor metadata alongside the messages. */ +public class PollResult { + + private final List messages; + private final int bufferedCount; + private final long watermark; + + public PollResult( + final List messages, final int bufferedCount, final long watermark) { + this.messages = messages != null ? messages : Collections.emptyList(); + this.bufferedCount = bufferedCount; + this.watermark = watermark; + } + + /** Returns the processed messages ready for consumption. */ + public List getMessages() { + return messages; + } + + /** Returns the total number of messages currently buffered across all processors. */ + public int getBufferedCount() { + return bufferedCount; + } + + /** + * Returns the current watermark timestamp (-1 if no watermark processor is configured). Messages + * with timestamps at or before this value have all been emitted. + */ + public long getWatermark() { + return watermark; + } + + @Override + public String toString() { + return "PollResult{messages=" + + messages.size() + + ", bufferedCount=" + + bufferedCount + + ", watermark=" + + watermark + + "}"; + } +} diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/payload/SubscriptionMessage.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/payload/SubscriptionMessage.java index b4ea6f0166f1b..a2a7c9df51c8e 100644 --- a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/payload/SubscriptionMessage.java +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/payload/SubscriptionMessage.java @@ -40,6 +40,9 @@ public class SubscriptionMessage implements Comparable { private final SubscriptionMessageHandler handler; + /** Watermark timestamp, valid only when messageType == WATERMARK. */ + private final long watermarkTimestamp; + private volatile boolean userDataRemoved = false; public SubscriptionMessage( @@ -47,6 +50,7 @@ public SubscriptionMessage( this.commitContext = commitContext; this.messageType = SubscriptionMessageType.RECORD_HANDLER.getType(); this.handler = new SubscriptionRecordHandler(tablets); + this.watermarkTimestamp = Long.MIN_VALUE; } public SubscriptionMessage( @@ -56,6 +60,16 @@ public SubscriptionMessage( this.commitContext = commitContext; this.messageType = SubscriptionMessageType.TS_FILE.getType(); this.handler = new SubscriptionTsFileHandler(absolutePath, databaseName); + this.watermarkTimestamp = Long.MIN_VALUE; + } + + /** Watermark message carrying server-side timestamp progress for a region. */ + public SubscriptionMessage( + final SubscriptionCommitContext commitContext, final long watermarkTimestamp) { + this.commitContext = commitContext; + this.messageType = SubscriptionMessageType.WATERMARK.getType(); + this.handler = null; + this.watermarkTimestamp = watermarkTimestamp; } public SubscriptionCommitContext getCommitContext() { @@ -66,12 +80,42 @@ public short getMessageType() { return messageType; } + /** + * Returns the watermark timestamp carried by this message. Only valid when {@code + * getMessageType() == SubscriptionMessageType.WATERMARK.getType()}. + * + * @return the watermark timestamp, or {@code Long.MIN_VALUE} if not a watermark message + */ + public long getWatermarkTimestamp() { + return watermarkTimestamp; + } + + /** + * Estimates the heap memory occupied by this message in bytes. For tablet-based messages, this + * delegates to {@link Tablet#ramBytesUsed()} for accurate per-column estimation. + * + * @return estimated byte size + */ + public long estimateSize() { + // Object header + references + primitives (rough constant) + long size = 64; + if (handler instanceof SubscriptionRecordHandler) { + final Iterator it = getRecordTabletIterator(); + while (it.hasNext()) { + size += it.next().ramBytesUsed(); + } + } + return size; + } + public void removeUserData() { if (userDataRemoved) { return; } - handler.removeUserData(); + if (Objects.nonNull(handler)) { + handler.removeUserData(); + } if (handler instanceof SubscriptionRecordHandler) { userDataRemoved = true; } @@ -89,13 +133,14 @@ public boolean equals(final Object obj) { } final SubscriptionMessage that = (SubscriptionMessage) obj; return Objects.equals(this.commitContext, that.commitContext) + && this.watermarkTimestamp == that.watermarkTimestamp && Objects.equals(this.messageType, that.messageType) && Objects.equals(this.handler, that.handler); } @Override public int hashCode() { - return Objects.hash(commitContext, messageType, handler); + return Objects.hash(commitContext, messageType, handler, watermarkTimestamp); } @Override @@ -109,6 +154,8 @@ public String toString() { + commitContext + ", messageType=" + SubscriptionMessageType.valueOf(messageType).toString() + + ", watermarkTimestamp=" + + watermarkTimestamp + "}"; } diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/payload/SubscriptionMessageType.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/payload/SubscriptionMessageType.java index 34189c2fa9b42..0732c0590c181 100644 --- a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/payload/SubscriptionMessageType.java +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/payload/SubscriptionMessageType.java @@ -26,6 +26,7 @@ public enum SubscriptionMessageType { RECORD_HANDLER((short) 0), TS_FILE((short) 1), + WATERMARK((short) 3), ; private final short type; diff --git a/iotdb-client/subscription/src/test/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionCommitContextTest.java b/iotdb-client/subscription/src/test/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionCommitContextTest.java new file mode 100644 index 0000000000000..4c70d25bfd68d --- /dev/null +++ b/iotdb-client/subscription/src/test/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionCommitContextTest.java @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.rpc.subscription.payload.poll; + +import org.apache.tsfile.utils.PublicBAOS; +import org.apache.tsfile.utils.ReadWriteIOUtils; +import org.junit.Test; + +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +public class SubscriptionCommitContextTest { + + @Test + public void testDeserializeCurrentCommitIdContext() throws IOException { + final SubscriptionCommitContext original = + new SubscriptionCommitContext(1, 2, "topic", "group", 3L); + final ByteBuffer buffer = SubscriptionCommitContext.serialize(original); + + final SubscriptionCommitContext context = SubscriptionCommitContext.deserialize(buffer); + + assertEquals(1, context.getDataNodeId()); + assertEquals(2, context.getRebootTimes()); + assertEquals("topic", context.getTopicName()); + assertEquals("group", context.getConsumerGroupId()); + assertEquals(3L, context.getCommitId()); + assertEquals(0L, context.getSeekGeneration()); + assertEquals("", context.getRegionId()); + assertEquals(0L, context.getPhysicalTime()); + assertFalse(context.hasWriterProgress()); + assertTrue(context.isCommittable()); + } + + @Test + public void testDeserializeCurrentPhysicalTimeContext() throws IOException { + final SubscriptionCommitContext original = + new SubscriptionCommitContext(1, 2, "topic", "group", 3L, 4L, "region", 5L); + + final ByteBuffer buffer = SubscriptionCommitContext.serialize(original); + final SubscriptionCommitContext parsed = SubscriptionCommitContext.deserialize(buffer); + + assertEquals(original, parsed); + assertFalse(parsed.hasWriterProgress()); + assertTrue(parsed.isCommittable()); + } + + @Test + public void testDeserializeV2() throws IOException { + final WriterId writerId = new WriterId("region", 7, 8L); + final WriterProgress writerProgress = new WriterProgress(9L, 10L); + final SubscriptionCommitContext original = + new SubscriptionCommitContext(1, 2, "topic", "group", 3L, writerId, writerProgress); + + final ByteBuffer buffer = SubscriptionCommitContext.serialize(original); + final SubscriptionCommitContext parsed = SubscriptionCommitContext.deserialize(buffer); + + assertEquals(original, parsed); + assertEquals(writerId, parsed.getWriterId()); + assertEquals(writerProgress, parsed.getWriterProgress()); + assertEquals("region", parsed.getRegionId()); + assertEquals(9L, parsed.getPhysicalTime()); + assertEquals(10L, parsed.getLocalSeq()); + assertTrue(parsed.hasWriterProgress()); + assertTrue(parsed.isCommittable()); + } + + @Test(expected = IllegalArgumentException.class) + public void testDeserializeUnsupportedVersion() throws IOException { + final ByteBuffer buffer = buildCurrentBufferWithVersion((byte) 1, 1, 2, "topic", "group", 3L); + SubscriptionCommitContext.deserialize(buffer); + } + + private static ByteBuffer buildCurrentBuffer( + final int dataNodeId, + final int rebootTimes, + final String topicName, + final String consumerGroupId, + final long commitId) + throws IOException { + return buildCurrentBufferWithVersion( + (byte) 2, dataNodeId, rebootTimes, topicName, consumerGroupId, commitId); + } + + private static ByteBuffer buildCurrentBufferWithVersion( + final byte version, + final int dataNodeId, + final int rebootTimes, + final String topicName, + final String consumerGroupId, + final long commitId) + throws IOException { + try (final PublicBAOS byteArrayOutputStream = new PublicBAOS(); + final DataOutputStream outputStream = new DataOutputStream(byteArrayOutputStream)) { + ReadWriteIOUtils.write(version, outputStream); + ReadWriteIOUtils.write(dataNodeId, outputStream); + ReadWriteIOUtils.write(rebootTimes, outputStream); + ReadWriteIOUtils.write(topicName, outputStream); + ReadWriteIOUtils.write(consumerGroupId, outputStream); + ReadWriteIOUtils.write(commitId, outputStream); + ReadWriteIOUtils.write(0L, outputStream); + ReadWriteIOUtils.write("", outputStream); + ReadWriteIOUtils.write(0L, outputStream); + ReadWriteIOUtils.write((byte) 0, outputStream); + return ByteBuffer.wrap(byteArrayOutputStream.getBuf(), 0, byteArrayOutputStream.size()); + } + } +} diff --git a/iotdb-client/subscription/src/test/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionPollRequestTest.java b/iotdb-client/subscription/src/test/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionPollRequestTest.java new file mode 100644 index 0000000000000..ecfea3d160bc4 --- /dev/null +++ b/iotdb-client/subscription/src/test/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionPollRequestTest.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.rpc.subscription.payload.poll; + +import org.junit.Test; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.Map; + +import static org.junit.Assert.assertEquals; + +public class SubscriptionPollRequestTest { + + @Test + public void testRoundTripWithProgressByTopic() throws IOException { + final Map writerPositions = new LinkedHashMap<>(); + writerPositions.put(new WriterId("1_100", 7, 2L), new WriterProgress(1001L, 11L)); + writerPositions.put(new WriterId("1_100", 8, 1L), new WriterProgress(999L, 9L)); + + final TopicProgress topicProgress = + new TopicProgress(Collections.singletonMap("1_100", new RegionProgress(writerPositions))); + final Map progressByTopic = new LinkedHashMap<>(); + progressByTopic.put("topicA", topicProgress); + + final SubscriptionPollRequest original = + new SubscriptionPollRequest( + SubscriptionPollRequestType.POLL.getType(), + new PollPayload(Collections.singleton("topicA")), + 1234L, + 4096L, + progressByTopic); + + final ByteBuffer serialized = SubscriptionPollRequest.serialize(original); + final SubscriptionPollRequest parsed = SubscriptionPollRequest.deserialize(serialized); + + assertEquals(original.getRequestType(), parsed.getRequestType()); + assertEquals(original.getTimeoutMs(), parsed.getTimeoutMs()); + assertEquals(original.getMaxBytes(), parsed.getMaxBytes()); + assertEquals(original.getPayload(), parsed.getPayload()); + assertEquals(progressByTopic, parsed.getProgressByTopic()); + } +} diff --git a/iotdb-client/subscription/src/test/java/org/apache/iotdb/rpc/subscription/payload/request/PipeSubscribeSeekReqTest.java b/iotdb-client/subscription/src/test/java/org/apache/iotdb/rpc/subscription/payload/request/PipeSubscribeSeekReqTest.java new file mode 100644 index 0000000000000..c2afb43110289 --- /dev/null +++ b/iotdb-client/subscription/src/test/java/org/apache/iotdb/rpc/subscription/payload/request/PipeSubscribeSeekReqTest.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.rpc.subscription.payload.request; + +import org.apache.iotdb.rpc.subscription.payload.poll.RegionProgress; +import org.apache.iotdb.rpc.subscription.payload.poll.TopicProgress; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterId; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterProgress; + +import org.junit.Test; + +import java.io.IOException; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.Map; + +import static org.junit.Assert.assertEquals; + +public class PipeSubscribeSeekReqTest { + + @Test + public void testTopicProgressSeekRoundTrip() throws IOException { + final Map writerPositions = new LinkedHashMap<>(); + writerPositions.put(new WriterId("1_100", 1, 2L), new WriterProgress(1000L, 10L)); + final TopicProgress original = + new TopicProgress(Collections.singletonMap("1_100", new RegionProgress(writerPositions))); + + final PipeSubscribeSeekReq req = + PipeSubscribeSeekReq.toTPipeSubscribeSeekAfterReq("topicA", original); + final PipeSubscribeSeekReq parsed = PipeSubscribeSeekReq.fromTPipeSubscribeReq(req); + + assertEquals(PipeSubscribeSeekReq.SEEK_AFTER_TOPIC_PROGRESS, parsed.getSeekType()); + assertEquals("topicA", parsed.getTopicName()); + assertEquals(original, parsed.getTopicProgress()); + } +} diff --git a/iotdb-client/subscription/src/test/java/org/apache/iotdb/session/subscription/consumer/base/WatermarkProcessorTest.java b/iotdb-client/subscription/src/test/java/org/apache/iotdb/session/subscription/consumer/base/WatermarkProcessorTest.java new file mode 100644 index 0000000000000..613090650bd1a --- /dev/null +++ b/iotdb-client/subscription/src/test/java/org/apache/iotdb/session/subscription/consumer/base/WatermarkProcessorTest.java @@ -0,0 +1,161 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.session.subscription.consumer.base; + +import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext; +import org.apache.iotdb.session.subscription.payload.SubscriptionMessage; + +import org.junit.Assert; +import org.junit.Test; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +public class WatermarkProcessorTest { + + private static final String TOPIC = "topic1"; + private static final String GROUP = "group1"; + private static final String REGION_R1 = "R1"; + private static final String REGION_R2 = "R2"; + + private static SubscriptionMessage dataMsg(final String regionId, final int dataNodeId) { + final SubscriptionCommitContext ctx = + new SubscriptionCommitContext(dataNodeId, 0, TOPIC, GROUP, 0L, 0L, regionId, 0L); + return new SubscriptionMessage(ctx, Collections.emptyMap()); + } + + private static SubscriptionMessage watermarkMsg( + final String regionId, final int dataNodeId, final long watermarkTs) { + final SubscriptionCommitContext ctx = + new SubscriptionCommitContext(dataNodeId, 0, TOPIC, GROUP, 0L, 0L, regionId, 0L); + return new SubscriptionMessage(ctx, watermarkTs); + } + + @Test + public void testSingleRegionRelease() { + final WatermarkProcessor proc = new WatermarkProcessor(5, 60_000); + + final List result = + proc.process(Collections.singletonList(watermarkMsg(REGION_R1, 1, 1000))); + + Assert.assertTrue(result.isEmpty()); + Assert.assertEquals(995, proc.getWatermark()); + } + + @Test + public void testTwoRegionsMinWatermark() { + final WatermarkProcessor proc = new WatermarkProcessor(10, 60_000); + + proc.process(Arrays.asList(watermarkMsg(REGION_R1, 1, 2000), watermarkMsg(REGION_R2, 1, 500))); + + Assert.assertEquals(490, proc.getWatermark()); + } + + @Test + public void testWatermarkAdvancesIdleRegion() { + final WatermarkProcessor proc = new WatermarkProcessor(5, 60_000); + + proc.process(Arrays.asList(watermarkMsg(REGION_R1, 1, 2000), watermarkMsg(REGION_R2, 1, 500))); + Assert.assertEquals(495, proc.getWatermark()); + + proc.process(Collections.singletonList(watermarkMsg(REGION_R2, 1, 1500))); + Assert.assertEquals(1495, proc.getWatermark()); + + proc.process(Collections.singletonList(watermarkMsg(REGION_R2, 1, 3000))); + Assert.assertEquals(1995, proc.getWatermark()); + } + + @Test + public void testWatermarkEventsNotBuffered() { + final WatermarkProcessor proc = new WatermarkProcessor(5, 60_000); + + proc.process(Collections.singletonList(watermarkMsg(REGION_R1, 1, 1000))); + + Assert.assertEquals(0, proc.getBufferedCount()); + } + + @Test + public void testFlushReleasesAll() { + final WatermarkProcessor proc = new WatermarkProcessor(5, 60_000); + + proc.process(Arrays.asList(dataMsg(REGION_R1, 1), dataMsg(REGION_R1, 1))); + + proc.flush(); + Assert.assertEquals(0, proc.getBufferedCount()); + } + + @Test + public void testWatermarkNoRegression() { + final WatermarkProcessor proc = new WatermarkProcessor(10, 60_000); + + proc.process(Collections.singletonList(watermarkMsg(REGION_R1, 1, 2000))); + Assert.assertEquals(1990, proc.getWatermark()); + + proc.process(Collections.singletonList(watermarkMsg(REGION_R1, 1, 1500))); + Assert.assertEquals(1990, proc.getWatermark()); + } + + @Test + public void testMultipleWatermarksInSingleBatch() { + final WatermarkProcessor proc = new WatermarkProcessor(0, 60_000); + + proc.process( + Arrays.asList( + watermarkMsg(REGION_R1, 1, 100), + watermarkMsg(REGION_R2, 1, 200), + watermarkMsg(REGION_R1, 1, 300))); + + Assert.assertEquals(200, proc.getWatermark()); + } + + @Test + public void testEmptyInput() { + final WatermarkProcessor proc = new WatermarkProcessor(5, 60_000); + + final List result = proc.process(Collections.emptyList()); + Assert.assertTrue(result.isEmpty()); + Assert.assertEquals(Long.MIN_VALUE, proc.getWatermark()); + } + + @Test + public void testThreeRegionsSlowestDeterminesWatermark() { + final WatermarkProcessor proc = new WatermarkProcessor(10, 60_000); + + proc.process( + Arrays.asList( + watermarkMsg(REGION_R1, 1, 5000), + watermarkMsg(REGION_R2, 1, 3000), + watermarkMsg("R3", 2, 4000))); + + Assert.assertEquals(2990, proc.getWatermark()); + + proc.process(Collections.singletonList(watermarkMsg(REGION_R2, 1, 6000))); + Assert.assertEquals(3990, proc.getWatermark()); + } + + @Test + public void testZeroOutOfOrderness() { + final WatermarkProcessor proc = new WatermarkProcessor(0, 60_000); + + proc.process(Collections.singletonList(watermarkMsg(REGION_R1, 1, 1000))); + Assert.assertEquals(1000, proc.getWatermark()); + } +} diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/CnToDnAsyncRequestType.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/CnToDnAsyncRequestType.java index e5753bf1bd184..7f20f8cbfd03a 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/CnToDnAsyncRequestType.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/CnToDnAsyncRequestType.java @@ -79,6 +79,8 @@ public enum CnToDnAsyncRequestType { TOPIC_PUSH_MULTI_META, CONSUMER_GROUP_PUSH_ALL_META, CONSUMER_GROUP_PUSH_SINGLE_META, + PULL_COMMIT_PROGRESS, + SUBSCRIPTION_PUSH_RUNTIME, // TEMPLATE UPDATE_TEMPLATE, diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/CnToDnInternalServiceAsyncRequestManager.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/CnToDnInternalServiceAsyncRequestManager.java index cd69f8b2c846d..4faea49d2fb7f 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/CnToDnInternalServiceAsyncRequestManager.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/CnToDnInternalServiceAsyncRequestManager.java @@ -47,6 +47,7 @@ import org.apache.iotdb.confignode.client.async.handlers.rpc.TreeDeviceViewFieldDetectionHandler; import org.apache.iotdb.confignode.client.async.handlers.rpc.subscription.CheckSchemaRegionUsingTemplateRPCHandler; import org.apache.iotdb.confignode.client.async.handlers.rpc.subscription.ConsumerGroupPushMetaRPCHandler; +import org.apache.iotdb.confignode.client.async.handlers.rpc.subscription.PullCommitProgressRPCHandler; import org.apache.iotdb.confignode.client.async.handlers.rpc.subscription.TopicPushMetaRPCHandler; import org.apache.iotdb.mpp.rpc.thrift.TActiveTriggerInstanceReq; import org.apache.iotdb.mpp.rpc.thrift.TAlterEncodingCompressorReq; @@ -83,6 +84,7 @@ import org.apache.iotdb.mpp.rpc.thrift.TKillQueryInstanceReq; import org.apache.iotdb.mpp.rpc.thrift.TNotifyRegionMigrationReq; import org.apache.iotdb.mpp.rpc.thrift.TPipeHeartbeatReq; +import org.apache.iotdb.mpp.rpc.thrift.TPullCommitProgressReq; import org.apache.iotdb.mpp.rpc.thrift.TPushConsumerGroupMetaReq; import org.apache.iotdb.mpp.rpc.thrift.TPushMultiPipeMetaReq; import org.apache.iotdb.mpp.rpc.thrift.TPushMultiTopicMetaReq; @@ -90,6 +92,7 @@ import org.apache.iotdb.mpp.rpc.thrift.TPushSingleConsumerGroupMetaReq; import org.apache.iotdb.mpp.rpc.thrift.TPushSinglePipeMetaReq; import org.apache.iotdb.mpp.rpc.thrift.TPushSingleTopicMetaReq; +import org.apache.iotdb.mpp.rpc.thrift.TPushSubscriptionRuntimeReq; import org.apache.iotdb.mpp.rpc.thrift.TPushTopicMetaReq; import org.apache.iotdb.mpp.rpc.thrift.TRegionLeaderChangeReq; import org.apache.iotdb.mpp.rpc.thrift.TRegionRouteReq; @@ -224,6 +227,16 @@ protected void initActionMapBuilder() { (req, client, handler) -> client.pushSingleConsumerGroupMeta( (TPushSingleConsumerGroupMetaReq) req, (ConsumerGroupPushMetaRPCHandler) handler)); + actionMapBuilder.put( + CnToDnAsyncRequestType.PULL_COMMIT_PROGRESS, + (req, client, handler) -> + client.pullCommitProgress( + (TPullCommitProgressReq) req, (PullCommitProgressRPCHandler) handler)); + actionMapBuilder.put( + CnToDnAsyncRequestType.SUBSCRIPTION_PUSH_RUNTIME, + (req, client, handler) -> + client.pushSubscriptionRuntime( + (TPushSubscriptionRuntimeReq) req, (DataNodeTSStatusRPCHandler) handler)); actionMapBuilder.put( CnToDnAsyncRequestType.PIPE_HEARTBEAT, (req, client, handler) -> diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/DataNodeAsyncRequestRPCHandler.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/DataNodeAsyncRequestRPCHandler.java index b2e2ec3232781..084998aa04825 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/DataNodeAsyncRequestRPCHandler.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/DataNodeAsyncRequestRPCHandler.java @@ -29,12 +29,14 @@ import org.apache.iotdb.confignode.client.async.CnToDnAsyncRequestType; import org.apache.iotdb.confignode.client.async.handlers.rpc.subscription.CheckSchemaRegionUsingTemplateRPCHandler; import org.apache.iotdb.confignode.client.async.handlers.rpc.subscription.ConsumerGroupPushMetaRPCHandler; +import org.apache.iotdb.confignode.client.async.handlers.rpc.subscription.PullCommitProgressRPCHandler; import org.apache.iotdb.confignode.client.async.handlers.rpc.subscription.TopicPushMetaRPCHandler; import org.apache.iotdb.mpp.rpc.thrift.TCheckSchemaRegionUsingTemplateResp; import org.apache.iotdb.mpp.rpc.thrift.TCheckTimeSeriesExistenceResp; import org.apache.iotdb.mpp.rpc.thrift.TCountPathsUsingTemplateResp; import org.apache.iotdb.mpp.rpc.thrift.TDeviceViewResp; import org.apache.iotdb.mpp.rpc.thrift.TFetchSchemaBlackListResp; +import org.apache.iotdb.mpp.rpc.thrift.TPullCommitProgressResp; import org.apache.iotdb.mpp.rpc.thrift.TPushConsumerGroupMetaResp; import org.apache.iotdb.mpp.rpc.thrift.TPushPipeMetaResp; import org.apache.iotdb.mpp.rpc.thrift.TPushTopicMetaResp; @@ -169,6 +171,14 @@ public static DataNodeAsyncRequestRPCHandler buildHandler( dataNodeLocationMap, (Map) responseMap, countDownLatch); + case PULL_COMMIT_PROGRESS: + return new PullCommitProgressRPCHandler( + requestType, + requestId, + targetDataNode, + dataNodeLocationMap, + (Map) responseMap, + countDownLatch); case CHANGE_REGION_LEADER: return new TransferLeaderRPCHandler( requestType, diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/DataNodeTSStatusRPCHandler.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/DataNodeTSStatusRPCHandler.java index 7c93f363dd4b8..bd8042071480a 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/DataNodeTSStatusRPCHandler.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/DataNodeTSStatusRPCHandler.java @@ -48,22 +48,19 @@ public DataNodeTSStatusRPCHandler( @Override public void onComplete(TSStatus response) { - // Put response responseMap.put(requestId, response); if (response.getCode() == TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - // Remove only if success nodeLocationMap.remove(requestId); LOGGER.info("Successfully {} on DataNode: {}", requestType, formattedTargetLocation); } else { - LOGGER.error( + logFailure( "Failed to {} on DataNode: {}, response: {}", requestType, formattedTargetLocation, response); } - // Always CountDown countDownLatch.countDown(); } @@ -76,14 +73,21 @@ public void onError(Exception e) { + formattedTargetLocation + ", exception: " + e.getMessage(); - LOGGER.error(errorMsg); + logFailure(errorMsg); responseMap.put( requestId, new TSStatus( RpcUtils.getStatus(TSStatusCode.EXECUTE_STATEMENT_ERROR.getStatusCode(), errorMsg))); - // Always CountDown countDownLatch.countDown(); } + + private void logFailure(final String format, final Object... args) { + if (requestType == CnToDnAsyncRequestType.SUBSCRIPTION_PUSH_RUNTIME) { + LOGGER.warn(format, args); + } else { + LOGGER.error(format, args); + } + } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/subscription/ConsumerGroupPushMetaRPCHandler.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/subscription/ConsumerGroupPushMetaRPCHandler.java index 2938d4f85b7cd..67ee9f372d747 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/subscription/ConsumerGroupPushMetaRPCHandler.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/subscription/ConsumerGroupPushMetaRPCHandler.java @@ -49,23 +49,19 @@ public ConsumerGroupPushMetaRPCHandler( @Override public void onComplete(TPushConsumerGroupMetaResp response) { - // Put response responseMap.put(requestId, response); if (response.getStatus().getCode() == TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOGGER.info("Successfully {} on DataNode: {}", requestType, formattedTargetLocation); + LOGGER.debug("Successfully {} on DataNode: {}", requestType, formattedTargetLocation); } else { - LOGGER.error( + LOGGER.warn( "Failed to {} on DataNode: {}, response: {}", requestType, formattedTargetLocation, response); } - // Always remove to avoid retrying nodeLocationMap.remove(requestId); - - // Always CountDown countDownLatch.countDown(); } @@ -78,14 +74,13 @@ public void onError(Exception e) { + formattedTargetLocation + ", exception: " + e.getMessage(); - LOGGER.error(errorMsg, e); + LOGGER.warn(errorMsg); responseMap.put( requestId, new TPushConsumerGroupMetaResp( RpcUtils.getStatus(TSStatusCode.CONSUMER_PUSH_META_ERROR, errorMsg))); - // Always CountDown countDownLatch.countDown(); } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/subscription/PullCommitProgressRPCHandler.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/subscription/PullCommitProgressRPCHandler.java new file mode 100644 index 0000000000000..a34dd627f320f --- /dev/null +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/subscription/PullCommitProgressRPCHandler.java @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.client.async.handlers.rpc.subscription; + +import org.apache.iotdb.common.rpc.thrift.TDataNodeLocation; +import org.apache.iotdb.confignode.client.async.CnToDnAsyncRequestType; +import org.apache.iotdb.confignode.client.async.handlers.rpc.DataNodeAsyncRequestRPCHandler; +import org.apache.iotdb.mpp.rpc.thrift.TPullCommitProgressResp; +import org.apache.iotdb.rpc.RpcUtils; +import org.apache.iotdb.rpc.TSStatusCode; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Map; +import java.util.concurrent.CountDownLatch; + +public class PullCommitProgressRPCHandler + extends DataNodeAsyncRequestRPCHandler { + private static final Logger LOGGER = LoggerFactory.getLogger(PullCommitProgressRPCHandler.class); + + public PullCommitProgressRPCHandler( + CnToDnAsyncRequestType requestType, + int requestId, + TDataNodeLocation targetDataNode, + Map dataNodeLocationMap, + Map responseMap, + CountDownLatch countDownLatch) { + super(requestType, requestId, targetDataNode, dataNodeLocationMap, responseMap, countDownLatch); + } + + @Override + public void onComplete(TPullCommitProgressResp response) { + responseMap.put(requestId, response); + + if (response.getStatus().getCode() == TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + logSuspiciousRegionProgressPayloads(response); + LOGGER.info("Successfully {} on DataNode: {}", requestType, formattedTargetLocation); + } else { + LOGGER.error( + "Failed to {} on DataNode: {}, response: {}", + requestType, + formattedTargetLocation, + response); + } + + nodeLocationMap.remove(requestId); + countDownLatch.countDown(); + } + + @Override + public void onError(Exception e) { + String errorMsg = + "Failed to " + + requestType + + " on DataNode: " + + formattedTargetLocation + + ", exception: " + + e.getMessage(); + LOGGER.error(errorMsg, e); + + responseMap.put( + requestId, + new TPullCommitProgressResp( + RpcUtils.getStatus(TSStatusCode.EXECUTE_STATEMENT_ERROR, errorMsg))); + + countDownLatch.countDown(); + } + + private void logSuspiciousRegionProgressPayloads(final TPullCommitProgressResp response) { + if (response == null || !response.isSetCommitRegionProgress()) { + return; + } + for (final Map.Entry entry : + response.getCommitRegionProgress().entrySet()) { + if (isSuspiciousRegionProgressPayload(entry.getValue())) { + LOGGER.warn( + "PULL_COMMIT_PROGRESS confignode recv suspicious payload from DataNode {}, key={}, summary={}", + formattedTargetLocation, + entry.getKey(), + summarizeRegionProgressPayload(entry.getValue())); + } + } + } + + private boolean isSuspiciousRegionProgressPayload(final java.nio.ByteBuffer buffer) { + if (buffer == null) { + return true; + } + final java.nio.ByteBuffer duplicate = buffer.slice(); + if (duplicate.remaining() < Integer.BYTES) { + return true; + } + final int firstInt = duplicate.getInt(); + return firstInt < 0 || firstInt > 1_000_000; + } + + private String summarizeRegionProgressPayload(final java.nio.ByteBuffer buffer) { + if (buffer == null) { + return "null"; + } + final int position = buffer.position(); + final int limit = buffer.limit(); + final int capacity = buffer.capacity(); + final java.nio.ByteBuffer duplicate = buffer.slice(); + final int remaining = duplicate.remaining(); + final String firstIntSummary; + if (remaining >= Integer.BYTES) { + final int firstInt = duplicate.getInt(); + firstIntSummary = firstInt + "(0x" + String.format("%08x", firstInt) + ")"; + duplicate.position(0); + } else { + firstIntSummary = "n/a"; + } + final int sampleLength = Math.min(16, remaining); + final byte[] sample = new byte[sampleLength]; + duplicate.get(sample, 0, sampleLength); + return "pos=" + + position + + ", limit=" + + limit + + ", capacity=" + + capacity + + ", remaining=" + + remaining + + ", firstInt=" + + firstIntSummary + + ", firstBytes=" + + bytesToHex(sample); + } + + private String bytesToHex(final byte[] bytes) { + if (bytes == null || bytes.length == 0) { + return ""; + } + final StringBuilder builder = new StringBuilder(bytes.length * 2); + for (final byte b : bytes) { + builder.append(String.format("%02x", b)); + } + return builder.toString(); + } +} diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/subscription/TopicPushMetaRPCHandler.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/subscription/TopicPushMetaRPCHandler.java index 91ffdd7232b3f..2f5e609f0cfec 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/subscription/TopicPushMetaRPCHandler.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/subscription/TopicPushMetaRPCHandler.java @@ -48,23 +48,19 @@ public TopicPushMetaRPCHandler( @Override public void onComplete(TPushTopicMetaResp response) { - // Put response responseMap.put(requestId, response); if (response.getStatus().getCode() == TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOGGER.info("Successfully {} on DataNode: {}", requestType, formattedTargetLocation); + LOGGER.debug("Successfully {} on DataNode: {}", requestType, formattedTargetLocation); } else { - LOGGER.error( + LOGGER.warn( "Failed to {} on DataNode: {}, response: {}", requestType, formattedTargetLocation, response); } - // Always remove to avoid retrying nodeLocationMap.remove(requestId); - - // Always CountDown countDownLatch.countDown(); } @@ -77,13 +73,12 @@ public void onError(Exception e) { + formattedTargetLocation + ", exception: " + e.getMessage(); - LOGGER.error(errorMsg, e); + LOGGER.warn(errorMsg); responseMap.put( requestId, new TPushTopicMetaResp(RpcUtils.getStatus(TSStatusCode.TOPIC_PUSH_META_ERROR, errorMsg))); - // Always CountDown countDownLatch.countDown(); } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/request/ConfigPhysicalPlan.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/request/ConfigPhysicalPlan.java index ffe333b56dd78..1eb46fdc330c4 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/request/ConfigPhysicalPlan.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/request/ConfigPhysicalPlan.java @@ -87,6 +87,7 @@ import org.apache.iotdb.confignode.consensus.request.write.region.PollRegionMaintainTaskPlan; import org.apache.iotdb.confignode.consensus.request.write.region.PollSpecificRegionMaintainTaskPlan; import org.apache.iotdb.confignode.consensus.request.write.subscription.consumer.AlterConsumerGroupPlan; +import org.apache.iotdb.confignode.consensus.request.write.subscription.consumer.runtime.CommitProgressHandleMetaChangePlan; import org.apache.iotdb.confignode.consensus.request.write.subscription.consumer.runtime.ConsumerGroupHandleMetaChangePlan; import org.apache.iotdb.confignode.consensus.request.write.subscription.topic.AlterMultipleTopicsPlan; import org.apache.iotdb.confignode.consensus.request.write.subscription.topic.AlterTopicPlan; @@ -542,6 +543,9 @@ public static ConfigPhysicalPlan create(final ByteBuffer buffer) throws IOExcept case ConsumerGroupHandleMetaChange: plan = new ConsumerGroupHandleMetaChangePlan(); break; + case CommitProgressHandleMetaChange: + plan = new CommitProgressHandleMetaChangePlan(); + break; case PipeUnsetTemplate: plan = new PipeUnsetSchemaTemplatePlan(); break; diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/request/ConfigPhysicalPlanType.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/request/ConfigPhysicalPlanType.java index fe04b93d9ad4b..ae10e9898f251 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/request/ConfigPhysicalPlanType.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/request/ConfigPhysicalPlanType.java @@ -324,6 +324,8 @@ public enum ConfigPhysicalPlanType { ShowSubscription((short) 2000), + CommitProgressHandleMetaChange((short) 2001), + // Authority version after and equal 2.0 DropUserV2((short) 2100), UpdateUserV2((short) 2101), diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/request/write/subscription/consumer/runtime/CommitProgressHandleMetaChangePlan.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/request/write/subscription/consumer/runtime/CommitProgressHandleMetaChangePlan.java new file mode 100644 index 0000000000000..387b0a43b4a61 --- /dev/null +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/request/write/subscription/consumer/runtime/CommitProgressHandleMetaChangePlan.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.consensus.request.write.subscription.consumer.runtime; + +import org.apache.iotdb.commons.subscription.meta.consumer.CommitProgressKeeper; +import org.apache.iotdb.confignode.consensus.request.ConfigPhysicalPlan; +import org.apache.iotdb.confignode.consensus.request.ConfigPhysicalPlanType; + +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.HashMap; +import java.util.Map; +import java.util.Objects; + +/** Consensus plan for handling per-region commit progress meta changes. */ +public class CommitProgressHandleMetaChangePlan extends ConfigPhysicalPlan { + + private Map regionProgressMap = new HashMap<>(); + + public CommitProgressHandleMetaChangePlan() { + super(ConfigPhysicalPlanType.CommitProgressHandleMetaChange); + } + + public CommitProgressHandleMetaChangePlan(final Map regionProgressMap) { + super(ConfigPhysicalPlanType.CommitProgressHandleMetaChange); + this.regionProgressMap = regionProgressMap; + } + + public Map getRegionProgressMap() { + return regionProgressMap; + } + + @Override + protected void serializeImpl(final DataOutputStream stream) throws IOException { + stream.writeShort(getType().getPlanType()); + stream.writeInt(regionProgressMap.size()); + for (final Map.Entry entry : regionProgressMap.entrySet()) { + final byte[] keyBytes = entry.getKey().getBytes("UTF-8"); + final ByteBuffer valueBuffer = entry.getValue().asReadOnlyBuffer(); + valueBuffer.rewind(); + final byte[] valueBytes = new byte[valueBuffer.remaining()]; + valueBuffer.get(valueBytes); + stream.writeInt(keyBytes.length); + stream.write(keyBytes); + stream.writeInt(valueBytes.length); + stream.write(valueBytes); + } + } + + @Override + protected void deserializeImpl(final ByteBuffer buffer) throws IOException { + regionProgressMap = CommitProgressKeeper.deserializeRegionProgressFromBuffer(buffer); + } + + @Override + public boolean equals(final Object obj) { + if (this == obj) { + return true; + } + if (obj == null || getClass() != obj.getClass()) { + return false; + } + final CommitProgressHandleMetaChangePlan that = (CommitProgressHandleMetaChangePlan) obj; + return Objects.equals(this.regionProgressMap, that.regionProgressMap); + } + + @Override + public int hashCode() { + return Objects.hash(regionProgressMap); + } +} diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ConfigManager.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ConfigManager.java index 182dc2f9fb249..d8c06062756a3 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ConfigManager.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ConfigManager.java @@ -191,6 +191,8 @@ import org.apache.iotdb.confignode.rpc.thrift.TGetAllSubscriptionInfoResp; import org.apache.iotdb.confignode.rpc.thrift.TGetAllTemplatesResp; import org.apache.iotdb.confignode.rpc.thrift.TGetAllTopicInfoResp; +import org.apache.iotdb.confignode.rpc.thrift.TGetCommitProgressReq; +import org.apache.iotdb.confignode.rpc.thrift.TGetCommitProgressResp; import org.apache.iotdb.confignode.rpc.thrift.TGetDataNodeLocationsResp; import org.apache.iotdb.confignode.rpc.thrift.TGetDatabaseReq; import org.apache.iotdb.confignode.rpc.thrift.TGetJarInListReq; @@ -254,6 +256,9 @@ import org.apache.iotdb.db.schemaengine.template.alter.TemplateAlterOperationUtil; import org.apache.iotdb.rpc.RpcUtils; import org.apache.iotdb.rpc.TSStatusCode; +import org.apache.iotdb.rpc.subscription.payload.poll.RegionProgress; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterId; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterProgress; import org.apache.iotdb.service.rpc.thrift.TPipeTransferReq; import org.apache.iotdb.service.rpc.thrift.TPipeTransferResp; @@ -264,6 +269,8 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.ByteArrayOutputStream; +import java.io.DataOutputStream; import java.io.File; import java.io.IOException; import java.net.URL; @@ -276,8 +283,10 @@ import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; +import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Set; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; @@ -2508,6 +2517,83 @@ public TGetAllSubscriptionInfoResp getAllSubscriptionInfo() { : new TGetAllSubscriptionInfoResp(status, Collections.emptyList()); } + public TGetCommitProgressResp getCommitProgress(TGetCommitProgressReq req) { + TSStatus status = confirmLeader(); + if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + return new TGetCommitProgressResp(status); + } + final String key = + req.getConsumerGroupId() + + "##" + + req.getTopicName() + + "##" + + req.getRegionId() + + "##" + + req.getDataNodeId(); + final String keyPrefix = + req.getConsumerGroupId() + "##" + req.getTopicName() + "##" + req.getRegionId() + "##"; + final org.apache.iotdb.commons.subscription.meta.consumer.CommitProgressKeeper keeper = + subscriptionManager + .getSubscriptionCoordinator() + .getSubscriptionInfo() + .getCommitProgressKeeper(); + final Map mergedWriterPositions = new LinkedHashMap<>(); + + for (final Map.Entry entry : keeper.getAllRegionProgress().entrySet()) { + if (!entry.getKey().startsWith(keyPrefix)) { + continue; + } + final RegionProgress regionProgress = deserializeRegionProgress(entry.getValue()); + if (Objects.isNull(regionProgress)) { + continue; + } + for (final Map.Entry writerEntry : + regionProgress.getWriterPositions().entrySet()) { + mergedWriterPositions.merge( + writerEntry.getKey(), + writerEntry.getValue(), + (oldProgress, newProgress) -> + compareWriterProgress(newProgress, oldProgress) > 0 ? newProgress : oldProgress); + } + } + final TGetCommitProgressResp resp = + new TGetCommitProgressResp(new TSStatus(TSStatusCode.SUCCESS_STATUS.getStatusCode())); + if (!mergedWriterPositions.isEmpty()) { + resp.setCommittedRegionProgress( + serializeRegionProgress(new RegionProgress(mergedWriterPositions))); + } + return resp; + } + + private static RegionProgress deserializeRegionProgress(final ByteBuffer buffer) { + if (Objects.isNull(buffer)) { + return null; + } + final ByteBuffer duplicate = buffer.asReadOnlyBuffer(); + duplicate.rewind(); + return RegionProgress.deserialize(duplicate); + } + + private static ByteBuffer serializeRegionProgress(final RegionProgress regionProgress) { + try (final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + final DataOutputStream dos = new DataOutputStream(baos)) { + regionProgress.serialize(dos); + dos.flush(); + return ByteBuffer.wrap(baos.toByteArray()).asReadOnlyBuffer(); + } catch (final IOException e) { + throw new RuntimeException("Failed to serialize region progress " + regionProgress, e); + } + } + + private static int compareWriterProgress( + final WriterProgress leftProgress, final WriterProgress rightProgress) { + int cmp = Long.compare(leftProgress.getPhysicalTime(), rightProgress.getPhysicalTime()); + if (cmp != 0) { + return cmp; + } + return Long.compare(leftProgress.getLocalSeq(), rightProgress.getLocalSeq()); + } + @Override public TPipeConfigTransferResp handleTransferConfigPlan(TPipeConfigTransferReq req) { TSStatus status = confirmLeader(); diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ProcedureManager.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ProcedureManager.java index 2c5a77303d9b9..3354948d0e552 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ProcedureManager.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ProcedureManager.java @@ -115,7 +115,9 @@ import org.apache.iotdb.confignode.procedure.impl.schema.table.view.SetViewPropertiesProcedure; import org.apache.iotdb.confignode.procedure.impl.subscription.consumer.CreateConsumerProcedure; import org.apache.iotdb.confignode.procedure.impl.subscription.consumer.DropConsumerProcedure; +import org.apache.iotdb.confignode.procedure.impl.subscription.consumer.runtime.CommitProgressSyncProcedure; import org.apache.iotdb.confignode.procedure.impl.subscription.consumer.runtime.ConsumerGroupMetaSyncProcedure; +import org.apache.iotdb.confignode.procedure.impl.subscription.runtime.SubscriptionHandleLeaderChangeProcedure; import org.apache.iotdb.confignode.procedure.impl.subscription.subscription.CreateSubscriptionProcedure; import org.apache.iotdb.confignode.procedure.impl.subscription.subscription.DropSubscriptionProcedure; import org.apache.iotdb.confignode.procedure.impl.subscription.topic.CreateTopicProcedure; @@ -1666,6 +1668,21 @@ public void pipeHandleLeaderChange( } } + public void subscriptionHandleLeaderChange( + Map> regionGroupToOldAndNewLeaderPairMap, + long runtimeVersion) { + try { + final long procedureId = + executor.submitProcedure( + new SubscriptionHandleLeaderChangeProcedure( + regionGroupToOldAndNewLeaderPairMap, runtimeVersion)); + LOGGER.info( + "SubscriptionHandleLeaderChangeProcedure was submitted, procedureId: {}.", procedureId); + } catch (Exception e) { + LOGGER.warn("SubscriptionHandleLeaderChangeProcedure was failed to submit.", e); + } + } + public void pipeHandleMetaChange( boolean needWriteConsensusOnConfigNodes, boolean needPushPipeMetaToDataNodes) { try { @@ -1815,6 +1832,23 @@ public TSStatus consumerGroupMetaSync() { } } + public TSStatus commitProgressSync() { + try { + CommitProgressSyncProcedure procedure = new CommitProgressSyncProcedure(); + executor.submitProcedure(procedure); + TSStatus status = waitingProcedureFinished(procedure); + if (status.getCode() == TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + return status; + } else { + return new TSStatus(TSStatusCode.CONSUMER_PUSH_META_ERROR.getStatusCode()) + .setMessage(wrapTimeoutMessageForPipeProcedure(status.getMessage())); + } + } catch (Exception e) { + return new TSStatus(TSStatusCode.CONSUMER_PUSH_META_ERROR.getStatusCode()) + .setMessage(e.getMessage()); + } + } + public TSStatus createSubscription(TSubscribeReq req) { try { CreateSubscriptionProcedure procedure = new CreateSubscriptionProcedure(req); diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/load/LoadManager.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/load/LoadManager.java index 993bfc0e40066..55d9417f30a2b 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/load/LoadManager.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/load/LoadManager.java @@ -88,6 +88,8 @@ public LoadManager(IManager configManager) { this.topologyService = new TopologyService(configManager, loadCache::updateTopology); this.eventService = new EventService(loadCache); this.eventService.register(configManager.getPipeManager().getPipeRuntimeCoordinator()); + this.eventService.register( + configManager.getSubscriptionManager().getSubscriptionLeaderChangeHandler()); this.eventService.register(routeBalancer); this.eventService.register(topologyService); } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/subscription/SubscriptionManager.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/subscription/SubscriptionManager.java index 1080b067fae82..ff06e20cf2dc7 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/subscription/SubscriptionManager.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/subscription/SubscriptionManager.java @@ -20,17 +20,32 @@ package org.apache.iotdb.confignode.manager.subscription; import org.apache.iotdb.confignode.manager.ConfigManager; +import org.apache.iotdb.confignode.manager.subscription.runtime.SubscriptionLeaderChangeHandler; +import org.apache.iotdb.confignode.manager.subscription.runtime.SubscriptionRuntimeCoordinator; import org.apache.iotdb.confignode.persistence.subscription.SubscriptionInfo; public class SubscriptionManager { private final SubscriptionCoordinator subscriptionCoordinator; + private final SubscriptionRuntimeCoordinator subscriptionRuntimeCoordinator; + private final SubscriptionLeaderChangeHandler subscriptionLeaderChangeHandler; public SubscriptionManager(ConfigManager configManager, SubscriptionInfo subscriptionInfo) { this.subscriptionCoordinator = new SubscriptionCoordinator(configManager, subscriptionInfo); + this.subscriptionRuntimeCoordinator = new SubscriptionRuntimeCoordinator(configManager); + this.subscriptionLeaderChangeHandler = + new SubscriptionLeaderChangeHandler(subscriptionRuntimeCoordinator); } public SubscriptionCoordinator getSubscriptionCoordinator() { return subscriptionCoordinator; } + + public SubscriptionRuntimeCoordinator getSubscriptionRuntimeCoordinator() { + return subscriptionRuntimeCoordinator; + } + + public SubscriptionLeaderChangeHandler getSubscriptionLeaderChangeHandler() { + return subscriptionLeaderChangeHandler; + } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/subscription/SubscriptionMetaSyncer.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/subscription/SubscriptionMetaSyncer.java index de49987e13fbe..4931a2948fc61 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/subscription/SubscriptionMetaSyncer.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/subscription/SubscriptionMetaSyncer.java @@ -106,6 +106,13 @@ private synchronized void sync() { return; } + // sync commit progress if syncing consumer group meta successfully + final TSStatus commitProgressSyncStatus = procedureManager.commitProgressSync(); + if (commitProgressSyncStatus.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + LOGGER.warn("Failed to sync commit progress. Result status: {}.", commitProgressSyncStatus); + return; + } + LOGGER.info( "After this successful sync, if SubscriptionInfo is empty during this sync and has not been modified afterwards, all subsequent syncs will be skipped"); isLastSubscriptionSyncSuccessful = true; diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/subscription/runtime/SubscriptionLeaderChangeHandler.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/subscription/runtime/SubscriptionLeaderChangeHandler.java new file mode 100644 index 0000000000000..6b888e424aa9c --- /dev/null +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/subscription/runtime/SubscriptionLeaderChangeHandler.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.manager.subscription.runtime; + +import org.apache.iotdb.confignode.manager.load.subscriber.ConsensusGroupStatisticsChangeEvent; +import org.apache.iotdb.confignode.manager.load.subscriber.IClusterStatusSubscriber; +import org.apache.iotdb.confignode.manager.load.subscriber.NodeStatisticsChangeEvent; + +public class SubscriptionLeaderChangeHandler implements IClusterStatusSubscriber { + + private final SubscriptionRuntimeCoordinator runtimeCoordinator; + + public SubscriptionLeaderChangeHandler(final SubscriptionRuntimeCoordinator runtimeCoordinator) { + this.runtimeCoordinator = runtimeCoordinator; + } + + @Override + public void onNodeStatisticsChanged(final NodeStatisticsChangeEvent event) { + runtimeCoordinator.handleNodeStatisticsChange(event); + } + + @Override + public void onConsensusGroupStatisticsChanged(final ConsensusGroupStatisticsChangeEvent event) { + runtimeCoordinator.handleLeaderChangeEvent(event); + } +} diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/subscription/runtime/SubscriptionRuntimeCoordinator.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/subscription/runtime/SubscriptionRuntimeCoordinator.java new file mode 100644 index 0000000000000..399327b0119e4 --- /dev/null +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/subscription/runtime/SubscriptionRuntimeCoordinator.java @@ -0,0 +1,158 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.manager.subscription.runtime; + +import org.apache.iotdb.common.rpc.thrift.TConsensusGroupId; +import org.apache.iotdb.common.rpc.thrift.TConsensusGroupType; +import org.apache.iotdb.commons.cluster.NodeStatus; +import org.apache.iotdb.commons.subscription.meta.topic.TopicMeta; +import org.apache.iotdb.confignode.manager.ConfigManager; +import org.apache.iotdb.confignode.manager.load.cache.node.NodeStatistics; +import org.apache.iotdb.confignode.manager.load.subscriber.ConsensusGroupStatisticsChangeEvent; +import org.apache.iotdb.confignode.manager.load.subscriber.NodeStatisticsChangeEvent; + +import org.apache.tsfile.utils.Pair; + +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.atomic.AtomicLong; + +public class SubscriptionRuntimeCoordinator { + + private final ConfigManager configManager; + private final Map> regionGroupToRuntimeLeaderPairMap = + new HashMap<>(); + private final AtomicLong runtimeVersionGenerator = new AtomicLong(System.currentTimeMillis()); + + public SubscriptionRuntimeCoordinator(final ConfigManager configManager) { + this.configManager = configManager; + } + + public synchronized void handleLeaderChangeEvent( + final ConsensusGroupStatisticsChangeEvent event) { + if (!hasAnyConsensusBasedTopic()) { + return; + } + + final Map> refreshMap = new HashMap<>(); + event + .getDifferentConsensusGroupStatisticsMap() + .forEach( + (regionGroupId, pair) -> { + if (regionGroupId.getType() != TConsensusGroupType.DataRegion) { + return; + } + final int oldLeaderNodeId = pair.left == null ? -1 : pair.left.getLeaderId(); + final int newLeaderNodeId = pair.right == null ? -1 : pair.right.getLeaderId(); + if (oldLeaderNodeId == newLeaderNodeId) { + return; + } + updateRuntimeLeaderPair(regionGroupId, oldLeaderNodeId, newLeaderNodeId, refreshMap); + }); + + submitRuntimeRefresh(refreshMap); + } + + public synchronized void handleNodeStatisticsChange(final NodeStatisticsChangeEvent event) { + if (!hasAnyConsensusBasedTopic()) { + return; + } + + final boolean shouldRefreshRuntime = + event.getDifferentNodeStatisticsMap().values().stream() + .anyMatch( + pair -> { + final NodeStatus oldStatus = getNodeStatus(pair.getLeft()); + final NodeStatus newStatus = getNodeStatus(pair.getRight()); + return oldStatus != newStatus + && (isRuntimeSensitiveStatus(oldStatus) + || isRuntimeSensitiveStatus(newStatus)); + }); + if (!shouldRefreshRuntime) { + return; + } + + seedRuntimeLeaderPairsFromCurrentLeaders(); + submitRuntimeRefresh(new HashMap<>(regionGroupToRuntimeLeaderPairMap)); + } + + public boolean hasAnyConsensusBasedTopic() { + for (final TopicMeta topicMeta : + configManager + .getSubscriptionManager() + .getSubscriptionCoordinator() + .getSubscriptionInfo() + .getAllTopicMeta()) { + if (topicMeta.getConfig().isConsensusMode()) { + return true; + } + } + return false; + } + + private void updateRuntimeLeaderPair( + final TConsensusGroupId regionGroupId, + final int oldLeaderNodeId, + final int newLeaderNodeId, + final Map> refreshMap) { + if (newLeaderNodeId < 0) { + regionGroupToRuntimeLeaderPairMap.remove(regionGroupId); + return; + } + final Pair runtimeLeaderPair = new Pair<>(oldLeaderNodeId, newLeaderNodeId); + regionGroupToRuntimeLeaderPairMap.put(regionGroupId, runtimeLeaderPair); + refreshMap.put(regionGroupId, runtimeLeaderPair); + } + + private void seedRuntimeLeaderPairsFromCurrentLeaders() { + configManager + .getLoadManager() + .getRegionLeaderMap() + .forEach( + (regionGroupId, leaderId) -> { + if (regionGroupId.getType() == TConsensusGroupType.DataRegion && leaderId >= 0) { + regionGroupToRuntimeLeaderPairMap.putIfAbsent( + regionGroupId, new Pair<>(-1, leaderId)); + } + }); + } + + private void submitRuntimeRefresh( + final Map> regionGroupToOldAndNewLeaderPairMap) { + if (regionGroupToOldAndNewLeaderPairMap.isEmpty()) { + return; + } + configManager + .getProcedureManager() + .subscriptionHandleLeaderChange( + regionGroupToOldAndNewLeaderPairMap, + runtimeVersionGenerator.updateAndGet( + currentRuntimeVersion -> + Math.max(currentRuntimeVersion + 1, System.currentTimeMillis()))); + } + + private static NodeStatus getNodeStatus(final NodeStatistics statistics) { + return statistics == null ? NodeStatus.Unknown : statistics.getStatus(); + } + + private static boolean isRuntimeSensitiveStatus(final NodeStatus status) { + return status == NodeStatus.Unknown || status == NodeStatus.Removing; + } +} diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/persistence/executor/ConfigPlanExecutor.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/persistence/executor/ConfigPlanExecutor.java index 6c9351e881aa8..9613bc9c57073 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/persistence/executor/ConfigPlanExecutor.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/persistence/executor/ConfigPlanExecutor.java @@ -111,6 +111,7 @@ import org.apache.iotdb.confignode.consensus.request.write.region.OfferRegionMaintainTasksPlan; import org.apache.iotdb.confignode.consensus.request.write.region.PollSpecificRegionMaintainTaskPlan; import org.apache.iotdb.confignode.consensus.request.write.subscription.consumer.AlterConsumerGroupPlan; +import org.apache.iotdb.confignode.consensus.request.write.subscription.consumer.runtime.CommitProgressHandleMetaChangePlan; import org.apache.iotdb.confignode.consensus.request.write.subscription.consumer.runtime.ConsumerGroupHandleMetaChangePlan; import org.apache.iotdb.confignode.consensus.request.write.subscription.topic.AlterMultipleTopicsPlan; import org.apache.iotdb.confignode.consensus.request.write.subscription.topic.AlterTopicPlan; @@ -639,6 +640,9 @@ public TSStatus executeNonQueryPlan(ConfigPhysicalPlan physicalPlan) case ConsumerGroupHandleMetaChange: return subscriptionInfo.handleConsumerGroupMetaChanges( (ConsumerGroupHandleMetaChangePlan) physicalPlan); + case CommitProgressHandleMetaChange: + return subscriptionInfo.handleCommitProgressChanges( + (CommitProgressHandleMetaChangePlan) physicalPlan); case AlterConsumerGroup: return subscriptionInfo.alterConsumerGroup((AlterConsumerGroupPlan) physicalPlan); case TopicHandleMetaChange: diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/persistence/subscription/SubscriptionInfo.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/persistence/subscription/SubscriptionInfo.java index 0c262655156d3..dc45c56b706d9 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/persistence/subscription/SubscriptionInfo.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/persistence/subscription/SubscriptionInfo.java @@ -21,12 +21,16 @@ import org.apache.iotdb.common.rpc.thrift.TSStatus; import org.apache.iotdb.commons.snapshot.SnapshotProcessor; +import org.apache.iotdb.commons.subscription.meta.consumer.CommitProgressKeeper; import org.apache.iotdb.commons.subscription.meta.consumer.ConsumerGroupMeta; import org.apache.iotdb.commons.subscription.meta.consumer.ConsumerGroupMetaKeeper; import org.apache.iotdb.commons.subscription.meta.subscription.SubscriptionMeta; import org.apache.iotdb.commons.subscription.meta.topic.TopicMeta; import org.apache.iotdb.commons.subscription.meta.topic.TopicMetaKeeper; +import org.apache.iotdb.confignode.conf.ConfigNodeConfig; +import org.apache.iotdb.confignode.conf.ConfigNodeDescriptor; import org.apache.iotdb.confignode.consensus.request.write.subscription.consumer.AlterConsumerGroupPlan; +import org.apache.iotdb.confignode.consensus.request.write.subscription.consumer.runtime.CommitProgressHandleMetaChangePlan; import org.apache.iotdb.confignode.consensus.request.write.subscription.consumer.runtime.ConsumerGroupHandleMetaChangePlan; import org.apache.iotdb.confignode.consensus.request.write.subscription.topic.AlterMultipleTopicsPlan; import org.apache.iotdb.confignode.consensus.request.write.subscription.topic.AlterTopicPlan; @@ -40,8 +44,11 @@ import org.apache.iotdb.confignode.rpc.thrift.TCreateTopicReq; import org.apache.iotdb.confignode.rpc.thrift.TSubscribeReq; import org.apache.iotdb.confignode.rpc.thrift.TUnsubscribeReq; +import org.apache.iotdb.consensus.ConsensusFactory; import org.apache.iotdb.consensus.common.DataSet; import org.apache.iotdb.rpc.TSStatusCode; +import org.apache.iotdb.rpc.subscription.config.TopicConfig; +import org.apache.iotdb.rpc.subscription.config.TopicConstant; import org.apache.iotdb.rpc.subscription.exception.SubscriptionException; import org.apache.thrift.annotation.Nullable; @@ -54,13 +61,17 @@ import java.io.FileOutputStream; import java.io.IOException; import java.util.ArrayList; +import java.util.Collections; import java.util.List; +import java.util.Map; import java.util.Objects; import java.util.Optional; import java.util.Set; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.function.Predicate; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; import java.util.stream.Collectors; import java.util.stream.StreamSupport; @@ -68,10 +79,15 @@ public class SubscriptionInfo implements SnapshotProcessor { private static final Logger LOGGER = LoggerFactory.getLogger(SubscriptionInfo.class); + private static final ConfigNodeConfig CONF = ConfigNodeDescriptor.getInstance().getConf(); + private static final String SNAPSHOT_FILE_NAME = "subscription_info.bin"; + private static final String DATA_REGION_CONSENSUS_PROTOCOL_CLASS_KEY = + "data_region_consensus_protocol_class"; private final TopicMetaKeeper topicMetaKeeper; private final ConsumerGroupMetaKeeper consumerGroupMetaKeeper; + private final CommitProgressKeeper commitProgressKeeper; private final ReentrantReadWriteLock subscriptionInfoLock = new ReentrantReadWriteLock(true); @@ -81,6 +97,7 @@ public class SubscriptionInfo implements SnapshotProcessor { public SubscriptionInfo() { this.topicMetaKeeper = new TopicMetaKeeper(); this.consumerGroupMetaKeeper = new ConsumerGroupMetaKeeper(); + this.commitProgressKeeper = new CommitProgressKeeper(); this.subscriptionInfoVersion = new SubscriptionInfoVersion(); } @@ -158,6 +175,8 @@ public boolean validateBeforeCreatingTopic(TCreateTopicReq createTopicReq) private boolean checkBeforeCreateTopicInternal(TCreateTopicReq createTopicReq) throws SubscriptionException { + validateTopicConfig(new TopicConfig(safeTopicAttributes(createTopicReq.getTopicAttributes()))); + if (!isTopicExisted(createTopicReq.getTopicName())) { return true; } @@ -247,7 +266,12 @@ public void validateBeforeAlteringTopic(TopicMeta topicMeta) throws Subscription } private void checkBeforeAlteringTopicInternal(TopicMeta topicMeta) throws SubscriptionException { + validateTopicConfig(topicMeta.getConfig()); + if (isTopicExisted(topicMeta.getTopicName())) { + final TopicMeta existedTopicMeta = topicMetaKeeper.getTopicMeta(topicMeta.getTopicName()); + validateUnsupportedHotUpdatedTopicConfig( + topicMeta.getTopicName(), existedTopicMeta.getConfig(), topicMeta.getConfig()); return; } @@ -258,6 +282,228 @@ private void checkBeforeAlteringTopicInternal(TopicMeta topicMeta) throws Subscr throw new SubscriptionException(exceptionMessage); } + private Map safeTopicAttributes(@Nullable final Map attributes) { + return Objects.nonNull(attributes) ? attributes : Collections.emptyMap(); + } + + private void validateTopicConfig(final TopicConfig topicConfig) throws SubscriptionException { + final String mode = topicConfig.getMode(); + if (!TopicConfig.isValidMode(mode)) { + final String exceptionMessage = + String.format( + "Failed to create or alter topic, unsupported %s=%s, expected one of [%s, %s, %s]", + TopicConstant.MODE_KEY, + mode, + TopicConstant.MODE_SNAPSHOT_VALUE, + TopicConstant.MODE_LIVE_VALUE, + TopicConstant.MODE_CONSENSUS_VALUE); + LOGGER.warn(exceptionMessage); + throw new SubscriptionException(exceptionMessage); + } + + validateConsensusProtocolSupport(topicConfig); + + if (topicConfig.isConsensusMode() && !topicConfig.isRecordFormat()) { + final String exceptionMessage = + String.format( + "Failed to create or alter topic, %s=%s only supports %s=%s", + TopicConstant.MODE_KEY, + TopicConstant.MODE_CONSENSUS_VALUE, + TopicConstant.FORMAT_KEY, + TopicConstant.FORMAT_RECORD_HANDLER_VALUE); + LOGGER.warn(exceptionMessage); + throw new SubscriptionException(exceptionMessage); + } + + final String orderMode = topicConfig.getOrderMode(); + if (!TopicConfig.isValidOrderMode(orderMode)) { + final String exceptionMessage = + String.format( + "Failed to create or alter topic, unsupported %s=%s, expected one of [%s, %s, %s]", + TopicConstant.ORDER_MODE_KEY, + orderMode, + TopicConstant.ORDER_MODE_LEADER_ONLY_VALUE, + TopicConstant.ORDER_MODE_MULTI_WRITER_VALUE, + TopicConstant.ORDER_MODE_PER_WRITER_VALUE); + LOGGER.warn(exceptionMessage); + throw new SubscriptionException(exceptionMessage); + } + + validateConsensusTableColumnPattern(topicConfig); + validateConsensusTopicRetentionConfig(topicConfig); + } + + private void validateConsensusProtocolSupport(final TopicConfig topicConfig) + throws SubscriptionException { + if (!topicConfig.isConsensusMode()) { + return; + } + + final String actualProtocol = String.valueOf(CONF.getDataRegionConsensusProtocolClass()); + if (ConsensusFactory.IOT_CONSENSUS.equals(actualProtocol)) { + return; + } + + final String exceptionMessage = + String.format( + "Failed to create or alter topic, %s=%s is only supported when %s=%s, but current value is %s", + TopicConstant.MODE_KEY, + TopicConstant.MODE_CONSENSUS_VALUE, + DATA_REGION_CONSENSUS_PROTOCOL_CLASS_KEY, + ConsensusFactory.IOT_CONSENSUS, + actualProtocol); + LOGGER.warn(exceptionMessage); + throw new SubscriptionException(exceptionMessage); + } + + private void validateConsensusTableColumnPattern(final TopicConfig topicConfig) + throws SubscriptionException { + if (!topicConfig.hasAttribute(TopicConstant.COLUMN_KEY)) { + return; + } + + if (!topicConfig.isTableTopic()) { + final String exceptionMessage = + String.format( + "Failed to create or alter topic, %s is only supported for table topics", + TopicConstant.COLUMN_KEY); + LOGGER.warn(exceptionMessage); + throw new SubscriptionException(exceptionMessage); + } + + if (!isConsensusBasedTopicConfig(topicConfig)) { + final String exceptionMessage = + String.format( + "Failed to create or alter topic, %s is only supported for consensus table topics", + TopicConstant.COLUMN_KEY); + LOGGER.warn(exceptionMessage); + throw new SubscriptionException(exceptionMessage); + } + + final String columnPattern = + topicConfig.getStringOrDefault( + TopicConstant.COLUMN_KEY, TopicConstant.COLUMN_DEFAULT_VALUE); + try { + Pattern.compile(columnPattern); + } catch (final PatternSyntaxException e) { + final String exceptionMessage = + String.format( + "Failed to create or alter topic, illegal %s=%s, detail: %s", + TopicConstant.COLUMN_KEY, columnPattern, e.getMessage()); + LOGGER.warn(exceptionMessage, e); + throw new SubscriptionException(exceptionMessage); + } + } + + private boolean isConsensusBasedTopicConfig(final TopicConfig topicConfig) { + return topicConfig.isConsensusMode(); + } + + private void validateConsensusTopicRetentionConfig(final TopicConfig topicConfig) + throws SubscriptionException { + if (!topicConfig.hasAttribute(TopicConstant.RETENTION_BYTES_KEY) + && !topicConfig.hasAttribute(TopicConstant.RETENTION_MS_KEY)) { + return; + } + + if (!isConsensusBasedTopicConfig(topicConfig)) { + final String exceptionMessage = + String.format( + "Failed to create or alter topic, %s and %s are only supported for consensus topics", + TopicConstant.RETENTION_BYTES_KEY, TopicConstant.RETENTION_MS_KEY); + LOGGER.warn(exceptionMessage); + throw new SubscriptionException(exceptionMessage); + } + + validateRetentionValue(topicConfig, TopicConstant.RETENTION_BYTES_KEY); + validateRetentionValue(topicConfig, TopicConstant.RETENTION_MS_KEY); + } + + private void validateRetentionValue(final TopicConfig topicConfig, final String key) + throws SubscriptionException { + if (!topicConfig.hasAttribute(key)) { + return; + } + + final String rawValue = topicConfig.getAttribute().get(key); + try { + final long parsedValue = Long.parseLong(rawValue); + if (parsedValue == 0 || parsedValue < -1) { + throw new SubscriptionException( + String.format( + "Failed to create or alter topic, illegal %s=%s, expected -1 or a positive long value", + key, rawValue)); + } + } catch (final NumberFormatException e) { + final String exceptionMessage = + String.format( + "Failed to create or alter topic, illegal %s=%s, expected a long value", + key, rawValue); + LOGGER.warn(exceptionMessage, e); + throw new SubscriptionException(exceptionMessage); + } catch (final SubscriptionException e) { + LOGGER.warn(e.getMessage()); + throw e; + } + } + + private void validateUnsupportedHotUpdatedTopicConfig( + final String topicName, final TopicConfig existedConfig, final TopicConfig updatedConfig) + throws SubscriptionException { + final String existedMode = existedConfig.getMode(); + final String updatedMode = updatedConfig.getMode(); + if (!Objects.equals(existedMode, updatedMode)) { + final String exceptionMessage = + String.format( + "Failed to alter topic %s, changing %s is not supported because existing subscription runtimes do not hot-refresh source mode", + topicName, TopicConstant.MODE_KEY); + LOGGER.warn(exceptionMessage); + throw new SubscriptionException(exceptionMessage); + } + + final String existedColumnPattern = + existedConfig.getStringOrDefault( + TopicConstant.COLUMN_KEY, TopicConstant.COLUMN_DEFAULT_VALUE); + final String updatedColumnPattern = + updatedConfig.getStringOrDefault( + TopicConstant.COLUMN_KEY, TopicConstant.COLUMN_DEFAULT_VALUE); + if (!Objects.equals(existedColumnPattern, updatedColumnPattern)) { + final String exceptionMessage = + String.format( + "Failed to alter topic %s, changing %s is not supported because existing consensus queues do not hot-refresh converter state", + topicName, TopicConstant.COLUMN_KEY); + LOGGER.warn(exceptionMessage); + throw new SubscriptionException(exceptionMessage); + } + + validateUnsupportedHotUpdatedRetentionConfig( + topicName, + existedConfig, + updatedConfig, + TopicConstant.RETENTION_BYTES_KEY, + TopicConstant.RETENTION_MS_KEY); + } + + private void validateUnsupportedHotUpdatedRetentionConfig( + final String topicName, + final TopicConfig existedConfig, + final TopicConfig updatedConfig, + final String... retentionKeys) + throws SubscriptionException { + for (final String retentionKey : retentionKeys) { + final String existedValue = existedConfig.getAttribute().get(retentionKey); + final String updatedValue = updatedConfig.getAttribute().get(retentionKey); + if (!Objects.equals(existedValue, updatedValue)) { + final String exceptionMessage = + String.format( + "Failed to alter topic %s, changing %s is not supported because existing consensus queues do not hot-refresh retention state", + topicName, retentionKey); + LOGGER.warn(exceptionMessage); + throw new SubscriptionException(exceptionMessage); + } + } + } + public boolean isTopicExisted(String topicName) { acquireReadLock(); try { @@ -566,6 +812,21 @@ public TSStatus handleConsumerGroupMetaChanges(ConsumerGroupHandleMetaChangePlan } } + public TSStatus handleCommitProgressChanges(CommitProgressHandleMetaChangePlan plan) { + acquireWriteLock(); + try { + LOGGER.info("Handling commit progress meta changes ..."); + commitProgressKeeper.replaceAll(plan.getRegionProgressMap()); + return new TSStatus(TSStatusCode.SUCCESS_STATUS.getStatusCode()); + } finally { + releaseWriteLock(); + } + } + + public CommitProgressKeeper getCommitProgressKeeper() { + return commitProgressKeeper; + } + ///////////////////////////////// Subscription ///////////////////////////////// public void validateBeforeSubscribe(TSubscribeReq subscribeReq) throws SubscriptionException { @@ -740,6 +1001,7 @@ public boolean processTakeSnapshot(File snapshotDir) throws IOException { try (final FileOutputStream fileOutputStream = new FileOutputStream(snapshotFile)) { topicMetaKeeper.processTakeSnapshot(fileOutputStream); consumerGroupMetaKeeper.processTakeSnapshot(fileOutputStream); + commitProgressKeeper.processTakeSnapshot(fileOutputStream); fileOutputStream.getFD().sync(); } @@ -764,6 +1026,7 @@ public void processLoadSnapshot(File snapshotDir) throws IOException { try (final FileInputStream fileInputStream = new FileInputStream(snapshotFile)) { topicMetaKeeper.processLoadSnapshot(fileInputStream); consumerGroupMetaKeeper.processLoadSnapshot(fileInputStream); + commitProgressKeeper.processLoadSnapshot(fileInputStream); } } finally { releaseWriteLock(); diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/ConfigNodeProcedureEnv.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/ConfigNodeProcedureEnv.java index 960d0a7977f51..d271d5ef33b9c 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/ConfigNodeProcedureEnv.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/ConfigNodeProcedureEnv.java @@ -70,6 +70,8 @@ import org.apache.iotdb.mpp.rpc.thrift.TInactiveTriggerInstanceReq; import org.apache.iotdb.mpp.rpc.thrift.TInvalidateCacheReq; import org.apache.iotdb.mpp.rpc.thrift.TNotifyRegionMigrationReq; +import org.apache.iotdb.mpp.rpc.thrift.TPullCommitProgressReq; +import org.apache.iotdb.mpp.rpc.thrift.TPullCommitProgressResp; import org.apache.iotdb.mpp.rpc.thrift.TPushConsumerGroupMetaReq; import org.apache.iotdb.mpp.rpc.thrift.TPushConsumerGroupMetaResp; import org.apache.iotdb.mpp.rpc.thrift.TPushMultiPipeMetaReq; @@ -79,12 +81,15 @@ import org.apache.iotdb.mpp.rpc.thrift.TPushSingleConsumerGroupMetaReq; import org.apache.iotdb.mpp.rpc.thrift.TPushSinglePipeMetaReq; import org.apache.iotdb.mpp.rpc.thrift.TPushSingleTopicMetaReq; +import org.apache.iotdb.mpp.rpc.thrift.TPushSubscriptionRuntimeReq; import org.apache.iotdb.mpp.rpc.thrift.TPushTopicMetaReq; import org.apache.iotdb.mpp.rpc.thrift.TPushTopicMetaResp; +import org.apache.iotdb.mpp.rpc.thrift.TSubscriptionRuntimeStateEntry; import org.apache.iotdb.rpc.TSStatusCode; import org.apache.thrift.TException; import org.apache.tsfile.utils.Binary; +import org.apache.tsfile.utils.Pair; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -93,8 +98,10 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; +import java.util.LinkedHashSet; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.concurrent.TimeUnit; import java.util.concurrent.locks.ReentrantLock; import java.util.stream.Collectors; @@ -848,6 +855,85 @@ public List dropSingleConsumerGroupOnDataNode(String consumerGroupName .collect(Collectors.toList()); } + public Map pullCommitProgressFromDataNodes() { + final Map dataNodeLocationMap = + configManager.getNodeManager().getRegisteredDataNodeLocations(); + final TPullCommitProgressReq request = new TPullCommitProgressReq(); + + final DataNodeAsyncRequestContext + clientHandler = + new DataNodeAsyncRequestContext<>( + CnToDnAsyncRequestType.PULL_COMMIT_PROGRESS, request, dataNodeLocationMap); + CnToDnInternalServiceAsyncRequestManager.getInstance() + .sendAsyncRequestToNodeWithRetryAndTimeoutInMs( + clientHandler, + PipeConfig.getInstance().getPipeMetaSyncerSyncIntervalMinutes() * 60 * 1000 * 2 / 3); + return clientHandler.getResponseMap(); + } + + public Map pushSubscriptionRuntimeStatesToDataNodes( + final Map> regionGroupToOldAndNewLeaderPairMap, + final long runtimeVersion) { + final Map dataNodeLocationMap = + configManager.getNodeManager().getRegisteredDataNodeLocations(); + final Map dataRegionReplicaSetMap = + getPartitionManager().getAllReplicaSetsMap(TConsensusGroupType.DataRegion); + final Set readableDataNodeIds = + getLoadManager().filterDataNodeThroughStatus(NodeStatus::isReadable).stream() + .collect(Collectors.toSet()); + final DataNodeAsyncRequestContext clientHandler = + new DataNodeAsyncRequestContext<>(CnToDnAsyncRequestType.SUBSCRIPTION_PUSH_RUNTIME); + + dataNodeLocationMap.forEach( + (dataNodeId, dataNodeLocation) -> { + final List runtimeStates = new ArrayList<>(); + regionGroupToOldAndNewLeaderPairMap.forEach( + (regionId, leaderPair) -> { + final int oldLeaderNodeId = leaderPair.getLeft(); + final int preferredWriterNodeId = leaderPair.getRight(); + final LinkedHashSet activeWriterNodeIds = new LinkedHashSet<>(); + final TRegionReplicaSet replicaSet = dataRegionReplicaSetMap.get(regionId); + if (replicaSet != null) { + replicaSet.getDataNodeLocations().stream() + .map(TDataNodeLocation::getDataNodeId) + .filter(readableDataNodeIds::contains) + .forEach(activeWriterNodeIds::add); + } + if (activeWriterNodeIds.isEmpty()) { + if (isRuntimeActiveWriterNode(preferredWriterNodeId)) { + activeWriterNodeIds.add(preferredWriterNodeId); + } + if (oldLeaderNodeId != preferredWriterNodeId + && isRuntimeActiveWriterNode(oldLeaderNodeId)) { + activeWriterNodeIds.add(oldLeaderNodeId); + } + } + runtimeStates.add( + new TSubscriptionRuntimeStateEntry( + regionId, + runtimeVersion, + preferredWriterNodeId, + preferredWriterNodeId == dataNodeId, + new ArrayList<>(activeWriterNodeIds))); + }); + clientHandler.putNodeLocation(dataNodeId, dataNodeLocation); + clientHandler.putRequest( + dataNodeId, new TPushSubscriptionRuntimeReq().setRuntimeStates(runtimeStates)); + }); + + CnToDnInternalServiceAsyncRequestManager.getInstance() + .sendAsyncRequestToNodeWithRetryAndTimeoutInMs( + clientHandler, + PipeConfig.getInstance().getPipeMetaSyncerSyncIntervalMinutes() * 60 * 1000 * 2 / 3); + return clientHandler.getResponseMap(); + } + + private boolean isRuntimeActiveWriterNode(final int dataNodeId) { + return dataNodeId >= 0 + && getLoadManager().getNodeStatus(dataNodeId) != NodeStatus.Unknown + && getLoadManager().getNodeStatus(dataNodeId) != NodeStatus.Removing; + } + public LockQueue getNodeLock() { return nodeLock; } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/AbstractOperateSubscriptionProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/AbstractOperateSubscriptionProcedure.java index 0b246ac4ef7d5..927c306ae5587 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/AbstractOperateSubscriptionProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/AbstractOperateSubscriptionProcedure.java @@ -240,6 +240,7 @@ protected Flow executeFromState(ConfigNodeProcedureEnv env, OperateSubscriptionS String.format( "ProcedureId %s: Fail to %s because %s", getProcId(), getOperation().name(), e.getMessage()))); + return Flow.NO_MORE_STATE; } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/SubscriptionOperation.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/SubscriptionOperation.java index 4428a7ee4d305..84b94ead22cbf 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/SubscriptionOperation.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/SubscriptionOperation.java @@ -28,8 +28,10 @@ public enum SubscriptionOperation { ALTER_CONSUMER_GROUP("alter consumer group"), CREATE_SUBSCRIPTION("create subscription"), DROP_SUBSCRIPTION("drop subscription"), + HANDLE_LEADER_CHANGE("handle leader change"), SYNC_CONSUMER_GROUP_META("sync consumer group meta"), SYNC_TOPIC_META("sync topic meta"), + SYNC_COMMIT_PROGRESS("sync commit progress"), ; private final String name; diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/consumer/runtime/CommitProgressSyncProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/consumer/runtime/CommitProgressSyncProcedure.java new file mode 100644 index 0000000000000..e9b3056e66211 --- /dev/null +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/consumer/runtime/CommitProgressSyncProcedure.java @@ -0,0 +1,316 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.procedure.impl.subscription.consumer.runtime; + +import org.apache.iotdb.common.rpc.thrift.TSStatus; +import org.apache.iotdb.commons.pipe.config.PipeConfig; +import org.apache.iotdb.confignode.consensus.request.write.subscription.consumer.runtime.CommitProgressHandleMetaChangePlan; +import org.apache.iotdb.confignode.persistence.subscription.SubscriptionInfo; +import org.apache.iotdb.confignode.procedure.env.ConfigNodeProcedureEnv; +import org.apache.iotdb.confignode.procedure.impl.subscription.AbstractOperateSubscriptionProcedure; +import org.apache.iotdb.confignode.procedure.impl.subscription.SubscriptionOperation; +import org.apache.iotdb.confignode.procedure.state.ProcedureLockState; +import org.apache.iotdb.confignode.procedure.store.ProcedureType; +import org.apache.iotdb.consensus.exception.ConsensusException; +import org.apache.iotdb.mpp.rpc.thrift.TPullCommitProgressResp; +import org.apache.iotdb.rpc.TSStatusCode; +import org.apache.iotdb.rpc.subscription.exception.SubscriptionException; +import org.apache.iotdb.rpc.subscription.payload.poll.RegionProgress; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterId; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterProgress; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.ByteArrayOutputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Objects; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.atomic.AtomicReference; + +/** + * Periodically pulls commit progress from all DataNodes and persists the merged result to + * ConfigNode consensus. + */ +public class CommitProgressSyncProcedure extends AbstractOperateSubscriptionProcedure { + + private static final Logger LOGGER = LoggerFactory.getLogger(CommitProgressSyncProcedure.class); + + private static final long MIN_EXECUTION_INTERVAL_MS = + PipeConfig.getInstance().getPipeMetaSyncerSyncIntervalMinutes() * 60 * 1000 / 2; + private static final AtomicLong LAST_EXECUTION_TIME = new AtomicLong(0); + + public CommitProgressSyncProcedure() { + super(); + } + + @Override + protected AtomicReference acquireLockInternal( + ConfigNodeProcedureEnv configNodeProcedureEnv) { + return configNodeProcedureEnv + .getConfigManager() + .getSubscriptionManager() + .getSubscriptionCoordinator() + .tryLock(); + } + + @Override + protected ProcedureLockState acquireLock(ConfigNodeProcedureEnv configNodeProcedureEnv) { + if (System.currentTimeMillis() - LAST_EXECUTION_TIME.get() < MIN_EXECUTION_INTERVAL_MS) { + subscriptionInfo = null; + LOGGER.info( + "CommitProgressSyncProcedure: acquireLock, skip the procedure due to the last execution time {}", + LAST_EXECUTION_TIME.get()); + return ProcedureLockState.LOCK_ACQUIRED; + } + return super.acquireLock(configNodeProcedureEnv); + } + + @Override + protected SubscriptionOperation getOperation() { + return SubscriptionOperation.SYNC_COMMIT_PROGRESS; + } + + @Override + public boolean executeFromValidate(ConfigNodeProcedureEnv env) { + LOGGER.info("CommitProgressSyncProcedure: executeFromValidate"); + LAST_EXECUTION_TIME.set(System.currentTimeMillis()); + return true; + } + + @Override + public void executeFromOperateOnConfigNodes(ConfigNodeProcedureEnv env) + throws SubscriptionException { + LOGGER.info("CommitProgressSyncProcedure: executeFromOperateOnConfigNodes"); + + // 1. Pull commit progress from all DataNodes + final Map respMap = env.pullCommitProgressFromDataNodes(); + + // 2. Merge all DataNode responses with existing progress using Math::max + final Map mergedRegionProgress = + deserializeRegionProgressMap( + subscriptionInfo.get().getCommitProgressKeeper().getAllRegionProgress()); + + for (Map.Entry entry : respMap.entrySet()) { + final TPullCommitProgressResp resp = entry.getValue(); + if (resp.getStatus().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + LOGGER.warn( + "Failed to pull commit progress from DataNode {}, status: {}", + entry.getKey(), + resp.getStatus()); + continue; + } + if (resp.isSetCommitRegionProgress()) { + for (final Map.Entry progressEntry : + resp.getCommitRegionProgress().entrySet()) { + final RegionProgress incomingProgress = + deserializeRegionProgress(progressEntry.getKey(), progressEntry.getValue()); + if (Objects.nonNull(incomingProgress)) { + mergedRegionProgress.merge( + progressEntry.getKey(), + incomingProgress, + CommitProgressSyncProcedure::mergeRegionProgress); + } + } + } + } + + // 3. Write the merged progress to consensus + TSStatus response; + try { + response = + env.getConfigManager() + .getConsensusManager() + .write( + new CommitProgressHandleMetaChangePlan( + serializeRegionProgressMap(mergedRegionProgress))); + } catch (ConsensusException e) { + LOGGER.warn("Failed in the write API executing the consensus layer due to: ", e); + response = new TSStatus(TSStatusCode.EXECUTE_STATEMENT_ERROR.getStatusCode()); + response.setMessage(e.getMessage()); + } + if (response.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + throw new SubscriptionException(response.getMessage()); + } + } + + @Override + public void executeFromOperateOnDataNodes(ConfigNodeProcedureEnv env) { + LOGGER.info("CommitProgressSyncProcedure: executeFromOperateOnDataNodes (no-op)"); + // No need to push back to DataNodes + } + + @Override + public void rollbackFromValidate(ConfigNodeProcedureEnv env) { + LOGGER.info("CommitProgressSyncProcedure: rollbackFromValidate"); + } + + @Override + public void rollbackFromOperateOnConfigNodes(ConfigNodeProcedureEnv env) { + LOGGER.info("CommitProgressSyncProcedure: rollbackFromOperateOnConfigNodes"); + } + + @Override + public void rollbackFromOperateOnDataNodes(ConfigNodeProcedureEnv env) { + LOGGER.info("CommitProgressSyncProcedure: rollbackFromOperateOnDataNodes"); + } + + @Override + public void serialize(DataOutputStream stream) throws IOException { + stream.writeShort(ProcedureType.COMMIT_PROGRESS_SYNC_PROCEDURE.getTypeCode()); + super.serialize(stream); + } + + @Override + public boolean equals(Object o) { + return o instanceof CommitProgressSyncProcedure; + } + + @Override + public int hashCode() { + return 0; + } + + private static Map deserializeRegionProgressMap( + final Map serializedRegionProgressMap) { + final Map result = new HashMap<>(); + for (final Map.Entry entry : serializedRegionProgressMap.entrySet()) { + final RegionProgress regionProgress = + deserializeRegionProgress(entry.getKey(), entry.getValue()); + if (Objects.nonNull(regionProgress)) { + result.put(entry.getKey(), regionProgress); + } + } + return result; + } + + private static Map serializeRegionProgressMap( + final Map regionProgressMap) { + final Map result = new HashMap<>(); + for (final Map.Entry entry : regionProgressMap.entrySet()) { + final ByteBuffer serialized = serializeRegionProgress(entry.getValue()); + if (Objects.nonNull(serialized)) { + result.put(entry.getKey(), serialized); + } + } + return result; + } + + private static RegionProgress deserializeRegionProgress( + final String key, final ByteBuffer buffer) { + if (Objects.isNull(buffer)) { + return null; + } + final ByteBuffer duplicate = buffer.slice(); + try { + return RegionProgress.deserialize(duplicate); + } catch (final RuntimeException e) { + LOGGER.warn( + "CommitProgressSyncProcedure: failed to deserialize region progress, key={}, summary={}", + key, + summarizeRegionProgressPayload(buffer), + e); + throw e; + } + } + + private static String summarizeRegionProgressPayload(final ByteBuffer buffer) { + if (Objects.isNull(buffer)) { + return "null"; + } + final int position = buffer.position(); + final int limit = buffer.limit(); + final int capacity = buffer.capacity(); + final ByteBuffer duplicate = buffer.slice(); + final int remaining = duplicate.remaining(); + final String firstIntSummary; + if (remaining >= Integer.BYTES) { + final int firstInt = duplicate.getInt(); + firstIntSummary = firstInt + "(0x" + String.format("%08x", firstInt) + ")"; + duplicate.position(0); + } else { + firstIntSummary = "n/a"; + } + final int sampleLength = Math.min(16, remaining); + final byte[] sample = new byte[sampleLength]; + duplicate.get(sample, 0, sampleLength); + return "pos=" + + position + + ", limit=" + + limit + + ", capacity=" + + capacity + + ", remaining=" + + remaining + + ", firstInt=" + + firstIntSummary + + ", firstBytes=" + + bytesToHex(sample); + } + + private static String bytesToHex(final byte[] bytes) { + if (Objects.isNull(bytes) || bytes.length == 0) { + return ""; + } + final StringBuilder builder = new StringBuilder(bytes.length * 2); + for (final byte b : bytes) { + builder.append(String.format("%02x", b)); + } + return builder.toString(); + } + + private static ByteBuffer serializeRegionProgress(final RegionProgress regionProgress) { + try (final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + final DataOutputStream dos = new DataOutputStream(baos)) { + regionProgress.serialize(dos); + dos.flush(); + return ByteBuffer.wrap(baos.toByteArray()).asReadOnlyBuffer(); + } catch (final IOException e) { + throw new RuntimeException("Failed to serialize region progress " + regionProgress, e); + } + } + + private static RegionProgress mergeRegionProgress( + final RegionProgress left, final RegionProgress right) { + final Map merged = new LinkedHashMap<>(left.getWriterPositions()); + for (final Map.Entry entry : right.getWriterPositions().entrySet()) { + merged.merge( + entry.getKey(), + entry.getValue(), + (oldProgress, newProgress) -> + compareWriterProgress(newProgress, oldProgress) > 0 ? newProgress : oldProgress); + } + return new RegionProgress(merged); + } + + private static int compareWriterProgress( + final WriterProgress leftProgress, final WriterProgress rightProgress) { + int cmp = Long.compare(leftProgress.getPhysicalTime(), rightProgress.getPhysicalTime()); + if (cmp != 0) { + return cmp; + } + return Long.compare(leftProgress.getLocalSeq(), rightProgress.getLocalSeq()); + } +} diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/runtime/SubscriptionHandleLeaderChangeProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/runtime/SubscriptionHandleLeaderChangeProcedure.java new file mode 100644 index 0000000000000..5337b719d207f --- /dev/null +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/runtime/SubscriptionHandleLeaderChangeProcedure.java @@ -0,0 +1,444 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.procedure.impl.subscription.runtime; + +import org.apache.iotdb.common.rpc.thrift.TConsensusGroupId; +import org.apache.iotdb.common.rpc.thrift.TConsensusGroupType; +import org.apache.iotdb.common.rpc.thrift.TSStatus; +import org.apache.iotdb.commons.cluster.NodeStatus; +import org.apache.iotdb.commons.subscription.meta.topic.TopicMeta; +import org.apache.iotdb.confignode.consensus.request.write.subscription.consumer.runtime.CommitProgressHandleMetaChangePlan; +import org.apache.iotdb.confignode.procedure.env.ConfigNodeProcedureEnv; +import org.apache.iotdb.confignode.procedure.impl.subscription.AbstractOperateSubscriptionProcedure; +import org.apache.iotdb.confignode.procedure.impl.subscription.SubscriptionOperation; +import org.apache.iotdb.confignode.procedure.store.ProcedureType; +import org.apache.iotdb.consensus.exception.ConsensusException; +import org.apache.iotdb.mpp.rpc.thrift.TPullCommitProgressResp; +import org.apache.iotdb.mpp.rpc.thrift.TPushConsumerGroupMetaResp; +import org.apache.iotdb.mpp.rpc.thrift.TPushTopicMetaResp; +import org.apache.iotdb.rpc.TSStatusCode; +import org.apache.iotdb.rpc.subscription.exception.SubscriptionException; +import org.apache.iotdb.rpc.subscription.payload.poll.RegionProgress; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterId; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterProgress; + +import org.apache.tsfile.utils.Pair; +import org.apache.tsfile.utils.ReadWriteIOUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.ByteArrayOutputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.stream.Collectors; + +/** + * Handles subscription runtime leader changes. The first version focuses on pulling the latest + * commit progress during leader migration so the new runtime owner starts from a fresher frontier. + */ +public class SubscriptionHandleLeaderChangeProcedure extends AbstractOperateSubscriptionProcedure { + + private static final Logger LOGGER = + LoggerFactory.getLogger(SubscriptionHandleLeaderChangeProcedure.class); + + private Map> regionGroupToOldAndNewLeaderPairMap = + new HashMap<>(); + private long runtimeVersion; + + public SubscriptionHandleLeaderChangeProcedure() { + super(); + } + + public SubscriptionHandleLeaderChangeProcedure( + final Map> regionGroupToOldAndNewLeaderPairMap, + final long runtimeVersion) { + super(); + this.regionGroupToOldAndNewLeaderPairMap = regionGroupToOldAndNewLeaderPairMap; + this.runtimeVersion = runtimeVersion; + } + + @Override + protected SubscriptionOperation getOperation() { + return SubscriptionOperation.HANDLE_LEADER_CHANGE; + } + + @Override + public boolean executeFromValidate(final ConfigNodeProcedureEnv env) { + LOGGER.info("SubscriptionHandleLeaderChangeProcedure: executeFromValidate"); + if (regionGroupToOldAndNewLeaderPairMap.isEmpty()) { + return false; + } + for (final TopicMeta topicMeta : subscriptionInfo.get().getAllTopicMeta()) { + if (topicMeta.getConfig().isConsensusMode()) { + return true; + } + } + return false; + } + + @Override + public void executeFromOperateOnConfigNodes(final ConfigNodeProcedureEnv env) + throws SubscriptionException { + LOGGER.info("SubscriptionHandleLeaderChangeProcedure: executeFromOperateOnConfigNodes"); + + final Map respMap = env.pullCommitProgressFromDataNodes(); + final Map mergedRegionProgress = + deserializeRegionProgressMap( + subscriptionInfo.get().getCommitProgressKeeper().getAllRegionProgress()); + + for (final Map.Entry entry : respMap.entrySet()) { + final TPullCommitProgressResp resp = entry.getValue(); + if (resp.getStatus().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + LOGGER.warn( + "SubscriptionHandleLeaderChangeProcedure: failed to pull commit progress from DataNode {}, status: {}", + entry.getKey(), + resp.getStatus()); + continue; + } + if (resp.isSetCommitRegionProgress()) { + for (final Map.Entry progressEntry : + resp.getCommitRegionProgress().entrySet()) { + final RegionProgress incomingProgress = + deserializeRegionProgress(progressEntry.getKey(), progressEntry.getValue()); + if (Objects.nonNull(incomingProgress)) { + mergedRegionProgress.merge( + progressEntry.getKey(), + incomingProgress, + SubscriptionHandleLeaderChangeProcedure::mergeRegionProgress); + } + } + } + } + + final TSStatus response; + try { + response = + env.getConfigManager() + .getConsensusManager() + .write( + new CommitProgressHandleMetaChangePlan( + serializeRegionProgressMap(mergedRegionProgress))); + } catch (final ConsensusException e) { + LOGGER.warn( + "SubscriptionHandleLeaderChangeProcedure: failed in the write API executing the consensus layer due to: ", + e); + throw new SubscriptionException(e.getMessage()); + } + + if (response.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + throw new SubscriptionException(response.getMessage()); + } + } + + @Override + public void executeFromOperateOnDataNodes(final ConfigNodeProcedureEnv env) + throws SubscriptionException, IOException { + LOGGER.info("SubscriptionHandleLeaderChangeProcedure: executeFromOperateOnDataNodes"); + + final Map topicRespMap = pushTopicMetaToDataNodes(env); + topicRespMap.forEach( + (dataNodeId, resp) -> { + if (resp.getStatus().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + LOGGER.warn( + "SubscriptionHandleLeaderChangeProcedure: ignored failed topic meta push to DataNode {}, status: {}", + dataNodeId, + resp.getStatus()); + } + }); + + final Map consumerGroupRespMap = + pushConsumerGroupMetaToDataNodes(env); + consumerGroupRespMap.forEach( + (dataNodeId, resp) -> { + if (resp.getStatus().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + LOGGER.warn( + "SubscriptionHandleLeaderChangeProcedure: ignored failed consumer group meta push to DataNode {}, status: {}", + dataNodeId, + resp.getStatus()); + } + }); + + final Map> runtimeLeaderPairMap = + regionGroupToOldAndNewLeaderPairMap.entrySet().stream() + .filter(entry -> entry.getValue().getRight() >= 0) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + if (!runtimeLeaderPairMap.isEmpty()) { + final Set readableDataNodeIds = getReadableDataNodeIds(env); + final Map runtimeRespMap = + env.pushSubscriptionRuntimeStatesToDataNodes(runtimeLeaderPairMap, runtimeVersion); + final String runtimePushError = + collectRequiredRuntimePushFailures(readableDataNodeIds, runtimeRespMap); + if (!runtimePushError.isEmpty()) { + throw new SubscriptionException( + String.format( + "Failed to push subscription runtime state to readable DataNodes during leader change, details: %s", + runtimePushError)); + } + runtimeRespMap.forEach( + (dataNodeId, status) -> { + if (!readableDataNodeIds.contains(dataNodeId) + && status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + LOGGER.warn( + "SubscriptionHandleLeaderChangeProcedure: ignored failed subscription runtime push to unreadable DataNode {}, status: {}", + dataNodeId, + status); + } + }); + } + } + + @Override + public void rollbackFromValidate(final ConfigNodeProcedureEnv env) { + LOGGER.info("SubscriptionHandleLeaderChangeProcedure: rollbackFromValidate"); + } + + @Override + public void rollbackFromOperateOnConfigNodes(final ConfigNodeProcedureEnv env) { + LOGGER.info("SubscriptionHandleLeaderChangeProcedure: rollbackFromOperateOnConfigNodes"); + } + + @Override + public void rollbackFromOperateOnDataNodes(final ConfigNodeProcedureEnv env) { + LOGGER.info("SubscriptionHandleLeaderChangeProcedure: rollbackFromOperateOnDataNodes"); + } + + @Override + public void serialize(final DataOutputStream stream) throws IOException { + stream.writeShort(ProcedureType.SUBSCRIPTION_HANDLE_LEADER_CHANGE_PROCEDURE.getTypeCode()); + super.serialize(stream); + ReadWriteIOUtils.write(runtimeVersion, stream); + ReadWriteIOUtils.write(regionGroupToOldAndNewLeaderPairMap.size(), stream); + for (final Map.Entry> entry : + regionGroupToOldAndNewLeaderPairMap.entrySet()) { + ReadWriteIOUtils.write(entry.getKey().getId(), stream); + ReadWriteIOUtils.write(entry.getValue().getLeft(), stream); + ReadWriteIOUtils.write(entry.getValue().getRight(), stream); + } + } + + @Override + public void deserialize(final ByteBuffer byteBuffer) { + super.deserialize(byteBuffer); + runtimeVersion = ReadWriteIOUtils.readLong(byteBuffer); + final int size = ReadWriteIOUtils.readInt(byteBuffer); + for (int i = 0; i < size; ++i) { + final int dataRegionGroupId = ReadWriteIOUtils.readInt(byteBuffer); + final int oldLeaderId = ReadWriteIOUtils.readInt(byteBuffer); + final int newLeaderId = ReadWriteIOUtils.readInt(byteBuffer); + regionGroupToOldAndNewLeaderPairMap.put( + new TConsensusGroupId(TConsensusGroupType.DataRegion, dataRegionGroupId), + new Pair<>(oldLeaderId, newLeaderId)); + } + } + + @Override + public boolean equals(final Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + final SubscriptionHandleLeaderChangeProcedure that = + (SubscriptionHandleLeaderChangeProcedure) o; + return getProcId() == that.getProcId() + && getCurrentState().equals(that.getCurrentState()) + && getCycles() == that.getCycles() + && runtimeVersion == that.runtimeVersion + && regionGroupToOldAndNewLeaderPairMap.equals(that.regionGroupToOldAndNewLeaderPairMap); + } + + @Override + public int hashCode() { + return Objects.hash( + getProcId(), + getCurrentState(), + getCycles(), + runtimeVersion, + regionGroupToOldAndNewLeaderPairMap); + } + + private static Map deserializeRegionProgressMap( + final Map serializedRegionProgressMap) { + final Map result = new HashMap<>(); + for (final Map.Entry entry : serializedRegionProgressMap.entrySet()) { + final RegionProgress regionProgress = + deserializeRegionProgress(entry.getKey(), entry.getValue()); + if (Objects.nonNull(regionProgress)) { + result.put(entry.getKey(), regionProgress); + } + } + return result; + } + + private static Map serializeRegionProgressMap( + final Map regionProgressMap) { + final Map result = new HashMap<>(); + for (final Map.Entry entry : regionProgressMap.entrySet()) { + final ByteBuffer serialized = serializeRegionProgress(entry.getValue()); + if (Objects.nonNull(serialized)) { + result.put(entry.getKey(), serialized); + } + } + return result; + } + + private static RegionProgress deserializeRegionProgress( + final String key, final ByteBuffer buffer) { + if (Objects.isNull(buffer)) { + return null; + } + final ByteBuffer duplicate = buffer.slice(); + try { + return RegionProgress.deserialize(duplicate); + } catch (final RuntimeException e) { + LOGGER.warn( + "SubscriptionHandleLeaderChangeProcedure: failed to deserialize region progress, key={}, summary={}", + key, + summarizeRegionProgressPayload(buffer), + e); + throw e; + } + } + + private static String summarizeRegionProgressPayload(final ByteBuffer buffer) { + if (Objects.isNull(buffer)) { + return "null"; + } + final int position = buffer.position(); + final int limit = buffer.limit(); + final int capacity = buffer.capacity(); + final ByteBuffer duplicate = buffer.slice(); + final int remaining = duplicate.remaining(); + final String firstIntSummary; + if (remaining >= Integer.BYTES) { + final int firstInt = duplicate.getInt(); + firstIntSummary = firstInt + "(0x" + String.format("%08x", firstInt) + ")"; + duplicate.position(0); + } else { + firstIntSummary = "n/a"; + } + final int sampleLength = Math.min(16, remaining); + final byte[] sample = new byte[sampleLength]; + duplicate.get(sample, 0, sampleLength); + return "pos=" + + position + + ", limit=" + + limit + + ", capacity=" + + capacity + + ", remaining=" + + remaining + + ", firstInt=" + + firstIntSummary + + ", firstBytes=" + + bytesToHex(sample); + } + + private static String bytesToHex(final byte[] bytes) { + if (Objects.isNull(bytes) || bytes.length == 0) { + return ""; + } + final StringBuilder builder = new StringBuilder(bytes.length * 2); + for (final byte b : bytes) { + builder.append(String.format("%02x", b)); + } + return builder.toString(); + } + + private static ByteBuffer serializeRegionProgress(final RegionProgress regionProgress) { + try (final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + final DataOutputStream dos = new DataOutputStream(baos)) { + regionProgress.serialize(dos); + dos.flush(); + return ByteBuffer.wrap(baos.toByteArray()).asReadOnlyBuffer(); + } catch (final IOException e) { + throw new RuntimeException("Failed to serialize region progress " + regionProgress, e); + } + } + + private static RegionProgress mergeRegionProgress( + final RegionProgress left, final RegionProgress right) { + final Map merged = new LinkedHashMap<>(left.getWriterPositions()); + for (final Map.Entry entry : right.getWriterPositions().entrySet()) { + merged.merge( + entry.getKey(), + entry.getValue(), + (oldProgress, newProgress) -> + compareWriterProgress(newProgress, oldProgress) > 0 ? newProgress : oldProgress); + } + return new RegionProgress(merged); + } + + private static int compareWriterProgress( + final WriterProgress leftProgress, final WriterProgress rightProgress) { + int cmp = Long.compare(leftProgress.getPhysicalTime(), rightProgress.getPhysicalTime()); + if (cmp != 0) { + return cmp; + } + return Long.compare(leftProgress.getLocalSeq(), rightProgress.getLocalSeq()); + } + + private Set getReadableDataNodeIds(final ConfigNodeProcedureEnv env) + throws SubscriptionException { + final Set readableDataNodeIds = + env + .getConfigManager() + .getLoadManager() + .filterDataNodeThroughStatus(NodeStatus::isReadable) + .stream() + .collect(Collectors.toSet()); + if (readableDataNodeIds.isEmpty()) { + throw new SubscriptionException( + "No readable DataNode is available to accept subscription metadata/runtime updates during leader change"); + } + return readableDataNodeIds; + } + + private String collectRequiredRuntimePushFailures( + final Set readableDataNodeIds, final Map respMap) { + final StringBuilder errorMessageBuilder = new StringBuilder(); + for (final Integer dataNodeId : readableDataNodeIds) { + final TSStatus status = respMap.get(dataNodeId); + if (Objects.isNull(status)) { + errorMessageBuilder + .append("DataNode ") + .append(dataNodeId) + .append(": missing subscription runtime push response; "); + continue; + } + if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + errorMessageBuilder + .append("DataNode ") + .append(dataNodeId) + .append(": ") + .append(status) + .append("; "); + } + } + return errorMessageBuilder.toString(); + } +} diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/CreateSubscriptionProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/CreateSubscriptionProcedure.java index cb5edd8cd91a3..a62ab2c2e2843 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/CreateSubscriptionProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/CreateSubscriptionProcedure.java @@ -52,6 +52,7 @@ import java.util.HashSet; import java.util.List; import java.util.Objects; +import java.util.Set; import java.util.stream.Collectors; public class CreateSubscriptionProcedure extends AbstractOperateSubscriptionAndPipeProcedure { @@ -66,6 +67,8 @@ public class CreateSubscriptionProcedure extends AbstractOperateSubscriptionAndP private AlterConsumerGroupProcedure alterConsumerGroupProcedure; private List createPipeProcedures = new ArrayList<>(); + private Set consensusTopicNames = new HashSet<>(); + // TODO: remove this variable later private final List alterTopicProcedures = new ArrayList<>(); // unused now @@ -103,15 +106,30 @@ protected boolean executeFromValidate(final ConfigNodeProcedureEnv env) alterConsumerGroupProcedure = new AlterConsumerGroupProcedure(updatedConsumerGroupMeta, subscriptionInfo); - // Construct CreatePipeProcedureV2s + // Construct CreatePipeProcedureV2s (for non-consensus topics) for (final String topicName : subscribeReq.getTopicNames()) { + final TopicMeta topicMeta = subscriptionInfo.get().deepCopyTopicMeta(topicName); + + final String topicMode = topicMeta.getConfig().getMode(); + final boolean isConsensusBasedTopic = topicMeta.getConfig().isConsensusMode(); + + if (isConsensusBasedTopic) { + // skip pipe creation + consensusTopicNames.add(topicName); + LOGGER.info( + "CreateSubscriptionProcedure: topic [{}] uses consensus subscription mode " + + "(mode={}), skipping pipe creation", + topicName, + topicMode); + continue; + } + final String pipeName = PipeStaticMeta.generateSubscriptionPipeName(topicName, consumerGroupId); if (!subscriptionInfo.get().isTopicSubscribedByConsumerGroup(topicName, consumerGroupId) // even if there existed subscription meta, if there is no corresponding pipe meta, it // will try to create the pipe || !pipeTaskInfo.get().isPipeExisted(pipeName)) { - final TopicMeta topicMeta = subscriptionInfo.get().deepCopyTopicMeta(topicName); createPipeProcedures.add( new CreatePipeProcedureV2( new TCreatePipeReq() @@ -177,20 +195,29 @@ protected void executeFromOperateOnDataNodes(final ConfigNodeProcedureEnv env) // Push consumer group meta to data nodes alterConsumerGroupProcedure.executeFromOperateOnDataNodes(env); - // Push pipe meta to data nodes - final List pipeNames = - createPipeProcedures.stream() - .map(CreatePipeProcedureV2::getPipeName) - .collect(Collectors.toList()); - final String exceptionMessage = - AbstractOperatePipeProcedureV2.parsePushPipeMetaExceptionForPipe( - null, pushMultiPipeMetaToDataNodes(pipeNames, env)); - if (!exceptionMessage.isEmpty()) { - // throw exception instead of logging warn, do not rely on metadata synchronization - throw new SubscriptionException( - String.format( - "Failed to create pipes %s when creating subscription with request %s, details: %s, metadata will be synchronized later.", - pipeNames, subscribeReq, exceptionMessage)); + if (!consensusTopicNames.isEmpty()) { + LOGGER.info( + "CreateSubscriptionProcedure: consensus-based topics {} will be handled by DataNode " + + "via consumer group meta push (no pipe creation needed)", + consensusTopicNames); + } + + // Push pipe meta to data nodes (only for non-consensus pipe-based topics) + if (!createPipeProcedures.isEmpty()) { + final List pipeNames = + createPipeProcedures.stream() + .map(CreatePipeProcedureV2::getPipeName) + .collect(Collectors.toList()); + final String exceptionMessage = + AbstractOperatePipeProcedureV2.parsePushPipeMetaExceptionForPipe( + null, pushMultiPipeMetaToDataNodes(pipeNames, env)); + if (!exceptionMessage.isEmpty()) { + // throw exception instead of logging warn, do not rely on metadata synchronization + throw new SubscriptionException( + String.format( + "Failed to create pipes %s when creating subscription with request %s, details: %s, metadata will be synchronized later.", + pipeNames, subscribeReq, exceptionMessage)); + } } } @@ -297,6 +324,12 @@ public void serialize(final DataOutputStream stream) throws IOException { } else { ReadWriteIOUtils.write(false, stream); } + + // Serialize consensus topic names + ReadWriteIOUtils.write(consensusTopicNames.size(), stream); + for (final String consensusTopicName : consensusTopicNames) { + ReadWriteIOUtils.write(consensusTopicName, stream); + } } @Override @@ -348,6 +381,14 @@ public void deserialize(final ByteBuffer byteBuffer) { } } } + + // Deserialize consensus topic names + if (byteBuffer.hasRemaining()) { + size = ReadWriteIOUtils.readInt(byteBuffer); + for (int i = 0; i < size; ++i) { + consensusTopicNames.add(ReadWriteIOUtils.readString(byteBuffer)); + } + } } @Override @@ -364,7 +405,8 @@ public boolean equals(final Object o) { && getCycles() == that.getCycles() && Objects.equals(subscribeReq, that.subscribeReq) && Objects.equals(alterConsumerGroupProcedure, that.alterConsumerGroupProcedure) - && Objects.equals(createPipeProcedures, that.createPipeProcedures); + && Objects.equals(createPipeProcedures, that.createPipeProcedures) + && Objects.equals(consensusTopicNames, that.consensusTopicNames); } @Override @@ -375,7 +417,8 @@ public int hashCode() { getCycles(), subscribeReq, alterConsumerGroupProcedure, - createPipeProcedures); + createPipeProcedures, + consensusTopicNames); } @TestOnly diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/DropSubscriptionProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/DropSubscriptionProcedure.java index 6741a6c1e2a84..08d47d82cbece 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/DropSubscriptionProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/DropSubscriptionProcedure.java @@ -22,6 +22,7 @@ import org.apache.iotdb.common.rpc.thrift.TSStatus; import org.apache.iotdb.commons.pipe.agent.task.meta.PipeStaticMeta; import org.apache.iotdb.commons.subscription.meta.consumer.ConsumerGroupMeta; +import org.apache.iotdb.commons.subscription.meta.topic.TopicMeta; import org.apache.iotdb.commons.utils.TestOnly; import org.apache.iotdb.confignode.consensus.request.ConfigPhysicalPlan; import org.apache.iotdb.confignode.consensus.request.write.pipe.task.DropPipePlanV2; @@ -100,6 +101,19 @@ protected boolean executeFromValidate(final ConfigNodeProcedureEnv env) for (final String topic : unsubscribeReq.getTopicNames()) { if (topicsUnsubByGroup.contains(topic)) { + final TopicMeta topicMeta = subscriptionInfo.get().deepCopyTopicMeta(topic); + final String topicMode = topicMeta.getConfig().getMode(); + final boolean isConsensusBasedTopic = topicMeta.getConfig().isConsensusMode(); + + if (isConsensusBasedTopic) { + LOGGER.info( + "DropSubscriptionProcedure: topic [{}] uses consensus subscription mode " + + "(mode={}), skipping pipe removal", + topic, + topicMode); + continue; + } + // Topic will be subscribed by no consumers in this group dropPipeProcedures.add( new DropPipeProcedureV2( diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureFactory.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureFactory.java index 140fffa852ccc..2af973e0c4425 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureFactory.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureFactory.java @@ -72,7 +72,9 @@ import org.apache.iotdb.confignode.procedure.impl.subscription.consumer.AlterConsumerGroupProcedure; import org.apache.iotdb.confignode.procedure.impl.subscription.consumer.CreateConsumerProcedure; import org.apache.iotdb.confignode.procedure.impl.subscription.consumer.DropConsumerProcedure; +import org.apache.iotdb.confignode.procedure.impl.subscription.consumer.runtime.CommitProgressSyncProcedure; import org.apache.iotdb.confignode.procedure.impl.subscription.consumer.runtime.ConsumerGroupMetaSyncProcedure; +import org.apache.iotdb.confignode.procedure.impl.subscription.runtime.SubscriptionHandleLeaderChangeProcedure; import org.apache.iotdb.confignode.procedure.impl.subscription.subscription.CreateSubscriptionProcedure; import org.apache.iotdb.confignode.procedure.impl.subscription.subscription.DropSubscriptionProcedure; import org.apache.iotdb.confignode.procedure.impl.subscription.topic.AlterTopicProcedure; @@ -396,6 +398,12 @@ public Procedure create(ByteBuffer buffer) throws IOException { case CONSUMER_GROUP_META_SYNC_PROCEDURE: procedure = new ConsumerGroupMetaSyncProcedure(); break; + case COMMIT_PROGRESS_SYNC_PROCEDURE: + procedure = new CommitProgressSyncProcedure(); + break; + case SUBSCRIPTION_HANDLE_LEADER_CHANGE_PROCEDURE: + procedure = new SubscriptionHandleLeaderChangeProcedure(); + break; case CREATE_MANY_DATABASES_PROCEDURE: procedure = new CreateManyDatabasesProcedure(); break; @@ -544,6 +552,10 @@ public static ProcedureType getProcedureType(final Procedure procedure) { return ProcedureType.ALTER_CONSUMER_GROUP_PROCEDURE; } else if (procedure instanceof ConsumerGroupMetaSyncProcedure) { return ProcedureType.CONSUMER_GROUP_META_SYNC_PROCEDURE; + } else if (procedure instanceof CommitProgressSyncProcedure) { + return ProcedureType.COMMIT_PROGRESS_SYNC_PROCEDURE; + } else if (procedure instanceof SubscriptionHandleLeaderChangeProcedure) { + return ProcedureType.SUBSCRIPTION_HANDLE_LEADER_CHANGE_PROCEDURE; } else if (procedure instanceof DeleteLogicalViewProcedure) { return ProcedureType.DELETE_LOGICAL_VIEW_PROCEDURE; } else if (procedure instanceof AlterLogicalViewProcedure) { diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureType.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureType.java index 839c8ace0984d..1cd6a46a4dcd1 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureType.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureType.java @@ -167,6 +167,8 @@ public enum ProcedureType { ALTER_CONSUMER_GROUP_PROCEDURE((short) 1507), TOPIC_META_SYNC_PROCEDURE((short) 1508), CONSUMER_GROUP_META_SYNC_PROCEDURE((short) 1509), + COMMIT_PROGRESS_SYNC_PROCEDURE((short) 1510), + SUBSCRIPTION_HANDLE_LEADER_CHANGE_PROCEDURE((short) 1511), /** Other */ @TestOnly diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/thrift/ConfigNodeRPCServiceProcessor.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/thrift/ConfigNodeRPCServiceProcessor.java index 4d01f3770c218..d4c4b141916d6 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/thrift/ConfigNodeRPCServiceProcessor.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/thrift/ConfigNodeRPCServiceProcessor.java @@ -159,6 +159,8 @@ import org.apache.iotdb.confignode.rpc.thrift.TGetAllTemplatesResp; import org.apache.iotdb.confignode.rpc.thrift.TGetAllTopicInfoResp; import org.apache.iotdb.confignode.rpc.thrift.TGetClusterIdResp; +import org.apache.iotdb.confignode.rpc.thrift.TGetCommitProgressReq; +import org.apache.iotdb.confignode.rpc.thrift.TGetCommitProgressResp; import org.apache.iotdb.confignode.rpc.thrift.TGetDataNodeLocationsResp; import org.apache.iotdb.confignode.rpc.thrift.TGetDatabaseReq; import org.apache.iotdb.confignode.rpc.thrift.TGetJarInListReq; @@ -1313,6 +1315,11 @@ public TGetAllSubscriptionInfoResp getAllSubscriptionInfo() { return configManager.getAllSubscriptionInfo(); } + @Override + public TGetCommitProgressResp getCommitProgress(TGetCommitProgressReq req) { + return configManager.getCommitProgress(req); + } + @Override public TGetRegionIdResp getRegionId(TGetRegionIdReq req) { return configManager.getRegionId(req); diff --git a/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/persistence/subscription/SubscriptionInfoTopicValidationTest.java b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/persistence/subscription/SubscriptionInfoTopicValidationTest.java new file mode 100644 index 0000000000000..144b1e9530da1 --- /dev/null +++ b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/persistence/subscription/SubscriptionInfoTopicValidationTest.java @@ -0,0 +1,220 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.persistence.subscription; + +import org.apache.iotdb.commons.pipe.config.constant.SystemConstant; +import org.apache.iotdb.commons.subscription.meta.topic.TopicMeta; +import org.apache.iotdb.confignode.consensus.request.write.subscription.topic.CreateTopicPlan; +import org.apache.iotdb.confignode.rpc.thrift.TCreateTopicReq; +import org.apache.iotdb.rpc.subscription.config.TopicConstant; +import org.apache.iotdb.rpc.subscription.exception.SubscriptionException; + +import org.junit.Assert; +import org.junit.Test; + +import java.util.HashMap; +import java.util.Map; + +public class SubscriptionInfoTopicValidationTest { + + @Test + public void testValidateConsensusTableColumnPatternOnCreate() throws Exception { + final SubscriptionInfo subscriptionInfo = new SubscriptionInfo(); + final Map attributes = newConsensusTableTopicAttributes(); + attributes.put(TopicConstant.COLUMN_KEY, "(id1|m1)"); + + Assert.assertTrue( + subscriptionInfo.validateBeforeCreatingTopic( + new TCreateTopicReq("table_topic").setTopicAttributes(attributes))); + } + + @Test + public void testRejectColumnPatternOnTreeTopic() { + final SubscriptionInfo subscriptionInfo = new SubscriptionInfo(); + final Map attributes = new HashMap<>(); + attributes.put(TopicConstant.COLUMN_KEY, "id1"); + + assertCreateRejected(subscriptionInfo, attributes, "only supported for table topics"); + } + + @Test + public void testRejectColumnPatternOnTsFileTopic() { + final SubscriptionInfo subscriptionInfo = new SubscriptionInfo(); + final Map attributes = newConsensusTableTopicAttributes(); + attributes.put(TopicConstant.FORMAT_KEY, TopicConstant.FORMAT_TS_FILE_VALUE); + attributes.put(TopicConstant.COLUMN_KEY, "id1"); + + assertCreateRejected(subscriptionInfo, attributes, "mode=consensus only supports format"); + } + + @Test + public void testRejectLegacyTsFileAliasOnConsensusTopic() { + final SubscriptionInfo subscriptionInfo = new SubscriptionInfo(); + final Map attributes = newConsensusTableTopicAttributes(); + attributes.put(TopicConstant.FORMAT_KEY, "TsFileHandler"); + + assertCreateRejected(subscriptionInfo, attributes, "mode=consensus only supports format"); + } + + @Test + public void testRejectIllegalColumnRegex() { + final SubscriptionInfo subscriptionInfo = new SubscriptionInfo(); + final Map attributes = newConsensusTableTopicAttributes(); + attributes.put(TopicConstant.COLUMN_KEY, "["); + + assertCreateRejected(subscriptionInfo, attributes, "illegal column"); + } + + @Test + public void testRejectAlteringColumnPattern() throws Exception { + final SubscriptionInfo subscriptionInfo = new SubscriptionInfo(); + final Map originalAttributes = newConsensusTableTopicAttributes(); + originalAttributes.put(TopicConstant.COLUMN_KEY, "id1"); + subscriptionInfo.createTopic( + new CreateTopicPlan(new TopicMeta("table_topic", 1L, originalAttributes))); + + final Map updatedAttributes = newConsensusTableTopicAttributes(); + updatedAttributes.put(TopicConstant.COLUMN_KEY, "m1"); + + try { + subscriptionInfo.validateBeforeAlteringTopic( + new TopicMeta("table_topic", 2L, updatedAttributes)); + Assert.fail("Expected altering the column pattern to be rejected"); + } catch (final SubscriptionException e) { + Assert.assertTrue(e.getMessage().contains("changing column is not supported")); + } + } + + @Test + public void testValidateRetentionConfigOnCreate() throws Exception { + final SubscriptionInfo subscriptionInfo = new SubscriptionInfo(); + final Map attributes = newConsensusTableTopicAttributes(); + attributes.put(TopicConstant.RETENTION_BYTES_KEY, "1048576"); + attributes.put(TopicConstant.RETENTION_MS_KEY, "-1"); + + Assert.assertTrue( + subscriptionInfo.validateBeforeCreatingTopic( + new TCreateTopicReq("table_topic").setTopicAttributes(attributes))); + } + + @Test + public void testRejectRetentionOnTsFileTopic() { + final SubscriptionInfo subscriptionInfo = new SubscriptionInfo(); + final Map attributes = newConsensusTableTopicAttributes(); + attributes.put(TopicConstant.FORMAT_KEY, TopicConstant.FORMAT_TS_FILE_VALUE); + attributes.put(TopicConstant.RETENTION_BYTES_KEY, "1024"); + + assertCreateRejected(subscriptionInfo, attributes, "mode=consensus only supports format"); + } + + @Test + public void testRejectIllegalRetentionValue() { + final SubscriptionInfo subscriptionInfo = new SubscriptionInfo(); + final Map attributes = newConsensusTableTopicAttributes(); + attributes.put(TopicConstant.RETENTION_BYTES_KEY, "0"); + + assertCreateRejected(subscriptionInfo, attributes, "expected -1 or a positive long value"); + } + + @Test + public void testRejectIllegalRetentionFormat() { + final SubscriptionInfo subscriptionInfo = new SubscriptionInfo(); + final Map attributes = newConsensusTableTopicAttributes(); + attributes.put(TopicConstant.RETENTION_MS_KEY, "1h"); + + assertCreateRejected(subscriptionInfo, attributes, "expected a long value"); + } + + @Test + public void testRejectAlteringRetentionConfig() throws Exception { + final SubscriptionInfo subscriptionInfo = new SubscriptionInfo(); + final Map originalAttributes = newConsensusTableTopicAttributes(); + originalAttributes.put(TopicConstant.RETENTION_BYTES_KEY, "1024"); + subscriptionInfo.createTopic( + new CreateTopicPlan(new TopicMeta("table_topic", 1L, originalAttributes))); + + final Map updatedAttributes = newConsensusTableTopicAttributes(); + updatedAttributes.put(TopicConstant.RETENTION_BYTES_KEY, "2048"); + + try { + subscriptionInfo.validateBeforeAlteringTopic( + new TopicMeta("table_topic", 2L, updatedAttributes)); + Assert.fail("Expected altering retention.bytes to be rejected"); + } catch (final SubscriptionException e) { + Assert.assertTrue(e.getMessage().contains("changing retention.bytes is not supported")); + } + } + + @Test + public void testRejectIllegalMode() { + final SubscriptionInfo subscriptionInfo = new SubscriptionInfo(); + final Map attributes = new HashMap<>(); + attributes.put(TopicConstant.MODE_KEY, "wal"); + + assertCreateRejected(subscriptionInfo, attributes, "unsupported mode"); + } + + @Test + public void testRejectConsensusOnlyColumnOnLiveTopic() { + final SubscriptionInfo subscriptionInfo = new SubscriptionInfo(); + final Map attributes = newLiveTableTopicAttributes(); + attributes.put(TopicConstant.COLUMN_KEY, "id1"); + + assertCreateRejected(subscriptionInfo, attributes, "only supported for consensus table topics"); + } + + @Test + public void testRejectConsensusOnlyRetentionOnLiveTopic() { + final SubscriptionInfo subscriptionInfo = new SubscriptionInfo(); + final Map attributes = newLiveTableTopicAttributes(); + attributes.put(TopicConstant.RETENTION_BYTES_KEY, "1024"); + + assertCreateRejected(subscriptionInfo, attributes, "only supported for consensus topics"); + } + + private static Map newConsensusTableTopicAttributes() { + final Map attributes = new HashMap<>(); + attributes.put(SystemConstant.SQL_DIALECT_KEY, SystemConstant.SQL_DIALECT_TABLE_VALUE); + attributes.put(TopicConstant.MODE_KEY, TopicConstant.MODE_CONSENSUS_VALUE); + attributes.put(TopicConstant.FORMAT_KEY, TopicConstant.FORMAT_RECORD_HANDLER_VALUE); + return attributes; + } + + private static Map newLiveTableTopicAttributes() { + final Map attributes = new HashMap<>(); + attributes.put(SystemConstant.SQL_DIALECT_KEY, SystemConstant.SQL_DIALECT_TABLE_VALUE); + attributes.put(TopicConstant.MODE_KEY, TopicConstant.MODE_LIVE_VALUE); + attributes.put(TopicConstant.FORMAT_KEY, TopicConstant.FORMAT_RECORD_HANDLER_VALUE); + return attributes; + } + + private static void assertCreateRejected( + final SubscriptionInfo subscriptionInfo, + final Map attributes, + final String expectedMessagePart) { + try { + subscriptionInfo.validateBeforeCreatingTopic( + new TCreateTopicReq("table_topic").setTopicAttributes(attributes)); + Assert.fail("Expected topic validation to fail"); + } catch (final SubscriptionException e) { + Assert.assertTrue(e.getMessage().contains(expectedMessagePart)); + } + } +} diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/common/request/DeserializedBatchIndexedConsensusRequest.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/common/request/DeserializedBatchIndexedConsensusRequest.java index 9cdeaf60c3029..332d5c2d6ef16 100644 --- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/common/request/DeserializedBatchIndexedConsensusRequest.java +++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/common/request/DeserializedBatchIndexedConsensusRequest.java @@ -28,13 +28,24 @@ public class DeserializedBatchIndexedConsensusRequest implements IConsensusRequest, Comparable { private final long startSyncIndex; private final long endSyncIndex; + private final int writerNodeId; + private final long writerEpoch; + private final long endPhysicalTime; private final List insertNodes; private long memorySize; public DeserializedBatchIndexedConsensusRequest( - long startSyncIndex, long endSyncIndex, int size) { + long startSyncIndex, + long endSyncIndex, + int size, + int writerNodeId, + long writerEpoch, + long endPhysicalTime) { this.startSyncIndex = startSyncIndex; this.endSyncIndex = endSyncIndex; + this.writerNodeId = writerNodeId; + this.writerEpoch = writerEpoch; + this.endPhysicalTime = endPhysicalTime; // use arraylist here because we know the number of requests this.insertNodes = new ArrayList<>(size); } @@ -47,6 +58,18 @@ public long getEndSyncIndex() { return endSyncIndex; } + public int getWriterNodeId() { + return writerNodeId; + } + + public long getWriterEpoch() { + return writerEpoch; + } + + public long getEndPhysicalTime() { + return endPhysicalTime; + } + public List getInsertNodes() { return insertNodes; } @@ -72,12 +95,16 @@ public boolean equals(Object o) { DeserializedBatchIndexedConsensusRequest request = (DeserializedBatchIndexedConsensusRequest) o; return startSyncIndex == request.startSyncIndex && endSyncIndex == request.endSyncIndex + && writerNodeId == request.writerNodeId + && writerEpoch == request.writerEpoch + && endPhysicalTime == request.endPhysicalTime && Objects.equals(insertNodes, request.insertNodes); } @Override public int hashCode() { - return Objects.hash(startSyncIndex, endSyncIndex, insertNodes); + return Objects.hash( + startSyncIndex, endSyncIndex, writerNodeId, writerEpoch, endPhysicalTime, insertNodes); } @Override diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/common/request/IndexedConsensusRequest.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/common/request/IndexedConsensusRequest.java index 2bf01d4ef868c..4e5391fa04f18 100644 --- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/common/request/IndexedConsensusRequest.java +++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/common/request/IndexedConsensusRequest.java @@ -32,6 +32,19 @@ public class IndexedConsensusRequest implements IConsensusRequest { private final long searchIndex; private final long syncIndex; + + /** routing epoch from ConfigNode broadcast for ordered consensus subscription */ + private long routingEpoch = 0; + + /** Millisecond physical time used as the first ordering key in the new subscription progress. */ + private long physicalTime = 0; + + /** Writer node id used as the second ordering key across multiple writers. */ + private int nodeId = -1; + + /** Writer-local lifecycle id. */ + private long writerEpoch = 0; + private final List requests; private final List serializedRequests; private long memorySize = 0; @@ -86,6 +99,56 @@ public long getSyncIndex() { return syncIndex; } + /** + * Returns the writer-local sequence used by the new subscription progress model. + * + *

For locally generated requests this is the request searchIndex. For replicated requests this + * is the source leader's propagated localSeq carried in syncIndex. + */ + public long getProgressLocalSeq() { + return syncIndex >= 0 ? syncIndex : searchIndex; + } + + public long getRoutingEpoch() { + return routingEpoch; + } + + public IndexedConsensusRequest setRoutingEpoch(long routingEpoch) { + this.routingEpoch = routingEpoch; + return this; + } + + public long getPhysicalTime() { + return physicalTime; + } + + public IndexedConsensusRequest setPhysicalTime(long physicalTime) { + this.physicalTime = physicalTime; + return this; + } + + public int getNodeId() { + return nodeId; + } + + public IndexedConsensusRequest setNodeId(int nodeId) { + this.nodeId = nodeId; + return this; + } + + public long getWriterEpoch() { + return writerEpoch; + } + + public IndexedConsensusRequest setWriterEpoch(long writerEpoch) { + this.writerEpoch = writerEpoch; + return this; + } + + public long getLocalSeq() { + return searchIndex; + } + @Override public boolean equals(Object o) { if (this == o) { diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/config/IoTConsensusConfig.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/config/IoTConsensusConfig.java index 32c4664b60dfd..7720cf59f055f 100644 --- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/config/IoTConsensusConfig.java +++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/config/IoTConsensusConfig.java @@ -323,6 +323,8 @@ public static class Replication { private final IMemoryBlock consensusMemoryBlock; private final double maxMemoryRatioForQueue; private final long regionMigrationSpeedLimitBytesPerSecond; + private final long subscriptionWalRetentionSizeInBytes; + private final long subscriptionWalRetentionTimeMs; private Replication( int maxLogEntriesNumPerBatch, @@ -338,7 +340,9 @@ private Replication( long checkpointGap, IMemoryBlock consensusMemoryBlock, double maxMemoryRatioForQueue, - long regionMigrationSpeedLimitBytesPerSecond) { + long regionMigrationSpeedLimitBytesPerSecond, + long subscriptionWalRetentionSizeInBytes, + long subscriptionWalRetentionTimeMs) { this.maxLogEntriesNumPerBatch = maxLogEntriesNumPerBatch; this.maxSizePerBatch = maxSizePerBatch; this.maxPendingBatchesNum = maxPendingBatchesNum; @@ -353,6 +357,8 @@ private Replication( this.consensusMemoryBlock = consensusMemoryBlock; this.maxMemoryRatioForQueue = maxMemoryRatioForQueue; this.regionMigrationSpeedLimitBytesPerSecond = regionMigrationSpeedLimitBytesPerSecond; + this.subscriptionWalRetentionSizeInBytes = subscriptionWalRetentionSizeInBytes; + this.subscriptionWalRetentionTimeMs = subscriptionWalRetentionTimeMs; } public int getMaxLogEntriesNumPerBatch() { @@ -411,6 +417,14 @@ public long getRegionMigrationSpeedLimitBytesPerSecond() { return regionMigrationSpeedLimitBytesPerSecond; } + public long getSubscriptionWalRetentionSizeInBytes() { + return subscriptionWalRetentionSizeInBytes; + } + + public long getSubscriptionWalRetentionTimeMs() { + return subscriptionWalRetentionTimeMs; + } + public static Replication.Builder newBuilder() { return new Replication.Builder(); } @@ -434,6 +448,8 @@ public static class Builder { "Consensus-Default", null, Runtime.getRuntime().maxMemory() / 10); private double maxMemoryRatioForQueue = 0.6; private long regionMigrationSpeedLimitBytesPerSecond = 32 * 1024 * 1024L; + private long subscriptionWalRetentionSizeInBytes = 0; + private long subscriptionWalRetentionTimeMs = -1L; public Replication.Builder setMaxLogEntriesNumPerBatch(int maxLogEntriesNumPerBatch) { this.maxLogEntriesNumPerBatch = maxLogEntriesNumPerBatch; @@ -508,6 +524,17 @@ public Builder setRegionMigrationSpeedLimitBytesPerSecond( return this; } + public Builder setSubscriptionWalRetentionSizeInBytes( + long subscriptionWalRetentionSizeInBytes) { + this.subscriptionWalRetentionSizeInBytes = subscriptionWalRetentionSizeInBytes; + return this; + } + + public Builder setSubscriptionWalRetentionTimeMs(long subscriptionWalRetentionTimeMs) { + this.subscriptionWalRetentionTimeMs = subscriptionWalRetentionTimeMs; + return this; + } + public Replication build() { return new Replication( maxLogEntriesNumPerBatch, @@ -523,7 +550,9 @@ public Replication build() { checkpointGap, consensusMemoryBlock, maxMemoryRatioForQueue, - regionMigrationSpeedLimitBytesPerSecond); + regionMigrationSpeedLimitBytesPerSecond, + subscriptionWalRetentionSizeInBytes, + subscriptionWalRetentionTimeMs); } } } diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensus.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensus.java index 959191ca2d6d3..8cb168272b295 100644 --- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensus.java +++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensus.java @@ -82,6 +82,7 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.function.BiConsumer; +import java.util.function.Consumer; import java.util.stream.Collectors; public class IoTConsensus implements IConsensus { @@ -98,6 +99,19 @@ public class IoTConsensus implements IConsensus { private final IoTConsensusRPCService service; private final RegisterManager registerManager = new RegisterManager(); private IoTConsensusConfig config; + + /** + * Optional callback invoked after a new local peer is created via {@link #createLocalPeer}. Used + * by the subscription system to auto-bind prefetching queues to new DataRegions. + */ + public static volatile BiConsumer onNewPeerCreated; + + /** + * Optional callback invoked before a local peer is deleted via {@link #deleteLocalPeer}. Used by + * the subscription system to unbind and clean up prefetching queues before the region is removed. + */ + public static volatile Consumer onPeerRemoved; + private final IClientManager clientManager; private final IClientManager syncClientManager; private final ScheduledExecutorService backgroundTaskService; @@ -299,11 +313,33 @@ public void createLocalPeer(ConsensusGroupId groupId, List peers) if (exist.get()) { throw new ConsensusGroupAlreadyExistException(groupId); } + + // Notify subscription system about new peer creation for auto-binding + final BiConsumer callback = onNewPeerCreated; + if (callback != null) { + try { + callback.accept(groupId, stateMachineMap.get(groupId)); + } catch (final Exception e) { + logger.warn("onNewPeerCreated callback failed for group {}", groupId, e); + } + } } @Override public void deleteLocalPeer(ConsensusGroupId groupId) throws ConsensusException { KillPoint.setKillPoint(IoTConsensusDeleteLocalPeerKillPoints.BEFORE_DELETE); + + // Notify subscription system before stopping the peer, so that subscription queues can + // properly unregister from the still-alive serverImpl. + final Consumer removeCallback = onPeerRemoved; + if (removeCallback != null) { + try { + removeCallback.accept(groupId); + } catch (final Exception e) { + logger.warn("onPeerRemoved callback failed for group {}", groupId, e); + } + } + AtomicBoolean exist = new AtomicBoolean(false); stateMachineMap.computeIfPresent( groupId, diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerImpl.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerImpl.java index 567261efffffa..62f68368ae499 100644 --- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerImpl.java +++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerImpl.java @@ -46,6 +46,9 @@ import org.apache.iotdb.consensus.iot.logdispatcher.LogDispatcher; import org.apache.iotdb.consensus.iot.snapshot.IoTConsensusRateLimiter; import org.apache.iotdb.consensus.iot.snapshot.SnapshotFragmentReader; +import org.apache.iotdb.consensus.iot.subscription.SubscriptionQueueRegistry; +import org.apache.iotdb.consensus.iot.subscription.SubscriptionWalRetentionCalculator; +import org.apache.iotdb.consensus.iot.subscription.SubscriptionWalRetentionCalculator.SubscriptionRetentionBound; import org.apache.iotdb.consensus.iot.thrift.TActivatePeerReq; import org.apache.iotdb.consensus.iot.thrift.TActivatePeerRes; import org.apache.iotdb.consensus.iot.thrift.TBuildSyncLogChannelReq; @@ -58,6 +61,8 @@ import org.apache.iotdb.consensus.iot.thrift.TRemoveSyncLogChannelRes; import org.apache.iotdb.consensus.iot.thrift.TSendSnapshotFragmentReq; import org.apache.iotdb.consensus.iot.thrift.TSendSnapshotFragmentRes; +import org.apache.iotdb.consensus.iot.thrift.TSyncSafeHlcReq; +import org.apache.iotdb.consensus.iot.thrift.TSyncSafeHlcRes; import org.apache.iotdb.consensus.iot.thrift.TTriggerSnapshotLoadReq; import org.apache.iotdb.consensus.iot.thrift.TTriggerSnapshotLoadRes; import org.apache.iotdb.consensus.iot.thrift.TWaitReleaseAllRegionRelatedResourceReq; @@ -86,9 +91,11 @@ import java.util.LinkedList; import java.util.List; import java.util.Objects; +import java.util.Optional; import java.util.PriorityQueue; import java.util.TreeSet; import java.util.UUID; +import java.util.concurrent.BlockingQueue; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; @@ -103,6 +110,7 @@ public class IoTConsensusServerImpl { public static final String SNAPSHOT_DIR_NAME = "snapshot"; + private static final String WRITER_META_FILE_NAME = "writer.meta"; private static final Pattern SNAPSHOT_INDEX_PATTEN = Pattern.compile(".*[^\\d](?=(\\d+))"); private static final PerformanceOverviewMetrics PERFORMANCE_OVERVIEW_METRICS = PerformanceOverviewMetrics.getInstance(); @@ -128,6 +136,28 @@ public class IoTConsensusServerImpl { IoTConsensusRateLimiter.getInstance(); private IndexedConsensusRequest lastConsensusRequest; + // Subscription queues receive IndexedConsensusRequest in real-time from write(), + // similar to LogDispatcher, enabling in-memory data delivery without waiting for WAL flush. + private final SubscriptionQueueRegistry subscriptionQueueRegistry; + private final SubscriptionWalRetentionCalculator subscriptionWalRetentionCalculator; + + /** Current routing epoch for ordered consensus subscription. Set by external routing changes. */ + private volatile long currentRoutingEpoch = 0; + + /** Lifecycle identifier of the local writer for this region replica. */ + private volatile long currentWriterEpoch = 1; + + /** + * Maximum physical time known to this replica. Local writes assign from it; remote replication + * can also raise it so future local writes do not regress behind observed remote events. + */ + private final AtomicLong lastAssignedPhysicalTime = new AtomicLong(0); + + private final WriterSafeFrontierTracker writerSafeFrontierTracker = + new WriterSafeFrontierTracker(); + + private final Path writerMetaPath; + public IoTConsensusServerImpl( String storageDir, Peer thisNode, @@ -150,6 +180,11 @@ public IoTConsensusServerImpl( this.consensusReqReader = (ConsensusReqReader) stateMachine.read(new GetConsensusReqReaderPlan()); this.searchIndex = new AtomicLong(consensusReqReader.getCurrentSearchIndex()); + this.subscriptionQueueRegistry = new SubscriptionQueueRegistry(consensusGroupId); + this.subscriptionWalRetentionCalculator = + new SubscriptionWalRetentionCalculator(consensusReqReader); + this.writerMetaPath = Paths.get(storageDir, WRITER_META_FILE_NAME); + initializeWriterMeta(); this.ioTConsensusServerMetrics = new IoTConsensusServerMetrics(this); this.logDispatcher = new LogDispatcher(this, clientManager); } @@ -209,6 +244,7 @@ public TSStatus write(IConsensusRequest request) { writeToStateMachineStartTime - getStateMachineLockTime); IndexedConsensusRequest indexedConsensusRequest = buildIndexedConsensusRequestForLocalRequest(request); + indexedConsensusRequest.setRoutingEpoch(currentRoutingEpoch); lastConsensusRequest = indexedConsensusRequest; if (indexedConsensusRequest.getSearchIndex() % 100000 == 0) { logger.info( @@ -228,6 +264,11 @@ public TSStatus write(IConsensusRequest request) { ioTConsensusServerMetrics.recordWriteStateMachineTime( writeToStateMachineEndTime - writeToStateMachineStartTime); if (result.getCode() == TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + writerSafeFrontierTracker.recordAppliedProgress( + thisNode.getNodeId(), + currentWriterEpoch, + indexedConsensusRequest.getPhysicalTime(), + indexedConsensusRequest.getLocalSeq()); // The index is used when constructing batch in LogDispatcher. If its value // increases but the corresponding request does not exist or is not put into // the queue, the dispatcher will try to find the request in WAL. This behavior @@ -236,17 +277,38 @@ public TSStatus write(IConsensusRequest request) { // in one transaction. synchronized (searchIndex) { logDispatcher.offer(indexedConsensusRequest); + // Deliver to subscription queues for real-time in-memory consumption. + // Offer AFTER stateMachine.write() so that InsertNode has inferred types + // and properly typed values (same timing as LogDispatcher). + final int sqCount = subscriptionQueueRegistry.size(); + if (sqCount > 0) { + subscriptionQueueRegistry.offer(indexedConsensusRequest); + } else { + // Log periodically when no subscription queues are registered + if (indexedConsensusRequest.getSearchIndex() % 50 == 0) { + logger.debug( + "write() no subscription queues registered, " + + "group={}, searchIndex={}, this={}", + consensusGroupId, + indexedConsensusRequest.getSearchIndex(), + System.identityHashCode(this)); + } + } searchIndex.incrementAndGet(); } + persistWriterMetaOnSuccess(indexedConsensusRequest); // statistic the time of offering request into queue ioTConsensusServerMetrics.recordOfferRequestToQueueTime( System.nanoTime() - writeToStateMachineEndTime); } else { logger.debug( - "{}: write operation failed. searchIndex: {}. Code: {}", + "write operation FAILED. group={}, searchIndex={}, code={}, " + + "subscriptionQueues={}, this={}", thisNode.getGroupId(), indexedConsensusRequest.getSearchIndex(), - result.getCode()); + result.getCode(), + subscriptionQueueRegistry.size(), + System.identityHashCode(this)); } // statistic the time of total write process ioTConsensusServerMetrics.recordConsensusWriteTime( @@ -435,7 +497,7 @@ public interface ThrowableFunction { public void inactivatePeer(Peer peer, boolean forDeletionPurpose) throws ConsensusGroupModifyPeerException { ConsensusGroupModifyPeerException lastException = null; - // In region migration, if the target node restarts before the “addRegionPeer” phase within 1 + // In region migration, if the target node restarts before the "addRegionPeer" phase within 1 // minutes, // the client in the ClientManager will become invalid. // This PR adds 1 retry at this point to ensure that region migration can still proceed @@ -659,6 +721,38 @@ private boolean isSuccess(TSStatus status) { return status.getCode() == TSStatusCode.SUCCESS_STATUS.getStatusCode(); } + public TSStatus syncSafeHlcToPeer( + final Peer targetPeer, + final int writerNodeId, + final long writerEpoch, + final long safePhysicalTime, + final long barrierLocalSeq) { + try (SyncIoTConsensusServiceClient client = + syncClientManager.borrowClient(targetPeer.getEndpoint())) { + final TSyncSafeHlcRes res = + client.syncSafeHlc( + new TSyncSafeHlcReq() + .setConsensusGroupId(thisNode.getGroupId().convertToTConsensusGroupId()) + .setWriterNodeId(writerNodeId) + .setWriterEpoch(writerEpoch) + .setSafePhysicalTime(safePhysicalTime) + .setBarrierLocalSeq(barrierLocalSeq)); + return res.getStatus(); + } catch (Exception e) { + logger.debug( + "Failed to sync safeHLC to peer {} for group {}, writer=({}, {}), safePt={}, barrier={}", + targetPeer, + consensusGroupId, + writerNodeId, + writerEpoch, + safePhysicalTime, + barrierLocalSeq, + e); + return new TSStatus(TSStatusCode.INTERNAL_SERVER_ERROR.getStatusCode()) + .setMessage(e.getMessage()); + } + } + /** build SyncLog channel with safeIndex as the default initial sync index. */ public void buildSyncLogChannel(Peer targetPeer, boolean startNow) { buildSyncLogChannel(targetPeer, getMinSyncIndex(), startNow); @@ -720,13 +814,152 @@ public IndexedConsensusRequest buildIndexedConsensusRequestForLocalRequest( new IoTProgressIndex(thisNode.getNodeId(), searchIndex.get() + 1); ((ComparableConsensusRequest) request).setProgressIndex(iotProgressIndex); } - return new IndexedConsensusRequest(searchIndex.get() + 1, Collections.singletonList(request)); + return new IndexedConsensusRequest(searchIndex.get() + 1, Collections.singletonList(request)) + .setPhysicalTime(assignPhysicalTimeInMs()) + .setNodeId(thisNode.getNodeId()) + .setWriterEpoch(currentWriterEpoch); } public IndexedConsensusRequest buildIndexedConsensusRequestForRemoteRequest( - long syncIndex, List requests) { - return new IndexedConsensusRequest( - ConsensusReqReader.DEFAULT_SEARCH_INDEX, syncIndex, requests); + long syncIndex, + long routingEpoch, + long physicalTime, + int nodeId, + long writerEpoch, + List requests) { + observePhysicalTimeLowerBound(physicalTime); + IndexedConsensusRequest req = + new IndexedConsensusRequest(ConsensusReqReader.DEFAULT_SEARCH_INDEX, syncIndex, requests); + req.setRoutingEpoch(routingEpoch); + req.setPhysicalTime(physicalTime); + req.setNodeId(nodeId); + req.setWriterEpoch(writerEpoch); + return req; + } + + public WriterSafeFrontierTracker.SafeHlc createIdleSafeHlcForCurrentWriter() { + final long safePhysicalTime = assignPhysicalTimeInMs(); + final long barrierLocalSeq = searchIndex.get(); + writerSafeFrontierTracker.recordAppliedProgress( + thisNode.getNodeId(), currentWriterEpoch, safePhysicalTime, barrierLocalSeq); + return new WriterSafeFrontierTracker.SafeHlc(safePhysicalTime, barrierLocalSeq); + } + + public void observeRemoteSafeHlc( + final int writerNodeId, + final long writerEpoch, + final long safePhysicalTime, + final long barrierLocalSeq) { + observePhysicalTimeLowerBound(safePhysicalTime); + writerSafeFrontierTracker.observePendingSafeHlc( + writerNodeId, writerEpoch, safePhysicalTime, barrierLocalSeq); + } + + public void recordRemoteAppliedWriterProgress( + final int writerNodeId, + final long writerEpoch, + final long physicalTime, + final long appliedLocalSeq) { + writerSafeFrontierTracker.recordAppliedProgress( + writerNodeId, writerEpoch, physicalTime, appliedLocalSeq); + } + + public long getEffectiveSafePhysicalTime(final int writerNodeId, final long writerEpoch) { + return writerSafeFrontierTracker.getEffectiveSafePt(writerNodeId, writerEpoch); + } + + public WriterSafeFrontierTracker getWriterSafeFrontierTracker() { + return writerSafeFrontierTracker; + } + + public boolean hasSubscriptionConsumers() { + return !subscriptionQueueRegistry.isEmpty(); + } + + private long assignPhysicalTimeInMs() { + while (true) { + final long previous = lastAssignedPhysicalTime.get(); + final long candidate = Math.max(System.currentTimeMillis(), previous); + if (lastAssignedPhysicalTime.compareAndSet(previous, candidate)) { + return candidate; + } + } + } + + private void observePhysicalTimeLowerBound(final long observedPhysicalTime) { + if (observedPhysicalTime <= 0) { + return; + } + while (true) { + final long previous = lastAssignedPhysicalTime.get(); + final long candidate = Math.max(previous, observedPhysicalTime); + if (candidate == previous || lastAssignedPhysicalTime.compareAndSet(previous, candidate)) { + return; + } + } + } + + private void initializeWriterMeta() { + final long recoveredSearchIndex = searchIndex.get(); + try { + final Optional writerMetaOptional = WriterMeta.load(writerMetaPath); + if (writerMetaOptional.isPresent()) { + final WriterMeta writerMeta = writerMetaOptional.get(); + if (recoveredSearchIndex >= writerMeta.getLastAllocatedLocalSeq()) { + currentWriterEpoch = writerMeta.getWriterEpoch(); + logger.info( + "Recovered writer meta for group {} from {}, writerEpoch={}, recoveredLocalSeq={}, " + + "persistedLocalSeq={}", + consensusGroupId, + writerMetaPath, + currentWriterEpoch, + recoveredSearchIndex, + writerMeta.getLastAllocatedLocalSeq()); + } else { + currentWriterEpoch = writerMeta.getWriterEpoch() + 1; + logger.warn( + "Recovered searchIndex {} is behind persisted writer localSeq {} for group {}. " + + "Starting a new writerEpoch {}.", + recoveredSearchIndex, + writerMeta.getLastAllocatedLocalSeq(), + consensusGroupId, + currentWriterEpoch); + } + lastAssignedPhysicalTime.set( + Math.max(writerMeta.getLastAssignedPhysicalTimeMs(), System.currentTimeMillis())); + return; + } + } catch (IOException e) { + logger.warn( + "Failed to load writer meta for group {} from {}. Starting with writerEpoch=1.", + consensusGroupId, + writerMetaPath, + e); + } + currentWriterEpoch = 1; + lastAssignedPhysicalTime.set(System.currentTimeMillis()); + logger.info( + "Initialized fresh writer meta for group {}, writerEpoch={}, recoveredLocalSeq={}", + consensusGroupId, + currentWriterEpoch, + recoveredSearchIndex); + } + + private void persistWriterMetaOnSuccess(final IndexedConsensusRequest indexedConsensusRequest) { + try { + new WriterMeta( + currentWriterEpoch, + indexedConsensusRequest.getLocalSeq(), + indexedConsensusRequest.getPhysicalTime()) + .persist(writerMetaPath); + } catch (IOException e) { + logger.warn( + "Failed to persist writer meta for group {} at localSeq={}, pt={}", + consensusGroupId, + indexedConsensusRequest.getLocalSeq(), + indexedConsensusRequest.getPhysicalTime(), + e); + } } /** @@ -757,6 +990,47 @@ public long getSearchIndex() { return searchIndex.get(); } + public long getCurrentWriterEpoch() { + return currentWriterEpoch; + } + + public ConsensusReqReader getConsensusReqReader() { + return consensusReqReader; + } + + /** + * Registers a subscription pending queue for real-time in-memory data delivery. When {@link + * #write(IConsensusRequest)} succeeds, the IndexedConsensusRequest is offered to all registered + * subscription queues, enabling subscription consumers to receive data without waiting for WAL + * flush. + * + * @param queue the blocking queue to receive IndexedConsensusRequest entries + */ + public void registerSubscriptionQueue( + final BlockingQueue queue, + final SubscriptionWalRetentionPolicy retentionPolicy) { + subscriptionQueueRegistry.register(queue, retentionPolicy); + // Immediately re-evaluate the safe delete index with new subscription awareness + checkAndUpdateSafeDeletedSearchIndex(); + logger.info( + "Registered subscription queue for group {}, " + + "total subscription queues: {}, currentSearchIndex={}, this={}", + consensusGroupId, + subscriptionQueueRegistry.size(), + searchIndex.get(), + System.identityHashCode(this)); + } + + public void deregisterSubscriptionQueue(final BlockingQueue queue) { + subscriptionQueueRegistry.unregister(queue); + // Re-evaluate: with fewer subscribers, more WAL may be deletable + checkAndUpdateSafeDeletedSearchIndex(); + logger.info( + "Deregistered subscription queue for group {}, remaining subscription queues: {}", + consensusGroupId, + subscriptionQueueRegistry.size()); + } + public long getSyncLag() { long minSyncIndex = getMinSyncIndex(); return getSearchIndex() - minSyncIndex; @@ -872,18 +1146,41 @@ void checkAndUpdateIndex() { } /** - * If there is only one replica, set it to Long.MAX_VALUE. If there are multiple replicas, get the - * latest SafelyDeletedSearchIndex again. This enables wal to be deleted in a timely manner. + * Computes and updates the safe-to-delete WAL search index based on replication progress and + * subscription WAL retention policy. + * + *

Because multiple subscription topics share one region WAL, the effective per-region + * retention policy is the most conservative policy across all active subscription queues on this + * region. Retention is applied at rolled WAL-file granularity and may therefore lag behind the + * configured thresholds. */ - void checkAndUpdateSafeDeletedSearchIndex() { + public void checkAndUpdateSafeDeletedSearchIndex() { if (configuration.isEmpty()) { logger.error( "Configuration is empty, which is unexpected. Safe deleted search index won't be updated this time."); - } else if (configuration.size() == 1) { + return; + } + + final boolean hasSubscriptions = !subscriptionQueueRegistry.isEmpty(); + + if (configuration.size() == 1 && !hasSubscriptions) { + // Single replica, no subscription consumers => delete all WAL freely consensusReqReader.setSafelyDeletedSearchIndex(Long.MAX_VALUE); - } else { - consensusReqReader.setSafelyDeletedSearchIndex(getMinFlushedSyncIndex()); + consensusReqReader.setSubscriptionRetainedMinVersionId(Long.MAX_VALUE); + return; } + + final long replicationIndex = + configuration.size() > 1 ? getMinFlushedSyncIndex() : Long.MAX_VALUE; + + final SubscriptionRetentionBound subscriptionRetentionBound = + subscriptionWalRetentionCalculator.calculate( + subscriptionQueueRegistry.getRetentionPolicies()); + + consensusReqReader.setSafelyDeletedSearchIndex( + Math.min(replicationIndex, subscriptionRetentionBound.getSafelyDeletedSearchIndex())); + consensusReqReader.setSubscriptionRetainedMinVersionId( + subscriptionRetentionBound.getRetainedMinVersionId()); } public void checkAndUpdateSearchIndex() { @@ -1019,6 +1316,14 @@ private TSStatus cacheAndInsertLatestNode(DeserializedBatchIndexedConsensusReque insertNode.markAsGeneratedByRemoteConsensusLeader(); subStatus.add(stateMachine.write(insertNode)); } + if (subStatus.stream() + .allMatch(status -> status.getCode() == TSStatusCode.SUCCESS_STATUS.getStatusCode())) { + recordRemoteAppliedWriterProgress( + request.getWriterNodeId(), + request.getWriterEpoch(), + request.getEndPhysicalTime(), + request.getEndSyncIndex()); + } long applyTime = System.nanoTime(); ioTConsensusServerMetrics.recordApplyCost(applyTime - sortTime); queueSortCondition.signalAll(); diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/SubscriptionWalRetentionPolicy.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/SubscriptionWalRetentionPolicy.java new file mode 100644 index 0000000000000..da32fbed49cc8 --- /dev/null +++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/SubscriptionWalRetentionPolicy.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.consensus.iot; + +public class SubscriptionWalRetentionPolicy { + + public static final long UNBOUNDED = -1L; + + private final String topicName; + private final long retentionBytes; + private final long retentionMs; + + public SubscriptionWalRetentionPolicy( + final String topicName, final long retentionBytes, final long retentionMs) { + this.topicName = topicName; + this.retentionBytes = retentionBytes; + this.retentionMs = retentionMs; + } + + public String getTopicName() { + return topicName; + } + + public long getRetentionBytes() { + return retentionBytes; + } + + public long getRetentionMs() { + return retentionMs; + } + + public boolean isBytesUnbounded() { + return retentionBytes == UNBOUNDED; + } + + public boolean isTimeUnbounded() { + return retentionMs == UNBOUNDED; + } +} diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/WriterMeta.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/WriterMeta.java new file mode 100644 index 0000000000000..c3d30c8594c06 --- /dev/null +++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/WriterMeta.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.consensus.iot; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.StandardCopyOption; +import java.nio.file.StandardOpenOption; +import java.util.Optional; + +final class WriterMeta { + + private static final int FORMAT_VERSION = 1; + + private final long writerEpoch; + private final long lastAllocatedLocalSeq; + private final long lastAssignedPhysicalTimeMs; + + WriterMeta(long writerEpoch, long lastAllocatedLocalSeq, long lastAssignedPhysicalTimeMs) { + this.writerEpoch = writerEpoch; + this.lastAllocatedLocalSeq = lastAllocatedLocalSeq; + this.lastAssignedPhysicalTimeMs = lastAssignedPhysicalTimeMs; + } + + long getWriterEpoch() { + return writerEpoch; + } + + long getLastAllocatedLocalSeq() { + return lastAllocatedLocalSeq; + } + + long getLastAssignedPhysicalTimeMs() { + return lastAssignedPhysicalTimeMs; + } + + static Optional load(Path path) throws IOException { + if (!Files.exists(path)) { + return Optional.empty(); + } + try (InputStream inputStream = Files.newInputStream(path, StandardOpenOption.READ); + DataInputStream dataInputStream = new DataInputStream(inputStream)) { + final int version = dataInputStream.readInt(); + if (version != FORMAT_VERSION) { + throw new IOException( + String.format( + "Unsupported writer meta version %d in %s", version, path.toAbsolutePath())); + } + return Optional.of( + new WriterMeta( + dataInputStream.readLong(), dataInputStream.readLong(), dataInputStream.readLong())); + } + } + + void persist(Path path) throws IOException { + final Path parent = path.getParent(); + if (parent != null && !Files.exists(parent)) { + Files.createDirectories(parent); + } + final Path tempPath = + parent == null + ? Paths.get(path + ".tmp") + : parent.resolve(path.getFileName().toString() + ".tmp"); + try (OutputStream outputStream = + Files.newOutputStream( + tempPath, + StandardOpenOption.CREATE, + StandardOpenOption.TRUNCATE_EXISTING, + StandardOpenOption.WRITE); + DataOutputStream dataOutputStream = new DataOutputStream(outputStream)) { + dataOutputStream.writeInt(FORMAT_VERSION); + dataOutputStream.writeLong(writerEpoch); + dataOutputStream.writeLong(lastAllocatedLocalSeq); + dataOutputStream.writeLong(lastAssignedPhysicalTimeMs); + dataOutputStream.flush(); + } + Files.move(tempPath, path, StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.ATOMIC_MOVE); + } +} diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/WriterSafeFrontierTracker.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/WriterSafeFrontierTracker.java new file mode 100644 index 0000000000000..f48c258c4dd3a --- /dev/null +++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/WriterSafeFrontierTracker.java @@ -0,0 +1,205 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.consensus.iot; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.Objects; + +/** + * Tracks per-writer safe frontier on the receiving side. + * + *

Each writer keeps at most one pending safeHLC because generated safeHLC for the same writer is + * expected to be totally ordered by both safePt and barrierLocalSeq. + */ +public class WriterSafeFrontierTracker { + + private static final Logger LOGGER = LoggerFactory.getLogger(WriterSafeFrontierTracker.class); + + private final Map states = new HashMap<>(); + + public synchronized void recordAppliedProgress( + final int writerNodeId, + final long writerEpoch, + final long physicalTime, + final long appliedLocalSeq) { + final WriterIdentity writerIdentity = new WriterIdentity(writerNodeId, writerEpoch); + final WriterFrontierState state = + states.computeIfAbsent(writerIdentity, ignored -> new WriterFrontierState()); + state.appliedLocalSeq = Math.max(state.appliedLocalSeq, appliedLocalSeq); + if (physicalTime > 0) { + state.effectiveSafePt = Math.max(state.effectiveSafePt, physicalTime); + } + promotePendingIfReady(state); + } + + public synchronized void observePendingSafeHlc( + final int writerNodeId, + final long writerEpoch, + final long safePhysicalTime, + final long barrierLocalSeq) { + if (safePhysicalTime <= 0) { + return; + } + final WriterIdentity writerIdentity = new WriterIdentity(writerNodeId, writerEpoch); + final WriterFrontierState state = + states.computeIfAbsent(writerIdentity, ignored -> new WriterFrontierState()); + final SafeHlc candidate = new SafeHlc(safePhysicalTime, barrierLocalSeq); + if (state.appliedLocalSeq >= barrierLocalSeq) { + state.effectiveSafePt = Math.max(state.effectiveSafePt, safePhysicalTime); + state.pendingSafeHlc = null; + return; + } + if (state.pendingSafeHlc == null) { + state.pendingSafeHlc = candidate; + return; + } + final SafeHlc pending = state.pendingSafeHlc; + if (dominates(candidate, pending)) { + state.pendingSafeHlc = candidate; + return; + } + if (dominates(pending, candidate)) { + return; + } + LOGGER.warn( + "Observed incomparable safeHLC for writer {}. keep pending={}, ignore candidate={}", + writerIdentity, + pending, + candidate); + } + + public synchronized long getEffectiveSafePt(final int writerNodeId, final long writerEpoch) { + final WriterFrontierState state = states.get(new WriterIdentity(writerNodeId, writerEpoch)); + return Objects.nonNull(state) ? state.effectiveSafePt : 0L; + } + + public synchronized SafeHlc getPendingSafeHlc(final int writerNodeId, final long writerEpoch) { + final WriterFrontierState state = states.get(new WriterIdentity(writerNodeId, writerEpoch)); + return Objects.nonNull(state) ? state.pendingSafeHlc : null; + } + + public synchronized Map snapshotEffectiveSafePts() { + final Map snapshot = new HashMap<>(); + for (final Map.Entry entry : states.entrySet()) { + snapshot.put(entry.getKey(), entry.getValue().effectiveSafePt); + } + return Collections.unmodifiableMap(snapshot); + } + + private void promotePendingIfReady(final WriterFrontierState state) { + if (state.pendingSafeHlc == null) { + return; + } + if (state.appliedLocalSeq >= state.pendingSafeHlc.getBarrierLocalSeq()) { + state.effectiveSafePt = + Math.max(state.effectiveSafePt, state.pendingSafeHlc.getSafePhysicalTime()); + state.pendingSafeHlc = null; + } + } + + private static boolean dominates(final SafeHlc left, final SafeHlc right) { + return left.safePhysicalTime >= right.safePhysicalTime + && left.barrierLocalSeq >= right.barrierLocalSeq; + } + + public static final class WriterIdentity { + private final int writerNodeId; + private final long writerEpoch; + + public WriterIdentity(final int writerNodeId, final long writerEpoch) { + this.writerNodeId = writerNodeId; + this.writerEpoch = writerEpoch; + } + + public int getWriterNodeId() { + return writerNodeId; + } + + public long getWriterEpoch() { + return writerEpoch; + } + + @Override + public boolean equals(final Object obj) { + if (this == obj) { + return true; + } + if (!(obj instanceof WriterIdentity)) { + return false; + } + final WriterIdentity that = (WriterIdentity) obj; + return writerNodeId == that.writerNodeId && writerEpoch == that.writerEpoch; + } + + @Override + public int hashCode() { + return Objects.hash(writerNodeId, writerEpoch); + } + + @Override + public String toString() { + return "WriterIdentity{" + + "writerNodeId=" + + writerNodeId + + ", writerEpoch=" + + writerEpoch + + '}'; + } + } + + public static final class SafeHlc { + private final long safePhysicalTime; + private final long barrierLocalSeq; + + public SafeHlc(final long safePhysicalTime, final long barrierLocalSeq) { + this.safePhysicalTime = safePhysicalTime; + this.barrierLocalSeq = barrierLocalSeq; + } + + public long getSafePhysicalTime() { + return safePhysicalTime; + } + + public long getBarrierLocalSeq() { + return barrierLocalSeq; + } + + @Override + public String toString() { + return "SafeHlc{" + + "safePhysicalTime=" + + safePhysicalTime + + ", barrierLocalSeq=" + + barrierLocalSeq + + '}'; + } + } + + private static final class WriterFrontierState { + private long appliedLocalSeq = 0L; + private long effectiveSafePt = 0L; + private SafeHlc pendingSafeHlc; + } +} diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/client/DispatchLogHandler.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/client/DispatchLogHandler.java index bb0326d7473e7..bd3650fc9c231 100644 --- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/client/DispatchLogHandler.java +++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/client/DispatchLogHandler.java @@ -71,12 +71,16 @@ public void onComplete(TSyncLogEntriesRes response) { .collect(Collectors.toList()); String messages = String.join(", ", retryStatusMessages); - logger.warn( - "Can not send {} to peer {} for {} times because {}", - batch, - thread.getPeer(), - ++retryCount, - messages); + if (++retryCount == 1) { + logger.warn("Can not send {} to peer {} because {}", batch, thread.getPeer(), messages); + } else { + logger.debug( + "Can not send {} to peer {} for {} times because {}", + batch, + thread.getPeer(), + retryCount, + messages); + } sleepCorrespondingTimeAndRetryAsynchronous(); } else { if (logger.isDebugEnabled()) { @@ -105,14 +109,19 @@ public void onComplete(TSyncLogEntriesRes response) { public void onError(Exception exception) { ++retryCount; Throwable rootCause = ExceptionUtils.getRootCause(exception); - logger.warn( - "Can not send {} to peer for {} times {} because {}", - batch, - thread.getPeer(), - retryCount, - rootCause.toString()); + final Throwable actualCause = rootCause == null ? exception : rootCause; + if (retryCount == 1) { + logger.warn("Can not send {} to peer {} because {}", batch, thread.getPeer(), actualCause); + } else { + logger.debug( + "Can not send {} to peer for {} times {} because {}", + batch, + thread.getPeer(), + retryCount, + actualCause.toString()); + } // skip TApplicationException caused by follower - if (rootCause instanceof TApplicationException) { + if (actualCause instanceof TApplicationException) { completeBatch(batch); logger.warn("Skip retrying this Batch {} because of TApplicationException.", batch); logDispatcherThreadMetrics.recordSyncLogTimePerRequest(System.nanoTime() - createTime); diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/log/ConsensusReqReader.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/log/ConsensusReqReader.java index 6959b56b674d3..ab66ac4bc3316 100644 --- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/log/ConsensusReqReader.java +++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/log/ConsensusReqReader.java @@ -90,4 +90,73 @@ interface ReqIterator { /** Get total size of wal files. */ long getTotalSize(); + + /** + * Get disk usage of this specific WAL node (region-local), as opposed to {@link #getTotalSize()} + * which returns the global WAL disk usage across all WAL nodes. + */ + default long getRegionDiskUsage() { + return getTotalSize(); + } + + /** + * Calculate the search index boundary that, if used as safelyDeletedSearchIndex, would free at + * least {@code bytesToFree} bytes of WAL files from the oldest files of this WAL node. + * + * @param bytesToFree the minimum number of bytes to free + * @return the startSearchIndex of the WAL file just after the freed range, or {@link + * #DEFAULT_SAFELY_DELETED_SEARCH_INDEX} if no files need to be freed + */ + default long getSearchIndexToFreeAtLeast(long bytesToFree) { + // Default implementation: if any freeing is needed, allow deleting everything. + return bytesToFree > 0 ? Long.MAX_VALUE : DEFAULT_SAFELY_DELETED_SEARCH_INDEX; + } + + /** + * Set the minimum WAL file versionId that must be retained for subscription consumers. Files with + * versionId >= this value will not be deleted, regardless of their WALFileStatus. This protects + * Follower WAL files (CONTAINS_NONE_SEARCH_INDEX) from being deleted while subscriptions need + * them. + * + * @param minVersionId the minimum versionId to retain; Long.MAX_VALUE means no retention + */ + default void setSubscriptionRetainedMinVersionId(long minVersionId) { + // no-op by default + } + + /** + * Calculate the minimum WAL file versionId to retain such that freeing all files with versionId + * below that value would release at least {@code bytesToFree} bytes. + * + * @param bytesToFree the minimum number of bytes to free + * @return the versionId boundary; files with versionId < this can be freed + */ + default long getVersionIdToFreeAtLeast(long bytesToFree) { + return bytesToFree > 0 ? Long.MAX_VALUE : 0; + } + + /** + * Calculate the search index boundary that, if used as safelyDeletedSearchIndex, would free the + * oldest rolled WAL files whose lastModified time is earlier than {@code cutoffTimeMs}. The + * currently written WAL file is never considered deletable by this method. + * + * @param cutoffTimeMs files strictly older than this timestamp may be freed + * @return the search index boundary of the first retained file, or Long.MIN_VALUE + 1 when no + * rolled WAL files are old enough to be freed + */ + default long getSearchIndexToFreeBeforeTimestamp(long cutoffTimeMs) { + return Long.MIN_VALUE + 1; + } + + /** + * Calculate the minimum retained WAL versionId after freeing the oldest rolled WAL files whose + * lastModified time is earlier than {@code cutoffTimeMs}. + * + * @param cutoffTimeMs files strictly older than this timestamp may be freed + * @return the versionId boundary of the first retained file, or 0 when no rolled WAL files are + * old enough to be freed + */ + default long getVersionIdToFreeBeforeTimestamp(long cutoffTimeMs) { + return 0; + } } diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/logdispatcher/LogDispatcher.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/logdispatcher/LogDispatcher.java index 374691bf38bf1..78602692be300 100644 --- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/logdispatcher/LogDispatcher.java +++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/logdispatcher/LogDispatcher.java @@ -20,14 +20,17 @@ package org.apache.iotdb.consensus.iot.logdispatcher; import org.apache.iotdb.common.rpc.thrift.TEndPoint; +import org.apache.iotdb.common.rpc.thrift.TSStatus; import org.apache.iotdb.commons.client.IClientManager; import org.apache.iotdb.commons.concurrent.IoTDBThreadPoolFactory; import org.apache.iotdb.commons.concurrent.ThreadName; import org.apache.iotdb.commons.service.metric.MetricService; +import org.apache.iotdb.commons.subscription.config.SubscriptionConfig; import org.apache.iotdb.consensus.common.Peer; import org.apache.iotdb.consensus.common.request.IndexedConsensusRequest; import org.apache.iotdb.consensus.config.IoTConsensusConfig; import org.apache.iotdb.consensus.iot.IoTConsensusServerImpl; +import org.apache.iotdb.consensus.iot.WriterSafeFrontierTracker; import org.apache.iotdb.consensus.iot.client.AsyncIoTConsensusServiceClient; import org.apache.iotdb.consensus.iot.client.DispatchLogHandler; import org.apache.iotdb.consensus.iot.log.ConsensusReqReader; @@ -167,16 +170,17 @@ public synchronized OptionalLong getMinFlushedSyncIndex() { return threads.stream().mapToLong(LogDispatcherThread::getLastFlushedSyncIndex).min(); } - public void checkAndFlushIndex() { + public synchronized void checkAndFlushIndex() { if (!threads.isEmpty()) { threads.forEach( thread -> { IndexController controller = thread.getController(); controller.update(controller.getCurrentIndex(), true); }); - // do not set SafelyDeletedSearchIndex as it is Long.MAX_VALUE when replica is 1 - reader.setSafelyDeletedSearchIndex(impl.getMinFlushedSyncIndex()); } + // Single-replica regions do not have dispatcher threads, but still need periodic retention + // recalculation for subscription-aware WAL cleanup. + impl.checkAndUpdateSafeDeletedSearchIndex(); } public void offer(IndexedConsensusRequest request) { @@ -213,7 +217,7 @@ public long getLogEntriesFromQueue() { public class LogDispatcherThread implements Runnable { - private static final long PENDING_REQUEST_TAKING_TIME_OUT_IN_SEC = 10; + private static final long PENDING_REQUEST_TAKING_TIME_OUT_IN_MS = 10_000L; private static final long START_INDEX = 1; private final IoTConsensusConfig config; private final Peer peer; @@ -236,6 +240,7 @@ public class LogDispatcherThread implements Runnable { private final LogDispatcherThreadMetrics logDispatcherThreadMetrics; private final CountDownLatch runFinished = new CountDownLatch(1); + private volatile long lastIdleSafeHlcSentTimeMs = 0L; public LogDispatcherThread(Peer peer, IoTConsensusConfig config, long initialSyncIndex) { this.peer = peer; @@ -354,9 +359,10 @@ public void run() { while (!Thread.interrupted() && !stopped) { long startTime = System.nanoTime(); while ((batch = getBatch()).isEmpty()) { + maybeSendIdleSafeHlc(); // we may block here if there is no requests in the queue IndexedConsensusRequest request = - pendingEntries.poll(PENDING_REQUEST_TAKING_TIME_OUT_IN_SEC, TimeUnit.SECONDS); + pendingEntries.poll(calculateIdlePollTimeoutInMs(), TimeUnit.MILLISECONDS); if (request != null) { bufferedEntries.add(request); // If write pressure is low, we simply sleep a little to reduce the number of RPC @@ -364,6 +370,8 @@ public void run() { && bufferedEntries.isEmpty()) { Thread.sleep(config.getReplication().getMaxWaitingTimeForAccumulatingBatchInMs()); } + } else { + maybeSendIdleSafeHlc(); } // Immediately check for interrupts after poll and sleep if (Thread.interrupted() || stopped) { @@ -397,8 +405,9 @@ public void updateSafelyDeletedSearchIndex() { // indicating that insert nodes whose search index are before this value can be deleted // safely. // - // Use minFlushedSyncIndex here to reserve the WAL which are not flushed and support kill -9. - reader.setSafelyDeletedSearchIndex(impl.getMinFlushedSyncIndex()); + // Use subscription-aware safe-delete to avoid deleting WAL entries + // still needed by subscription consumers. + impl.checkAndUpdateSafeDeletedSearchIndex(); // notify if (impl.unblockWrite()) { impl.signal(); @@ -406,6 +415,7 @@ public void updateSafelyDeletedSearchIndex() { } public Batch getBatch() { + long startIndex = syncStatus.getNextSendingIndex(); long maxIndex; synchronized (impl.getIndexObject()) { @@ -504,6 +514,56 @@ public Batch getBatch() { return batches; } + private void maybeSendIdleSafeHlc() { + if (!shouldSendIdleSafeHlc()) { + return; + } + final long now = System.currentTimeMillis(); + if (now - lastIdleSafeHlcSentTimeMs + < SubscriptionConfig.getInstance().getSubscriptionConsensusIdleSafeHlcIntervalMs()) { + return; + } + final WriterSafeFrontierTracker.SafeHlc safeHlc = impl.createIdleSafeHlcForCurrentWriter(); + final TSStatus status = + impl.syncSafeHlcToPeer( + peer, + impl.getThisNode().getNodeId(), + impl.getCurrentWriterEpoch(), + safeHlc.getSafePhysicalTime(), + safeHlc.getBarrierLocalSeq()); + if (status.getCode() == org.apache.iotdb.rpc.TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + lastIdleSafeHlcSentTimeMs = now; + } else { + logger.debug( + "{}: Failed to send idle safeHLC to {}. status={}", + impl.getThisNode().getGroupId(), + peer, + status); + } + } + + private long calculateIdlePollTimeoutInMs() { + if (!shouldSendIdleSafeHlc()) { + return PENDING_REQUEST_TAKING_TIME_OUT_IN_MS; + } + final long elapsedSinceLastIdleSafeHlcMs = + System.currentTimeMillis() - lastIdleSafeHlcSentTimeMs; + final long untilNextIdleSafeHlcMs = + Math.max( + 1L, + SubscriptionConfig.getInstance().getSubscriptionConsensusIdleSafeHlcIntervalMs() + - elapsedSinceLastIdleSafeHlcMs); + return Math.min(PENDING_REQUEST_TAKING_TIME_OUT_IN_MS, untilNextIdleSafeHlcMs); + } + + private boolean shouldSendIdleSafeHlc() { + return impl.hasSubscriptionConsumers() + && pendingEntries.isEmpty() + && bufferedEntries.isEmpty() + && !syncStatus.hasPendingBatches() + && syncStatus.getNextSendingIndex() > impl.getSearchIndex(); + } + public void sendBatchAsync(Batch batch, DispatchLogHandler handler) { try { AsyncIoTConsensusServiceClient client = clientManager.borrowClient(peer.getEndpoint()); @@ -565,9 +625,13 @@ private boolean constructBatchFromWAL(long currentIndex, long maxIndex, Batch lo targetIndex = data.getSearchIndex() + 1; data.buildSerializedRequests(); // construct request from wal - logBatches.addTLogEntry( + TLogEntry logEntry = new TLogEntry( - data.getSerializedRequests(), data.getSearchIndex(), true, data.getMemorySize())); + data.getSerializedRequests(), data.getSearchIndex(), true, data.getMemorySize()); + logEntry.setRoutingEpoch(data.getRoutingEpoch()); + logEntry.setPhysicalTime(data.getPhysicalTime()); + logEntry.setWriterEpoch(writerEpochToShort(data.getWriterEpoch())); + logBatches.addTLogEntry(logEntry); } // In the case of corrupt Data, we return true so that we can send a batch as soon as // possible, avoiding potential duplication @@ -576,12 +640,16 @@ private boolean constructBatchFromWAL(long currentIndex, long maxIndex, Batch lo private void constructBatchIndexedFromConsensusRequest( IndexedConsensusRequest request, Batch logBatches) { - logBatches.addTLogEntry( + TLogEntry logEntry = new TLogEntry( request.getSerializedRequests(), request.getSearchIndex(), false, - request.getMemorySize())); + request.getMemorySize()); + logEntry.setRoutingEpoch(request.getRoutingEpoch()); + logEntry.setPhysicalTime(request.getPhysicalTime()); + logEntry.setWriterEpoch(writerEpochToShort(request.getWriterEpoch())); + logBatches.addTLogEntry(logEntry); } } @@ -592,4 +660,11 @@ public static AtomicLong getReceiverMemSizeSum() { public static AtomicLong getSenderMemSizeSum() { return senderMemSizeSum; } + + private static short writerEpochToShort(long writerEpoch) { + if (writerEpoch < Short.MIN_VALUE || writerEpoch > Short.MAX_VALUE) { + throw new IllegalArgumentException("writerEpoch exceeds short range: " + writerEpoch); + } + return (short) writerEpoch; + } } diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/logdispatcher/SyncStatus.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/logdispatcher/SyncStatus.java index accc9f7667d21..35304b82406c1 100644 --- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/logdispatcher/SyncStatus.java +++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/logdispatcher/SyncStatus.java @@ -109,4 +109,8 @@ public synchronized long getNextSendingIndex() { public synchronized List getPendingBatches() { return pendingBatches; } + + public synchronized boolean hasPendingBatches() { + return !pendingBatches.isEmpty(); + } } diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/service/IoTConsensusRPCServiceProcessor.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/service/IoTConsensusRPCServiceProcessor.java index 71c14aebaa139..51571ffa0c9ad 100644 --- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/service/IoTConsensusRPCServiceProcessor.java +++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/service/IoTConsensusRPCServiceProcessor.java @@ -48,6 +48,8 @@ import org.apache.iotdb.consensus.iot.thrift.TSendSnapshotFragmentRes; import org.apache.iotdb.consensus.iot.thrift.TSyncLogEntriesReq; import org.apache.iotdb.consensus.iot.thrift.TSyncLogEntriesRes; +import org.apache.iotdb.consensus.iot.thrift.TSyncSafeHlcReq; +import org.apache.iotdb.consensus.iot.thrift.TSyncSafeHlcRes; import org.apache.iotdb.consensus.iot.thrift.TTriggerSnapshotLoadReq; import org.apache.iotdb.consensus.iot.thrift.TTriggerSnapshotLoadRes; import org.apache.iotdb.consensus.iot.thrift.TWaitReleaseAllRegionRelatedResourceReq; @@ -107,11 +109,19 @@ public TSyncLogEntriesRes syncLogEntries(TSyncLogEntriesReq req) { } BatchIndexedConsensusRequest logEntriesInThisBatch = new BatchIndexedConsensusRequest(req.peerId); + final int sourceNodeId = req.peerId; // We use synchronized to ensure atomicity of executing multiple logs for (TLogEntry entry : req.getLogEntries()) { + long routingEpoch = entry.isSetRoutingEpoch() ? entry.getRoutingEpoch() : 0L; + long physicalTime = entry.isSetPhysicalTime() ? entry.getPhysicalTime() : 0L; + long writerEpoch = entry.isSetWriterEpoch() ? entry.getWriterEpoch() : 0L; logEntriesInThisBatch.add( impl.buildIndexedConsensusRequestForRemoteRequest( entry.getSearchIndex(), + routingEpoch, + physicalTime, + sourceNodeId, + writerEpoch, entry.getData().stream() .map( entry.isFromWAL() @@ -133,6 +143,28 @@ public TSyncLogEntriesRes syncLogEntries(TSyncLogEntriesReq req) { .setReceiverMemSize(deserializedRequest.getMemorySize()); } + @Override + public TSyncSafeHlcRes syncSafeHlc(final TSyncSafeHlcReq req) { + final ConsensusGroupId groupId = + ConsensusGroupId.Factory.createFromTConsensusGroupId(req.getConsensusGroupId()); + final IoTConsensusServerImpl impl = consensus.getImpl(groupId); + if (impl == null) { + final String message = + String.format("unexpected consensusGroupId %s for TSyncSafeHlcReq", groupId); + LOGGER.error(message); + final TSStatus status = new TSStatus(TSStatusCode.INTERNAL_SERVER_ERROR.getStatusCode()); + status.setMessage(message); + return new TSyncSafeHlcRes().setStatus(status); + } + impl.observeRemoteSafeHlc( + req.getWriterNodeId(), + req.getWriterEpoch(), + req.getSafePhysicalTime(), + req.getBarrierLocalSeq()); + return new TSyncSafeHlcRes() + .setStatus(new TSStatus(TSStatusCode.SUCCESS_STATUS.getStatusCode())); + } + @Override public TInactivatePeerRes inactivatePeer(TInactivatePeerReq req) throws TException { if (req.isForDeletionPurpose()) { diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/subscription/SubscriptionQueueRegistry.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/subscription/SubscriptionQueueRegistry.java new file mode 100644 index 0000000000000..c61f2f9479dea --- /dev/null +++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/subscription/SubscriptionQueueRegistry.java @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.consensus.iot.subscription; + +import org.apache.iotdb.consensus.common.request.IndexedConsensusRequest; +import org.apache.iotdb.consensus.iot.SubscriptionWalRetentionPolicy; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Collection; +import java.util.Map; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; + +public class SubscriptionQueueRegistry { + + private static final Logger LOGGER = LoggerFactory.getLogger(SubscriptionQueueRegistry.class); + + private static final long QUEUE_FULL_LOG_INTERVAL_MS = TimeUnit.SECONDS.toMillis(10); + + private final String consensusGroupId; + private final Map, SubscriptionWalRetentionPolicy> queues = + new ConcurrentHashMap<>(); + private final AtomicLong droppedEntries = new AtomicLong(); + private final AtomicLong lastDropLogTimeMs = new AtomicLong(); + + public SubscriptionQueueRegistry(final String consensusGroupId) { + this.consensusGroupId = consensusGroupId; + } + + public void register( + final BlockingQueue queue, + final SubscriptionWalRetentionPolicy retentionPolicy) { + queues.put(queue, retentionPolicy); + } + + public void unregister(final BlockingQueue queue) { + queues.remove(queue); + } + + public boolean isEmpty() { + return queues.isEmpty(); + } + + public int size() { + return queues.size(); + } + + public Collection getRetentionPolicies() { + return queues.values(); + } + + public void offer(final IndexedConsensusRequest indexedConsensusRequest) { + final int queueCount = queues.size(); + if (queueCount <= 0) { + return; + } + + LOGGER.debug( + "write() offering to {} subscription queue(s), group={}, searchIndex={}, requestType={}", + queueCount, + consensusGroupId, + indexedConsensusRequest.getSearchIndex(), + indexedConsensusRequest.getRequests().isEmpty() + ? "EMPTY" + : indexedConsensusRequest.getRequests().get(0).getClass().getSimpleName()); + + for (final BlockingQueue queue : queues.keySet()) { + final boolean offered = queue.offer(indexedConsensusRequest); + LOGGER.debug( + "offer result={}, queueSize={}, queueRemaining={}", + offered, + queue.size(), + queue.remainingCapacity()); + if (!offered) { + final long droppedCount = droppedEntries.incrementAndGet(); + final long now = System.currentTimeMillis(); + final long lastLogTime = lastDropLogTimeMs.get(); + if (now - lastLogTime >= QUEUE_FULL_LOG_INTERVAL_MS + && lastDropLogTimeMs.compareAndSet(lastLogTime, now)) { + LOGGER.warn( + "Subscription queue full, dropped {} entry(s) in the last {} ms, latest " + + "searchIndex={}, queueSize={}, queueRemaining={}", + droppedEntries.getAndSet(0), + QUEUE_FULL_LOG_INTERVAL_MS, + indexedConsensusRequest.getSearchIndex(), + queue.size(), + queue.remainingCapacity()); + } else { + LOGGER.debug( + "Subscription queue full, dropped entry searchIndex={}, droppedCount={}", + indexedConsensusRequest.getSearchIndex(), + droppedCount); + } + } + } + } +} diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/subscription/SubscriptionWalRetentionCalculator.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/subscription/SubscriptionWalRetentionCalculator.java new file mode 100644 index 0000000000000..e3caa892f131c --- /dev/null +++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/subscription/SubscriptionWalRetentionCalculator.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.consensus.iot.subscription; + +import org.apache.iotdb.consensus.iot.SubscriptionWalRetentionPolicy; +import org.apache.iotdb.consensus.iot.log.ConsensusReqReader; + +import java.util.Collection; + +public class SubscriptionWalRetentionCalculator { + + public static final class SubscriptionRetentionBound { + + private static final long RETAIN_ALL_SEARCH_INDEX = Long.MIN_VALUE + 1; + private static final long RETAIN_ALL_VERSION_ID = 0L; + + private final long safelyDeletedSearchIndex; + private final long retainedMinVersionId; + + private SubscriptionRetentionBound( + final long safelyDeletedSearchIndex, final long retainedMinVersionId) { + this.safelyDeletedSearchIndex = safelyDeletedSearchIndex; + this.retainedMinVersionId = retainedMinVersionId; + } + + private static SubscriptionRetentionBound noConstraint() { + return new SubscriptionRetentionBound(Long.MAX_VALUE, Long.MAX_VALUE); + } + + private static SubscriptionRetentionBound retainAll() { + return new SubscriptionRetentionBound(RETAIN_ALL_SEARCH_INDEX, RETAIN_ALL_VERSION_ID); + } + + private static SubscriptionRetentionBound of( + final long safelyDeletedSearchIndex, final long retainedMinVersionId) { + return new SubscriptionRetentionBound(safelyDeletedSearchIndex, retainedMinVersionId); + } + + private SubscriptionRetentionBound mergeDeleteEither(final SubscriptionRetentionBound other) { + return new SubscriptionRetentionBound( + Math.max(safelyDeletedSearchIndex, other.safelyDeletedSearchIndex), + Math.max(retainedMinVersionId, other.retainedMinVersionId)); + } + + private SubscriptionRetentionBound mergeDeleteOnlyIfBoth( + final SubscriptionRetentionBound other) { + return new SubscriptionRetentionBound( + Math.min(safelyDeletedSearchIndex, other.safelyDeletedSearchIndex), + Math.min(retainedMinVersionId, other.retainedMinVersionId)); + } + + public long getSafelyDeletedSearchIndex() { + return safelyDeletedSearchIndex; + } + + public long getRetainedMinVersionId() { + return retainedMinVersionId; + } + } + + private final ConsensusReqReader consensusReqReader; + + public SubscriptionWalRetentionCalculator(final ConsensusReqReader consensusReqReader) { + this.consensusReqReader = consensusReqReader; + } + + public SubscriptionRetentionBound calculate( + final Collection retentionPolicies) { + SubscriptionRetentionBound mergedBound = SubscriptionRetentionBound.noConstraint(); + for (final SubscriptionWalRetentionPolicy policy : retentionPolicies) { + // For each topic, data can be deleted once either its size retention or its time retention + // allows reclamation. Across topics sharing the same region WAL, we can only delete data + // that every active topic already allows us to reclaim. + final SubscriptionRetentionBound perQueueBound = + buildSizeRetentionBound(policy.getRetentionBytes()) + .mergeDeleteEither(buildTimeRetentionBound(policy.getRetentionMs())); + mergedBound = mergedBound.mergeDeleteOnlyIfBoth(perQueueBound); + } + return mergedBound; + } + + private SubscriptionRetentionBound buildSizeRetentionBound(final long retentionSizeLimit) { + if (retentionSizeLimit == SubscriptionWalRetentionPolicy.UNBOUNDED || retentionSizeLimit <= 0) { + return SubscriptionRetentionBound.retainAll(); + } + + final long regionWalSize = consensusReqReader.getRegionDiskUsage(); + if (regionWalSize <= retentionSizeLimit) { + return SubscriptionRetentionBound.retainAll(); + } + + final long excess = regionWalSize - retentionSizeLimit; + return SubscriptionRetentionBound.of( + consensusReqReader.getSearchIndexToFreeAtLeast(excess), + consensusReqReader.getVersionIdToFreeAtLeast(excess)); + } + + private SubscriptionRetentionBound buildTimeRetentionBound(final long retentionMs) { + if (retentionMs == SubscriptionWalRetentionPolicy.UNBOUNDED || retentionMs <= 0) { + return SubscriptionRetentionBound.retainAll(); + } + + final long cutoffTimeMs = System.currentTimeMillis() - retentionMs; + return SubscriptionRetentionBound.of( + consensusReqReader.getSearchIndexToFreeBeforeTimestamp(cutoffTimeMs), + consensusReqReader.getVersionIdToFreeBeforeTimestamp(cutoffTimeMs)); + } +} diff --git a/iotdb-core/consensus/src/test/java/org/apache/iotdb/consensus/iot/WriterSafeFrontierTrackerTest.java b/iotdb-core/consensus/src/test/java/org/apache/iotdb/consensus/iot/WriterSafeFrontierTrackerTest.java new file mode 100644 index 0000000000000..a368750cc7916 --- /dev/null +++ b/iotdb-core/consensus/src/test/java/org/apache/iotdb/consensus/iot/WriterSafeFrontierTrackerTest.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.consensus.iot; + +import org.junit.Test; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; + +public class WriterSafeFrontierTrackerTest { + + @Test + public void testPendingSafeHlcPromotesWhenBarrierIsApplied() { + final WriterSafeFrontierTracker tracker = new WriterSafeFrontierTracker(); + + tracker.recordAppliedProgress(7, 2L, 100L, 10L); + assertEquals(100L, tracker.getEffectiveSafePt(7, 2L)); + + tracker.observePendingSafeHlc(7, 2L, 130L, 20L); + assertEquals(100L, tracker.getEffectiveSafePt(7, 2L)); + assertEquals(130L, tracker.getPendingSafeHlc(7, 2L).getSafePhysicalTime()); + + tracker.recordAppliedProgress(7, 2L, 125L, 19L); + assertEquals(125L, tracker.getEffectiveSafePt(7, 2L)); + + tracker.recordAppliedProgress(7, 2L, 126L, 20L); + assertEquals(130L, tracker.getEffectiveSafePt(7, 2L)); + assertNull(tracker.getPendingSafeHlc(7, 2L)); + } + + @Test + public void testSameWriterKeepsOnlyNewestPendingSafeHlc() { + final WriterSafeFrontierTracker tracker = new WriterSafeFrontierTracker(); + + tracker.observePendingSafeHlc(9, 3L, 200L, 30L); + tracker.observePendingSafeHlc(9, 3L, 220L, 35L); + + assertEquals(220L, tracker.getPendingSafeHlc(9, 3L).getSafePhysicalTime()); + assertEquals(35L, tracker.getPendingSafeHlc(9, 3L).getBarrierLocalSeq()); + + tracker.observePendingSafeHlc(9, 3L, 210L, 32L); + assertEquals(220L, tracker.getPendingSafeHlc(9, 3L).getSafePhysicalTime()); + assertEquals(35L, tracker.getPendingSafeHlc(9, 3L).getBarrierLocalSeq()); + } +} diff --git a/iotdb-core/consensus/src/test/java/org/apache/iotdb/consensus/iot/util/FakeConsensusReqReader.java b/iotdb-core/consensus/src/test/java/org/apache/iotdb/consensus/iot/util/FakeConsensusReqReader.java index 733df885e48fe..76e6e798c0f11 100644 --- a/iotdb-core/consensus/src/test/java/org/apache/iotdb/consensus/iot/util/FakeConsensusReqReader.java +++ b/iotdb-core/consensus/src/test/java/org/apache/iotdb/consensus/iot/util/FakeConsensusReqReader.java @@ -57,6 +57,26 @@ public long getTotalSize() { return 0; } + @Override + public long getRegionDiskUsage() { + return 0; + } + + @Override + public long getSearchIndexToFreeAtLeast(long bytesToFree) { + return bytesToFree > 0 ? Long.MAX_VALUE : DEFAULT_SAFELY_DELETED_SEARCH_INDEX; + } + + @Override + public long getSearchIndexToFreeBeforeTimestamp(long cutoffTimeMs) { + return Long.MIN_VALUE + 1; + } + + @Override + public long getVersionIdToFreeBeforeTimestamp(long cutoffTimeMs) { + return 0; + } + private class FakeConsensusReqIterator implements ConsensusReqReader.ReqIterator { private long nextSearchIndex; diff --git a/iotdb-core/consensus/src/test/java/org/apache/iotdb/consensus/iot/util/TestStateMachine.java b/iotdb-core/consensus/src/test/java/org/apache/iotdb/consensus/iot/util/TestStateMachine.java index a515010a3497a..9aa27d79ff645 100644 --- a/iotdb-core/consensus/src/test/java/org/apache/iotdb/consensus/iot/util/TestStateMachine.java +++ b/iotdb-core/consensus/src/test/java/org/apache/iotdb/consensus/iot/util/TestStateMachine.java @@ -97,11 +97,18 @@ public TSStatus write(IConsensusRequest request) { public IConsensusRequest deserializeRequest(IConsensusRequest request) { if (request instanceof BatchIndexedConsensusRequest) { BatchIndexedConsensusRequest consensusRequest = (BatchIndexedConsensusRequest) request; + final IndexedConsensusRequest lastIndexedRequest = + consensusRequest.getRequests().isEmpty() + ? null + : consensusRequest.getRequests().get(consensusRequest.getRequests().size() - 1); DeserializedBatchIndexedConsensusRequest result = new DeserializedBatchIndexedConsensusRequest( consensusRequest.getStartSyncIndex(), consensusRequest.getEndSyncIndex(), - consensusRequest.getRequests().size()); + consensusRequest.getRequests().size(), + consensusRequest.getSourcePeerId(), + lastIndexedRequest != null ? lastIndexedRequest.getWriterEpoch() : 0L, + lastIndexedRequest != null ? lastIndexedRequest.getPhysicalTime() : 0L); for (IndexedConsensusRequest r : consensusRequest.getRequests()) { result.add(r); } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBDescriptor.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBDescriptor.java index b221d8810a5ed..21ed712961fa0 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBDescriptor.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBDescriptor.java @@ -2206,6 +2206,11 @@ public synchronized void loadHotModifiedProps(TrimProperties properties) conf.setWALCompressionAlgorithm( enableWALCompression ? CompressionType.LZ4 : CompressionType.UNCOMPRESSED); + // update subscription consensus config: + // - batching properties take effect on running queues because they are read dynamically + // - retention defaults only affect queues created after reload + commonDescriptor.loadHotModifiedSubscriptionConsensusProps(properties); + // update Consensus config reloadConsensusProps(properties); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/consensus/DataRegionConsensusImpl.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/consensus/DataRegionConsensusImpl.java index 8b3eb5ffd2fe4..e9398043c6b10 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/consensus/DataRegionConsensusImpl.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/consensus/DataRegionConsensusImpl.java @@ -158,6 +158,10 @@ private static ConsensusConfig buildConsensusConfig() { .setMaxMemoryRatioForQueue(CONF.getMaxMemoryRatioForQueue()) .setRegionMigrationSpeedLimitBytesPerSecond( CONF.getRegionMigrationSpeedLimitBytesPerSecond()) + .setSubscriptionWalRetentionSizeInBytes( + COMMON_CONF.getSubscriptionConsensusWalRetentionSizeInBytes()) + .setSubscriptionWalRetentionTimeMs( + COMMON_CONF.getSubscriptionConsensusWalRetentionTimeMs()) .build()) .build()) .setIoTConsensusV2Config( diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/consensus/statemachine/dataregion/DataExecutionVisitor.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/consensus/statemachine/dataregion/DataExecutionVisitor.java index e0184b8595d23..f97c0194425f6 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/consensus/statemachine/dataregion/DataExecutionVisitor.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/consensus/statemachine/dataregion/DataExecutionVisitor.java @@ -75,7 +75,7 @@ public TSStatus visitRelationalInsertRows(RelationalInsertRowsNode node, DataReg public TSStatus visitInsertRow(InsertRowNode node, DataRegion dataRegion) { try { dataRegion.insert(node); - dataRegion.insertSeparatorToWAL(); + dataRegion.insertSeparatorToWAL(node); return StatusUtils.OK; } catch (OutOfTTLException e) { LOGGER.warn("Error in executing plan node: {}, caused by {}", node, e.getMessage()); @@ -99,7 +99,7 @@ public TSStatus visitRelationalInsertTablet( public TSStatus visitInsertTablet(final InsertTabletNode node, final DataRegion dataRegion) { try { dataRegion.insertTablet(node); - dataRegion.insertSeparatorToWAL(); + dataRegion.insertSeparatorToWAL(node); return StatusUtils.OK; } catch (final OutOfTTLException e) { LOGGER.debug("Error in executing plan node: {}, caused by {}", node, e.getMessage()); @@ -136,7 +136,7 @@ public TSStatus visitInsertTablet(final InsertTabletNode node, final DataRegion public TSStatus visitInsertRows(InsertRowsNode node, DataRegion dataRegion) { try { dataRegion.insert(node); - dataRegion.insertSeparatorToWAL(); + dataRegion.insertSeparatorToWAL(node); return StatusUtils.OK; } catch (WriteProcessRejectException e) { LOGGER.warn("Reject in executing plan node: {}, caused by {}", node, e.getMessage()); @@ -173,7 +173,7 @@ public TSStatus visitInsertRows(InsertRowsNode node, DataRegion dataRegion) { public TSStatus visitInsertMultiTablets(InsertMultiTabletsNode node, DataRegion dataRegion) { try { dataRegion.insertTablets(node); - dataRegion.insertSeparatorToWAL(); + dataRegion.insertSeparatorToWAL(node); return StatusUtils.OK; } catch (WriteProcessRejectException e) { LOGGER.warn("Reject in executing plan node: {}, caused by {}", node, e.getMessage()); @@ -208,7 +208,7 @@ public TSStatus visitInsertRowsOfOneDevice( InsertRowsOfOneDeviceNode node, DataRegion dataRegion) { try { dataRegion.insert(node); - dataRegion.insertSeparatorToWAL(); + dataRegion.insertSeparatorToWAL(node); return StatusUtils.OK; } catch (WriteProcessRejectException e) { LOGGER.warn("Reject in executing plan node: {}, caused by {}", node, e.getMessage()); @@ -264,7 +264,7 @@ public TSStatus visitDeleteData(DeleteDataNode node, DataRegion dataRegion) { dataRegion.deleteByDevice(path, node); } } - dataRegion.insertSeparatorToWAL(); + dataRegion.insertSeparatorToWAL(node); return StatusUtils.OK; } catch (IOException | IllegalPathException e) { LOGGER.error("Error in executing plan node: {}", node, e); @@ -279,7 +279,7 @@ public TSStatus visitDeleteData( final RelationalDeleteDataNode node, final DataRegion dataRegion) { try { dataRegion.deleteByTable(node); - dataRegion.insertSeparatorToWAL(); + dataRegion.insertSeparatorToWAL(node); return StatusUtils.OK; } catch (final IOException e) { LOGGER.error("Error in executing plan node: {}", node, e); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/consensus/statemachine/dataregion/DataRegionStateMachine.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/consensus/statemachine/dataregion/DataRegionStateMachine.java index 5fa375406b896..edafc3d597b5f 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/consensus/statemachine/dataregion/DataRegionStateMachine.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/consensus/statemachine/dataregion/DataRegionStateMachine.java @@ -154,6 +154,10 @@ protected PlanNode grabPlanNode(IndexedConsensusRequest indexedRequest) { PlanNode planNode = getPlanNode(req); if (planNode instanceof SearchNode) { ((SearchNode) planNode).setSearchIndex(indexedRequest.getSearchIndex()); + ((SearchNode) planNode).setPhysicalTime(indexedRequest.getPhysicalTime()); + ((SearchNode) planNode).setNodeId(indexedRequest.getNodeId()); + ((SearchNode) planNode).setWriterEpoch(indexedRequest.getWriterEpoch()); + ((SearchNode) planNode).setSyncIndex(indexedRequest.getSyncIndex()); searchNodes.add((SearchNode) planNode); } else { logger.warn("Unexpected PlanNode type {}, which is not SearchNode", planNode.getClass()); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/consensus/statemachine/dataregion/IoTConsensusDataRegionStateMachine.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/consensus/statemachine/dataregion/IoTConsensusDataRegionStateMachine.java index 240c1b1caa0fe..a835335aa81b2 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/consensus/statemachine/dataregion/IoTConsensusDataRegionStateMachine.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/consensus/statemachine/dataregion/IoTConsensusDataRegionStateMachine.java @@ -82,11 +82,18 @@ public IConsensusRequest deserializeRequest(IConsensusRequest request) { result = grabPlanNode(indexedRequest); } else if (request instanceof BatchIndexedConsensusRequest) { BatchIndexedConsensusRequest batchRequest = (BatchIndexedConsensusRequest) request; + final IndexedConsensusRequest lastIndexedRequest = + batchRequest.getRequests().isEmpty() + ? null + : batchRequest.getRequests().get(batchRequest.getRequests().size() - 1); DeserializedBatchIndexedConsensusRequest deserializedRequest = new DeserializedBatchIndexedConsensusRequest( batchRequest.getStartSyncIndex(), batchRequest.getEndSyncIndex(), - batchRequest.getRequests().size()); + batchRequest.getRequests().size(), + batchRequest.getSourcePeerId(), + lastIndexedRequest != null ? lastIndexedRequest.getWriterEpoch() : 0L, + lastIndexedRequest != null ? lastIndexedRequest.getPhysicalTime() : 0L); for (IndexedConsensusRequest indexedRequest : batchRequest.getRequests()) { final PlanNode planNode = grabPlanNode(indexedRequest); if (planNode instanceof ComparableConsensusRequest) { diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/pipe/agent/task/execution/PipeSubtaskExecutorManager.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/pipe/agent/task/execution/PipeSubtaskExecutorManager.java index 45f86a4706c0e..f83c23871f516 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/pipe/agent/task/execution/PipeSubtaskExecutorManager.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/pipe/agent/task/execution/PipeSubtaskExecutorManager.java @@ -33,7 +33,7 @@ public class PipeSubtaskExecutorManager { private final PipeProcessorSubtaskExecutor processorExecutor; private final Supplier connectorExecutorSupplier; - private final SubscriptionSubtaskExecutor subscriptionExecutor; + private volatile SubscriptionSubtaskExecutor subscriptionExecutor; public PipeProcessorSubtaskExecutor getProcessorExecutor() { return processorExecutor; @@ -49,6 +49,7 @@ public IoTConsensusV2SubtaskExecutor getConsensusExecutor() { } public SubscriptionSubtaskExecutor getSubscriptionExecutor() { + ensureSubscriptionExecutors(); return subscriptionExecutor; } @@ -57,15 +58,28 @@ public SubscriptionSubtaskExecutor getSubscriptionExecutor() { private PipeSubtaskExecutorManager() { processorExecutor = new PipeProcessorSubtaskExecutor(); connectorExecutorSupplier = PipeSinkSubtaskExecutor::new; - subscriptionExecutor = - SubscriptionConfig.getInstance().getSubscriptionEnabled() - ? new SubscriptionSubtaskExecutor() - : null; + ensureSubscriptionExecutors(); // IoTV2 uses global singleton executor pool. IoTV2GlobalComponentContainer.getInstance() .setConsensusExecutor(new IoTConsensusV2SubtaskExecutor()); } + public synchronized void ensureSubscriptionExecutors() { + if (!SubscriptionConfig.getInstance().getSubscriptionEnabled()) { + return; + } + if (subscriptionExecutor == null || subscriptionExecutor.isShutdown()) { + subscriptionExecutor = new SubscriptionSubtaskExecutor(); + } + } + + public synchronized void shutdownSubscriptionExecutors() { + if (subscriptionExecutor != null) { + subscriptionExecutor.shutdown(); + subscriptionExecutor = null; + } + } + private static class PipeTaskExecutorHolder { private static PipeSubtaskExecutorManager instance = null; } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/client/ConfigNodeClient.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/client/ConfigNodeClient.java index e2c04caedfb20..e0dce94b1dda7 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/client/ConfigNodeClient.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/client/ConfigNodeClient.java @@ -117,6 +117,8 @@ import org.apache.iotdb.confignode.rpc.thrift.TGetAllTemplatesResp; import org.apache.iotdb.confignode.rpc.thrift.TGetAllTopicInfoResp; import org.apache.iotdb.confignode.rpc.thrift.TGetClusterIdResp; +import org.apache.iotdb.confignode.rpc.thrift.TGetCommitProgressReq; +import org.apache.iotdb.confignode.rpc.thrift.TGetCommitProgressResp; import org.apache.iotdb.confignode.rpc.thrift.TGetDataNodeLocationsResp; import org.apache.iotdb.confignode.rpc.thrift.TGetDatabaseReq; import org.apache.iotdb.confignode.rpc.thrift.TGetJarInListReq; @@ -1265,6 +1267,12 @@ public TGetAllSubscriptionInfoResp getAllSubscriptionInfo() throws TException { () -> client.getAllSubscriptionInfo(), resp -> !updateConfigNodeLeader(resp.status)); } + @Override + public TGetCommitProgressResp getCommitProgress(TGetCommitProgressReq req) throws TException { + return executeRemoteCallWithRetry( + () -> client.getCommitProgress(req), resp -> !updateConfigNodeLeader(resp.status)); + } + @Override public TPipeConfigTransferResp handleTransferConfigPlan(TPipeConfigTransferReq req) throws TException { diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java index 9fc5fcb436c89..536fa4355eb01 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java @@ -211,6 +211,8 @@ import org.apache.iotdb.db.storageengine.rescon.quotas.DataNodeSpaceQuotaManager; import org.apache.iotdb.db.storageengine.rescon.quotas.DataNodeThrottleQuotaManager; import org.apache.iotdb.db.subscription.agent.SubscriptionAgent; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusRegionRuntimeState; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusSubscriptionSetupHandler; import org.apache.iotdb.db.trigger.executor.TriggerExecutor; import org.apache.iotdb.db.trigger.executor.TriggerFireResult; import org.apache.iotdb.db.trigger.service.TriggerManagementService; @@ -283,6 +285,8 @@ import org.apache.iotdb.mpp.rpc.thrift.TMaintainPeerReq; import org.apache.iotdb.mpp.rpc.thrift.TNotifyRegionMigrationReq; import org.apache.iotdb.mpp.rpc.thrift.TPipeHeartbeatReq; +import org.apache.iotdb.mpp.rpc.thrift.TPullCommitProgressReq; +import org.apache.iotdb.mpp.rpc.thrift.TPullCommitProgressResp; import org.apache.iotdb.mpp.rpc.thrift.TPushConsumerGroupMetaReq; import org.apache.iotdb.mpp.rpc.thrift.TPushConsumerGroupMetaResp; import org.apache.iotdb.mpp.rpc.thrift.TPushConsumerGroupMetaRespExceptionMessage; @@ -294,6 +298,7 @@ import org.apache.iotdb.mpp.rpc.thrift.TPushSingleConsumerGroupMetaReq; import org.apache.iotdb.mpp.rpc.thrift.TPushSinglePipeMetaReq; import org.apache.iotdb.mpp.rpc.thrift.TPushSingleTopicMetaReq; +import org.apache.iotdb.mpp.rpc.thrift.TPushSubscriptionRuntimeReq; import org.apache.iotdb.mpp.rpc.thrift.TPushTopicMetaReq; import org.apache.iotdb.mpp.rpc.thrift.TPushTopicMetaResp; import org.apache.iotdb.mpp.rpc.thrift.TPushTopicMetaRespExceptionMessage; @@ -312,6 +317,8 @@ import org.apache.iotdb.mpp.rpc.thrift.TSendFragmentInstanceReq; import org.apache.iotdb.mpp.rpc.thrift.TSendFragmentInstanceResp; import org.apache.iotdb.mpp.rpc.thrift.TSendSinglePlanNodeResp; +import org.apache.iotdb.mpp.rpc.thrift.TSubscriptionRuntimeStateEntry; +import org.apache.iotdb.mpp.rpc.thrift.TSyncSubscriptionProgressReq; import org.apache.iotdb.mpp.rpc.thrift.TTableDeviceDeletionWithPatternAndFilterReq; import org.apache.iotdb.mpp.rpc.thrift.TTableDeviceDeletionWithPatternOrModReq; import org.apache.iotdb.mpp.rpc.thrift.TTableDeviceInvalidateCacheReq; @@ -357,6 +364,7 @@ import java.util.Collections; import java.util.HashMap; import java.util.HashSet; +import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Objects; @@ -1551,6 +1559,132 @@ public TPushConsumerGroupMetaResp pushSingleConsumerGroupMeta( } } + @Override + public TPullCommitProgressResp pullCommitProgress(TPullCommitProgressReq req) { + try { + final int dataNodeId = IoTDBDescriptor.getInstance().getConfig().getDataNodeId(); + final Map regionProgress = + SubscriptionAgent.broker().collectAllRegionCommitProgress(dataNodeId); + logSuspiciousRegionProgressPayloads(regionProgress); + return new TPullCommitProgressResp(new TSStatus(TSStatusCode.SUCCESS_STATUS.getStatusCode())) + .setCommitRegionProgress(regionProgress); + } catch (Exception e) { + LOGGER.warn("Error occurred when pulling commit progress", e); + return new TPullCommitProgressResp( + new TSStatus(TSStatusCode.EXECUTE_STATEMENT_ERROR.getStatusCode())); + } + } + + @Override + public TSStatus syncSubscriptionProgress(TSyncSubscriptionProgressReq req) { + try { + SubscriptionAgent.broker() + .receiveSubscriptionProgress( + req.getConsumerGroupId(), + req.getTopicName(), + req.getRegionId(), + req.getPhysicalTime(), + req.getLocalSeq(), + req.isSetWriterNodeId() ? req.getWriterNodeId() : -1, + req.isSetWriterEpoch() ? req.getWriterEpoch() : 0L); + return new TSStatus(TSStatusCode.SUCCESS_STATUS.getStatusCode()); + } catch (Exception e) { + LOGGER.warn("Error occurred when receiving subscription progress broadcast", e); + return new TSStatus(TSStatusCode.EXECUTE_STATEMENT_ERROR.getStatusCode()); + } + } + + private static void logSuspiciousRegionProgressPayloads( + final Map regionProgress) { + if (Objects.isNull(regionProgress) || regionProgress.isEmpty()) { + return; + } + for (final Map.Entry entry : regionProgress.entrySet()) { + if (isSuspiciousRegionProgressPayload(entry.getValue())) { + LOGGER.warn( + "PULL_COMMIT_PROGRESS datanode send suspicious payload, key={}, summary={}", + entry.getKey(), + summarizeRegionProgressPayload(entry.getValue())); + } + } + } + + private static boolean isSuspiciousRegionProgressPayload(final ByteBuffer buffer) { + if (Objects.isNull(buffer)) { + return true; + } + final ByteBuffer duplicate = buffer.slice(); + if (duplicate.remaining() < Integer.BYTES) { + return true; + } + final int firstInt = duplicate.getInt(); + return firstInt < 0 || firstInt > 1_000_000; + } + + private static String summarizeRegionProgressPayload(final ByteBuffer buffer) { + if (Objects.isNull(buffer)) { + return "null"; + } + final int position = buffer.position(); + final int limit = buffer.limit(); + final int capacity = buffer.capacity(); + final ByteBuffer duplicate = buffer.slice(); + final int remaining = duplicate.remaining(); + final String firstIntSummary; + if (remaining >= Integer.BYTES) { + final int firstInt = duplicate.getInt(); + firstIntSummary = firstInt + "(0x" + String.format("%08x", firstInt) + ")"; + duplicate.position(0); + } else { + firstIntSummary = "n/a"; + } + final int sampleLength = Math.min(16, remaining); + final byte[] sample = new byte[sampleLength]; + duplicate.get(sample, 0, sampleLength); + return "pos=" + + position + + ", limit=" + + limit + + ", capacity=" + + capacity + + ", remaining=" + + remaining + + ", firstInt=" + + firstIntSummary + + ", firstBytes=" + + bytesToHex(sample); + } + + private static String bytesToHex(final byte[] bytes) { + if (Objects.isNull(bytes) || bytes.length == 0) { + return ""; + } + final StringBuilder builder = new StringBuilder(bytes.length * 2); + for (final byte b : bytes) { + builder.append(String.format("%02x", b)); + } + return builder.toString(); + } + + @Override + public TSStatus pushSubscriptionRuntime(TPushSubscriptionRuntimeReq req) { + try { + for (final TSubscriptionRuntimeStateEntry runtimeStateEntry : req.getRuntimeStates()) { + ConsensusSubscriptionSetupHandler.applyRuntimeState( + runtimeStateEntry.getRegionId(), + new ConsensusRegionRuntimeState( + runtimeStateEntry.getRuntimeVersion(), + runtimeStateEntry.getPreferredWriterNodeId(), + runtimeStateEntry.isActive(), + new LinkedHashSet<>(runtimeStateEntry.getActiveWriterNodeIds()))); + } + return new TSStatus(TSStatusCode.SUCCESS_STATUS.getStatusCode()); + } catch (Exception e) { + LOGGER.warn("Error occurred when pushing subscription runtime state", e); + return new TSStatus(TSStatusCode.EXECUTE_STATEMENT_ERROR.getStatusCode()); + } + } + @Override public TPipeHeartbeatResp pipeHeartbeat(TPipeHeartbeatReq req) throws TException { final TPipeHeartbeatResp resp = new TPipeHeartbeatResp(new ArrayList<>()); @@ -2239,6 +2373,13 @@ public TDataNodeHeartbeatResp getDataNodeHeartBeat(TDataNodeHeartbeatReq req) th public TSStatus updateRegionCache(TRegionRouteReq req) { boolean result = ClusterPartitionFetcher.getInstance().updateRegionCache(req); if (result) { + // Notify consensus subscription queues of any preferred-writer changes + try { + ConsensusSubscriptionSetupHandler.onRegionRouteChanged( + req.getRegionRouteMap(), req.getTimestamp()); + } catch (final Exception e) { + LOGGER.warn("Failed to process consensus subscription route update", e); + } return RpcUtils.getStatus(TSStatusCode.SUCCESS_STATUS); } else { return RpcUtils.getStatus(TSStatusCode.PARTITION_CACHE_UPDATE_ERROR); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/execution/config/executor/ClusterConfigTaskExecutor.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/execution/config/executor/ClusterConfigTaskExecutor.java index 5325560301c64..ec917d2b9d4f5 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/execution/config/executor/ClusterConfigTaskExecutor.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/execution/config/executor/ClusterConfigTaskExecutor.java @@ -2818,19 +2818,21 @@ public SettableFuture createTopic( // Validate topic config final TopicMeta temporaryTopicMeta = new TopicMeta(topicName, System.currentTimeMillis(), topicAttributes); - try { - PipeDataNodeAgent.plugin() - .validate( - "fakePipeName", - // TODO: currently use root to create topic - temporaryTopicMeta.generateExtractorAttributes( - CommonDescriptor.getInstance().getConfig().getDefaultAdminName()), - temporaryTopicMeta.generateProcessorAttributes(), - temporaryTopicMeta.generateConnectorAttributes("fakeConsumerGroupId")); - } catch (final Exception e) { - future.setException( - new IoTDBException(e.getMessage(), TSStatusCode.CREATE_TOPIC_ERROR.getStatusCode())); - return future; + if (!temporaryTopicMeta.getConfig().isConsensusMode()) { + try { + PipeDataNodeAgent.plugin() + .validate( + "fakePipeName", + // TODO: currently use root to create topic + temporaryTopicMeta.generateExtractorAttributes( + CommonDescriptor.getInstance().getConfig().getDefaultAdminName()), + temporaryTopicMeta.generateProcessorAttributes(), + temporaryTopicMeta.generateConnectorAttributes("fakeConsumerGroupId")); + } catch (final Exception e) { + future.setException( + new IoTDBException(e.getMessage(), TSStatusCode.CREATE_TOPIC_ERROR.getStatusCode())); + return future; + } } try (final ConfigNodeClient configNodeClient = diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/pipe/PipeEnrichedDeleteDataNode.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/pipe/PipeEnrichedDeleteDataNode.java index 11d70e0daa755..eb668a206a1b3 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/pipe/PipeEnrichedDeleteDataNode.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/pipe/PipeEnrichedDeleteDataNode.java @@ -96,6 +96,72 @@ public void setProgressIndex(ProgressIndex progressIndex) { deleteDataNode.setProgressIndex(progressIndex); } + @Override + public SearchNode setSearchIndex(final long searchIndex) { + deleteDataNode.setSearchIndex(searchIndex); + return this; + } + + @Override + public long getSearchIndex() { + return deleteDataNode.getSearchIndex(); + } + + @Override + public long getRoutingEpoch() { + return deleteDataNode.getRoutingEpoch(); + } + + @Override + public SearchNode setRoutingEpoch(final long routingEpoch) { + deleteDataNode.setRoutingEpoch(routingEpoch); + return this; + } + + @Override + public long getPhysicalTime() { + return deleteDataNode.getPhysicalTime(); + } + + @Override + public SearchNode setPhysicalTime(final long physicalTime) { + deleteDataNode.setPhysicalTime(physicalTime); + return this; + } + + @Override + public int getNodeId() { + return deleteDataNode.getNodeId(); + } + + @Override + public SearchNode setNodeId(final int nodeId) { + deleteDataNode.setNodeId(nodeId); + return this; + } + + @Override + public long getWriterEpoch() { + return deleteDataNode.getWriterEpoch(); + } + + @Override + public SearchNode setWriterEpoch(final long writerEpoch) { + deleteDataNode.setWriterEpoch(writerEpoch); + return this; + } + + @Override + public long getSyncIndex() { + return deleteDataNode.getSyncIndex(); + } + + @Override + public SearchNode setSyncIndex(final long syncIndex) { + deleteDataNode.setSyncIndex(syncIndex); + return this; + } + @Override public List getChildren() { return deleteDataNode.getChildren(); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/pipe/PipeEnrichedInsertNode.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/pipe/PipeEnrichedInsertNode.java index 2e517700217b7..f8c7ee9a17415 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/pipe/PipeEnrichedInsertNode.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/pipe/PipeEnrichedInsertNode.java @@ -233,6 +233,61 @@ public SearchNode setSearchIndex(final long searchIndex) { return this; } + @Override + public long getRoutingEpoch() { + return insertNode.getRoutingEpoch(); + } + + @Override + public SearchNode setRoutingEpoch(final long routingEpoch) { + insertNode.setRoutingEpoch(routingEpoch); + return this; + } + + @Override + public long getPhysicalTime() { + return insertNode.getPhysicalTime(); + } + + @Override + public SearchNode setPhysicalTime(final long physicalTime) { + insertNode.setPhysicalTime(physicalTime); + return this; + } + + @Override + public int getNodeId() { + return insertNode.getNodeId(); + } + + @Override + public SearchNode setNodeId(final int nodeId) { + insertNode.setNodeId(nodeId); + return this; + } + + @Override + public long getWriterEpoch() { + return insertNode.getWriterEpoch(); + } + + @Override + public SearchNode setWriterEpoch(final long writerEpoch) { + insertNode.setWriterEpoch(writerEpoch); + return this; + } + + @Override + public long getSyncIndex() { + return insertNode.getSyncIndex(); + } + + @Override + public SearchNode setSyncIndex(final long syncIndex) { + insertNode.setSyncIndex(syncIndex); + return this; + } + @Override protected void serializeAttributes(final ByteBuffer byteBuffer) { PlanNodeType.PIPE_ENRICHED_INSERT_DATA.serialize(byteBuffer); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/DeleteDataNode.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/DeleteDataNode.java index cfba72d66db62..7c0bc25dfaa55 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/DeleteDataNode.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/DeleteDataNode.java @@ -398,6 +398,10 @@ public SearchNode merge(List searchNodes) { pathList, firstOne.getDeleteStartTime(), firstOne.getDeleteEndTime()) - .setSearchIndex(firstOne.searchIndex); + .setSearchIndex(firstOne.searchIndex) + .setPhysicalTime(firstOne.getPhysicalTime()) + .setNodeId(firstOne.getNodeId()) + .setWriterEpoch(firstOne.getWriterEpoch()) + .setSyncIndex(firstOne.getSyncIndex()); } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/InsertMultiTabletsNode.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/InsertMultiTabletsNode.java index b41d178b396c6..bf842e862b447 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/InsertMultiTabletsNode.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/InsertMultiTabletsNode.java @@ -142,6 +142,34 @@ public SearchNode setSearchIndex(long index) { return this; } + @Override + public SearchNode setPhysicalTime(long physicalTime) { + this.physicalTime = physicalTime; + insertTabletNodeList.forEach(plan -> plan.setPhysicalTime(physicalTime)); + return this; + } + + @Override + public SearchNode setNodeId(int nodeId) { + this.nodeId = nodeId; + insertTabletNodeList.forEach(plan -> plan.setNodeId(nodeId)); + return this; + } + + @Override + public SearchNode setWriterEpoch(long writerEpoch) { + this.writerEpoch = writerEpoch; + insertTabletNodeList.forEach(plan -> plan.setWriterEpoch(writerEpoch)); + return this; + } + + @Override + public SearchNode setSyncIndex(long syncIndex) { + this.syncIndex = syncIndex; + insertTabletNodeList.forEach(plan -> plan.setSyncIndex(syncIndex)); + return this; + } + @Override public List splitByPartition(IAnalysis analysis) { Map splitMap = new HashMap<>(); @@ -156,6 +184,10 @@ public List splitByPartition(IAnalysis analysis) { } else { tmpNode = new InsertMultiTabletsNode(this.getPlanNodeId()); tmpNode.setDataRegionReplicaSet(dataRegionReplicaSet); + tmpNode.setPhysicalTime(getPhysicalTime()); + tmpNode.setNodeId(getNodeId()); + tmpNode.setWriterEpoch(getWriterEpoch()); + tmpNode.setSyncIndex(getSyncIndex()); tmpNode.addInsertTabletNode((InsertTabletNode) subNode, i); splitMap.put(dataRegionReplicaSet, tmpNode); } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/InsertNode.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/InsertNode.java index 88a6faa004745..9aac99f485cef 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/InsertNode.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/InsertNode.java @@ -110,6 +110,10 @@ public final SearchNode merge(List searchNodes) { .collect(Collectors.toList()); InsertNode result = mergeInsertNode(insertNodes); result.setSearchIndex(insertNodes.get(0).getSearchIndex()); + result.setPhysicalTime(insertNodes.get(0).getPhysicalTime()); + result.setNodeId(insertNodes.get(0).getNodeId()); + result.setWriterEpoch(insertNodes.get(0).getWriterEpoch()); + result.setSyncIndex(insertNodes.get(0).getSyncIndex()); result.setTargetPath(insertNodes.get(0).getTargetPath()); return result; } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/InsertRowsNode.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/InsertRowsNode.java index 7392b7612705e..7a22085285cc5 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/InsertRowsNode.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/InsertRowsNode.java @@ -136,6 +136,34 @@ public SearchNode setSearchIndex(long index) { return this; } + @Override + public SearchNode setPhysicalTime(long physicalTime) { + this.physicalTime = physicalTime; + insertRowNodeList.forEach(plan -> plan.setPhysicalTime(physicalTime)); + return this; + } + + @Override + public SearchNode setNodeId(int nodeId) { + this.nodeId = nodeId; + insertRowNodeList.forEach(plan -> plan.setNodeId(nodeId)); + return this; + } + + @Override + public SearchNode setWriterEpoch(long writerEpoch) { + this.writerEpoch = writerEpoch; + insertRowNodeList.forEach(plan -> plan.setWriterEpoch(writerEpoch)); + return this; + } + + @Override + public SearchNode setSyncIndex(long syncIndex) { + this.syncIndex = syncIndex; + insertRowNodeList.forEach(plan -> plan.setSyncIndex(syncIndex)); + return this; + } + public Map getResults() { return results; } @@ -287,6 +315,10 @@ public List splitByPartition(IAnalysis analysis) { } else { tmpNode = new InsertRowsNode(this.getPlanNodeId()); tmpNode.setDataRegionReplicaSet(dataRegionReplicaSet); + tmpNode.setPhysicalTime(getPhysicalTime()); + tmpNode.setNodeId(getNodeId()); + tmpNode.setWriterEpoch(getWriterEpoch()); + tmpNode.setSyncIndex(getSyncIndex()); tmpNode.addOneInsertRowNode(insertRowNode, i); splitMap.put(dataRegionReplicaSet, tmpNode); } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/InsertRowsOfOneDeviceNode.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/InsertRowsOfOneDeviceNode.java index f1e28d32b104d..d3b9329bf756b 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/InsertRowsOfOneDeviceNode.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/InsertRowsOfOneDeviceNode.java @@ -106,6 +106,34 @@ public SearchNode setSearchIndex(long index) { return this; } + @Override + public SearchNode setPhysicalTime(long physicalTime) { + this.physicalTime = physicalTime; + insertRowNodeList.forEach(plan -> plan.setPhysicalTime(physicalTime)); + return this; + } + + @Override + public SearchNode setNodeId(int nodeId) { + this.nodeId = nodeId; + insertRowNodeList.forEach(plan -> plan.setNodeId(nodeId)); + return this; + } + + @Override + public SearchNode setWriterEpoch(long writerEpoch) { + this.writerEpoch = writerEpoch; + insertRowNodeList.forEach(plan -> plan.setWriterEpoch(writerEpoch)); + return this; + } + + @Override + public SearchNode setSyncIndex(long syncIndex) { + this.syncIndex = syncIndex; + insertRowNodeList.forEach(plan -> plan.setSyncIndex(syncIndex)); + return this; + } + public TSStatus[] getFailingStatus() { return StatusUtils.getFailingStatus(results, insertRowNodeList.size()); } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/RelationalDeleteDataNode.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/RelationalDeleteDataNode.java index 632d7c9ee1e0a..78117076ba5de 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/RelationalDeleteDataNode.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/RelationalDeleteDataNode.java @@ -330,6 +330,10 @@ public SearchNode merge(List searchNodes) { .flatMap(Collection::stream) .collect(Collectors.toList()); return new RelationalDeleteDataNode(this.getPlanNodeId(), allTableDeletionEntries, databaseName) - .setSearchIndex(getSearchIndex()); + .setSearchIndex(getSearchIndex()) + .setPhysicalTime(getPhysicalTime()) + .setNodeId(getNodeId()) + .setWriterEpoch(getWriterEpoch()) + .setSyncIndex(getSyncIndex()); } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/RelationalInsertRowsNode.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/RelationalInsertRowsNode.java index 594ccf50471f9..c8bcf04808ff7 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/RelationalInsertRowsNode.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/RelationalInsertRowsNode.java @@ -184,6 +184,10 @@ public List splitByPartition(IAnalysis analysis) { } else { tmpNode = new RelationalInsertRowsNode(this.getPlanNodeId()); tmpNode.setDataRegionReplicaSet(dataRegionReplicaSet); + tmpNode.setPhysicalTime(getPhysicalTime()); + tmpNode.setNodeId(getNodeId()); + tmpNode.setWriterEpoch(getWriterEpoch()); + tmpNode.setSyncIndex(getSyncIndex()); tmpNode.addOneInsertRowNode(insertRowNode, i); splitMap.put(dataRegionReplicaSet, tmpNode); } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/SearchNode.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/SearchNode.java index d506d1414e15e..7c0a9fec2bfe5 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/SearchNode.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/SearchNode.java @@ -23,11 +23,17 @@ import org.apache.iotdb.consensus.iot.log.ConsensusReqReader; import org.apache.iotdb.db.queryengine.plan.planner.plan.node.PlanNodeId; import org.apache.iotdb.db.queryengine.plan.planner.plan.node.WritePlanNode; +import org.apache.iotdb.db.storageengine.dataregion.wal.buffer.IWALByteBufferView; +import java.io.DataInputStream; +import java.io.IOException; +import java.nio.ByteBuffer; import java.util.List; public abstract class SearchNode extends WritePlanNode implements ComparableConsensusRequest { + protected static final int WAL_POSITION_SERIALIZED_SIZE = Long.BYTES; + /** this insert node doesn't need to participate in iot consensus */ public static final long NO_CONSENSUS_INDEX = ConsensusReqReader.DEFAULT_SEARCH_INDEX; @@ -37,6 +43,25 @@ public abstract class SearchNode extends WritePlanNode implements ComparableCons */ protected long searchIndex = NO_CONSENSUS_INDEX; + /** routing epoch from ConfigNode broadcast, used for ordered consensus subscription */ + protected long routingEpoch = 0; + + /** Millisecond physical time used as the first ordering key in the new subscription progress. */ + protected long physicalTime = 0; + + /** Writer node id used as the second ordering key across multiple writers. */ + protected int nodeId = -1; + + /** Writer-local lifecycle id. */ + protected long writerEpoch = 0; + + /** + * syncIndex carries the source Leader's searchIndex for replicated (Follower) writes. On Leader + * nodes this stays at NO_CONSENSUS_INDEX (-1). Only stored in WALMetaData V3, never changes the + * WAL entry's own searchIndex. + */ + protected long syncIndex = NO_CONSENSUS_INDEX; + protected SearchNode(PlanNodeId id) { super(id); } @@ -51,5 +76,71 @@ public SearchNode setSearchIndex(long searchIndex) { return this; } + public long getRoutingEpoch() { + return routingEpoch; + } + + public SearchNode setRoutingEpoch(long routingEpoch) { + this.routingEpoch = routingEpoch; + return this; + } + + public long getPhysicalTime() { + return physicalTime; + } + + public SearchNode setPhysicalTime(long physicalTime) { + this.physicalTime = physicalTime; + return this; + } + + public int getNodeId() { + return nodeId; + } + + public SearchNode setNodeId(int nodeId) { + this.nodeId = nodeId; + return this; + } + + public long getWriterEpoch() { + return writerEpoch; + } + + public SearchNode setWriterEpoch(long writerEpoch) { + this.writerEpoch = writerEpoch; + return this; + } + + public long getSyncIndex() { + return syncIndex; + } + + public SearchNode setSyncIndex(long syncIndex) { + this.syncIndex = syncIndex; + return this; + } + + public long getLocalSeq() { + return searchIndex; + } + + public SearchNode setLocalSeq(long localSeq) { + this.searchIndex = localSeq; + return this; + } + + protected final void serializeWalPosition(IWALByteBufferView buffer) { + buffer.putLong(searchIndex); + } + + protected final void deserializeWalPosition(DataInputStream stream) throws IOException { + this.searchIndex = stream.readLong(); + } + + protected final void deserializeWalPosition(ByteBuffer buffer) { + this.searchIndex = buffer.getLong(); + } + public abstract SearchNode merge(List searchNodes); } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/DataRegion.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/DataRegion.java index c65105ae54c8d..f5695b2d413ca 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/DataRegion.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/DataRegion.java @@ -87,6 +87,7 @@ import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertRowsOfOneDeviceNode; import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertTabletNode; import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.RelationalDeleteDataNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.SearchNode; import org.apache.iotdb.db.queryengine.plan.relational.metadata.TableMetadataImpl; import org.apache.iotdb.db.queryengine.plan.relational.metadata.fetcher.cache.LastCacheLoadStrategy; import org.apache.iotdb.db.queryengine.plan.relational.metadata.fetcher.cache.TableDeviceSchemaCache; @@ -1729,6 +1730,10 @@ private List insertToTsFileProcessors( if (v == null) { v = insertRowsNode.emptyClone(); v.setSearchIndex(insertRowNode.getSearchIndex()); + v.setPhysicalTime(insertRowsNode.getPhysicalTime()); + v.setNodeId(insertRowsNode.getNodeId()); + v.setWriterEpoch(insertRowsNode.getWriterEpoch()); + v.setSyncIndex(insertRowsNode.getSyncIndex()); v.setAligned(insertRowNode.isAligned()); if (insertRowNode.isGeneratedByPipe()) { v.markAsGeneratedByPipe(); @@ -2852,8 +2857,7 @@ public void deleteByDevice(final MeasurementPath pattern, final DeleteDataNode n } TreeDeviceSchemaCacheManager.getInstance().invalidateLastCache(pattern); // write log to impacted working TsFileProcessors - List walListeners = - logDeletionInWAL(startTime, endTime, searchIndex, pattern); + List walListeners = logDeletionInWAL(node, pattern); for (WALFlushListener walFlushListener : walListeners) { if (walFlushListener.waitForResult() == WALFlushListener.Status.FAILURE) { @@ -3018,8 +3022,7 @@ public void deleteDataDirectly(MeasurementPath pathToDelete, DeleteDataNode node } TreeDeviceSchemaCacheManager.getInstance().invalidateDatabaseLastCache(getDatabaseName()); // write log to impacted working TsFileProcessors - List walListeners = - logDeletionInWAL(startTime, endTime, searchIndex, pathToDelete); + List walListeners = logDeletionInWAL(node, pathToDelete); for (WALFlushListener walFlushListener : walListeners) { if (walFlushListener.waitForResult() == WALFlushListener.Status.FAILURE) { @@ -3095,22 +3098,37 @@ private List logDeletionInWAL(RelationalDeleteDataNode deleteD } private List logDeletionInWAL( - long startTime, long endTime, long searchIndex, MeasurementPath path) { + DeleteDataNode templateDeleteDataNode, MeasurementPath path) { if (config.getWalMode() == WALMode.DISABLE) { return Collections.emptyList(); } List walFlushListeners = new ArrayList<>(); DeleteDataNode deleteDataNode = - new DeleteDataNode(new PlanNodeId(""), Collections.singletonList(path), startTime, endTime); - deleteDataNode.setSearchIndex(searchIndex); + new DeleteDataNode( + new PlanNodeId(""), + Collections.singletonList(path), + templateDeleteDataNode.getDeleteStartTime(), + templateDeleteDataNode.getDeleteEndTime()); + deleteDataNode + .setSearchIndex(templateDeleteDataNode.getSearchIndex()) + .setPhysicalTime(templateDeleteDataNode.getPhysicalTime()) + .setNodeId(templateDeleteDataNode.getNodeId()) + .setWriterEpoch(templateDeleteDataNode.getWriterEpoch()) + .setSyncIndex(templateDeleteDataNode.getSyncIndex()); for (Map.Entry entry : workSequenceTsFileProcessors.entrySet()) { - if (TimePartitionUtils.satisfyPartitionId(startTime, endTime, entry.getKey())) { + if (TimePartitionUtils.satisfyPartitionId( + templateDeleteDataNode.getDeleteStartTime(), + templateDeleteDataNode.getDeleteEndTime(), + entry.getKey())) { WALFlushListener walFlushListener = entry.getValue().logDeleteDataNodeInWAL(deleteDataNode); walFlushListeners.add(walFlushListener); } } for (Map.Entry entry : workUnsequenceTsFileProcessors.entrySet()) { - if (TimePartitionUtils.satisfyPartitionId(startTime, endTime, entry.getKey())) { + if (TimePartitionUtils.satisfyPartitionId( + templateDeleteDataNode.getDeleteStartTime(), + templateDeleteDataNode.getDeleteEndTime(), + entry.getKey())) { WALFlushListener walFlushListener = entry.getValue().logDeleteDataNodeInWAL(deleteDataNode); walFlushListeners.add(walFlushListener); } @@ -3187,17 +3205,27 @@ private void deleteObjectFiles(List matchedObjectDirs, List for details. */ public void insertSeparatorToWAL() { + insertSeparatorToWAL(null); + } + + public void insertSeparatorToWAL(final SearchNode sourceNode) { writeLock("insertSeparatorToWAL"); try { if (deleted) { return; } + final ContinuousSameSearchIndexSeparatorNode separatorNode = + new ContinuousSameSearchIndexSeparatorNode(); + if (Objects.nonNull(sourceNode)) { + separatorNode + .setRoutingEpoch(sourceNode.getRoutingEpoch()) + .setPhysicalTime(sourceNode.getPhysicalTime()) + .setNodeId(sourceNode.getNodeId()) + .setWriterEpoch(sourceNode.getWriterEpoch()) + .setSyncIndex(sourceNode.getSyncIndex()); + } getWALNode() - .ifPresent( - walNode -> - walNode.log( - TsFileProcessor.MEMTABLE_NOT_EXIST, - new ContinuousSameSearchIndexSeparatorNode())); + .ifPresent(walNode -> walNode.log(TsFileProcessor.MEMTABLE_NOT_EXIST, separatorNode)); } finally { writeUnlock(); } @@ -4501,6 +4529,10 @@ public void insert(InsertRowsOfOneDeviceNode insertRowsOfOneDeviceNode) if (v == null) { v = new InsertRowsNode(insertRowsOfOneDeviceNode.getPlanNodeId()); v.setSearchIndex(insertRowNode.getSearchIndex()); + v.setPhysicalTime(insertRowsOfOneDeviceNode.getPhysicalTime()); + v.setNodeId(insertRowsOfOneDeviceNode.getNodeId()); + v.setWriterEpoch(insertRowsOfOneDeviceNode.getWriterEpoch()); + v.setSyncIndex(insertRowsOfOneDeviceNode.getSyncIndex()); v.setAligned(insertRowNode.isAligned()); if (insertRowNode.isGeneratedByPipe()) { v.markAsGeneratedByPipe(); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/buffer/WALBuffer.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/buffer/WALBuffer.java index a7d79f92b5753..1eed5f2a5f16a 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/buffer/WALBuffer.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/buffer/WALBuffer.java @@ -25,6 +25,7 @@ import org.apache.iotdb.commons.utils.TestOnly; import org.apache.iotdb.db.conf.IoTDBConfig; import org.apache.iotdb.db.conf.IoTDBDescriptor; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.ContinuousSameSearchIndexSeparatorNode; import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.DeleteDataNode; import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertNode; import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.ObjectNode; @@ -35,6 +36,7 @@ import org.apache.iotdb.db.storageengine.dataregion.wal.exception.BrokenWALFileException; import org.apache.iotdb.db.storageengine.dataregion.wal.exception.WALNodeClosedException; import org.apache.iotdb.db.storageengine.dataregion.wal.io.WALMetaData; +import org.apache.iotdb.db.storageengine.dataregion.wal.io.WALWriter; import org.apache.iotdb.db.storageengine.dataregion.wal.utils.MemoryControlledWALEntryQueue; import org.apache.iotdb.db.storageengine.dataregion.wal.utils.WALFileStatus; import org.apache.iotdb.db.storageengine.dataregion.wal.utils.WALFileUtils; @@ -326,26 +328,64 @@ private void handleInfoEntry(WALEntry walEntry) { walEntry.getWalFlushListener().fail(e); return; } - // parse search index + // parse search index and writer-progress metadata long searchIndex = DEFAULT_SEARCH_INDEX; - if (walEntry.getType().needSearch()) { + long syncIndex = DEFAULT_SEARCH_INDEX; + long physicalTime = 0; + int nodeId = -1; + long writerEpoch = 0; + if (walEntry.getType() == WALEntryType.CONTINUOUS_SAME_SEARCH_INDEX_SEPARATOR_NODE) { + final ContinuousSameSearchIndexSeparatorNode separatorNode = + (ContinuousSameSearchIndexSeparatorNode) walEntry.getValue(); + syncIndex = separatorNode.getSyncIndex(); + physicalTime = separatorNode.getPhysicalTime(); + nodeId = separatorNode.getNodeId(); + writerEpoch = separatorNode.getWriterEpoch(); + } else if (walEntry.getType().needSearch()) { if (walEntry.getType() == WALEntryType.DELETE_DATA_NODE) { searchIndex = ((DeleteDataNode) walEntry.getValue()).getSearchIndex(); + syncIndex = ((DeleteDataNode) walEntry.getValue()).getSyncIndex(); + physicalTime = ((DeleteDataNode) walEntry.getValue()).getPhysicalTime(); + nodeId = ((DeleteDataNode) walEntry.getValue()).getNodeId(); + writerEpoch = ((DeleteDataNode) walEntry.getValue()).getWriterEpoch(); } else if (walEntry.getType() == WALEntryType.RELATIONAL_DELETE_DATA_NODE) { searchIndex = ((RelationalDeleteDataNode) walEntry.getValue()).getSearchIndex(); + syncIndex = ((RelationalDeleteDataNode) walEntry.getValue()).getSyncIndex(); + physicalTime = ((RelationalDeleteDataNode) walEntry.getValue()).getPhysicalTime(); + nodeId = ((RelationalDeleteDataNode) walEntry.getValue()).getNodeId(); + writerEpoch = ((RelationalDeleteDataNode) walEntry.getValue()).getWriterEpoch(); } else if (walEntry.getType() == WALEntryType.OBJECT_FILE_NODE) { searchIndex = ((ObjectNode) walEntry.getValue()).getSearchIndex(); + syncIndex = ((ObjectNode) walEntry.getValue()).getSyncIndex(); + physicalTime = ((ObjectNode) walEntry.getValue()).getPhysicalTime(); + nodeId = ((ObjectNode) walEntry.getValue()).getNodeId(); + writerEpoch = ((ObjectNode) walEntry.getValue()).getWriterEpoch(); } else { searchIndex = ((InsertNode) walEntry.getValue()).getSearchIndex(); + syncIndex = ((InsertNode) walEntry.getValue()).getSyncIndex(); + physicalTime = ((InsertNode) walEntry.getValue()).getPhysicalTime(); + nodeId = ((InsertNode) walEntry.getValue()).getNodeId(); + writerEpoch = ((InsertNode) walEntry.getValue()).getWriterEpoch(); } if (searchIndex != DEFAULT_SEARCH_INDEX) { currentSearchIndex = searchIndex; currentFileStatus = WALFileStatus.CONTAINS_SEARCH_INDEX; } } + // For Leader writes: syncIndex stays -1, use searchIndex as the ordering key + // For Follower writes: searchIndex is -1, syncIndex carries source's searchIndex + long effectiveSyncIndex = (syncIndex >= 0) ? syncIndex : searchIndex; + long effectiveLocalSeq = (syncIndex >= 0) ? syncIndex : searchIndex; // update related info totalSize += size; - info.metaData.add(size, searchIndex, walEntry.getMemTableId()); + info.metaData.add( + size, + searchIndex, + walEntry.getMemTableId(), + physicalTime, + nodeId, + writerEpoch, + effectiveLocalSeq); info.memTableId2WalDiskUsage.compute( walEntry.getMemTableId(), (k, v) -> v == null ? size : v + size); info.fsyncListeners.add(walEntry.getWalFlushListener()); @@ -748,6 +788,11 @@ public boolean isAllWALEntriesConsumed() { } } + public WALMetaData getCurrentWALMetaDataSnapshot() { + final WALWriter writer = currentWALFileWriter; + return writer == null ? new WALMetaData() : writer.snapshotMetaData(); + } + public CheckpointManager getCheckpointManager() { return checkpointManager; } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/LogWriter.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/LogWriter.java index 95721f846ccca..8ad62c8a395a0 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/LogWriter.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/LogWriter.java @@ -69,7 +69,8 @@ protected LogWriter(File logFile, WALFileVersion version) throws IOException { this.logFile = logFile; this.logStream = new FileOutputStream(logFile, true); this.logChannel = this.logStream.getChannel(); - if ((!logFile.exists() || logFile.length() == 0) && version == WALFileVersion.V2) { + if ((!logFile.exists() || logFile.length() == 0) + && (version == WALFileVersion.V2 || version == WALFileVersion.V3)) { this.logChannel.write(ByteBuffer.wrap(version.getVersionBytes())); } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/ProgressWALReader.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/ProgressWALReader.java new file mode 100644 index 0000000000000..7b2d8485efbed --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/ProgressWALReader.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.storageengine.dataregion.wal.io; + +import java.io.Closeable; +import java.io.File; +import java.io.IOException; +import java.nio.ByteBuffer; + +/** + * Reader dedicated to the new writer-based subscription progress model. + * + *

It keeps the original WAL entry body untouched and exposes per-entry writer metadata from WAL + * footer arrays alongside the current entry buffer. + */ +public class ProgressWALReader implements Closeable { + + private final WALByteBufReader delegate; + + public ProgressWALReader(File logFile) throws IOException { + this.delegate = new WALByteBufReader(logFile); + } + + public ProgressWALReader(File logFile, WALMetaData metaDataSnapshot) throws IOException { + this.delegate = new WALByteBufReader(logFile, metaDataSnapshot); + } + + public boolean hasNext() { + return delegate.hasNext(); + } + + public ByteBuffer next() throws IOException { + return delegate.next(); + } + + public WALMetaData getMetaData() { + return delegate.getMetaData(); + } + + public long getCurrentEntryPhysicalTime() { + return delegate.getCurrentEntryPhysicalTime(); + } + + public int getCurrentEntryNodeId() { + return delegate.getCurrentEntryNodeId(); + } + + public long getCurrentEntryWriterEpoch() { + return delegate.getCurrentEntryWriterEpoch(); + } + + public long getCurrentEntryLocalSeq() { + return delegate.getCurrentEntryLocalSeq(); + } + + public int getCurrentEntryIndex() { + return delegate.getCurrentEntryIndex(); + } + + @Override + public void close() throws IOException { + delegate.close(); + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALByteBufReader.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALByteBufReader.java index 2f257da9adc4a..d14ea69cf6c5f 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALByteBufReader.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALByteBufReader.java @@ -27,6 +27,7 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.util.Iterator; +import java.util.List; /** * This reader returns {@link WALEntry} as {@link ByteBuffer}, the usage of WALByteBufReader is like @@ -36,6 +37,8 @@ public class WALByteBufReader implements Closeable { private WALMetaData metaData; private DataInputStream logStream; private Iterator sizeIterator; + // V3: track current entry index to provide per-entry progress metadata + private int currentEntryIndex = -1; public WALByteBufReader(File logFile) throws IOException { WALInputStream walInputStream = new WALInputStream(logFile); @@ -49,6 +52,18 @@ public WALByteBufReader(File logFile) throws IOException { } } + public WALByteBufReader(File logFile, WALMetaData metaDataSnapshot) throws IOException { + WALInputStream walInputStream = new WALInputStream(logFile); + try { + this.logStream = new DataInputStream(walInputStream); + this.metaData = metaDataSnapshot == null ? new WALMetaData() : metaDataSnapshot; + this.sizeIterator = this.metaData.getBuffersSize().iterator(); + } catch (Exception e) { + walInputStream.close(); + throw e; + } + } + /** Like {@link Iterator#hasNext()}. */ public boolean hasNext() { return sizeIterator.hasNext(); @@ -60,6 +75,7 @@ public boolean hasNext() { * @throws IOException when failing to read from channel. */ public ByteBuffer next() throws IOException { + currentEntryIndex++; int size = sizeIterator.next(); // TODO: Reuse this buffer ByteBuffer buffer = ByteBuffer.allocate(size); @@ -84,4 +100,41 @@ public void close() throws IOException { public long getFirstSearchIndex() { return metaData.getFirstSearchIndex(); } + + public long getCurrentEntryPhysicalTime() { + List physicalTimes = metaData.getPhysicalTimes(); + if (currentEntryIndex >= 0 && currentEntryIndex < physicalTimes.size()) { + return physicalTimes.get(currentEntryIndex); + } + return 0L; + } + + public int getCurrentEntryNodeId() { + List nodeIds = metaData.getNodeIds(); + if (currentEntryIndex >= 0 && currentEntryIndex < nodeIds.size()) { + return nodeIds.get(currentEntryIndex); + } + return -1; + } + + public long getCurrentEntryWriterEpoch() { + List writerEpochs = metaData.getWriterEpochs(); + if (currentEntryIndex >= 0 && currentEntryIndex < writerEpochs.size()) { + return writerEpochs.get(currentEntryIndex); + } + return 0L; + } + + public long getCurrentEntryLocalSeq() { + List localSeqs = metaData.getLocalSeqs(); + if (currentEntryIndex >= 0 && currentEntryIndex < localSeqs.size()) { + return localSeqs.get(currentEntryIndex); + } + return metaData.getFirstSearchIndex() + currentEntryIndex; + } + + /** Returns the current entry index (0-based). */ + public int getCurrentEntryIndex() { + return currentEntryIndex; + } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALFileVersion.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALFileVersion.java index e3d374551b115..fc09c34b6508e 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALFileVersion.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALFileVersion.java @@ -26,7 +26,8 @@ public enum WALFileVersion { V1("WAL"), - V2("V2-WAL"); + V2("V2-WAL"), + V3("V3-WAL"); private final String versionString; private byte[] versionBytes; @@ -56,7 +57,7 @@ public static WALFileVersion getVersion(FileChannel channel) throws IOException long originalPosition = channel.position(); try { // head magic string starts to exist since V2 - WALFileVersion[] versions = {V2}; + WALFileVersion[] versions = {V3, V2}; for (WALFileVersion version : versions) { channel.position(0); if (channel.size() < version.versionBytes.length) { diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALInputStream.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALInputStream.java index 0a7dbb5463c1a..906002b5922fd 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALInputStream.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALInputStream.java @@ -82,7 +82,7 @@ private void getEndOffset() throws IOException { } ByteBuffer metadataSizeBuf = ByteBuffer.allocate(Integer.BYTES); long position; - if (version == WALFileVersion.V2) { + if (version == WALFileVersion.V2 || version == WALFileVersion.V3) { // New Version ByteBuffer magicStringBuffer = ByteBuffer.allocate(version.getVersionBytes().length); channel.read(magicStringBuffer, channel.size() - version.getVersionBytes().length); @@ -122,7 +122,7 @@ private void getEndOffset() throws IOException { int metadataSize = metadataSizeBuf.getInt(); endOffset = channel.size() - version.getVersionBytes().length - Integer.BYTES - metadataSize; } finally { - if (version == WALFileVersion.V2) { + if (version == WALFileVersion.V2 || version == WALFileVersion.V3) { // Set the position back to the end of head magic string channel.position(version.getVersionBytes().length); } else { @@ -191,7 +191,7 @@ private void loadNextSegment() throws IOException { } long startTime = System.nanoTime(); long startPosition = channel.position(); - if (version == WALFileVersion.V2) { + if (version == WALFileVersion.V2 || version == WALFileVersion.V3) { loadNextSegmentV2(); } else if (version == WALFileVersion.V1) { loadNextSegmentV1(); @@ -295,7 +295,7 @@ private void tryLoadSegment() throws IOException { * @throws IOException If the file is broken or the given position is invalid */ public void skipToGivenLogicalPosition(long pos) throws IOException { - if (version == WALFileVersion.V2) { + if (version == WALFileVersion.V2 || version == WALFileVersion.V3) { channel.position(version.getVersionBytes().length); long posRemain = pos; SegmentInfo segmentInfo; diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALMetaData.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALMetaData.java index ba9211656ef03..4668ae75a3d70 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALMetaData.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALMetaData.java @@ -32,13 +32,18 @@ import java.nio.channels.FileChannel; import java.nio.charset.StandardCharsets; import java.util.ArrayList; +import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Set; /** * Metadata exists at the end of each wal file, including each entry's size, search index of first * entry and the number of entries. + * + *

V3 extension stores per-entry writer progress metadata, plus file-level timestamp range, to + * support consensus subscription recovery. */ public class WALMetaData implements SerializedSize { @@ -54,6 +59,20 @@ public class WALMetaData implements SerializedSize { private final Set memTablesId; private long truncateOffSet = 0; + // V3 fields: file-level data timestamp range for timestamp-based seek + private long minDataTs = Long.MAX_VALUE; + private long maxDataTs = Long.MIN_VALUE; + // V3 extension for writer-based subscription progress. + private final List physicalTimes; + private final List nodeIds; + private final List writerEpochs; + private final List localSeqs; + + private static final short DEFAULT_NODE_ID = (short) -1; + private static final short DEFAULT_WRITER_EPOCH = 0; + private static final int V3_EMPTY_METADATA_REMAINING_WITHOUT_MEMTABLE_COUNT = + Long.BYTES * 2 + Short.BYTES * 2 + Integer.BYTES; + public WALMetaData() { this(ConsensusReqReader.DEFAULT_SEARCH_INDEX, new ArrayList<>(), new HashSet<>()); } @@ -62,14 +81,52 @@ public WALMetaData(long firstSearchIndex, List buffersSize, Set m this.firstSearchIndex = firstSearchIndex; this.buffersSize = buffersSize; this.memTablesId = memTablesId; + this.physicalTimes = new ArrayList<>(); + this.nodeIds = new ArrayList<>(); + this.writerEpochs = new ArrayList<>(); + this.localSeqs = new ArrayList<>(); } + /** Adds an entry without explicit writer progress metadata. */ public void add(int size, long searchIndex, long memTableId) { + add(size, searchIndex, memTableId, 0L, DEFAULT_NODE_ID, DEFAULT_WRITER_EPOCH, searchIndex); + } + + public void add( + int size, + long searchIndex, + long memTableId, + long physicalTime, + int nodeId, + long writerEpoch, + long localSeq) { if (buffersSize.isEmpty()) { firstSearchIndex = searchIndex; } buffersSize.add(size); memTablesId.add(memTableId); + physicalTimes.add(physicalTime); + nodeIds.add(toShortExact(nodeId, "nodeId")); + writerEpochs.add(toShortExact(writerEpoch, "writerEpoch")); + localSeqs.add(localSeq); + } + + private static short toShortExact(long value, String fieldName) { + if (value < Short.MIN_VALUE || value > Short.MAX_VALUE) { + throw new IllegalArgumentException( + String.format("%s %s exceeds short range", fieldName, value)); + } + return (short) value; + } + + /** Update file-level timestamp range with a data point's timestamp. */ + public void updateTimestampRange(long dataTs) { + if (dataTs < minDataTs) { + minDataTs = dataTs; + } + if (dataTs > maxDataTs) { + maxDataTs = dataTs; + } } public void addAll(WALMetaData metaData) { @@ -78,16 +135,47 @@ public void addAll(WALMetaData metaData) { } buffersSize.addAll(metaData.getBuffersSize()); memTablesId.addAll(metaData.getMemTablesId()); + physicalTimes.addAll(metaData.getPhysicalTimes()); + nodeIds.addAll(metaData.getNodeIds()); + writerEpochs.addAll(metaData.getWriterEpochs()); + localSeqs.addAll(metaData.getLocalSeqs()); + if (metaData.minDataTs < this.minDataTs) { + this.minDataTs = metaData.minDataTs; + } + if (metaData.maxDataTs > this.maxDataTs) { + this.maxDataTs = metaData.maxDataTs; + } } @Override public int serializedSize() { - return FIXED_SERIALIZED_SIZE - + buffersSize.size() * Integer.BYTES - + (memTablesId.isEmpty() ? 0 : Integer.BYTES + memTablesId.size() * Long.BYTES); + return serializedSize(WALFileVersion.V2); + } + + public int serializedSize(WALFileVersion version) { + int size = + FIXED_SERIALIZED_SIZE + + buffersSize.size() * Integer.BYTES + + (memTablesId.isEmpty() ? 0 : Integer.BYTES + memTablesId.size() * Long.BYTES); + if (version == WALFileVersion.V3) { + // minDataTs(long) + maxDataTs(long) + size += Long.BYTES * 2; + // physicalTimes(long[]) + localSeqs(long[]) + size += buffersSize.size() * Long.BYTES * 2; + // defaultNodeId(short) + defaultWriterEpoch(short) + overrideCount(int) + // + override ordinals(int[]) + override nodeIds(short[]) + override writerEpochs(short[]) + final int overrideCount = getWriterOverrideCount(); + size += Short.BYTES * 2 + Integer.BYTES; + size += overrideCount * (Integer.BYTES + Short.BYTES + Short.BYTES); + } + return size; } public void serialize(ByteBuffer buffer) { + serialize(buffer, WALFileVersion.V2); + } + + public void serialize(ByteBuffer buffer, WALFileVersion version) { buffer.putLong(firstSearchIndex); buffer.putInt(buffersSize.size()); for (int size : buffersSize) { @@ -99,9 +187,49 @@ public void serialize(ByteBuffer buffer) { buffer.putLong(memTableId); } } + if (version == WALFileVersion.V3) { + buffer.putLong(minDataTs); + buffer.putLong(maxDataTs); + for (long physicalTime : physicalTimes) { + buffer.putLong(physicalTime); + } + for (long localSeq : localSeqs) { + buffer.putLong(localSeq); + } + final short defaultNodeId = computeDefaultNodeId(); + final short defaultWriterEpoch = computeDefaultWriterEpoch(); + final List overrideIndexes = new ArrayList<>(); + final List overrideNodeIds = new ArrayList<>(); + final List overrideWriterEpochs = new ArrayList<>(); + for (int i = 0; i < buffersSize.size(); i++) { + final short nodeId = nodeIds.get(i); + final short writerEpoch = writerEpochs.get(i); + if (nodeId != defaultNodeId || writerEpoch != defaultWriterEpoch) { + overrideIndexes.add(i); + overrideNodeIds.add(nodeId); + overrideWriterEpochs.add(writerEpoch); + } + } + buffer.putShort(defaultNodeId); + buffer.putShort(defaultWriterEpoch); + buffer.putInt(overrideIndexes.size()); + for (int overrideIndex : overrideIndexes) { + buffer.putInt(overrideIndex); + } + for (short nodeId : overrideNodeIds) { + buffer.putShort(nodeId); + } + for (short writerEpoch : overrideWriterEpochs) { + buffer.putShort(writerEpoch); + } + } } public static WALMetaData deserialize(ByteBuffer buffer) { + return deserialize(buffer, WALFileVersion.V2); + } + + public static WALMetaData deserialize(ByteBuffer buffer, WALFileVersion version) { long firstSearchIndex = buffer.getLong(); int entriesNum = buffer.getInt(); List buffersSize = new ArrayList<>(entriesNum); @@ -109,13 +237,56 @@ public static WALMetaData deserialize(ByteBuffer buffer) { buffersSize.add(buffer.getInt()); } Set memTablesId = new HashSet<>(); - if (buffer.hasRemaining()) { + final boolean serializedEmptyV3WithoutMemTableCount = + version == WALFileVersion.V3 + && entriesNum == 0 + && buffer.remaining() == V3_EMPTY_METADATA_REMAINING_WITHOUT_MEMTABLE_COUNT; + if (buffer.hasRemaining() && !serializedEmptyV3WithoutMemTableCount) { int memTablesIdNum = buffer.getInt(); for (int i = 0; i < memTablesIdNum; ++i) { memTablesId.add(buffer.getLong()); } } - return new WALMetaData(firstSearchIndex, buffersSize, memTablesId); + WALMetaData result = new WALMetaData(firstSearchIndex, buffersSize, memTablesId); + // V3 extension: file-level timestamp range + per-entry writer progress metadata + if (version == WALFileVersion.V3 && buffer.hasRemaining()) { + result.minDataTs = buffer.getLong(); + result.maxDataTs = buffer.getLong(); + if (buffer.remaining() >= entriesNum * Long.BYTES * 2 + Short.BYTES * 2 + Integer.BYTES) { + for (int i = 0; i < entriesNum; i++) { + result.physicalTimes.add(buffer.getLong()); + } + for (int i = 0; i < entriesNum; i++) { + result.localSeqs.add(buffer.getLong()); + } + final short defaultNodeId = buffer.getShort(); + final short defaultWriterEpoch = buffer.getShort(); + final int overrideCount = buffer.getInt(); + final int[] overrideIndexes = new int[overrideCount]; + final short[] overrideNodeIds = new short[overrideCount]; + final short[] overrideWriterEpochs = new short[overrideCount]; + for (int i = 0; i < overrideCount; i++) { + overrideIndexes[i] = buffer.getInt(); + } + for (int i = 0; i < overrideCount; i++) { + overrideNodeIds[i] = buffer.getShort(); + } + for (int i = 0; i < overrideCount; i++) { + overrideWriterEpochs[i] = buffer.getShort(); + } + for (int i = 0; i < entriesNum; i++) { + result.nodeIds.add(defaultNodeId); + result.writerEpochs.add(defaultWriterEpoch); + } + for (int i = 0; i < overrideCount; i++) { + result.nodeIds.set(overrideIndexes[i], overrideNodeIds[i]); + result.writerEpochs.set(overrideIndexes[i], overrideWriterEpochs[i]); + } + } else { + result.rebuildWriterMetadataWithDefaults(); + } + } + return result; } public List getBuffersSize() { @@ -130,6 +301,106 @@ public long getFirstSearchIndex() { return firstSearchIndex; } + public List getPhysicalTimes() { + return physicalTimes; + } + + public List getNodeIds() { + return nodeIds; + } + + public List getWriterEpochs() { + return writerEpochs; + } + + public List getLocalSeqs() { + return localSeqs; + } + + private short computeDefaultNodeId() { + return unpackNodeId(computeDefaultWriterIdentity()); + } + + private short computeDefaultWriterEpoch() { + return unpackWriterEpoch(computeDefaultWriterIdentity()); + } + + private int getWriterOverrideCount() { + final short defaultNodeId = computeDefaultNodeId(); + final short defaultWriterEpoch = computeDefaultWriterEpoch(); + int count = 0; + for (int i = 0; i < buffersSize.size(); i++) { + if (nodeIds.get(i) != defaultNodeId || writerEpochs.get(i) != defaultWriterEpoch) { + count++; + } + } + return count; + } + + private int computeDefaultWriterIdentity() { + if (nodeIds.isEmpty()) { + return packWriterIdentity(DEFAULT_NODE_ID, DEFAULT_WRITER_EPOCH); + } + final Map counts = new HashMap<>(); + int bestIdentity = packWriterIdentity(nodeIds.get(0), writerEpochs.get(0)); + int bestCount = 0; + for (int i = 0; i < nodeIds.size(); i++) { + final int identity = packWriterIdentity(nodeIds.get(i), writerEpochs.get(i)); + final int count = counts.merge(identity, 1, Integer::sum); + if (count > bestCount) { + bestCount = count; + bestIdentity = identity; + } + } + return bestIdentity; + } + + private static int packWriterIdentity(short nodeId, short writerEpoch) { + return ((nodeId & 0xFFFF) << 16) | (writerEpoch & 0xFFFF); + } + + private static short unpackNodeId(int identity) { + return (short) (identity >>> 16); + } + + private static short unpackWriterEpoch(int identity) { + return (short) identity; + } + + public WALMetaData copy() { + WALMetaData copy = + new WALMetaData(firstSearchIndex, new ArrayList<>(buffersSize), new HashSet<>(memTablesId)); + copy.truncateOffSet = truncateOffSet; + copy.physicalTimes.addAll(physicalTimes); + copy.nodeIds.addAll(nodeIds); + copy.writerEpochs.addAll(writerEpochs); + copy.localSeqs.addAll(localSeqs); + copy.minDataTs = minDataTs; + copy.maxDataTs = maxDataTs; + return copy; + } + + public long getMinDataTs() { + return minDataTs; + } + + public long getMaxDataTs() { + return maxDataTs; + } + + private void rebuildWriterMetadataWithDefaults() { + physicalTimes.clear(); + nodeIds.clear(); + writerEpochs.clear(); + localSeqs.clear(); + for (int i = 0; i < buffersSize.size(); i++) { + physicalTimes.add(0L); + nodeIds.add(DEFAULT_NODE_ID); + writerEpochs.add(DEFAULT_WRITER_EPOCH); + localSeqs.add(firstSearchIndex + i); + } + } + public static WALMetaData readFromWALFile(File logFile, FileChannel channel) throws IOException { if (channel.size() < WALFileVersion.V2.getVersionBytes().length || !isValidMagicString(channel)) { @@ -150,7 +421,7 @@ public static WALMetaData readFromWALFile(File logFile, FileChannel channel) thr ByteBuffer metadataBuf = ByteBuffer.allocate(metadataSize); channel.read(metadataBuf, position - metadataSize); metadataBuf.flip(); - metaData = WALMetaData.deserialize(metadataBuf); + metaData = WALMetaData.deserialize(metadataBuf, version); // versions before V1.3, should recover memTable ids from entries if (metaData.memTablesId.isEmpty()) { int offset = Byte.BYTES; @@ -174,11 +445,16 @@ public static WALMetaData readFromWALFile(File logFile, FileChannel channel) thr } private static boolean isValidMagicString(FileChannel channel) throws IOException { - ByteBuffer magicStringBytes = ByteBuffer.allocate(WALFileVersion.V2.getVersionBytes().length); - channel.read(magicStringBytes, channel.size() - WALFileVersion.V2.getVersionBytes().length); + // V3 magic string is the longest; read enough bytes to check all versions + int maxMagicLen = + Math.max( + WALFileVersion.V3.getVersionBytes().length, WALFileVersion.V2.getVersionBytes().length); + ByteBuffer magicStringBytes = ByteBuffer.allocate(maxMagicLen); + channel.read(magicStringBytes, channel.size() - maxMagicLen); magicStringBytes.flip(); String magicString = new String(magicStringBytes.array(), StandardCharsets.UTF_8); - return magicString.equals(WALFileVersion.V2.getVersionString()) + return magicString.contains(WALFileVersion.V3.getVersionString()) + || magicString.contains(WALFileVersion.V2.getVersionString()) || magicString.contains(WALFileVersion.V1.getVersionString()); } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALWriter.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALWriter.java index 6f13040bec8b4..10d164f3851cd 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALWriter.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALWriter.java @@ -34,11 +34,11 @@ public class WALWriter extends LogWriter { private WALFileStatus walFileStatus = WALFileStatus.CONTAINS_NONE_SEARCH_INDEX; // wal files' metadata protected final WALMetaData metaData = new WALMetaData(); - // By default is V2 - private WALFileVersion version = WALFileVersion.V2; + // By default is V3 for writer-progress metadata support. + private WALFileVersion version = WALFileVersion.V3; public WALWriter(File logFile) throws IOException { - this(logFile, WALFileVersion.V2); + this(logFile, WALFileVersion.V3); } public WALWriter(File logFile, WALFileVersion version) throws IOException { @@ -58,12 +58,16 @@ public double write(ByteBuffer buffer, WALMetaData metaData) throws IOException return write(buffer); } - public void updateMetaData(WALMetaData metaData) { + public synchronized void updateMetaData(WALMetaData metaData) { this.metaData.addAll(metaData); } - private void endFile() throws IOException { - if (logFile.length() == WALFileVersion.V2.getVersionBytes().length) { + public synchronized WALMetaData snapshotMetaData() { + return metaData.copy(); + } + + private synchronized void endFile() throws IOException { + if (logFile.length() == version.getVersionBytes().length) { super.close(); return; } @@ -72,12 +76,12 @@ private void endFile() throws IOException { // mark info part ends endMarker.serialize(markerBuffer); write(markerBuffer, false); - int metaDataSize = metaData.serializedSize(); + int metaDataSize = metaData.serializedSize(version); ByteBuffer buffer = ByteBuffer.allocate(metaDataSize + Integer.BYTES + version.getVersionBytes().length); - // flush meta data - metaData.serialize(buffer); + // flush meta data with version-aware serialization + metaData.serialize(buffer, version); buffer.putInt(metaDataSize); // add magic string buffer.put(version.getVersionBytes()); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/node/WALFakeNode.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/node/WALFakeNode.java index e35d5e79fc019..0002aad9748bd 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/node/WALFakeNode.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/node/WALFakeNode.java @@ -143,6 +143,36 @@ public long getTotalSize() { return 0; } + @Override + public long getRegionDiskUsage() { + return 0; + } + + @Override + public long getSearchIndexToFreeAtLeast(long bytesToFree) { + return bytesToFree > 0 ? Long.MAX_VALUE : DEFAULT_SAFELY_DELETED_SEARCH_INDEX; + } + + @Override + public void setSubscriptionRetainedMinVersionId(long minVersionId) { + // do nothing + } + + @Override + public long getVersionIdToFreeAtLeast(long bytesToFree) { + return bytesToFree > 0 ? Long.MAX_VALUE : 0; + } + + @Override + public long getSearchIndexToFreeBeforeTimestamp(long cutoffTimeMs) { + return Long.MIN_VALUE + 1; + } + + @Override + public long getVersionIdToFreeBeforeTimestamp(long cutoffTimeMs) { + return 0; + } + public static WALFakeNode getFailureInstance(Exception e) { return new WALFakeNode( Status.FAILURE, new WALException("Cannot write wal into a fake node. ", e)); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/node/WALNode.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/node/WALNode.java index 07dd4d78f6605..c0f0445f9cbed 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/node/WALNode.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/node/WALNode.java @@ -52,6 +52,7 @@ import org.apache.iotdb.db.storageengine.dataregion.wal.checkpoint.CheckpointType; import org.apache.iotdb.db.storageengine.dataregion.wal.checkpoint.MemTableInfo; import org.apache.iotdb.db.storageengine.dataregion.wal.io.WALByteBufReader; +import org.apache.iotdb.db.storageengine.dataregion.wal.io.WALMetaData; import org.apache.iotdb.db.storageengine.dataregion.wal.utils.WALFileStatus; import org.apache.iotdb.db.storageengine.dataregion.wal.utils.WALFileUtils; import org.apache.iotdb.db.storageengine.dataregion.wal.utils.listener.AbstractResultListener; @@ -82,6 +83,7 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicReference; import java.util.stream.Collectors; @@ -112,6 +114,8 @@ public class WALNode implements IWALNode { private final Map memTableSnapshotCount = new ConcurrentHashMap<>(); // insert nodes whose search index are before this value can be deleted safely private volatile long safelyDeletedSearchIndex = DEFAULT_SAFELY_DELETED_SEARCH_INDEX; + // WAL files with versionId >= this value are retained for subscription consumers + private volatile long subscriptionRetainedMinVersionId = Long.MAX_VALUE; private volatile boolean deleted = false; @@ -572,6 +576,7 @@ public boolean isContainsActiveOrPinnedMemTable(Long versionId) { private boolean canDeleteFile(long fileArrIdx, WALFileStatus walFileStatus, long versionId) { return (fileArrIdx < fileIndexAfterFilterSafelyDeleteIndex || walFileStatus == WALFileStatus.CONTAINS_NONE_SEARCH_INDEX) + && versionId < subscriptionRetainedMinVersionId && !isContainsActiveOrPinnedMemTable(versionId); } } @@ -584,6 +589,11 @@ public void setSafelyDeletedSearchIndex(long safelyDeletedSearchIndex) { this.safelyDeletedSearchIndex = safelyDeletedSearchIndex; } + @Override + public void setSubscriptionRetainedMinVersionId(long minVersionId) { + this.subscriptionRetainedMinVersionId = minVersionId; + } + /** This iterator is not concurrency-safe, cannot read the current-writing wal file. */ @Override public ReqIterator getReqIterator(long startIndex) { @@ -654,6 +664,11 @@ public boolean hasNext() { AtomicReference> tmpNodes = new AtomicReference<>(new ArrayList<>()); AtomicBoolean notFirstFile = new AtomicBoolean(false); AtomicBoolean hasCollectedSufficientData = new AtomicBoolean(false); + // V3: track writer progress metadata for current entry group + AtomicLong currentEntryLocalSeq = new AtomicLong(-1); + AtomicLong currentEntryPhysicalTime = new AtomicLong(0); + AtomicLong currentEntryWriterEpoch = new AtomicLong(0); + AtomicLong currentEntryNodeId = new AtomicLong(-1); long memorySize = 0; @@ -662,7 +677,15 @@ public boolean hasNext() { Runnable tryToCollectInsertNodeAndBumpIndex = () -> { if (!tmpNodes.get().isEmpty()) { - insertNodes.add(new IndexedConsensusRequest(nextSearchIndex, tmpNodes.get())); + long localSeq = currentEntryLocalSeq.get(); + IndexedConsensusRequest req = + (localSeq >= 0) + ? new IndexedConsensusRequest(nextSearchIndex, localSeq, tmpNodes.get()) + : new IndexedConsensusRequest(nextSearchIndex, tmpNodes.get()); + req.setPhysicalTime(currentEntryPhysicalTime.get()) + .setNodeId((int) currentEntryNodeId.get()) + .setWriterEpoch(currentEntryWriterEpoch.get()); + insertNodes.add(req); tmpNodes.set(new ArrayList<>()); nextSearchIndex++; if (notFirstFile.get()) { @@ -695,6 +718,10 @@ public boolean hasNext() { } else if (currentWalEntryIndex < nextSearchIndex) { // WAL entry is outdated, do nothing, continue to see next WAL entry } else if (currentWalEntryIndex == nextSearchIndex) { + currentEntryLocalSeq.set(walByteBufReader.getCurrentEntryLocalSeq()); + currentEntryPhysicalTime.set(walByteBufReader.getCurrentEntryPhysicalTime()); + currentEntryWriterEpoch.set(walByteBufReader.getCurrentEntryWriterEpoch()); + currentEntryNodeId.set(walByteBufReader.getCurrentEntryNodeId()); if (type == WALEntryType.OBJECT_FILE_NODE) { WALEntry walEntry = WALEntry.deserialize( @@ -723,6 +750,10 @@ public boolean hasNext() { currentWalEntryIndex); nextSearchIndex = currentWalEntryIndex; } + currentEntryLocalSeq.set(walByteBufReader.getCurrentEntryLocalSeq()); + currentEntryPhysicalTime.set(walByteBufReader.getCurrentEntryPhysicalTime()); + currentEntryWriterEpoch.set(walByteBufReader.getCurrentEntryWriterEpoch()); + currentEntryNodeId.set(walByteBufReader.getCurrentEntryNodeId()); if (type == WALEntryType.OBJECT_FILE_NODE) { WALEntry walEntry = WALEntry.deserialize( @@ -898,11 +929,117 @@ public long getCurrentWALFileVersion() { return buffer.getCurrentWALFileVersion(); } + public WALMetaData getCurrentWALMetaDataSnapshot() { + return buffer.getCurrentWALMetaDataSnapshot(); + } + @Override public long getTotalSize() { return WALManager.getInstance().getTotalDiskUsage(); } + @Override + public long getRegionDiskUsage() { + return buffer.getDiskUsage(); + } + + @Override + public long getSearchIndexToFreeAtLeast(long bytesToFree) { + if (bytesToFree <= 0) { + return DEFAULT_SAFELY_DELETED_SEARCH_INDEX; + } + File[] walFiles = WALFileUtils.listAllWALFiles(logDirectory); + if (walFiles == null || walFiles.length <= 1) { + // No files or only the current-writing file — cannot free anything + return DEFAULT_SAFELY_DELETED_SEARCH_INDEX; + } + WALFileUtils.ascSortByVersionId(walFiles); + // Exclude the last file (currently being written) + long accumulated = 0; + for (int i = 0; i < walFiles.length - 1; i++) { + accumulated += walFiles[i].length(); + if (accumulated >= bytesToFree) { + // The next file's startSearchIndex is the boundary: everything before it can be deleted + if (i + 1 < walFiles.length) { + return WALFileUtils.parseStartSearchIndex(walFiles[i + 1].getName()); + } + break; + } + } + // Could not free enough even by deleting all non-current files — allow deleting all + return Long.MAX_VALUE; + } + + @Override + public long getVersionIdToFreeAtLeast(long bytesToFree) { + if (bytesToFree <= 0) { + return 0; + } + File[] walFiles = WALFileUtils.listAllWALFiles(logDirectory); + if (walFiles == null || walFiles.length <= 1) { + return 0; + } + WALFileUtils.ascSortByVersionId(walFiles); + long accumulated = 0; + for (int i = 0; i < walFiles.length - 1; i++) { + accumulated += walFiles[i].length(); + if (accumulated >= bytesToFree) { + // Return the versionId of the next file — files before it can be freed + if (i + 1 < walFiles.length) { + return WALFileUtils.parseVersionId(walFiles[i + 1].getName()); + } + break; + } + } + return Long.MAX_VALUE; + } + + @Override + public long getSearchIndexToFreeBeforeTimestamp(long cutoffTimeMs) { + File[] walFiles = WALFileUtils.listAllWALFiles(logDirectory); + if (walFiles == null || walFiles.length <= 1) { + return Long.MIN_VALUE + 1; + } + WALFileUtils.ascSortByVersionId(walFiles); + int expiredPrefixLength = countExpiredRolledWalFiles(walFiles, cutoffTimeMs); + if (expiredPrefixLength == 0) { + return Long.MIN_VALUE + 1; + } + if (expiredPrefixLength >= walFiles.length - 1) { + return Long.MAX_VALUE; + } + return WALFileUtils.parseStartSearchIndex(walFiles[expiredPrefixLength].getName()); + } + + @Override + public long getVersionIdToFreeBeforeTimestamp(long cutoffTimeMs) { + File[] walFiles = WALFileUtils.listAllWALFiles(logDirectory); + if (walFiles == null || walFiles.length <= 1) { + return 0; + } + WALFileUtils.ascSortByVersionId(walFiles); + int expiredPrefixLength = countExpiredRolledWalFiles(walFiles, cutoffTimeMs); + if (expiredPrefixLength == 0) { + return 0; + } + if (expiredPrefixLength >= walFiles.length - 1) { + return Long.MAX_VALUE; + } + return WALFileUtils.parseVersionId(walFiles[expiredPrefixLength].getName()); + } + + private int countExpiredRolledWalFiles(File[] walFiles, long cutoffTimeMs) { + int expiredPrefixLength = 0; + for (int i = 0; i < walFiles.length - 1; i++) { + if (walFiles[i].lastModified() < cutoffTimeMs) { + expiredPrefixLength++; + } else { + break; + } + } + return expiredPrefixLength; + } + // endregion @Override diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/utils/WALFileUtils.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/utils/WALFileUtils.java index 117f06c764440..db55b092dae6b 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/utils/WALFileUtils.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/utils/WALFileUtils.java @@ -19,8 +19,18 @@ package org.apache.iotdb.db.storageengine.dataregion.wal.utils; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.PlanNodeType; +import org.apache.iotdb.db.storageengine.dataregion.wal.buffer.WALEntryType; +import org.apache.iotdb.db.storageengine.dataregion.wal.buffer.WALInfoEntry; +import org.apache.iotdb.db.storageengine.dataregion.wal.io.ProgressWALReader; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import java.io.File; import java.io.FileNotFoundException; +import java.io.IOException; +import java.nio.ByteBuffer; import java.nio.file.Path; import java.util.Arrays; import java.util.Comparator; @@ -35,6 +45,11 @@ import static org.apache.iotdb.commons.conf.IoTDBConstant.WAL_VERSION_ID; public class WALFileUtils { + + private static final Logger logger = LoggerFactory.getLogger(WALFileUtils.class); + private static final int SEARCH_INDEX_OFFSET = + WALInfoEntry.FIXED_SERIALIZED_SIZE + PlanNodeType.BYTES; + /** * versionId is a self-incremented id number, helping to maintain the order of wal files. * startSearchIndex is the valid search index of last flushed wal entry. statusCode is the. For @@ -182,4 +197,267 @@ public static String getTsFileRelativePath(String absolutePath) { Path path = new File(absolutePath).toPath(); return path.subpath(path.getNameCount() - 5, path.getNameCount()).toString(); } + + /** + * Locate the first local searchIndex whose writer progress is equal to or strictly greater than + * the given writer-local frontier. This is currently used by single-writer recovery paths, so it + * matches only entries from the supplied (nodeId, writerEpoch) pair. + * + * @return [targetSearchIndex, exactMatchFlag], or null if no matching/later entry exists + */ + public static long[] locateByWriterProgress( + final File logDir, + final int nodeId, + final long writerEpoch, + final long physicalTime, + final long localSeq) { + final long[] exactSearchIndex = new long[] {-1L}; + final long[] firstAfterSearchIndex = new long[] {-1L}; + final long[] firstAfterPhysicalTime = new long[] {Long.MAX_VALUE}; + final long[] firstAfterLocalSeq = new long[] {Long.MAX_VALUE}; + + forEachSealedSearchableRequest( + logDir, + request -> { + if (request.nodeId != nodeId || request.writerEpoch != writerEpoch) { + return true; + } + final int cmp = + compareWriterProgress( + request.physicalTime, + request.nodeId, + request.localSeq, + physicalTime, + nodeId, + localSeq); + if (cmp == 0) { + exactSearchIndex[0] = request.searchIndex; + return false; + } + if (cmp > 0 + && (firstAfterSearchIndex[0] < 0L + || compareWriterProgress( + request.physicalTime, + request.nodeId, + request.localSeq, + firstAfterPhysicalTime[0], + nodeId, + firstAfterLocalSeq[0]) + < 0)) { + firstAfterSearchIndex[0] = request.searchIndex; + firstAfterPhysicalTime[0] = request.physicalTime; + firstAfterLocalSeq[0] = request.localSeq; + } + return true; + }); + + if (exactSearchIndex[0] >= 0L) { + return new long[] {exactSearchIndex[0], 1L}; + } + if (firstAfterSearchIndex[0] >= 0L) { + return new long[] {firstAfterSearchIndex[0], 0L}; + } + return null; + } + + public static long findSearchIndexByWriterProgress( + final File logDir, + final int nodeId, + final long writerEpoch, + final long physicalTime, + final long localSeq) { + final long[] located = + locateByWriterProgress(logDir, nodeId, writerEpoch, physicalTime, localSeq); + return located != null && located[1] == 1L ? located[0] : -1L; + } + + public static long findSearchIndexAfterWriterProgress( + final File logDir, + final int nodeId, + final long writerEpoch, + final long physicalTime, + final long localSeq) { + final long[] bestSearchIndex = new long[] {-1L}; + final long[] bestPhysicalTime = new long[] {Long.MAX_VALUE}; + final long[] bestLocalSeq = new long[] {Long.MAX_VALUE}; + forEachSealedSearchableRequest( + logDir, + request -> { + if (request.nodeId != nodeId || request.writerEpoch != writerEpoch) { + return true; + } + if (compareWriterProgress( + request.physicalTime, + request.nodeId, + request.localSeq, + physicalTime, + nodeId, + localSeq) + <= 0) { + return true; + } + if (bestSearchIndex[0] < 0L + || compareWriterProgress( + request.physicalTime, + request.nodeId, + request.localSeq, + bestPhysicalTime[0], + nodeId, + bestLocalSeq[0]) + < 0) { + bestSearchIndex[0] = request.searchIndex; + bestPhysicalTime[0] = request.physicalTime; + bestLocalSeq[0] = request.localSeq; + } + return true; + }); + return bestSearchIndex[0]; + } + + private interface SearchableRequestVisitor { + boolean onRequest(SearchableRequestMeta request); + } + + private static final class SearchableRequestMeta { + private final long searchIndex; + private final long physicalTime; + private final int nodeId; + private final long writerEpoch; + private final long localSeq; + + private SearchableRequestMeta( + final long searchIndex, + final long physicalTime, + final int nodeId, + final long writerEpoch, + final long localSeq) { + this.searchIndex = searchIndex; + this.physicalTime = physicalTime; + this.nodeId = nodeId; + this.writerEpoch = writerEpoch; + this.localSeq = localSeq; + } + } + + private static void forEachSealedSearchableRequest( + final File logDir, final SearchableRequestVisitor visitor) { + final File[] walFiles = listSealedWALFiles(logDir); + if (walFiles == null || walFiles.length == 0) { + return; + } + + for (final File walFile : walFiles) { + try (final ProgressWALReader reader = new ProgressWALReader(walFile)) { + long pendingSearchIndex = Long.MIN_VALUE; + long pendingPhysicalTime = 0L; + int pendingNodeId = -1; + long pendingWriterEpoch = 0L; + long pendingLocalSeq = Long.MIN_VALUE; + boolean hasPending = false; + + while (reader.hasNext()) { + final ByteBuffer buffer = reader.next(); + final WALEntryType type = WALEntryType.valueOf(buffer.get()); + buffer.clear(); + if (!type.needSearch()) { + continue; + } + + final long currentLocalSeq = reader.getCurrentEntryLocalSeq(); + final long currentPhysicalTime = reader.getCurrentEntryPhysicalTime(); + final int currentNodeId = reader.getCurrentEntryNodeId(); + final long currentWriterEpoch = reader.getCurrentEntryWriterEpoch(); + + buffer.position(SEARCH_INDEX_OFFSET); + final long bodySearchIndex = buffer.getLong(); + buffer.clear(); + final long currentSearchIndex = bodySearchIndex >= 0 ? bodySearchIndex : currentLocalSeq; + + if (hasPending + && pendingLocalSeq == currentLocalSeq + && pendingNodeId == currentNodeId + && pendingWriterEpoch == currentWriterEpoch) { + if (pendingSearchIndex < 0 && currentSearchIndex >= 0) { + pendingSearchIndex = currentSearchIndex; + } + continue; + } + + if (hasPending + && !visitor.onRequest( + new SearchableRequestMeta( + pendingSearchIndex >= 0 ? pendingSearchIndex : pendingLocalSeq, + pendingPhysicalTime, + pendingNodeId, + pendingWriterEpoch, + pendingLocalSeq))) { + return; + } + + hasPending = true; + pendingSearchIndex = currentSearchIndex; + pendingPhysicalTime = currentPhysicalTime; + pendingNodeId = currentNodeId; + pendingWriterEpoch = currentWriterEpoch; + pendingLocalSeq = currentLocalSeq; + } + + if (hasPending + && !visitor.onRequest( + new SearchableRequestMeta( + pendingSearchIndex >= 0 ? pendingSearchIndex : pendingLocalSeq, + pendingPhysicalTime, + pendingNodeId, + pendingWriterEpoch, + pendingLocalSeq))) { + return; + } + } catch (final IOException e) { + logger.warn("Failed to scan WAL file {} for searchable request metadata", walFile, e); + } + } + } + + private static int compareCompatibleProgress( + final long leftPhysicalTime, + final int leftNodeId, + final long leftLocalSeq, + final long rightPhysicalTime, + final long rightLocalSeq) { + if (leftPhysicalTime != rightPhysicalTime) { + return Long.compare(leftPhysicalTime, rightPhysicalTime); + } + if (leftLocalSeq != rightLocalSeq) { + return Long.compare(leftLocalSeq, rightLocalSeq); + } + return 0; + } + + private static int compareWriterProgress( + final long leftPhysicalTime, + final int leftNodeId, + final long leftLocalSeq, + final long rightPhysicalTime, + final int rightNodeId, + final long rightLocalSeq) { + if (leftPhysicalTime != rightPhysicalTime) { + return Long.compare(leftPhysicalTime, rightPhysicalTime); + } + if (leftNodeId != rightNodeId) { + return Integer.compare(leftNodeId, rightNodeId); + } + return Long.compare(leftLocalSeq, rightLocalSeq); + } + + private static File[] listSealedWALFiles(final File logDir) { + final File[] walFiles = listAllWALFiles(logDir); + if (walFiles == null || walFiles.length == 0) { + return walFiles; + } + ascSortByVersionId(walFiles); + if (walFiles.length == 1) { + return new File[0]; + } + return Arrays.copyOf(walFiles, walFiles.length - 1); + } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionBrokerAgent.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionBrokerAgent.java index 00007f921b260..33918e96cdc6c 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionBrokerAgent.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionBrokerAgent.java @@ -19,17 +19,37 @@ package org.apache.iotdb.db.subscription.agent; +import org.apache.iotdb.commons.consensus.ConsensusGroupId; +import org.apache.iotdb.consensus.ConsensusFactory; +import org.apache.iotdb.consensus.IConsensus; +import org.apache.iotdb.consensus.iot.IoTConsensus; +import org.apache.iotdb.consensus.iot.IoTConsensusServerImpl; +import org.apache.iotdb.consensus.iot.SubscriptionWalRetentionPolicy; +import org.apache.iotdb.db.conf.IoTDBDescriptor; +import org.apache.iotdb.db.consensus.DataRegionConsensusImpl; +import org.apache.iotdb.db.subscription.broker.ConsensusSubscriptionBroker; import org.apache.iotdb.db.subscription.broker.SubscriptionBroker; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusLogToTabletConverter; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusRegionRuntimeState; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusSubscriptionCommitManager; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusSubscriptionSetupHandler; import org.apache.iotdb.db.subscription.event.SubscriptionEvent; import org.apache.iotdb.db.subscription.resource.SubscriptionDataNodeResourceManager; +import org.apache.iotdb.db.subscription.task.execution.ConsensusSubscriptionPrefetchExecutorManager; import org.apache.iotdb.db.subscription.task.subtask.SubscriptionSinkSubtask; import org.apache.iotdb.rpc.subscription.config.ConsumerConfig; import org.apache.iotdb.rpc.subscription.exception.SubscriptionException; +import org.apache.iotdb.rpc.subscription.payload.poll.RegionProgress; import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext; +import org.apache.iotdb.rpc.subscription.payload.poll.TopicProgress; +import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeSeekReq; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; @@ -43,7 +63,12 @@ public class SubscriptionBrokerAgent { private static final Logger LOGGER = LoggerFactory.getLogger(SubscriptionBrokerAgent.class); - private final Map consumerGroupIdToSubscriptionBroker = + /** Pipe-based subscription brokers, one per consumer group. */ + private final Map consumerGroupIdToPipeBroker = + new ConcurrentHashMap<>(); + + /** Consensus-based subscription brokers, one per consumer group. */ + private final Map consumerGroupIdToConsensusBroker = new ConcurrentHashMap<>(); private final Cache prefetchingQueueCount = @@ -53,18 +78,73 @@ public class SubscriptionBrokerAgent { public List poll( final ConsumerConfig consumerConfig, final Set topicNames, final long maxBytes) { + return poll(consumerConfig, topicNames, maxBytes, Collections.emptyMap()); + } + + public List poll( + final ConsumerConfig consumerConfig, + final Set topicNames, + final long maxBytes, + final Map progressByTopic) { final String consumerGroupId = consumerConfig.getConsumerGroupId(); - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + final String consumerId = consumerConfig.getConsumerId(); + final List unsupportedConsensusTopics = getUnsupportedConsensusTopics(topicNames); + if (!unsupportedConsensusTopics.isEmpty()) { final String errorMessage = - String.format( - "Subscription: broker bound to consumer group [%s] does not exist", consumerGroupId); + buildUnsupportedConsensusRuntimeMessage( + consumerGroupId, unsupportedConsensusTopics, "poll"); LOGGER.warn(errorMessage); throw new SubscriptionException(errorMessage); } - // TODO: currently we fetch messages from all topics - final String consumerId = consumerConfig.getConsumerId(); - return broker.poll(consumerId, topicNames, maxBytes); + + final List allEvents = new ArrayList<>(); + long remainingBytes = maxBytes; + + // Poll from pipe-based broker + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.nonNull(pipeBroker)) { + final List pipeEvents = + pipeBroker.poll(consumerId, topicNames, remainingBytes); + allEvents.addAll(pipeEvents); + for (final SubscriptionEvent event : pipeEvents) { + try { + remainingBytes -= event.getCurrentResponseSize(); + } catch (final IOException ignored) { + // best effort + } + } + } + + // Poll from consensus-based broker + if (remainingBytes > 0) { + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + if (Objects.nonNull(consensusBroker)) { + LOGGER.debug( + "SubscriptionBrokerAgent: polling consensus broker for consumer group [{}], " + + "topicNames={}, remainingBytes={}", + consumerGroupId, + topicNames, + remainingBytes); + allEvents.addAll( + consensusBroker.poll(consumerId, topicNames, remainingBytes, progressByTopic)); + } else { + LOGGER.debug( + "SubscriptionBrokerAgent: no consensus broker for consumer group [{}]", + consumerGroupId); + } + } + + if (allEvents.isEmpty() + && Objects.isNull(pipeBroker) + && Objects.isNull(consumerGroupIdToConsensusBroker.get(consumerGroupId))) { + final String errorMessage = + String.format("Subscription: no broker bound to consumer group [%s]", consumerGroupId); + LOGGER.warn(errorMessage); + throw new SubscriptionException(errorMessage); + } + + return allEvents; } public List pollTsFile( @@ -72,16 +152,18 @@ public List pollTsFile( final SubscriptionCommitContext commitContext, final long writingOffset) { final String consumerGroupId = consumerConfig.getConsumerGroupId(); - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + // TsFile polling can only be called by pipe-based subscriptions + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.isNull(pipeBroker)) { final String errorMessage = String.format( - "Subscription: broker bound to consumer group [%s] does not exist", consumerGroupId); + "Subscription: pipe broker bound to consumer group [%s] does not exist", + consumerGroupId); LOGGER.warn(errorMessage); throw new SubscriptionException(errorMessage); } final String consumerId = consumerConfig.getConsumerId(); - return broker.pollTsFile(consumerId, commitContext, writingOffset); + return pipeBroker.pollTsFile(consumerId, commitContext, writingOffset); } public List pollTablets( @@ -89,16 +171,26 @@ public List pollTablets( final SubscriptionCommitContext commitContext, final int offset) { final String consumerGroupId = consumerConfig.getConsumerGroupId(); - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + final String consumerId = consumerConfig.getConsumerId(); + final String topicName = commitContext.getTopicName(); + + // Try consensus-based broker first + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) { + return consensusBroker.pollTablets(consumerId, commitContext, offset); + } + + // Fall back to pipe-based broker + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.isNull(pipeBroker)) { final String errorMessage = String.format( "Subscription: broker bound to consumer group [%s] does not exist", consumerGroupId); LOGGER.warn(errorMessage); throw new SubscriptionException(errorMessage); } - final String consumerId = consumerConfig.getConsumerId(); - return broker.pollTablets(consumerId, commitContext, offset); + return pipeBroker.pollTablets(consumerId, commitContext, offset); } /** @@ -109,46 +201,257 @@ public List commit( final List commitContexts, final boolean nack) { final String consumerGroupId = consumerConfig.getConsumerGroupId(); - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + final String consumerId = consumerConfig.getConsumerId(); + final List allSuccessful = new ArrayList<>(); + + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + + if (Objects.isNull(pipeBroker) && Objects.isNull(consensusBroker)) { + final String errorMessage = + String.format("Subscription: no broker bound to consumer group [%s]", consumerGroupId); + LOGGER.warn(errorMessage); + throw new SubscriptionException(errorMessage); + } + + // Partition commit contexts by which broker owns the topic. + final List pipeContexts = new ArrayList<>(); + final List consensusContexts = new ArrayList<>(); + for (final SubscriptionCommitContext ctx : commitContexts) { + final String topicName = ctx.getTopicName(); + if (Objects.nonNull(consensusBroker) + && ConsensusSubscriptionSetupHandler.isConsensusBasedTopic(topicName)) { + consensusContexts.add(ctx); + } else { + pipeContexts.add(ctx); + } + } + + if (Objects.nonNull(pipeBroker) && !pipeContexts.isEmpty()) { + allSuccessful.addAll(pipeBroker.commit(consumerId, pipeContexts, nack)); + } + if (Objects.nonNull(consensusBroker) && !consensusContexts.isEmpty()) { + allSuccessful.addAll(consensusBroker.commit(consumerId, consensusContexts, nack)); + } + + return allSuccessful; + } + + public void seek( + final ConsumerConfig consumerConfig, final String topicName, final short seekType) { + final String consumerGroupId = consumerConfig.getConsumerGroupId(); + + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) { + ensureConsensusSeekRuntimeAvailable(consumerGroupId, topicName, "seek"); + if (seekType != PipeSubscribeSeekReq.SEEK_TO_BEGINNING + && seekType != PipeSubscribeSeekReq.SEEK_TO_END) { + final String errorMessage = + String.format( + "Subscription: consensus seek only supports beginning/end or topic progress, " + + "consumerGroup=%s, topic=%s, seekType=%s", + consumerGroupId, topicName, seekType); + LOGGER.warn(errorMessage); + throw new SubscriptionException(errorMessage); + } + consensusBroker.seek(topicName, seekType); + return; + } + + if (isConsensusRuntimeUnsupported(topicName)) { + final String errorMessage = + buildUnsupportedConsensusRuntimeMessage(consumerGroupId, topicName, "seek"); + LOGGER.warn(errorMessage); + throw new SubscriptionException(errorMessage); + } + + final String errorMessage = + String.format( + "Subscription: seek is only supported for consensus-based subscriptions, " + + "consumerGroup=%s, topic=%s", + consumerGroupId, topicName); + LOGGER.warn(errorMessage); + throw new SubscriptionException(errorMessage); + } + + public void seekToTopicProgress( + final ConsumerConfig consumerConfig, + final String topicName, + final TopicProgress topicProgress) { + final String consumerGroupId = consumerConfig.getConsumerGroupId(); + + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) { + ensureConsensusSeekRuntimeAvailable(consumerGroupId, topicName, "seek(topicProgress)"); + consensusBroker.seek(topicName, topicProgress); + return; + } + + if (isConsensusRuntimeUnsupported(topicName)) { + final String errorMessage = + buildUnsupportedConsensusRuntimeMessage( + consumerGroupId, topicName, "seek(topicProgress)"); + LOGGER.warn(errorMessage); + throw new SubscriptionException(errorMessage); + } + + final String errorMessage = + String.format( + "Subscription: seek(topicProgress) is only supported for consensus-based subscriptions, " + + "consumerGroup=%s, topic=%s", + consumerGroupId, topicName); + LOGGER.warn(errorMessage); + throw new SubscriptionException(errorMessage); + } + + public void seekAfterTopicProgress( + final ConsumerConfig consumerConfig, + final String topicName, + final TopicProgress topicProgress) { + final String consumerGroupId = consumerConfig.getConsumerGroupId(); + + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) { + ensureConsensusSeekRuntimeAvailable(consumerGroupId, topicName, "seekAfter(topicProgress)"); + consensusBroker.seekAfter(topicName, topicProgress); + return; + } + + if (isConsensusRuntimeUnsupported(topicName)) { + final String errorMessage = + buildUnsupportedConsensusRuntimeMessage( + consumerGroupId, topicName, "seekAfter(topicProgress)"); + LOGGER.warn(errorMessage); + throw new SubscriptionException(errorMessage); + } + + final String errorMessage = + String.format( + "Subscription: seekAfter(topicProgress) is only supported for consensus-based subscriptions, " + + "consumerGroup=%s, topic=%s", + consumerGroupId, topicName); + LOGGER.warn(errorMessage); + throw new SubscriptionException(errorMessage); + } + + private void ensureConsensusSeekRuntimeAvailable( + final String consumerGroupId, final String topicName, final String operation) { + if (!ConsensusSubscriptionPrefetchExecutorManager.getInstance().isStarted() + || SubscriptionAgent.runtime().isShutdown()) { final String errorMessage = String.format( - "Subscription: broker bound to consumer group [%s] does not exist", consumerGroupId); + "Subscription: consensus %s is unavailable because subscription runtime is stopped, " + + "consumerGroup=%s, topic=%s", + operation, consumerGroupId, topicName); LOGGER.warn(errorMessage); throw new SubscriptionException(errorMessage); } - final String consumerId = consumerConfig.getConsumerId(); - return broker.commit(consumerId, commitContexts, nack); + } + + private boolean isConsensusRuntimeUnsupported(final String topicName) { + return !(DataRegionConsensusImpl.getInstance() instanceof IoTConsensus) + && ConsensusSubscriptionSetupHandler.isConsensusBasedTopic(topicName); + } + + private List getUnsupportedConsensusTopics(final Set topicNames) { + if (DataRegionConsensusImpl.getInstance() instanceof IoTConsensus) { + return Collections.emptyList(); + } + + final List unsupportedConsensusTopics = new ArrayList<>(); + for (final String topicName : topicNames) { + if (ConsensusSubscriptionSetupHandler.isConsensusBasedTopic(topicName)) { + unsupportedConsensusTopics.add(topicName); + } + } + return unsupportedConsensusTopics; + } + + private String buildUnsupportedConsensusRuntimeMessage( + final String consumerGroupId, final String topicName, final String operation) { + return buildUnsupportedConsensusRuntimeMessage( + consumerGroupId, Collections.singletonList(topicName), operation); + } + + private String buildUnsupportedConsensusRuntimeMessage( + final String consumerGroupId, final List topicNames, final String operation) { + final IConsensus dataRegionConsensus = DataRegionConsensusImpl.getInstance(); + final String configuredProtocol = + IoTDBDescriptor.getInstance().getConfig().getDataRegionConsensusProtocolClass(); + final String runtimeConsensusImplementation = + Objects.nonNull(dataRegionConsensus) ? dataRegionConsensus.getClass().getName() : "null"; + return String.format( + "Subscription: cannot %s consensus-based topic(s) %s in consumer group [%s] because " + + "mode=consensus only supports data_region_consensus_protocol_class=%s, but current " + + "configured value is %s (runtime consensus implementation: %s)", + operation, + topicNames, + consumerGroupId, + ConsensusFactory.IOT_CONSENSUS, + configuredProtocol, + runtimeConsensusImplementation); } public boolean isCommitContextOutdated(final SubscriptionCommitContext commitContext) { final String consumerGroupId = commitContext.getConsumerGroupId(); - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + final String topicName = commitContext.getTopicName(); + + // Try consensus broker first + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) { + return consensusBroker.isCommitContextOutdated(commitContext); + } + + // Fall back to pipe broker + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.isNull(pipeBroker)) { return true; } - return broker.isCommitContextOutdated(commitContext); + return pipeBroker.isCommitContextOutdated(commitContext); } public List fetchTopicNamesToUnsubscribe( final ConsumerConfig consumerConfig, final Set topicNames) { final String consumerGroupId = consumerConfig.getConsumerGroupId(); - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + + // Consensus-based subscription topics are unbounded streams, so they do not trigger + // auto-unsubscribe. + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + final Set pipeOnlyTopicNames; + if (Objects.nonNull(consensusBroker)) { + pipeOnlyTopicNames = new java.util.HashSet<>(topicNames); + pipeOnlyTopicNames.removeIf(consensusBroker::hasQueue); + } else { + pipeOnlyTopicNames = topicNames; + } + + if (pipeOnlyTopicNames.isEmpty()) { + return Collections.emptyList(); + } + + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.isNull(pipeBroker)) { return Collections.emptyList(); } - return broker.fetchTopicNamesToUnsubscribe(topicNames); + return pipeBroker.fetchTopicNamesToUnsubscribe(pipeOnlyTopicNames); } /////////////////////////////// broker /////////////////////////////// public boolean isBrokerExist(final String consumerGroupId) { - return consumerGroupIdToSubscriptionBroker.containsKey(consumerGroupId); + return consumerGroupIdToPipeBroker.containsKey(consumerGroupId) + || consumerGroupIdToConsensusBroker.containsKey(consumerGroupId); } public void createBrokerIfNotExist(final String consumerGroupId) { - consumerGroupIdToSubscriptionBroker.computeIfAbsent(consumerGroupId, SubscriptionBroker::new); - LOGGER.info("Subscription: create broker bound to consumer group [{}]", consumerGroupId); + consumerGroupIdToPipeBroker.computeIfAbsent(consumerGroupId, SubscriptionBroker::new); + LOGGER.info("Subscription: create pipe broker bound to consumer group [{}]", consumerGroupId); } /** @@ -156,26 +459,46 @@ public void createBrokerIfNotExist(final String consumerGroupId) { */ public boolean dropBroker(final String consumerGroupId) { final AtomicBoolean dropped = new AtomicBoolean(false); - consumerGroupIdToSubscriptionBroker.compute( + + // Drop pipe broker + consumerGroupIdToPipeBroker.compute( consumerGroupId, (id, broker) -> { if (Objects.isNull(broker)) { + dropped.set(true); + return null; + } + if (!broker.isEmpty()) { LOGGER.warn( - "Subscription: broker bound to consumer group [{}] does not exist", + "Subscription: pipe broker bound to consumer group [{}] is not empty when dropping", consumerGroupId); - dropped.set(true); + return broker; + } + dropped.set(true); + LOGGER.info( + "Subscription: drop pipe broker bound to consumer group [{}]", consumerGroupId); + return null; + }); + + // Drop consensus broker + consumerGroupIdToConsensusBroker.compute( + consumerGroupId, + (id, broker) -> { + if (Objects.isNull(broker)) { return null; } if (!broker.isEmpty()) { LOGGER.warn( - "Subscription: broker bound to consumer group [{}] is not empty when dropping", + "Subscription: consensus broker bound to consumer group [{}] is not empty when dropping", consumerGroupId); return broker; } dropped.set(true); - LOGGER.info("Subscription: drop broker bound to consumer group [{}]", consumerGroupId); - return null; // remove this entry + LOGGER.info( + "Subscription: drop consensus broker bound to consumer group [{}]", consumerGroupId); + return null; }); + return dropped.get(); } @@ -183,15 +506,14 @@ public boolean dropBroker(final String consumerGroupId) { public void bindPrefetchingQueue(final SubscriptionSinkSubtask subtask) { final String consumerGroupId = subtask.getConsumerGroupId(); - consumerGroupIdToSubscriptionBroker + consumerGroupIdToPipeBroker .compute( consumerGroupId, (id, broker) -> { if (Objects.isNull(broker)) { LOGGER.info( - "Subscription: broker bound to consumer group [{}] does not exist, create new for binding prefetching queue", + "Subscription: pipe broker bound to consumer group [{}] does not exist, create new for binding prefetching queue", consumerGroupId); - // TODO: consider more robust metadata semantics return new SubscriptionBroker(consumerGroupId); } return broker; @@ -200,41 +522,183 @@ public void bindPrefetchingQueue(final SubscriptionSinkSubtask subtask) { prefetchingQueueCount.invalidate(); } - public void updateCompletedTopicNames(final String consumerGroupId, final String topicName) { - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); + public void bindConsensusPrefetchingQueue( + final String consumerGroupId, + final String topicName, + final String orderMode, + final ConsensusGroupId consensusGroupId, + final IoTConsensusServerImpl serverImpl, + final SubscriptionWalRetentionPolicy retentionPolicy, + final ConsensusLogToTabletConverter converter, + final ConsensusSubscriptionCommitManager commitManager, + final RegionProgress fallbackCommittedRegionProgress, + final long tailStartSearchIndex, + final long initialRuntimeVersion, + final boolean initialActive) { + consumerGroupIdToConsensusBroker + .compute( + consumerGroupId, + (id, broker) -> { + if (Objects.isNull(broker)) { + LOGGER.info( + "Subscription: consensus broker bound to consumer group [{}] does not exist, create new for binding consensus prefetching queue", + consumerGroupId); + return new ConsensusSubscriptionBroker(consumerGroupId); + } + return broker; + }) + .bindConsensusPrefetchingQueue( + topicName, + orderMode, + consensusGroupId, + serverImpl, + retentionPolicy, + converter, + commitManager, + fallbackCommittedRegionProgress, + tailStartSearchIndex, + initialRuntimeVersion, + initialActive); + prefetchingQueueCount.invalidate(); + } + + public void refreshConsensusQueueOrderMode(final String topicName, final String orderMode) { + LOGGER.info( + "SubscriptionBrokerAgent: refreshing consensus queue order-mode for topic [{}] to [{}]", + topicName, + orderMode); + for (final ConsensusSubscriptionBroker broker : consumerGroupIdToConsensusBroker.values()) { + broker.refreshConsensusQueueOrderMode(topicName, orderMode); + } + } + + public void unbindConsensusPrefetchingQueue( + final String consumerGroupId, final String topicName) { + final ConsensusSubscriptionBroker broker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); if (Objects.isNull(broker)) { LOGGER.warn( - "Subscription: broker bound to consumer group [{}] does not exist", consumerGroupId); + "Subscription: consensus broker bound to consumer group [{}] does not exist", + consumerGroupId); return; } - broker.updateCompletedTopicNames(topicName); + broker.unbindConsensusPrefetchingQueue(topicName); + prefetchingQueueCount.invalidate(); + } + + public void unbindByRegion(final ConsensusGroupId regionId) { + int totalClosed = 0; + for (final ConsensusSubscriptionBroker broker : consumerGroupIdToConsensusBroker.values()) { + totalClosed += broker.unbindByRegion(regionId); + } + if (totalClosed > 0) { + prefetchingQueueCount.invalidate(); + LOGGER.info( + "Subscription: unbound {} consensus prefetching queue(s) for removed region [{}]", + totalClosed, + regionId); + } + } + + /** + * Activates or deactivates all consensus prefetching queues bound to {@code regionId} across all + * consumer groups. Called on leader migration to ensure only the preferred writer serves + * subscription data. + */ + public void setActiveForRegion(final ConsensusGroupId regionId, final boolean active) { + LOGGER.info( + "SubscriptionBrokerAgent: setActiveForRegion regionId={}, active={}", regionId, active); + for (final ConsensusSubscriptionBroker broker : consumerGroupIdToConsensusBroker.values()) { + broker.setActiveForRegion(regionId, active); + } + } + + public void setActiveWritersForRegion( + final ConsensusGroupId regionId, final Set activeWriterNodeIds) { + LOGGER.info( + "SubscriptionBrokerAgent: setActiveWritersForRegion regionId={}, activeWriterNodeIds={}", + regionId, + activeWriterNodeIds); + for (final ConsensusSubscriptionBroker broker : consumerGroupIdToConsensusBroker.values()) { + broker.setActiveWritersForRegion(regionId, activeWriterNodeIds); + } + } + + public void applyRuntimeStateForRegion( + final ConsensusGroupId regionId, final ConsensusRegionRuntimeState runtimeState) { + LOGGER.info( + "SubscriptionBrokerAgent: applyRuntimeStateForRegion regionId={}, runtimeState={}", + regionId, + runtimeState); + for (final ConsensusSubscriptionBroker broker : consumerGroupIdToConsensusBroker.values()) { + broker.applyRuntimeStateForRegion(regionId, runtimeState); + } + } + + public void abortConsensusPendingSeeksForRuntimeStop() { + for (final ConsensusSubscriptionBroker broker : consumerGroupIdToConsensusBroker.values()) { + broker.abortPendingSeeksForRuntimeStop(); + } + } + + public void updateCompletedTopicNames(final String consumerGroupId, final String topicName) { + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.isNull(pipeBroker)) { + LOGGER.warn( + "Subscription: pipe broker bound to consumer group [{}] does not exist", consumerGroupId); + return; + } + pipeBroker.updateCompletedTopicNames(topicName); } public void unbindPrefetchingQueue(final String consumerGroupId, final String topicName) { - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + // Try consensus broker first + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) { + consensusBroker.unbindConsensusPrefetchingQueue(topicName); + prefetchingQueueCount.invalidate(); + return; + } + // Fall back to pipe broker + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.isNull(pipeBroker)) { LOGGER.warn( "Subscription: broker bound to consumer group [{}] does not exist", consumerGroupId); return; } - broker.unbindPrefetchingQueue(topicName); + pipeBroker.unbindPrefetchingQueue(topicName); prefetchingQueueCount.invalidate(); } public void removePrefetchingQueue(final String consumerGroupId, final String topicName) { - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + // Try consensus broker + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) { + consensusBroker.removeQueue(topicName); + prefetchingQueueCount.invalidate(); + return; + } + // Fall back to pipe broker + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.isNull(pipeBroker)) { LOGGER.warn( "Subscription: broker bound to consumer group [{}] does not exist", consumerGroupId); return; } - broker.removePrefetchingQueue(topicName); + pipeBroker.removePrefetchingQueue(topicName); prefetchingQueueCount.invalidate(); } public boolean executePrefetch(final String consumerGroupId, final String topicName) { - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + if (ConsensusSubscriptionSetupHandler.isConsensusBasedTopic(topicName)) { + return false; + } + + // Fall back to pipe broker + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.isNull(pipeBroker)) { SubscriptionDataNodeResourceManager.log() .schedule(SubscriptionBrokerAgent.class, consumerGroupId, topicName) .ifPresent( @@ -244,27 +708,81 @@ public boolean executePrefetch(final String consumerGroupId, final String topicN consumerGroupId)); return false; } - return broker.executePrefetch(topicName); + return pipeBroker.executePrefetch(topicName); } public int getPipeEventCount(final String consumerGroupId, final String topicName) { - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + // Try consensus broker first + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) { + return consensusBroker.getEventCount(topicName); + } + // Fall back to pipe broker + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.isNull(pipeBroker)) { LOGGER.warn( "Subscription: broker bound to consumer group [{}] does not exist", consumerGroupId); return 0; } - return broker.getPipeEventCount(topicName); + return pipeBroker.getPipeEventCount(topicName); } public int getPrefetchingQueueCount() { return prefetchingQueueCount.get(); } + public Map getConsensusLagSummary() { + final Map result = new ConcurrentHashMap<>(); + for (final Map.Entry entry : + consumerGroupIdToConsensusBroker.entrySet()) { + final String groupId = entry.getKey(); + for (final Map.Entry lag : entry.getValue().getLagSummary().entrySet()) { + result.put(groupId + "/" + lag.getKey(), lag.getValue()); + } + } + return result; + } + private int getPrefetchingQueueCountInternal() { - return consumerGroupIdToSubscriptionBroker.values().stream() - .map(SubscriptionBroker::getPrefetchingQueueCount) - .reduce(0, Integer::sum); + int count = + consumerGroupIdToPipeBroker.values().stream() + .map(SubscriptionBroker::getPrefetchingQueueCount) + .reduce(0, Integer::sum); + count += + consumerGroupIdToConsensusBroker.values().stream() + .map(ConsensusSubscriptionBroker::getQueueCount) + .reduce(0, Integer::sum); + return count; + } + + /////////////////////////////// Commit Progress /////////////////////////////// + + public Map collectAllRegionCommitProgress(final int dataNodeId) { + return ConsensusSubscriptionCommitManager.getInstance().collectAllRegionProgress(dataNodeId); + } + + /** + * Receives a committed progress broadcast from another DataNode (Leader → Follower). Delegates to + * CommitManager to update local progress state. + */ + public void receiveSubscriptionProgress( + final String consumerGroupId, + final String topicName, + final String regionId, + final long physicalTime, + final long localSeq, + final int writerNodeId, + final long writerEpoch) { + ConsensusSubscriptionCommitManager.getInstance() + .receiveProgressBroadcast( + consumerGroupId, + topicName, + regionId, + physicalTime, + localSeq, + writerNodeId, + writerEpoch); } /////////////////////////////// Cache /////////////////////////////// @@ -272,8 +790,9 @@ private int getPrefetchingQueueCountInternal() { /** * A simple generic cache that computes and stores a value on demand. * - *

Note that since the get() and invalidate() methods are not modified with synchronized, the - * value obtained may not be entirely accurate. + *

Both {@code value} and {@code valid} are volatile to ensure visibility across threads. The + * {@code get()} method uses a local snapshot of {@code valid} to avoid double-read reordering. + * Concurrent recomputation by multiple threads is benign (idempotent supplier). * * @param the type of the cached value */ @@ -304,8 +823,10 @@ private void invalidate() { */ private T get() { if (!valid) { - value = supplier.get(); + final T computed = supplier.get(); + value = computed; valid = true; + return computed; } return value; } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionConsumerAgent.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionConsumerAgent.java index 4ee6b191a2478..67d5fe875d3cf 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionConsumerAgent.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionConsumerAgent.java @@ -21,12 +21,14 @@ import org.apache.iotdb.commons.subscription.meta.consumer.ConsumerGroupMeta; import org.apache.iotdb.commons.subscription.meta.consumer.ConsumerGroupMetaKeeper; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusSubscriptionSetupHandler; import org.apache.iotdb.mpp.rpc.thrift.TPushConsumerGroupMetaRespExceptionMessage; import org.apache.iotdb.rpc.subscription.exception.SubscriptionException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.util.LinkedHashSet; import java.util.List; import java.util.Objects; import java.util.Set; @@ -128,14 +130,46 @@ private void handleSingleConsumerGroupMetaChangesInternal( // remove prefetching queues for topics unsubscribed by the consumer group final Set topicsUnsubByGroup = ConsumerGroupMeta.getTopicsUnsubByGroup(metaInAgent, metaFromCoordinator); + final Set pipeTopicsUnsubByGroup = new LinkedHashSet<>(); + final Set consensusTopicsUnsubByGroup = new LinkedHashSet<>(); for (final String topicName : topicsUnsubByGroup) { + if (ConsensusSubscriptionSetupHandler.isConsensusBasedTopic(topicName)) { + consensusTopicsUnsubByGroup.add(topicName); + continue; + } + pipeTopicsUnsubByGroup.add(topicName); + } + for (final String topicName : pipeTopicsUnsubByGroup) { SubscriptionAgent.broker().removePrefetchingQueue(consumerGroupId, topicName); } + // Tear down consensus-based subscriptions for unsubscribed topics + if (!consensusTopicsUnsubByGroup.isEmpty()) { + ConsensusSubscriptionSetupHandler.teardownConsensusSubscriptions( + consumerGroupId, consensusTopicsUnsubByGroup); + } + + // Detect newly subscribed topics (present in new meta but not in old meta) + final Set newlySubscribedTopics = + ConsumerGroupMeta.getTopicsNewlySubByGroup(metaInAgent, metaFromCoordinator); + + LOGGER.info( + "Subscription: consumer group [{}] meta change detected, " + + "topicsUnsubByGroup={}, newlySubscribedTopics={}", + consumerGroupId, + topicsUnsubByGroup, + newlySubscribedTopics); // TODO: Currently we fully replace the entire ConsumerGroupMeta without carefully checking the // changes in its fields. consumerGroupMetaKeeper.removeConsumerGroupMeta(consumerGroupId); consumerGroupMetaKeeper.addConsumerGroupMeta(consumerGroupId, metaFromCoordinator); + + // Set up consensus-based subscription for newly subscribed consensus-mode topics. + // This must happen after the meta is updated so that the broker can find the topic config. + if (!newlySubscribedTopics.isEmpty()) { + ConsensusSubscriptionSetupHandler.handleNewSubscriptions( + consumerGroupId, newlySubscribedTopics); + } } public TPushConsumerGroupMetaRespExceptionMessage handleConsumerGroupMetaChanges( @@ -221,4 +255,24 @@ public Set getTopicNamesSubscribedByConsumer( releaseReadLock(); } } + + /** + * Get all active subscriptions: consumerGroupId → set of subscribed topic names. Used by + * consensus subscription auto-binding when a new DataRegion is created. + */ + public java.util.Map> getAllSubscriptions() { + acquireReadLock(); + try { + final java.util.Map> result = new java.util.HashMap<>(); + for (final ConsumerGroupMeta meta : consumerGroupMetaKeeper.getAllConsumerGroupMeta()) { + final Set topics = meta.getSubscribedTopicNames(); + if (!topics.isEmpty()) { + result.put(meta.getConsumerGroupId(), new java.util.HashSet<>(topics)); + } + } + return result; + } finally { + releaseReadLock(); + } + } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionRuntimeAgent.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionRuntimeAgent.java index aec165684635a..e942453f7bd6c 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionRuntimeAgent.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionRuntimeAgent.java @@ -23,6 +23,7 @@ import org.apache.iotdb.commons.service.IService; import org.apache.iotdb.commons.service.ServiceType; import org.apache.iotdb.commons.subscription.config.SubscriptionConfig; +import org.apache.iotdb.db.subscription.task.execution.ConsensusSubscriptionPrefetchExecutorManager; import java.util.concurrent.atomic.AtomicBoolean; @@ -67,6 +68,7 @@ public void start() throws StartupException { } SubscriptionConfig.getInstance().printAllConfigs(); + ConsensusSubscriptionPrefetchExecutorManager.getInstance().start(); SubscriptionAgentLauncher.launchSubscriptionTopicAgent(); SubscriptionAgentLauncher.launchSubscriptionConsumerAgent(); @@ -80,8 +82,9 @@ public void stop() { return; } isShutdown.set(true); - - // let PipeDataNodeRuntimeAgent to drop all related pipe tasks + SubscriptionAgent.broker().abortConsensusPendingSeeksForRuntimeStop(); + ConsensusSubscriptionPrefetchExecutorManager.getInstance().stop(); + SubscriptionAgent.broker().abortConsensusPendingSeeksForRuntimeStop(); } public boolean isShutdown() { diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionTopicAgent.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionTopicAgent.java index 37cdaa72690be..5db4401df65bb 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionTopicAgent.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionTopicAgent.java @@ -88,6 +88,8 @@ private void handleSingleTopicMetaChangesInternal(final TopicMeta metaFromCoordi final String topicName = metaFromCoordinator.getTopicName(); topicMetaKeeper.removeTopicMeta(topicName); topicMetaKeeper.addTopicMeta(topicName, metaFromCoordinator); + SubscriptionAgent.broker() + .refreshConsensusQueueOrderMode(topicName, metaFromCoordinator.getConfig().getOrderMode()); } public TPushTopicMetaRespExceptionMessage handleTopicMetaChanges( @@ -164,16 +166,22 @@ public String getTopicMode(final String topicName) { acquireReadLock(); try { return topicMetaKeeper.containsTopicMeta(topicName) - ? topicMetaKeeper - .getTopicMeta(topicName) - .getConfig() - .getStringOrDefault(TopicConstant.MODE_KEY, TopicConstant.MODE_DEFAULT_VALUE) + ? topicMetaKeeper.getTopicMeta(topicName).getConfig().getMode() : null; } finally { releaseReadLock(); } } + public String getTopicOrderMode(final String topicName) { + acquireReadLock(); + try { + return topicMetaKeeper.getTopicMeta(topicName).getConfig().getOrderMode(); + } finally { + releaseReadLock(); + } + } + public Map getTopicConfigs(final Set topicNames) { acquireReadLock(); try { diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ConsensusSubscriptionBroker.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ConsensusSubscriptionBroker.java new file mode 100644 index 0000000000000..85b89b06c7fb7 --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ConsensusSubscriptionBroker.java @@ -0,0 +1,811 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker; + +import org.apache.iotdb.commons.consensus.ConsensusGroupId; +import org.apache.iotdb.commons.subscription.config.SubscriptionConfig; +import org.apache.iotdb.consensus.iot.IoTConsensusServerImpl; +import org.apache.iotdb.consensus.iot.SubscriptionWalRetentionPolicy; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusLogToTabletConverter; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusPrefetchingQueue; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusRegionRuntimeState; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusSubscriptionCommitManager; +import org.apache.iotdb.db.subscription.event.SubscriptionEvent; +import org.apache.iotdb.rpc.subscription.payload.poll.RegionProgress; +import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext; +import org.apache.iotdb.rpc.subscription.payload.poll.TopicProgress; +import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeSeekReq; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; + +/** + * Consensus-based subscription broker that reads data directly from IoTConsensus WAL. Each instance + * manages consensus prefetching queues for a single consumer group. + */ +public class ConsensusSubscriptionBroker implements ISubscriptionBroker { + + private static final Logger LOGGER = LoggerFactory.getLogger(ConsensusSubscriptionBroker.class); + + private final String brokerId; // consumer group id + + /** Maps topic name to a list of ConsensusPrefetchingQueues, one per data region. */ + private final Map> topicNameToConsensusPrefetchingQueues; + + /** Round-robin counter for fair polling among region queues already assigned to this consumer. */ + private final AtomicInteger pollRoundRobinIndex = new AtomicInteger(0); + + private final Map> topicConsumerLastPollMs = + new ConcurrentHashMap<>(); + + private final Map topicOwnershipSnapshots = + new ConcurrentHashMap<>(); + + public ConsensusSubscriptionBroker(final String brokerId) { + this.brokerId = brokerId; + this.topicNameToConsensusPrefetchingQueues = new ConcurrentHashMap<>(); + } + + @Override + public boolean isEmpty() { + return topicNameToConsensusPrefetchingQueues.isEmpty(); + } + + @Override + public boolean hasQueue(final String topicName) { + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + return Objects.nonNull(queues) + && !queues.isEmpty() + && queues.stream().anyMatch(q -> !q.isClosed()); + } + + //////////////////////////// poll //////////////////////////// + + @Override + public List poll( + final String consumerId, final Set topicNames, final long maxBytes) { + return poll(consumerId, topicNames, maxBytes, Collections.emptyMap()); + } + + public List poll( + final String consumerId, + final Set topicNames, + final long maxBytes, + final Map progressByTopic) { + LOGGER.debug( + "ConsensusSubscriptionBroker [{}]: poll called, consumerId={}, topicNames={}, " + + "queueCount={}, maxBytes={}", + brokerId, + consumerId, + topicNames, + topicNameToConsensusPrefetchingQueues.size(), + maxBytes); + + final List eventsToPoll = new ArrayList<>(); + final List eventsToNack = new ArrayList<>(); + long totalSize = 0; + + for (final String topicName : topicNames) { + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.isNull(queues) || queues.isEmpty()) { + continue; + } + + final TopicOwnershipSnapshot ownershipSnapshot = + refreshAndGetTopicOwnership(topicName, queues, consumerId); + final List assignedQueues = + getAssignedQueues(queues, consumerId, ownershipSnapshot); + if (assignedQueues.isEmpty()) { + continue; + } + + final List pollQueues = + buildPollOrderForAssignedQueues(assignedQueues, topicName); + final int eventsBeforeTopicPoll = eventsToPoll.size(); + + for (final ConsensusPrefetchingQueue consensusQueue : pollQueues) { + if (consensusQueue.isClosed()) { + continue; + } + + final String regionIdStr = consensusQueue.getConsensusGroupId().toString(); + final TopicProgress topicProgress = progressByTopic.get(topicName); + final RegionProgress regionProgress = + Objects.nonNull(topicProgress) + ? topicProgress.getRegionProgress().get(regionIdStr) + : null; + + final SubscriptionEvent event = consensusQueue.poll(consumerId, regionProgress); + if (Objects.isNull(event)) { + continue; + } + + final long currentSize; + try { + currentSize = event.getCurrentResponseSize(); + } catch (final IOException e) { + eventsToNack.add(event); + continue; + } + + eventsToPoll.add(event); + totalSize += currentSize; + + if (totalSize >= maxBytes) { + break; + } + } + if (totalSize >= maxBytes) { + break; + } + } + + // Nack any events that had errors + if (!eventsToNack.isEmpty()) { + commit( + consumerId, + eventsToNack.stream() + .map(SubscriptionEvent::getCommitContext) + .collect(Collectors.toList()), + true); + } + + LOGGER.debug( + "ConsensusSubscriptionBroker [{}]: poll result, consumerId={}, eventsPolled={}, eventsNacked={}", + brokerId, + consumerId, + eventsToPoll.size(), + eventsToNack.size()); + + return eventsToPoll; + } + + @Override + public List pollTablets( + final String consumerId, final SubscriptionCommitContext commitContext, final int offset) { + final String topicName = commitContext.getTopicName(); + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.isNull(queues) || queues.isEmpty()) { + return Collections.emptyList(); + } + + final ConsensusPrefetchingQueue assignedQueue = + getAssignedQueueForConsumer( + queues, topicName, consumerId, commitContext.getRegionId(), "pollTablets"); + if (Objects.isNull(assignedQueue)) { + return Collections.emptyList(); + } + + final SubscriptionEvent event = assignedQueue.pollTablets(consumerId, commitContext, offset); + if (Objects.nonNull(event)) { + return Collections.singletonList(event); + } + return Collections.emptyList(); + } + + //////////////////////////// commit //////////////////////////// + + @Override + public List commit( + final String consumerId, + final List commitContexts, + final boolean nack) { + final List successfulCommitContexts = new ArrayList<>(); + for (final SubscriptionCommitContext commitContext : commitContexts) { + final String topicName = commitContext.getTopicName(); + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.isNull(queues) || queues.isEmpty()) { + LOGGER.warn( + "ConsensusSubscriptionBroker [{}]: no queues for topic [{}] to commit", + brokerId, + topicName); + continue; + } + + final ConsensusPrefetchingQueue assignedQueue = + getAssignedQueueForConsumer( + queues, topicName, consumerId, commitContext.getRegionId(), nack ? "nack" : "ack"); + boolean handled = false; + if (Objects.nonNull(assignedQueue)) { + final boolean success; + if (!nack) { + success = assignedQueue.ackSilent(consumerId, commitContext); + } else { + success = assignedQueue.nackSilent(consumerId, commitContext); + } + if (success) { + successfulCommitContexts.add(commitContext); + handled = true; + } + } + if (!handled) { + LOGGER.warn( + "ConsensusSubscriptionBroker [{}]: commit context {} not found in any of {} region queue(s) for topic [{}]", + brokerId, + commitContext, + queues.size(), + topicName); + } + } + return successfulCommitContexts; + } + + @Override + public boolean isCommitContextOutdated(final SubscriptionCommitContext commitContext) { + final String topicName = commitContext.getTopicName(); + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.isNull(queues) || queues.isEmpty()) { + return true; + } + // Route directly to the correct region queue using regionId + final String regionId = commitContext.getRegionId(); + for (final ConsensusPrefetchingQueue q : queues) { + if (!regionId.isEmpty() && !regionId.equals(q.getConsensusGroupId().toString())) { + continue; + } + return q.isCommitContextOutdated(commitContext); + } + return true; + } + + //////////////////////////// seek //////////////////////////// + + public void seek(final String topicName, final short seekType) { + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.isNull(queues) || queues.isEmpty()) { + LOGGER.warn( + "ConsensusSubscriptionBroker [{}]: no queues for topic [{}] to seek", + brokerId, + topicName); + return; + } + + for (final ConsensusPrefetchingQueue queue : queues) { + if (queue.isClosed()) { + continue; + } + switch (seekType) { + case PipeSubscribeSeekReq.SEEK_TO_BEGINNING: + queue.seekToBeginning(); + break; + case PipeSubscribeSeekReq.SEEK_TO_END: + queue.seekToEnd(); + break; + default: + LOGGER.warn( + "ConsensusSubscriptionBroker [{}]: unsupported seekType {} for topic [{}]", + brokerId, + seekType, + topicName); + break; + } + } + } + + public void seek(final String topicName, final TopicProgress topicProgress) { + final TopicProgress safeProgress = + topicProgress != null ? topicProgress : new TopicProgress(Collections.emptyMap()); + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.isNull(queues) || queues.isEmpty()) { + LOGGER.warn( + "ConsensusSubscriptionBroker [{}]: no queues for topic [{}] to seek(topicProgress)", + brokerId, + topicName); + return; + } + for (final ConsensusPrefetchingQueue queue : queues) { + if (queue.isClosed()) { + continue; + } + final RegionProgress regionProgress = + safeProgress.getRegionProgress().get(queue.getConsensusGroupId().toString()); + seekQueueToRegionProgress(queue, regionProgress, false); + } + } + + public void seekAfter(final String topicName, final TopicProgress topicProgress) { + final TopicProgress safeProgress = + topicProgress != null ? topicProgress : new TopicProgress(Collections.emptyMap()); + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.isNull(queues) || queues.isEmpty()) { + LOGGER.warn( + "ConsensusSubscriptionBroker [{}]: no queues for topic [{}] to seekAfter(topicProgress)", + brokerId, + topicName); + return; + } + for (final ConsensusPrefetchingQueue queue : queues) { + if (queue.isClosed()) { + continue; + } + final RegionProgress regionProgress = + safeProgress.getRegionProgress().get(queue.getConsensusGroupId().toString()); + seekQueueToRegionProgress(queue, regionProgress, true); + } + } + + private void seekQueueToRegionProgress( + final ConsensusPrefetchingQueue queue, + final RegionProgress regionProgress, + final boolean seekAfter) { + if (Objects.isNull(regionProgress) || regionProgress.getWriterPositions().isEmpty()) { + return; + } + if (seekAfter) { + queue.seekAfterRegionProgress(regionProgress); + } else { + queue.seekToRegionProgress(regionProgress); + } + } + + //////////////////////////// prefetching //////////////////////////// + + @Override + public boolean executePrefetch(final String topicName) { + // Consensus prefetch is fully driven by queue-local wakeup sources and the dedicated delayed + // scheduler. This interface remains only to satisfy the shared broker contract used by + // pipe-based subscription. + return false; + } + + @Override + public int getEventCount(final String topicName) { + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.isNull(queues)) { + return 0; + } + return queues.stream() + .filter(queue -> !queue.isClosed()) + .mapToInt(ConsensusPrefetchingQueue::getPrefetchedEventCount) + .sum(); + } + + @Override + public int getQueueCount() { + return topicNameToConsensusPrefetchingQueues.size(); + } + + /** + * Returns per-region lag information for all topics managed by this broker. The result maps + * "topicName/regionId" to the lag (number of WAL entries behind). + */ + public Map getLagSummary() { + final Map lagMap = new ConcurrentHashMap<>(); + for (final Map.Entry> entry : + topicNameToConsensusPrefetchingQueues.entrySet()) { + for (final ConsensusPrefetchingQueue queue : entry.getValue()) { + if (!queue.isClosed()) { + lagMap.put(entry.getKey() + "/" + queue.getConsensusGroupId().toString(), queue.getLag()); + } + } + } + return lagMap; + } + + private TopicOwnershipSnapshot refreshAndGetTopicOwnership( + final String topicName, + final List queues, + final String consumerId) { + final ConcurrentHashMap consumerTimestamps = + topicConsumerLastPollMs.computeIfAbsent(topicName, ignored -> new ConcurrentHashMap<>()); + consumerTimestamps.put(consumerId, System.currentTimeMillis()); + evictInactiveConsumers(consumerTimestamps); + final List sortedConsumers = new ArrayList<>(consumerTimestamps.keySet()); + Collections.sort(sortedConsumers); + + final List activeRegionIds = + queues.stream() + .filter(q -> !q.isClosed()) + .map(q -> q.getConsensusGroupId().toString()) + .sorted() + .collect(Collectors.toList()); + + final TopicOwnershipSnapshot existingSnapshot = topicOwnershipSnapshots.get(topicName); + if (Objects.nonNull(existingSnapshot) + && existingSnapshot.hasSameConsumers(sortedConsumers) + && existingSnapshot.hasSameRegions(activeRegionIds)) { + return existingSnapshot; + } + + final TopicOwnershipSnapshot refreshedSnapshot = + TopicOwnershipSnapshot.create(sortedConsumers, activeRegionIds); + topicOwnershipSnapshots.put(topicName, refreshedSnapshot); + LOGGER.debug( + "ConsensusSubscriptionBroker [{}]: refreshed ownership for topic [{}], consumers={}, regions={}, generation={}", + brokerId, + topicName, + sortedConsumers, + activeRegionIds, + refreshedSnapshot.getGeneration()); + return refreshedSnapshot; + } + + private List getAssignedQueues( + final List queues, + final String consumerId, + final TopicOwnershipSnapshot ownershipSnapshot) { + if (Objects.isNull(ownershipSnapshot) || ownershipSnapshot.isEmpty()) { + return Collections.emptyList(); + } + final List assignedQueues = new ArrayList<>(); + for (final ConsensusPrefetchingQueue queue : queues) { + if (queue.isClosed()) { + continue; + } + if (consumerId.equals( + ownershipSnapshot.getOwnerConsumerId(queue.getConsensusGroupId().toString()))) { + assignedQueues.add(queue); + } + } + return assignedQueues; + } + + private List buildPollOrderForAssignedQueues( + final List assignedQueues, final String topicName) { + if (assignedQueues.size() <= 1) { + return assignedQueues; + } + final List pollQueues = new ArrayList<>(assignedQueues); + if (SubscriptionConfig.getInstance().isSubscriptionConsensusLagBasedPriority()) { + pollQueues.sort( + Comparator.comparingLong(ConsensusPrefetchingQueue::getLag) + .reversed() + .thenComparing(q -> q.getConsensusGroupId().toString())); + return pollQueues; + } + + final int startOffset = Math.floorMod(pollRoundRobinIndex.getAndIncrement(), pollQueues.size()); + final List orderedQueues = new ArrayList<>(pollQueues.size()); + for (int i = 0; i < pollQueues.size(); i++) { + orderedQueues.add(pollQueues.get((startOffset + i) % pollQueues.size())); + } + LOGGER.debug( + "ConsensusSubscriptionBroker [{}]: stable ownership poll order for topic [{}], assignedQueueCount={}", + brokerId, + topicName, + orderedQueues.size()); + return orderedQueues; + } + + private ConsensusPrefetchingQueue getAssignedQueueForConsumer( + final List queues, + final String topicName, + final String consumerId, + final String regionId, + final String action) { + final TopicOwnershipSnapshot ownershipSnapshot = + refreshAndGetTopicOwnership(topicName, queues, consumerId); + for (final ConsensusPrefetchingQueue queue : queues) { + if (queue.isClosed()) { + continue; + } + if (!regionId.isEmpty() && !regionId.equals(queue.getConsensusGroupId().toString())) { + continue; + } + if (consumerId.equals( + ownershipSnapshot.getOwnerConsumerId(queue.getConsensusGroupId().toString()))) { + return queue; + } + LOGGER.debug( + "ConsensusSubscriptionBroker [{}]: consumer [{}] skipped {} on topic [{}], region [{}] is currently owned by [{}]", + brokerId, + consumerId, + action, + topicName, + queue.getConsensusGroupId(), + ownershipSnapshot.getOwnerConsumerId(queue.getConsensusGroupId().toString())); + return null; + } + return null; + } + + /** Evicts consumers that have not polled within the configured eviction timeout. */ + private void evictInactiveConsumers(final ConcurrentHashMap consumerTimestamps) { + final long now = System.currentTimeMillis(); + final long timeout = + SubscriptionConfig.getInstance().getSubscriptionConsensusConsumerEvictionTimeoutMs(); + consumerTimestamps.entrySet().removeIf(entry -> (now - entry.getValue()) > timeout); + } + + //////////////////////////// queue management //////////////////////////// + + public void bindConsensusPrefetchingQueue( + final String topicName, + final String orderMode, + final ConsensusGroupId consensusGroupId, + final IoTConsensusServerImpl serverImpl, + final SubscriptionWalRetentionPolicy retentionPolicy, + final ConsensusLogToTabletConverter converter, + final ConsensusSubscriptionCommitManager commitManager, + final RegionProgress fallbackCommittedRegionProgress, + final long tailStartSearchIndex, + final long initialRuntimeVersion, + final boolean initialActive) { + // Get or create the list of queues for this topic + final List queues = + topicNameToConsensusPrefetchingQueues.computeIfAbsent( + topicName, k -> new CopyOnWriteArrayList<>()); + + // Check for duplicate region binding + for (final ConsensusPrefetchingQueue existing : queues) { + if (consensusGroupId.equals(existing.getConsensusGroupId()) && !existing.isClosed()) { + LOGGER.info( + "Subscription: consensus prefetching queue for topic [{}], region [{}] " + + "in consumer group [{}] already exists, skipping", + topicName, + consensusGroupId, + brokerId); + return; + } + } + + // Create the per-region consensus queue for this topic. + final ConsensusPrefetchingQueue consensusQueue = + new ConsensusPrefetchingQueue( + brokerId, + topicName, + orderMode, + consensusGroupId, + serverImpl, + retentionPolicy, + converter, + commitManager, + fallbackCommittedRegionProgress, + tailStartSearchIndex, + initialRuntimeVersion, + initialActive); + queues.add(consensusQueue); + LOGGER.info( + "Subscription: create consensus prefetching queue bound to topic [{}] for consumer group [{}], " + + "consensusGroupId={}, fallbackCommittedRegionProgress={}, " + + "tailStartSearchIndex={}, initialRuntimeVersion={}, initialActive={}, totalRegionQueues={}", + topicName, + brokerId, + consensusGroupId, + fallbackCommittedRegionProgress, + tailStartSearchIndex, + initialRuntimeVersion, + initialActive, + queues.size()); + } + + public void refreshConsensusQueueOrderMode(final String topicName, final String orderMode) { + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.isNull(queues) || queues.isEmpty()) { + return; + } + + for (final ConsensusPrefetchingQueue queue : queues) { + queue.setOrderMode(orderMode); + } + } + + public void unbindConsensusPrefetchingQueue(final String topicName) { + closeAndRemoveConsensusPrefetchingQueues(topicName, true); + } + + public int unbindByRegion(final ConsensusGroupId regionId) { + int closedCount = 0; + for (final Map.Entry> entry : + topicNameToConsensusPrefetchingQueues.entrySet()) { + final List queues = entry.getValue(); + final int beforeSize = queues.size(); + queues.removeIf( + q -> { + if (!regionId.equals(q.getConsensusGroupId())) { + return false; + } + q.close(); + LOGGER.info( + "Subscription: closed consensus prefetching queue for topic [{}] region [{}] " + + "in consumer group [{}] due to region removal", + entry.getKey(), + regionId, + brokerId); + return true; + }); + closedCount += beforeSize - queues.size(); + if (queues.isEmpty()) { + topicNameToConsensusPrefetchingQueues.remove(entry.getKey(), queues); + topicConsumerLastPollMs.remove(entry.getKey()); + topicOwnershipSnapshots.remove(entry.getKey()); + } else { + topicOwnershipSnapshots.remove(entry.getKey()); + } + } + return closedCount; + } + + /** + * Activates or deactivates all queues bound to {@code regionId}. Called on leader migration: + * {@code false} on old leader, {@code true} on new leader. Inactive queues skip prefetching and + * return null on poll, ensuring only the preferred writer serves subscription data. + */ + public void setActiveForRegion(final ConsensusGroupId regionId, final boolean active) { + for (final List queues : + topicNameToConsensusPrefetchingQueues.values()) { + for (final ConsensusPrefetchingQueue q : queues) { + if (regionId.equals(q.getConsensusGroupId())) { + q.setActive(active); + } + } + } + } + + public void setActiveWritersForRegion( + final ConsensusGroupId regionId, final Set activeWriterNodeIds) { + final Set normalizedActiveWriterNodeIds = + Collections.unmodifiableSet(new LinkedHashSet<>(activeWriterNodeIds)); + for (final List queues : + topicNameToConsensusPrefetchingQueues.values()) { + for (final ConsensusPrefetchingQueue q : queues) { + if (regionId.equals(q.getConsensusGroupId())) { + q.setActiveWriterNodeIds(normalizedActiveWriterNodeIds); + } + } + } + } + + public void applyRuntimeStateForRegion( + final ConsensusGroupId regionId, final ConsensusRegionRuntimeState runtimeState) { + for (final List queues : + topicNameToConsensusPrefetchingQueues.values()) { + for (final ConsensusPrefetchingQueue q : queues) { + if (regionId.equals(q.getConsensusGroupId())) { + q.applyRuntimeState(runtimeState); + } + } + } + } + + public void abortPendingSeeksForRuntimeStop() { + for (final List queues : + topicNameToConsensusPrefetchingQueues.values()) { + for (final ConsensusPrefetchingQueue q : queues) { + if (!q.isClosed()) { + q.abortPendingSeekForRuntimeStop(); + } + } + } + } + + @Override + public void removeQueue(final String topicName) { + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.nonNull(queues) && !queues.isEmpty()) { + LOGGER.info( + "Subscription: consensus prefetching queue(s) bound to topic [{}] for consumer group [{}] still exist, unbind before closing", + topicName, + brokerId); + } + closeAndRemoveConsensusPrefetchingQueues(topicName, false); + } + + private void closeAndRemoveConsensusPrefetchingQueues( + final String topicName, final boolean warnIfMissing) { + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.isNull(queues) || queues.isEmpty()) { + if (warnIfMissing) { + LOGGER.warn( + "Subscription: consensus prefetching queues bound to topic [{}] for consumer group [{}] do not exist", + topicName, + brokerId); + } + return; + } + + for (final ConsensusPrefetchingQueue q : queues) { + q.close(); + } + topicNameToConsensusPrefetchingQueues.remove(topicName); + topicConsumerLastPollMs.remove(topicName); + topicOwnershipSnapshots.remove(topicName); + LOGGER.info( + "Subscription: drop all {} consensus prefetching queue(s) bound to topic [{}] for consumer group [{}]", + queues.size(), + topicName, + brokerId); + } + + private static final class TopicOwnershipSnapshot { + + private final List activeConsumers; + private final List activeRegionIds; + private final Map ownerByRegionId; + private final int generation; + + private TopicOwnershipSnapshot( + final List activeConsumers, + final List activeRegionIds, + final Map ownerByRegionId, + final int generation) { + this.activeConsumers = activeConsumers; + this.activeRegionIds = activeRegionIds; + this.ownerByRegionId = ownerByRegionId; + this.generation = generation; + } + + private static TopicOwnershipSnapshot create( + final List activeConsumers, final List activeRegionIds) { + if (activeConsumers.isEmpty() || activeRegionIds.isEmpty()) { + return new TopicOwnershipSnapshot( + Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), 0); + } + + final Map ownerByRegionId = new ConcurrentHashMap<>(); + final int consumerCount = activeConsumers.size(); + for (final String regionId : activeRegionIds) { + final int ownerIdx = Math.floorMod(regionId.hashCode(), consumerCount); + ownerByRegionId.put(regionId, activeConsumers.get(ownerIdx)); + } + return new TopicOwnershipSnapshot( + Collections.unmodifiableList(new ArrayList<>(activeConsumers)), + Collections.unmodifiableList(new ArrayList<>(activeRegionIds)), + Collections.unmodifiableMap(ownerByRegionId), + ownerByRegionId.hashCode()); + } + + private boolean isEmpty() { + return activeConsumers.isEmpty() || activeRegionIds.isEmpty(); + } + + private boolean hasSameConsumers(final List consumers) { + return activeConsumers.equals(consumers); + } + + private boolean hasSameRegions(final List regionIds) { + return activeRegionIds.equals(regionIds); + } + + private String getOwnerConsumerId(final String regionId) { + return ownerByRegionId.get(regionId); + } + + private int getGeneration() { + return generation; + } + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ISubscriptionBroker.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ISubscriptionBroker.java new file mode 100644 index 0000000000000..aaa88a5f84777 --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ISubscriptionBroker.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker; + +import org.apache.iotdb.db.subscription.event.SubscriptionEvent; +import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext; + +import java.util.List; +import java.util.Set; + +public interface ISubscriptionBroker { + + List poll(String consumerId, Set topicNames, long maxBytes); + + List pollTablets( + String consumerId, SubscriptionCommitContext commitContext, int offset); + + List commit( + String consumerId, List commitContexts, boolean nack); + + boolean isCommitContextOutdated(SubscriptionCommitContext commitContext); + + boolean executePrefetch(String topicName); + + int getEventCount(String topicName); + + int getQueueCount(); + + void removeQueue(String topicName); + + boolean isEmpty(); + + boolean hasQueue(String topicName); +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/SubscriptionBroker.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/SubscriptionBroker.java index cc03f7261419b..8f9d05324e905 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/SubscriptionBroker.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/SubscriptionBroker.java @@ -56,7 +56,7 @@ import static org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext.INVALID_COMMIT_ID; -public class SubscriptionBroker { +public class SubscriptionBroker implements ISubscriptionBroker { private static final Logger LOGGER = LoggerFactory.getLogger(SubscriptionBroker.class); @@ -83,14 +83,23 @@ public SubscriptionBroker(final String brokerId) { .build(consumerId -> new SubscriptionStates()); } + @Override public boolean isEmpty() { return topicNameToPrefetchingQueue.isEmpty() && completedTopicNames.isEmpty() && topicNameToCommitIdGenerator.isEmpty(); } + @Override + public boolean hasQueue(final String topicName) { + final SubscriptionPrefetchingQueue prefetchingQueue = + topicNameToPrefetchingQueue.get(topicName); + return Objects.nonNull(prefetchingQueue) && !prefetchingQueue.isClosed(); + } + //////////////////////////// provided for SubscriptionBrokerAgent //////////////////////////// + @Override public List poll( final String consumerId, final Set topicNames, final long maxBytes) { final List eventsToPoll = new ArrayList<>(); @@ -112,9 +121,10 @@ public List poll( // Iterate over each sorted topic name and poll the corresponding events int remainingTopicSize = sortedTopicNames.size(); for (final String topicName : sortedTopicNames) { + remainingTopicSize -= 1; + // Check pipe-based queue final SubscriptionPrefetchingQueue prefetchingQueue = topicNameToPrefetchingQueue.get(topicName); - remainingTopicSize -= 1; // Recheck if (Objects.isNull(prefetchingQueue) || prefetchingQueue.isClosed()) { @@ -182,6 +192,7 @@ private Set prepareCandidateTopicNames( final List eventsToPoll /* output parameter */) { final Set candidateTopicNames = new HashSet<>(); for (final String topicName : topicNames) { + // Check pipe-based queue final SubscriptionPrefetchingQueue prefetchingQueue = topicNameToPrefetchingQueue.get(topicName); // If there is no prefetching queue for the topic, check if it's completed @@ -271,6 +282,7 @@ public List pollTsFile( return Collections.emptyList(); } + @Override public List pollTablets( final String consumerId, final SubscriptionCommitContext commitContext, final int offset) { final String topicName = commitContext.getTopicName(); @@ -312,6 +324,7 @@ public List pollTablets( /** * @return list of successful commit contexts */ + @Override public List commit( final String consumerId, final List commitContexts, @@ -348,6 +361,7 @@ public List commit( return successfulCommitContexts; } + @Override public boolean isCommitContextOutdated(final SubscriptionCommitContext commitContext) { final String topicName = commitContext.getTopicName(); final SubscriptionPrefetchingQueue prefetchingQueue = @@ -457,6 +471,11 @@ public void unbindPrefetchingQueue(final String topicName) { brokerId); } + @Override + public void removeQueue(final String topicName) { + removePrefetchingQueue(topicName); + } + public void removePrefetchingQueue(final String topicName) { final SubscriptionPrefetchingQueue prefetchingQueue = topicNameToPrefetchingQueue.get(topicName); @@ -473,6 +492,7 @@ public void removePrefetchingQueue(final String topicName) { topicNameToCommitIdGenerator.remove(topicName); } + @Override public boolean executePrefetch(final String topicName) { final SubscriptionPrefetchingQueue prefetchingQueue = topicNameToPrefetchingQueue.get(topicName); @@ -505,6 +525,11 @@ public boolean executePrefetch(final String topicName) { : prefetchingQueue.executePrefetchV2(); } + @Override + public int getEventCount(final String topicName) { + return getPipeEventCount(topicName); + } + public int getPipeEventCount(final String topicName) { final SubscriptionPrefetchingQueue prefetchingQueue = topicNameToPrefetchingQueue.get(topicName); @@ -525,6 +550,11 @@ public int getPipeEventCount(final String topicName) { return prefetchingQueue.getPipeEventCount(); } + @Override + public int getQueueCount() { + return getPrefetchingQueueCount(); + } + public int getPrefetchingQueueCount() { return topicNameToPrefetchingQueue.size(); } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/SubscriptionPrefetchingQueue.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/SubscriptionPrefetchingQueue.java index b8bdc4e802ff5..b325d0938c499 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/SubscriptionPrefetchingQueue.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/SubscriptionPrefetchingQueue.java @@ -849,6 +849,18 @@ public boolean nackInternal( ev.nack(); // now pollable nacked.set(true); + if (ev.isPoisoned()) { + LOGGER.error( + "Subscription: poison message detected (nackCount={}), force-acking event {} in prefetching queue: {}", + ev.getNackCount(), + ev, + this); + ev.ack(); + ev.recordCommittedTimestamp(); + ev.cleanUp(false); + return null; // remove from inFlightEvents + } + // no need to update inFlightEvents and prefetchingQueue return ev; }); @@ -1017,11 +1029,33 @@ private static RemappingFunction COMBINER( (ev) -> { if (ev.eagerlyPollable()) { ev.nack(); // now pollable (the nack operation here is actually unnecessary) + if (ev.isPoisoned()) { + LOGGER.error( + "Subscription: poison message detected (nackCount={}), force-acking eagerly pollable event {} in prefetching queue: {}", + ev.getNackCount(), + ev, + this); + ev.ack(); + ev.recordCommittedTimestamp(); + ev.cleanUp(false); + return null; + } prefetchEvent(ev); // no need to log warn for eagerly pollable event return null; // remove this entry } else if (ev.pollable()) { ev.nack(); // now pollable + if (ev.isPoisoned()) { + LOGGER.error( + "Subscription: poison message detected (nackCount={}), force-acking pollable event {} in prefetching queue: {}", + ev.getNackCount(), + ev, + this); + ev.ack(); + ev.recordCommittedTimestamp(); + ev.cleanUp(false); + return null; + } prefetchEvent(ev); LOGGER.warn( "Subscription: SubscriptionPrefetchingQueue {} recycle event {} from in flight events, nack and enqueue it to prefetching queue", diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusLogToTabletConverter.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusLogToTabletConverter.java new file mode 100644 index 0000000000000..c5bbeb320fb65 --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusLogToTabletConverter.java @@ -0,0 +1,606 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker.consensus; + +import org.apache.iotdb.commons.pipe.datastructure.pattern.TablePattern; +import org.apache.iotdb.commons.pipe.datastructure.pattern.TreePattern; +import org.apache.iotdb.commons.schema.table.column.TsTableColumnCategory; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.PlanNodeType; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertMultiTabletsNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertRowNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertRowsNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertRowsOfOneDeviceNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertTabletNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.RelationalInsertRowNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.RelationalInsertRowsNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.RelationalInsertTabletNode; + +import org.apache.tsfile.enums.ColumnCategory; +import org.apache.tsfile.enums.TSDataType; +import org.apache.tsfile.file.metadata.IDeviceID; +import org.apache.tsfile.utils.Binary; +import org.apache.tsfile.utils.BitMap; +import org.apache.tsfile.write.record.Tablet; +import org.apache.tsfile.write.schema.IMeasurementSchema; +import org.apache.tsfile.write.schema.MeasurementSchema; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Objects; +import java.util.regex.Pattern; + +/** Converts IoTConsensus WAL log entries (InsertNode) to Tablet format for subscription. */ +public class ConsensusLogToTabletConverter { + + private static final Logger LOGGER = LoggerFactory.getLogger(ConsensusLogToTabletConverter.class); + + private final TreePattern treePattern; + private final TablePattern tablePattern; + private final Pattern tableColumnPattern; + + /** + * The actual database name of the DataRegion this converter processes (table-model format without + * "root." prefix). Null for tree-model topics. + */ + private final String databaseName; + + public ConsensusLogToTabletConverter( + final TreePattern treePattern, + final TablePattern tablePattern, + final Pattern tableColumnPattern, + final String databaseName) { + this.treePattern = treePattern; + this.tablePattern = tablePattern; + this.tableColumnPattern = tableColumnPattern; + this.databaseName = databaseName; + } + + public String getDatabaseName() { + return databaseName; + } + + static String safeDeviceIdForLog(final InsertNode node) { + try { + final Object deviceId = node.getDeviceID(); + return deviceId != null ? deviceId.toString() : "null"; + } catch (final Exception e) { + return "N/A(" + node.getType() + ")"; + } + } + + public List convert(final InsertNode insertNode) { + if (Objects.isNull(insertNode)) { + return Collections.emptyList(); + } + + final PlanNodeType nodeType = insertNode.getType(); + if (nodeType == null) { + LOGGER.warn("InsertNode type is null, skipping conversion"); + return Collections.emptyList(); + } + + LOGGER.debug( + "ConsensusLogToTabletConverter: converting InsertNode type={}, deviceId={}", + nodeType, + safeDeviceIdForLog(insertNode)); + + switch (nodeType) { + case INSERT_ROW: + return convertInsertRowNode((InsertRowNode) insertNode); + case INSERT_TABLET: + return convertInsertTabletNode((InsertTabletNode) insertNode); + case INSERT_ROWS: + return convertInsertRowsNode((InsertRowsNode) insertNode); + case INSERT_ROWS_OF_ONE_DEVICE: + return convertInsertRowsOfOneDeviceNode((InsertRowsOfOneDeviceNode) insertNode); + case INSERT_MULTI_TABLET: + return convertInsertMultiTabletsNode((InsertMultiTabletsNode) insertNode); + case RELATIONAL_INSERT_ROW: + return convertRelationalInsertRowNode((RelationalInsertRowNode) insertNode); + case RELATIONAL_INSERT_TABLET: + return convertRelationalInsertTabletNode((RelationalInsertTabletNode) insertNode); + case RELATIONAL_INSERT_ROWS: + return convertRelationalInsertRowsNode((RelationalInsertRowsNode) insertNode); + default: + LOGGER.debug("Unsupported InsertNode type for subscription: {}", nodeType); + return Collections.emptyList(); + } + } + + // ======================== Tree Model Conversion ======================== + + private List convertInsertRowNode(final InsertRowNode node) { + final IDeviceID deviceId = node.getDeviceID(); + + // Device-level path filtering + if (treePattern != null && !treePattern.mayOverlapWithDevice(deviceId)) { + return Collections.emptyList(); + } + + final long time = node.getTime(); + + // Determine which columns match the pattern + final String[] measurements = node.getMeasurements(); + final TSDataType[] dataTypes = node.getDataTypes(); + final Object[] values = node.getValues(); + final List matchedColumnIndices = getMatchedTreeColumnIndices(deviceId, measurements); + + if (matchedColumnIndices.isEmpty()) { + return Collections.emptyList(); + } + + // Build Tablet with matched columns + final int columnCount = matchedColumnIndices.size(); + final List schemas = new ArrayList<>(columnCount); + for (final int colIdx : matchedColumnIndices) { + schemas.add(new MeasurementSchema(measurements[colIdx], dataTypes[colIdx])); + } + + final Tablet tablet = new Tablet(deviceId.toString(), schemas, 1 /* maxRowNumber */); + tablet.addTimestamp(0, time); + + for (int i = 0; i < columnCount; i++) { + final int originalColIdx = matchedColumnIndices.get(i); + final Object value = values[originalColIdx]; + if (value == null) { + if (tablet.getBitMaps() == null) { + tablet.initBitMaps(); + } + tablet.getBitMaps()[i].mark(0); + } else { + addValueToTablet(tablet, 0, i, dataTypes[originalColIdx], value); + } + } + tablet.setRowSize(1); + + return Collections.singletonList(tablet); + } + + private List convertInsertTabletNode(final InsertTabletNode node) { + if (node instanceof RelationalInsertTabletNode) { + return convertRelationalInsertTabletNode((RelationalInsertTabletNode) node); + } + + final IDeviceID deviceId = node.getDeviceID(); + + // Device-level path filtering + if (treePattern != null && !treePattern.mayOverlapWithDevice(deviceId)) { + return Collections.emptyList(); + } + + final String[] measurements = node.getMeasurements(); + final TSDataType[] dataTypes = node.getDataTypes(); + final long[] times = node.getTimes(); + final Object[] columns = node.getColumns(); + final BitMap[] bitMaps = node.getBitMaps(); + final int rowCount = node.getRowCount(); + + // Column filtering + final List matchedColumnIndices = getMatchedTreeColumnIndices(deviceId, measurements); + if (matchedColumnIndices.isEmpty()) { + return Collections.emptyList(); + } + + final int columnCount = matchedColumnIndices.size(); + final boolean allColumnsMatch = (columnCount == measurements.length); + + // Build schemas (always needed) + final List schemas = new ArrayList<>(columnCount); + for (final int colIdx : matchedColumnIndices) { + schemas.add(new MeasurementSchema(measurements[colIdx], dataTypes[colIdx])); + } + + // Build column arrays and bitmaps using bulk copy + final long[] newTimes = Arrays.copyOf(times, rowCount); + final Object[] newColumns = new Object[columnCount]; + final BitMap[] newBitMaps = new BitMap[columnCount]; + + for (int i = 0; i < columnCount; i++) { + final int originalColIdx = allColumnsMatch ? i : matchedColumnIndices.get(i); + newColumns[i] = copyColumnArray(dataTypes[originalColIdx], columns[originalColIdx], rowCount); + if (bitMaps != null && bitMaps[originalColIdx] != null) { + newBitMaps[i] = new BitMap(rowCount); + BitMap.copyOfRange(bitMaps[originalColIdx], 0, newBitMaps[i], 0, rowCount); + } + } + + final Tablet tablet = + new Tablet(deviceId.toString(), schemas, newTimes, newColumns, newBitMaps, rowCount); + + return Collections.singletonList(tablet); + } + + private List convertInsertRowsNode(final InsertRowsNode node) { + final List tablets = new ArrayList<>(); + for (final InsertRowNode rowNode : node.getInsertRowNodeList()) { + // Handle merge bug: RelationalInsertRowNode.mergeInsertNode() is not overridden, + // so merged relational nodes arrive as InsertRowsNode (tree) with RelationalInsertRowNode + // children. Dispatch correctly by checking the actual child type. + if (rowNode instanceof RelationalInsertRowNode) { + tablets.addAll(convertRelationalInsertRowNode((RelationalInsertRowNode) rowNode)); + } else { + tablets.addAll(convertInsertRowNode(rowNode)); + } + } + return tablets; + } + + private List convertInsertRowsOfOneDeviceNode(final InsertRowsOfOneDeviceNode node) { + final List tablets = new ArrayList<>(); + for (final InsertRowNode rowNode : node.getInsertRowNodeList()) { + tablets.addAll(convertInsertRowNode(rowNode)); + } + return tablets; + } + + private List convertInsertMultiTabletsNode(final InsertMultiTabletsNode node) { + final List tablets = new ArrayList<>(); + for (final InsertTabletNode tabletNode : node.getInsertTabletNodeList()) { + // Handle merge bug: RelationalInsertTabletNode.mergeInsertNode() is not overridden, + // so merged relational tablets arrive as InsertMultiTabletsNode (tree) with + // RelationalInsertTabletNode children. Dispatch correctly by checking the actual child type. + if (tabletNode instanceof RelationalInsertTabletNode) { + tablets.addAll(convertRelationalInsertTabletNode((RelationalInsertTabletNode) tabletNode)); + } else { + tablets.addAll(convertInsertTabletNode(tabletNode)); + } + } + return tablets; + } + + // ======================== Table Model Conversion ======================== + + private List convertRelationalInsertRowNode(final RelationalInsertRowNode node) { + final String tableName = node.getTableName(); + + // Table-level pattern filtering + if (tablePattern != null) { + if (databaseName != null && !tablePattern.matchesDatabase(databaseName)) { + return Collections.emptyList(); + } + if (tableName != null && !tablePattern.matchesTable(tableName)) { + return Collections.emptyList(); + } + } + + final long time = node.getTime(); + final String[] measurements = node.getMeasurements(); + final TSDataType[] dataTypes = node.getDataTypes(); + final Object[] values = node.getValues(); + final List matchedColumnIndices = getMatchedTableColumnIndices(measurements); + if (matchedColumnIndices.isEmpty()) { + return Collections.emptyList(); + } + + final int columnCount = matchedColumnIndices.size(); + final List columnNames = new ArrayList<>(columnCount); + final List columnDataTypes = new ArrayList<>(columnCount); + final List columnTypes = new ArrayList<>(columnCount); + for (final int originalColIdx : matchedColumnIndices) { + columnNames.add(measurements[originalColIdx]); + columnDataTypes.add(dataTypes[originalColIdx]); + columnTypes.add(toTsFileColumnCategory(node.getColumnCategories(), originalColIdx)); + } + + final Tablet tablet = + new Tablet( + tableName != null ? tableName : "", columnNames, columnDataTypes, columnTypes, 1); + tablet.addTimestamp(0, time); + + for (int i = 0; i < columnCount; i++) { + final int originalColIdx = matchedColumnIndices.get(i); + final Object value = values[originalColIdx]; + if (value == null) { + if (tablet.getBitMaps() == null) { + tablet.initBitMaps(); + } + tablet.getBitMaps()[i].mark(0); + } else { + addValueToTablet(tablet, 0, i, dataTypes[originalColIdx], value); + } + } + tablet.setRowSize(1); + + return Collections.singletonList(tablet); + } + + private List convertRelationalInsertTabletNode(final RelationalInsertTabletNode node) { + final String tableName = node.getTableName(); + + // Table-level pattern filtering + if (tablePattern != null) { + if (databaseName != null && !tablePattern.matchesDatabase(databaseName)) { + return Collections.emptyList(); + } + if (tableName != null && !tablePattern.matchesTable(tableName)) { + return Collections.emptyList(); + } + } + + final String[] measurements = node.getMeasurements(); + final TSDataType[] dataTypes = node.getDataTypes(); + final long[] times = node.getTimes(); + final Object[] columns = node.getColumns(); + final BitMap[] bitMaps = node.getBitMaps(); + final int rowCount = node.getRowCount(); + final List matchedColumnIndices = getMatchedTableColumnIndices(measurements); + if (matchedColumnIndices.isEmpty()) { + return Collections.emptyList(); + } + + final int columnCount = matchedColumnIndices.size(); + final boolean allColumnsMatch = columnCount == measurements.length; + final List schemas = new ArrayList<>(columnCount); + final List columnTypes = new ArrayList<>(columnCount); + for (final int originalColIdx : matchedColumnIndices) { + schemas.add(new MeasurementSchema(measurements[originalColIdx], dataTypes[originalColIdx])); + columnTypes.add(toTsFileColumnCategory(node.getColumnCategories(), originalColIdx)); + } + + // Build column arrays and bitmaps using bulk copy + final long[] newTimes = Arrays.copyOf(times, rowCount); + final Object[] newColumns = new Object[columnCount]; + final BitMap[] newBitMaps = new BitMap[columnCount]; + + for (int colIdx = 0; colIdx < columnCount; colIdx++) { + final int originalColIdx = allColumnsMatch ? colIdx : matchedColumnIndices.get(colIdx); + newColumns[colIdx] = + copyColumnArray(dataTypes[originalColIdx], columns[originalColIdx], rowCount); + if (bitMaps != null && bitMaps[originalColIdx] != null) { + newBitMaps[colIdx] = new BitMap(rowCount); + BitMap.copyOfRange(bitMaps[originalColIdx], 0, newBitMaps[colIdx], 0, rowCount); + } + } + + final Tablet tablet = + new Tablet( + tableName != null ? tableName : "", + schemas, + columnTypes, + newTimes, + newColumns, + newBitMaps, + rowCount); + + return Collections.singletonList(tablet); + } + + private List convertRelationalInsertRowsNode(final RelationalInsertRowsNode node) { + final List tablets = new ArrayList<>(); + for (final InsertRowNode rowNode : node.getInsertRowNodeList()) { + tablets.addAll(convertRelationalInsertRowNode((RelationalInsertRowNode) rowNode)); + } + return tablets; + } + + // ======================== Helper Methods ======================== + + /** + * Returns indices of columns that match the tree pattern. If no tree pattern is specified, all + * column indices are returned. + */ + private List getMatchedTreeColumnIndices( + final IDeviceID deviceId, final String[] measurements) { + if (treePattern == null || treePattern.isRoot() || treePattern.coversDevice(deviceId)) { + // All columns match + final List allIndices = new ArrayList<>(measurements.length); + for (int i = 0; i < measurements.length; i++) { + if (measurements[i] != null) { + allIndices.add(i); + } + } + return allIndices; + } + + final List matchedIndices = new ArrayList<>(); + for (int i = 0; i < measurements.length; i++) { + if (measurements[i] != null && treePattern.matchesMeasurement(deviceId, measurements[i])) { + matchedIndices.add(i); + } + } + return matchedIndices; + } + + /** + * Returns indices of table columns that match the configured column pattern. If no table column + * pattern is specified, all non-null columns are returned. + */ + private List getMatchedTableColumnIndices(final String[] measurements) { + final List matchedIndices = new ArrayList<>(measurements.length); + for (int i = 0; i < measurements.length; i++) { + if (measurements[i] == null) { + continue; + } + if (tableColumnPattern == null || tableColumnPattern.matcher(measurements[i]).matches()) { + matchedIndices.add(i); + } + } + return matchedIndices; + } + + private ColumnCategory toTsFileColumnCategory( + final TsTableColumnCategory[] columnCategories, final int columnIndex) { + return columnCategories != null && columnCategories[columnIndex] != null + ? columnCategories[columnIndex].toTsFileColumnType() + : ColumnCategory.FIELD; + } + + /** + * Bulk-copies a typed column array using System.arraycopy. Returns a new array of the same type + * containing the first {@code rowCount} elements. + */ + private Object copyColumnArray( + final TSDataType dataType, final Object sourceColumn, final int rowCount) { + switch (dataType) { + case BOOLEAN: + { + final boolean[] src = (boolean[]) sourceColumn; + final boolean[] dst = new boolean[rowCount]; + System.arraycopy(src, 0, dst, 0, rowCount); + return dst; + } + case INT32: + case DATE: + { + final int[] src = (int[]) sourceColumn; + final int[] dst = new int[rowCount]; + System.arraycopy(src, 0, dst, 0, rowCount); + return dst; + } + case INT64: + case TIMESTAMP: + { + final long[] src = (long[]) sourceColumn; + final long[] dst = new long[rowCount]; + System.arraycopy(src, 0, dst, 0, rowCount); + return dst; + } + case FLOAT: + { + final float[] src = (float[]) sourceColumn; + final float[] dst = new float[rowCount]; + System.arraycopy(src, 0, dst, 0, rowCount); + return dst; + } + case DOUBLE: + { + final double[] src = (double[]) sourceColumn; + final double[] dst = new double[rowCount]; + System.arraycopy(src, 0, dst, 0, rowCount); + return dst; + } + case TEXT: + case BLOB: + case STRING: + { + final Binary[] src = (Binary[]) sourceColumn; + final Binary[] dst = new Binary[rowCount]; + System.arraycopy(src, 0, dst, 0, rowCount); + return dst; + } + default: + LOGGER.warn("Unsupported data type for bulk copy: {}", dataType); + return sourceColumn; + } + } + + /** + * Adds a single value to the tablet at the specified position. + * + *

IMPORTANT: In tsfile-2.2.1, Tablet.addTimestamp() calls initBitMapsWithApiUsage() which + * creates bitMaps and marks ALL positions as null via markAll(). Since we write values directly + * to the underlying typed arrays (bypassing the Tablet.addValue() API which would call + * updateBitMap to unmark), we must explicitly unmark the bitmap position to indicate the value is + * NOT null. + */ + private void addValueToTablet( + final Tablet tablet, + final int rowIndex, + final int columnIndex, + final TSDataType dataType, + final Object value) { + switch (dataType) { + case BOOLEAN: + ((boolean[]) tablet.getValues()[columnIndex])[rowIndex] = (boolean) value; + break; + case INT32: + case DATE: + ((int[]) tablet.getValues()[columnIndex])[rowIndex] = (int) value; + break; + case INT64: + case TIMESTAMP: + ((long[]) tablet.getValues()[columnIndex])[rowIndex] = (long) value; + break; + case FLOAT: + ((float[]) tablet.getValues()[columnIndex])[rowIndex] = (float) value; + break; + case DOUBLE: + ((double[]) tablet.getValues()[columnIndex])[rowIndex] = (double) value; + break; + case TEXT: + case BLOB: + case STRING: + ((Binary[]) tablet.getValues()[columnIndex])[rowIndex] = (Binary) value; + break; + default: + LOGGER.warn("Unsupported data type: {}", dataType); + return; + } + // Unmark the bitmap position to indicate this value is NOT null. + // addTimestamp() triggers initBitMapsWithApiUsage() which marks all positions as null. + final BitMap[] bitMaps = tablet.getBitMaps(); + if (bitMaps != null && bitMaps[columnIndex] != null) { + bitMaps[columnIndex].unmark(rowIndex); + } + } + + /** Copies a single column value from the source column array to the tablet. */ + private void copyColumnValue( + final Tablet tablet, + final int targetRowIndex, + final int targetColumnIndex, + final TSDataType dataType, + final Object sourceColumn, + final int sourceRowIndex) { + switch (dataType) { + case BOOLEAN: + ((boolean[]) tablet.getValues()[targetColumnIndex])[targetRowIndex] = + ((boolean[]) sourceColumn)[sourceRowIndex]; + break; + case INT32: + case DATE: + ((int[]) tablet.getValues()[targetColumnIndex])[targetRowIndex] = + ((int[]) sourceColumn)[sourceRowIndex]; + break; + case INT64: + case TIMESTAMP: + ((long[]) tablet.getValues()[targetColumnIndex])[targetRowIndex] = + ((long[]) sourceColumn)[sourceRowIndex]; + break; + case FLOAT: + ((float[]) tablet.getValues()[targetColumnIndex])[targetRowIndex] = + ((float[]) sourceColumn)[sourceRowIndex]; + break; + case DOUBLE: + ((double[]) tablet.getValues()[targetColumnIndex])[targetRowIndex] = + ((double[]) sourceColumn)[sourceRowIndex]; + break; + case TEXT: + case BLOB: + case STRING: + ((Binary[]) tablet.getValues()[targetColumnIndex])[targetRowIndex] = + ((Binary[]) sourceColumn)[sourceRowIndex]; + break; + default: + LOGGER.warn("Unsupported data type for copy: {}", dataType); + return; + } + // Unmark the bitmap position to indicate this value is NOT null. + final BitMap[] bitMaps = tablet.getBitMaps(); + if (bitMaps != null && bitMaps[targetColumnIndex] != null) { + bitMaps[targetColumnIndex].unmark(targetRowIndex); + } + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusPrefetchingQueue.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusPrefetchingQueue.java new file mode 100644 index 0000000000000..7a6c06b1655b8 --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusPrefetchingQueue.java @@ -0,0 +1,3502 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker.consensus; + +import org.apache.iotdb.commons.consensus.ConsensusGroupId; +import org.apache.iotdb.commons.subscription.config.SubscriptionConfig; +import org.apache.iotdb.consensus.common.request.IConsensusRequest; +import org.apache.iotdb.consensus.common.request.IndexedConsensusRequest; +import org.apache.iotdb.consensus.common.request.IoTConsensusRequest; +import org.apache.iotdb.consensus.iot.IoTConsensusServerImpl; +import org.apache.iotdb.consensus.iot.SubscriptionWalRetentionPolicy; +import org.apache.iotdb.consensus.iot.WriterSafeFrontierTracker; +import org.apache.iotdb.consensus.iot.log.ConsensusReqReader; +import org.apache.iotdb.db.conf.IoTDBDescriptor; +import org.apache.iotdb.db.pipe.agent.PipeDataNodeAgent; +import org.apache.iotdb.db.pipe.resource.memory.PipeMemoryWeightUtil; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.PlanNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.PlanNodeType; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertMultiTabletsNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertRowNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertRowsNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertRowsOfOneDeviceNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertTabletNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.SearchNode; +import org.apache.iotdb.db.storageengine.dataregion.wal.buffer.WALEntry; +import org.apache.iotdb.db.storageengine.dataregion.wal.io.ProgressWALReader; +import org.apache.iotdb.db.storageengine.dataregion.wal.io.WALMetaData; +import org.apache.iotdb.db.storageengine.dataregion.wal.node.WALNode; +import org.apache.iotdb.db.storageengine.dataregion.wal.utils.WALFileUtils; +import org.apache.iotdb.db.subscription.event.SubscriptionEvent; +import org.apache.iotdb.db.subscription.metric.ConsensusSubscriptionPrefetchingQueueMetrics; +import org.apache.iotdb.db.subscription.task.execution.ConsensusSubscriptionPrefetchExecutor; +import org.apache.iotdb.db.subscription.task.execution.ConsensusSubscriptionPrefetchExecutorManager; +import org.apache.iotdb.db.subscription.task.subtask.ConsensusPrefetchSubtask; +import org.apache.iotdb.rpc.subscription.config.TopicConfig; +import org.apache.iotdb.rpc.subscription.config.TopicConstant; +import org.apache.iotdb.rpc.subscription.payload.poll.ErrorPayload; +import org.apache.iotdb.rpc.subscription.payload.poll.RegionProgress; +import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext; +import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionPollResponseType; +import org.apache.iotdb.rpc.subscription.payload.poll.TabletsPayload; +import org.apache.iotdb.rpc.subscription.payload.poll.WatermarkPayload; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterId; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterProgress; + +import org.apache.tsfile.utils.Pair; +import org.apache.tsfile.write.record.Tablet; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.NavigableMap; +import java.util.Objects; +import java.util.PriorityQueue; +import java.util.Set; +import java.util.TreeMap; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.LinkedBlockingDeque; +import java.util.concurrent.PriorityBlockingQueue; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.locks.ReentrantReadWriteLock; +import java.util.function.BiConsumer; +import java.util.function.Function; +import java.util.function.Predicate; +import java.util.function.Supplier; + +import static org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext.INVALID_COMMIT_ID; + +public class ConsensusPrefetchingQueue { + + private static final Logger LOGGER = LoggerFactory.getLogger(ConsensusPrefetchingQueue.class); + + private final String brokerId; // consumer group id + private final String topicName; + private final ConsensusGroupId consensusGroupId; + + private final IoTConsensusServerImpl serverImpl; + + private final ConsensusReqReader consensusReqReader; + + private final SubscriptionWalRetentionPolicy retentionPolicy; + + private final WakeableIndexedConsensusQueue pendingEntries; + + private static final int PENDING_QUEUE_CAPACITY = 4096; + + private final ConsensusLogToTabletConverter converter; + + private final ConsensusSubscriptionCommitManager commitManager; + + private final AtomicLong seekGeneration; + + /** Internal WAL reader cursor used only for local replay positioning and deduplication. */ + private final AtomicLong nextExpectedSearchIndex; + + private final PriorityBlockingQueue prefetchingQueue; + + private final Map, SubscriptionEvent> inFlightEvents; + + private static final int MAX_PREFETCHING_QUEUE_SIZE = + SubscriptionConfig.getInstance().getSubscriptionConsensusPrefetchingQueueCapacity(); + + private final AtomicLong walGapSkippedEntries = new AtomicLong(0); + + /** Guards queue state transitions that touch replay positioning, seek state, and lane buffers. */ + private final ReentrantReadWriteLock lock = new ReentrantReadWriteLock(true); + + private volatile boolean isClosed = false; + + private volatile boolean closeRequested = false; + + private volatile boolean isActive = true; + + private volatile Set activeWriterNodeIds = Collections.emptySet(); + + private volatile Set runtimeActiveWriterNodeIds = Collections.emptySet(); + + private volatile int preferredWriterNodeId = -1; + + private volatile int previousPreferredWriterNodeId = -1; + + // ======================== Routing Runtime Version ======================== + + private volatile long runtimeVersion = 0; + + private final AtomicLong runtimeVersionChangeCount = new AtomicLong(0); + + // ======================== Unified WAL / Release State ======================== + + private volatile ProgressWALIterator subscriptionWALIterator; + + /** + * Seek requests must not close/reset the WAL iterator from RPC threads because the prefetch + * worker may be reading it concurrently. Instead, seek only records the latest desired reset and + * the queue's next prefetch round applies it after observing the new seek generation. + */ + private volatile long pendingSubscriptionWalResetSearchIndex = Long.MIN_VALUE; + + private volatile long pendingSubscriptionWalResetGeneration = Long.MIN_VALUE; + + // ======================== Watermark ======================== + + /** Maximum data timestamp observed across all InsertNodes processed by this queue. */ + private volatile long maxObservedTimestamp = Long.MIN_VALUE; + + /** Wall-clock time (ms) of last watermark injection. 0 means never injected. */ + private volatile long lastWatermarkEmitTimeMs = 0; + + /** Number of entries accepted from realtime pending queue. */ + private final AtomicLong pendingPathAcceptedEntries = new AtomicLong(0); + + /** Number of entries accepted from WAL-backed paths (historical or catch-up). */ + private final AtomicLong walPathAcceptedEntries = new AtomicLong(0); + + private final Object prefetchBindingLock = new Object(); + + private volatile ConsensusPrefetchSubtask prefetchSubtask; + + private volatile ConsensusSubscriptionPrefetchExecutor prefetchExecutor; + + /** + * Whether the prefetch runtime has been initialized. Starts as false (dormant). Set to true on + * the first poll with a region progress hint or when a seek installs a pending reset. This keeps + * queue creation cheap: realtime entries can be buffered immediately while WAL replay state is + * only built once the queue is actually activated. + */ + private volatile boolean prefetchInitialized = false; + + private volatile PendingSeekRequest pendingSeekRequest; + + private final DeliveryBatchState lingerBatch = new DeliveryBatchState(); + + private volatile long observedSeekGeneration; + + private volatile long lastStatsLogTimeMs = System.currentTimeMillis(); + + private volatile long lastPendingAcceptedEntries = 0L; + + private volatile long lastWalAcceptedEntries = 0L; + + private volatile boolean pendingWalGapRetryRequested = false; + + private volatile long walGapWaitStartTimeMs = 0L; + + private volatile long lastWalGapWaitLogTimeMs = 0L; + + /** Fallback committed region progress from local persisted state. */ + private final RegionProgress fallbackCommittedRegionProgress; + + /** Recovery-time per-writer frontiers used to skip already committed entries after restart. */ + private final Map recoveryWriterProgressByWriter = + new ConcurrentHashMap<>(); + + /** + * Source-level dedup frontier for follower-origin entries that do not carry a local searchIndex. + * The same request may first arrive through pendingEntries and later become visible from WAL; + * once a follower-origin localSeq has already been materialized into queue state, the WAL path + * must not materialize it again. + */ + private final Map materializedFollowerProgressByWriter = + new ConcurrentHashMap<>(); + + /** + * Lane state keyed by writer identity. Release gating reasons in terms of writer lanes and safe + * frontiers instead of a region-level committed frontier. + */ + private final Map writerLanes = new ConcurrentHashMap<>(); + + /** + * Realtime lane buffers used by both pending replay and WAL catch-up so queue materialization + * converges on the same per-writer lane representation before batch delivery. + */ + private final Map> realtimeEntriesByLane = + new ConcurrentHashMap<>(); + + /** + * Local tail position used only when initialization starts without any persisted region progress. + */ + private final long fallbackTailSearchIndex; + + /** Local sequence used to represent the position immediately before a writer's first record. */ + private static final long BEFORE_FIRST_LOCAL_SEQ = -1L; + + /** Writer-progress metadata for the current pending/WAL batch being assembled. */ + private volatile long batchPhysicalTime = 0L; + + private volatile int batchWriterNodeId = -1; + private volatile long batchWriterEpoch = 0L; + private volatile String orderMode = TopicConstant.ORDER_MODE_DEFAULT_VALUE; + + protected enum ReplayLocateStatus { + FOUND, + AT_END, + LOCATE_MISS + } + + protected static final class ReplayLocateDecision { + private final ReplayLocateStatus status; + private final long startSearchIndex; + private final RegionProgress recoveryRegionProgress; + private final String detail; + + private ReplayLocateDecision( + final ReplayLocateStatus status, + final long startSearchIndex, + final RegionProgress recoveryRegionProgress, + final String detail) { + this.status = status; + this.startSearchIndex = startSearchIndex; + this.recoveryRegionProgress = recoveryRegionProgress; + this.detail = detail; + } + + static ReplayLocateDecision found( + final long startSearchIndex, + final RegionProgress recoveryRegionProgress, + final String detail) { + return new ReplayLocateDecision( + ReplayLocateStatus.FOUND, startSearchIndex, recoveryRegionProgress, detail); + } + + static ReplayLocateDecision atEnd( + final long startSearchIndex, + final RegionProgress recoveryRegionProgress, + final String detail) { + return new ReplayLocateDecision( + ReplayLocateStatus.AT_END, startSearchIndex, recoveryRegionProgress, detail); + } + + static ReplayLocateDecision locateMiss( + final RegionProgress recoveryRegionProgress, final String detail) { + return new ReplayLocateDecision( + ReplayLocateStatus.LOCATE_MISS, Long.MIN_VALUE, recoveryRegionProgress, detail); + } + + protected ReplayLocateStatus getStatus() { + return status; + } + + protected long getStartSearchIndex() { + return startSearchIndex; + } + + protected RegionProgress getRecoveryRegionProgress() { + return recoveryRegionProgress; + } + + protected String getDetail() { + return detail; + } + } + + private static final class WakeableIndexedConsensusQueue + extends LinkedBlockingDeque { + + private final Runnable wakeupHook; + + private WakeableIndexedConsensusQueue(final int capacity, final Runnable wakeupHook) { + super(capacity); + this.wakeupHook = wakeupHook; + } + + @Override + public boolean offer(final IndexedConsensusRequest request) { + final boolean offered = super.offer(request); + if (offered) { + wakeupHook.run(); + } + return offered; + } + + @Override + public void put(final IndexedConsensusRequest request) throws InterruptedException { + super.put(request); + wakeupHook.run(); + } + } + + private static final class PendingSeekRequest { + + private final long targetSearchIndex; + private final RegionProgress committedRegionProgress; + private final String seekReason; + private final boolean previousPrefetchInitialized; + private final long previousSeekGeneration; + private final long targetSeekGeneration; + + private boolean completed = false; + private RuntimeException failure; + + private PendingSeekRequest( + final long targetSearchIndex, + final RegionProgress committedRegionProgress, + final String seekReason, + final boolean previousPrefetchInitialized, + final long previousSeekGeneration, + final long targetSeekGeneration) { + this.targetSearchIndex = targetSearchIndex; + this.committedRegionProgress = committedRegionProgress; + this.seekReason = seekReason; + this.previousPrefetchInitialized = previousPrefetchInitialized; + this.previousSeekGeneration = previousSeekGeneration; + this.targetSeekGeneration = targetSeekGeneration; + } + + private synchronized void complete() { + completed = true; + notifyAll(); + } + + private synchronized void fail(final RuntimeException failure) { + this.failure = failure; + completed = true; + notifyAll(); + } + + private synchronized void awaitCompletion() { + while (!completed) { + try { + wait(50L); + } catch (final InterruptedException e) { + Thread.currentThread().interrupt(); + throw new RuntimeException("Interrupted while waiting for seek application", e); + } + } + if (failure != null) { + throw failure; + } + } + } + + public ConsensusPrefetchingQueue( + final String brokerId, + final String topicName, + final String orderMode, + final ConsensusGroupId consensusGroupId, + final IoTConsensusServerImpl serverImpl, + final SubscriptionWalRetentionPolicy retentionPolicy, + final ConsensusLogToTabletConverter converter, + final ConsensusSubscriptionCommitManager commitManager, + final RegionProgress fallbackCommittedRegionProgress, + final long tailStartSearchIndex, + final long initialRuntimeVersion, + final boolean initialActive) { + this.brokerId = brokerId; + this.topicName = topicName; + this.consensusGroupId = consensusGroupId; + this.serverImpl = serverImpl; + this.consensusReqReader = serverImpl.getConsensusReqReader(); + this.retentionPolicy = retentionPolicy; + this.converter = converter; + this.commitManager = commitManager; + this.fallbackCommittedRegionProgress = fallbackCommittedRegionProgress; + this.fallbackTailSearchIndex = tailStartSearchIndex; + this.runtimeVersion = initialRuntimeVersion; + this.isActive = initialActive; + this.orderMode = TopicConfig.normalizeOrderMode(orderMode); + + this.seekGeneration = new AtomicLong(0); + this.nextExpectedSearchIndex = new AtomicLong(tailStartSearchIndex); + + this.prefetchingQueue = new PriorityBlockingQueue<>(); + this.inFlightEvents = new ConcurrentHashMap<>(); + this.observedSeekGeneration = seekGeneration.get(); + + // Register pending queue early so we don't miss real-time writes + this.pendingEntries = + new WakeableIndexedConsensusQueue(PENDING_QUEUE_CAPACITY, this::requestPrefetch); + serverImpl.registerSubscriptionQueue(pendingEntries, retentionPolicy); + + LOGGER.info( + "ConsensusPrefetchingQueue created (dormant): brokerId={}, topicName={}, " + + "orderMode={}, consensusGroupId={}, fallbackCommittedRegionProgress={}, " + + "fallbackTailSearchIndex={}, initialRuntimeVersion={}, initialActive={}", + brokerId, + topicName, + this.orderMode, + consensusGroupId, + fallbackCommittedRegionProgress, + tailStartSearchIndex, + initialRuntimeVersion, + initialActive); + + // Register metrics + ConsensusSubscriptionPrefetchingQueueMetrics.getInstance().register(this); + } + + // ======================== Lock Operations ======================== + + private void acquireReadLock() { + lock.readLock().lock(); + } + + private void releaseReadLock() { + lock.readLock().unlock(); + } + + private void acquireWriteLock() { + lock.writeLock().lock(); + } + + private void releaseWriteLock() { + lock.writeLock().unlock(); + } + + private void requestPrefetch() { + if (closeRequested || isClosed) { + return; + } + final ConsensusPrefetchSubtask subtask = ensurePrefetchSubtaskBound(); + if (Objects.nonNull(subtask)) { + subtask.requestWakeupNow(); + } + } + + private ConsensusPrefetchSubtask ensurePrefetchSubtaskBound() { + if (closeRequested || isClosed) { + return null; + } + + final ConsensusSubscriptionPrefetchExecutor currentExecutor = + ConsensusSubscriptionPrefetchExecutorManager.getInstance().getExecutor(); + if (Objects.isNull(currentExecutor)) { + return null; + } + + final ConsensusPrefetchSubtask currentSubtask = prefetchSubtask; + if (Objects.nonNull(currentSubtask) + && prefetchExecutor == currentExecutor + && !currentSubtask.isClosed()) { + return currentSubtask; + } + + synchronized (prefetchBindingLock) { + if (closeRequested || isClosed) { + return null; + } + + if (Objects.nonNull(prefetchSubtask) + && prefetchExecutor == currentExecutor + && !prefetchSubtask.isClosed()) { + return prefetchSubtask; + } + + final ConsensusPrefetchSubtask staleSubtask = prefetchSubtask; + final ConsensusSubscriptionPrefetchExecutor staleExecutor = prefetchExecutor; + if (Objects.nonNull(staleSubtask) + && Objects.nonNull(staleExecutor) + && (staleExecutor != currentExecutor || staleSubtask.isClosed()) + && !staleExecutor.isShutdown()) { + staleExecutor.deregister(staleSubtask.getTaskId()); + } + + final ConsensusPrefetchSubtask newSubtask = new ConsensusPrefetchSubtask(this); + if (!currentExecutor.register(newSubtask)) { + return null; + } + prefetchExecutor = currentExecutor; + prefetchSubtask = newSubtask; + return newSubtask; + } + } + + private Pair + detachPrefetchSubtask() { + synchronized (prefetchBindingLock) { + final Pair detached = + new Pair<>(prefetchExecutor, prefetchSubtask); + prefetchExecutor = null; + prefetchSubtask = null; + return detached; + } + } + + private boolean shouldRecoverPrefetchBindingAfterEmptyPoll() { + if (!prefetchInitialized || isClosed || closeRequested || pendingSeekRequest != null) { + return false; + } + + final ConsensusSubscriptionPrefetchExecutor currentExecutor = + ConsensusSubscriptionPrefetchExecutorManager.getInstance().getExecutor(); + if (Objects.isNull(currentExecutor)) { + return false; + } + + final ConsensusPrefetchSubtask currentSubtask = prefetchSubtask; + final boolean bindingMissing = + Objects.isNull(currentSubtask) + || currentSubtask.isClosed() + || Objects.isNull(prefetchExecutor) + || prefetchExecutor.isShutdown() + || prefetchExecutor != currentExecutor; + if (!bindingMissing) { + return false; + } + + return hasImmediatePrefetchableWork() + || hasHistoricalWalLag() + || !lingerBatch.isEmpty() + || !inFlightEvents.isEmpty() + || computeWatermarkDelayMs() > 0L; + } + + // ======================== Poll ======================== + + public SubscriptionEvent poll(final String consumerId) { + return poll(consumerId, null); + } + + public SubscriptionEvent poll(final String consumerId, final RegionProgress regionProgress) { + acquireReadLock(); + try { + if (isClosed || closeRequested || !isActive) { + return null; + } + if (!prefetchInitialized) { + initPrefetch(regionProgress); + } + if (pendingSeekRequest != null) { + return null; + } + final SubscriptionEvent event = pollInternal(consumerId); + if (Objects.nonNull(event) && prefetchingQueue.size() < MAX_PREFETCHING_QUEUE_SIZE) { + requestPrefetch(); + } else if (Objects.isNull(event) && shouldRecoverPrefetchBindingAfterEmptyPoll()) { + requestPrefetch(); + } + return event; + } finally { + releaseReadLock(); + } + } + + private synchronized void initPrefetch(final RegionProgress regionProgress) { + if (prefetchInitialized) { + return; // double-check under synchronization + } + + final RegionProgress committedRegionProgress = resolveCommittedRegionProgressForInit(); + final boolean useConsumerHint = + shouldUseConsumerRegionProgressHint(regionProgress, committedRegionProgress); + final RegionProgress recoveryRegionProgress = + useConsumerHint + ? mergeRecoveryRegionProgress(committedRegionProgress, regionProgress) + : committedRegionProgress; + final String progressSource = + useConsumerHint + ? Objects.nonNull(committedRegionProgress) + && !committedRegionProgress.getWriterPositions().isEmpty() + ? "merged committed region progress with consumer topic progress hint" + : "consumer topic progress hint" + : "committed region progress fallback"; + final ReplayLocateDecision resolvedStart = + resolveInitReplayStartDecision(recoveryRegionProgress, progressSource); + + clearRecoveryWriterProgress(); + final RegionProgress effectiveRecoveryRegionProgress = + resolvedStart.getRecoveryRegionProgress(); + if (Objects.nonNull(effectiveRecoveryRegionProgress) + && !effectiveRecoveryRegionProgress.getWriterPositions().isEmpty()) { + installRecoveryWriterProgress(effectiveRecoveryRegionProgress); + } + + this.nextExpectedSearchIndex.set(resolvedStart.getStartSearchIndex()); + if (consensusReqReader instanceof WALNode) { + this.subscriptionWALIterator = + new ProgressWALIterator( + (WALNode) consensusReqReader, resolvedStart.getStartSearchIndex()); + } + this.prefetchInitialized = true; + this.observedSeekGeneration = seekGeneration.get(); + this.lingerBatch.reset(); + resetBatchWriterProgress(); + + LOGGER.info( + "ConsensusPrefetchingQueue {}: prefetch initialized, startSearchIndex={}, progressSource={}, recoveryWriterCount={}", + this, + resolvedStart.getStartSearchIndex(), + resolvedStart.getDetail(), + recoveryWriterProgressByWriter.size()); + + requestPrefetch(); + } + + private ReplayLocateDecision resolveInitReplayStartDecision( + final RegionProgress recoveryRegionProgress, final String progressSource) { + if (Objects.isNull(recoveryRegionProgress) + || recoveryRegionProgress.getWriterPositions().isEmpty()) { + return ReplayLocateDecision.found( + fallbackTailSearchIndex, + new RegionProgress(Collections.emptyMap()), + progressSource + " (tail start without progress)"); + } + if (!(consensusReqReader instanceof WALNode)) { + throw new IllegalStateException( + String.format( + "ConsensusPrefetchingQueue %s: cannot recover from non-empty region progress without WAL access: %s", + this, recoveryRegionProgress)); + } + + final ReplayLocateDecision replayTarget = + locateReplayStartForRegionProgress(recoveryRegionProgress, true); + switch (replayTarget.getStatus()) { + case FOUND: + case AT_END: + return new ReplayLocateDecision( + replayTarget.getStatus(), + replayTarget.getStartSearchIndex(), + replayTarget.getRecoveryRegionProgress(), + progressSource + " (" + replayTarget.getDetail() + ")"); + case LOCATE_MISS: + default: + throw new IllegalStateException( + String.format( + "ConsensusPrefetchingQueue %s: cannot initialize replay start from region progress %s: %s", + this, recoveryRegionProgress, replayTarget.getDetail())); + } + } + + private boolean shouldUseConsumerRegionProgressHint( + final RegionProgress regionProgress, final RegionProgress committedRegionProgress) { + if (Objects.isNull(regionProgress) || regionProgress.getWriterPositions().isEmpty()) { + return false; + } + if (Objects.isNull(committedRegionProgress) + || committedRegionProgress.getWriterPositions().isEmpty()) { + return true; + } + for (final Map.Entry entry : + regionProgress.getWriterPositions().entrySet()) { + if (Objects.isNull(entry.getKey()) || Objects.isNull(entry.getValue())) { + continue; + } + final WriterProgress committedWriterProgress = + committedRegionProgress.getWriterPositions().get(entry.getKey()); + if (Objects.isNull(committedWriterProgress) + || compareWriterProgress(entry.getValue(), committedWriterProgress) > 0) { + return true; + } + } + return false; + } + + private RegionProgress mergeRecoveryRegionProgress( + final RegionProgress committedRegionProgress, final RegionProgress consumerRegionProgress) { + if (Objects.isNull(committedRegionProgress) + || committedRegionProgress.getWriterPositions().isEmpty()) { + return consumerRegionProgress; + } + if (Objects.isNull(consumerRegionProgress) + || consumerRegionProgress.getWriterPositions().isEmpty()) { + return committedRegionProgress; + } + + final Map mergedWriterProgress = new LinkedHashMap<>(); + committedRegionProgress + .getWriterPositions() + .forEach( + (writerId, writerProgress) -> { + if (Objects.nonNull(writerId) && Objects.nonNull(writerProgress)) { + mergedWriterProgress.put(writerId, writerProgress); + } + }); + consumerRegionProgress + .getWriterPositions() + .forEach( + (writerId, writerProgress) -> { + if (Objects.isNull(writerId) || Objects.isNull(writerProgress)) { + return; + } + mergedWriterProgress.merge( + writerId, + writerProgress, + (committedWriterProgress, consumerWriterProgress) -> + compareWriterProgress(consumerWriterProgress, committedWriterProgress) > 0 + ? consumerWriterProgress + : committedWriterProgress); + }); + return new RegionProgress(mergedWriterProgress); + } + + protected RegionProgress resolveCommittedRegionProgressForInit() { + commitManager.getOrCreateState(brokerId, topicName, consensusGroupId); + final RegionProgress latestCommittedRegionProgress = + commitManager.getCommittedRegionProgress(brokerId, topicName, consensusGroupId); + if (Objects.nonNull(latestCommittedRegionProgress) + && !latestCommittedRegionProgress.getWriterPositions().isEmpty()) { + return latestCommittedRegionProgress; + } + return Objects.nonNull(fallbackCommittedRegionProgress) + && !fallbackCommittedRegionProgress.getWriterPositions().isEmpty() + ? fallbackCommittedRegionProgress + : null; + } + + private void installRecoveryWriterProgress(final RegionProgress regionProgress) { + recoveryWriterProgressByWriter.clear(); + recoveryWriterProgressByWriter.putAll(regionProgress.getWriterPositions()); + regionProgress + .getWriterPositions() + .keySet() + .forEach(writerId -> trackWriterLane(writerId.getNodeId(), writerId.getWriterEpoch())); + } + + private void clearRecoveryWriterProgress() { + recoveryWriterProgressByWriter.clear(); + } + + private boolean shouldSkipForRecoveryProgress(final IndexedConsensusRequest request) { + if (recoveryWriterProgressByWriter.isEmpty()) { + return false; + } + return isRequestCoveredByRegionProgress(request, recoveryWriterProgressByWriter, true); + } + + private boolean hasComparableWriterProgress(final IndexedConsensusRequest request) { + return request.getNodeId() >= 0 + && request.getWriterEpoch() >= 0 + && request.getPhysicalTime() > 0 + && request.getProgressLocalSeq() >= 0; + } + + private WriterId toWriterId(final IndexedConsensusRequest request) { + return new WriterId(consensusGroupId.toString(), request.getNodeId(), request.getWriterEpoch()); + } + + private WriterProgress toWriterProgress(final IndexedConsensusRequest request) { + return new WriterProgress(request.getPhysicalTime(), request.getProgressLocalSeq()); + } + + private boolean isRequestCoveredByRegionProgress( + final IndexedConsensusRequest request, + final Map regionProgressByWriter, + final boolean seekAfter) { + if (!hasComparableWriterProgress(request)) { + return false; + } + final WriterProgress committedProgress = regionProgressByWriter.get(toWriterId(request)); + if (Objects.isNull(committedProgress)) { + return false; + } + final int cmp = compareWriterProgress(toWriterProgress(request), committedProgress); + return seekAfter ? cmp <= 0 : cmp < 0; + } + + private WriterProgress decrementWriterProgress(final WriterProgress writerProgress) { + return new WriterProgress( + writerProgress.getPhysicalTime(), + writerProgress.getLocalSeq() > 0L + ? writerProgress.getLocalSeq() - 1L + : BEFORE_FIRST_LOCAL_SEQ); + } + + protected ReplayLocateDecision scanReplayStartForRequests( + final Iterable requests, + final RegionProgress regionProgress, + final boolean seekAfter) { + final Map requestedWriterProgress = new LinkedHashMap<>(); + if (Objects.nonNull(regionProgress)) { + regionProgress + .getWriterPositions() + .forEach( + (writerId, writerProgress) -> { + if (Objects.nonNull(writerId) && Objects.nonNull(writerProgress)) { + requestedWriterProgress.put(writerId, writerProgress); + } + }); + } + final Map effectiveRecoveryWriterProgress = + new LinkedHashMap<>(requestedWriterProgress); + final Set exactVisibleWriterIds = new LinkedHashSet<>(); + Long firstUncoveredReplayableSearchIndex = null; + boolean sawBlockingNonReplayableUncovered = false; + + for (final IndexedConsensusRequest request : requests) { + if (!hasComparableWriterProgress(request)) { + continue; + } + + final WriterId writerId = toWriterId(request); + final WriterProgress requestProgress = toWriterProgress(request); + final WriterProgress storedWriterProgress = requestedWriterProgress.get(writerId); + if (!seekAfter + && Objects.nonNull(storedWriterProgress) + && compareWriterProgress(requestProgress, storedWriterProgress) == 0) { + exactVisibleWriterIds.add(writerId); + } + + if (isRequestCoveredByRegionProgress(request, requestedWriterProgress, seekAfter)) { + continue; + } + + if (request.getSearchIndex() >= 0) { + if (Objects.isNull(firstUncoveredReplayableSearchIndex)) { + firstUncoveredReplayableSearchIndex = request.getSearchIndex(); + } + } else if (Objects.isNull(firstUncoveredReplayableSearchIndex)) { + sawBlockingNonReplayableUncovered = true; + } + } + + if (!seekAfter && !exactVisibleWriterIds.isEmpty()) { + for (final WriterId writerId : exactVisibleWriterIds) { + final WriterProgress writerProgress = requestedWriterProgress.get(writerId); + if (Objects.nonNull(writerProgress)) { + effectiveRecoveryWriterProgress.put(writerId, decrementWriterProgress(writerProgress)); + } + } + } + final RegionProgress effectiveRecoveryRegionProgress = + new RegionProgress(effectiveRecoveryWriterProgress); + + if (sawBlockingNonReplayableUncovered) { + return ReplayLocateDecision.locateMiss( + effectiveRecoveryRegionProgress, + "uncovered non-replayable WAL records appear before the first local replayable record"); + } + if (Objects.nonNull(firstUncoveredReplayableSearchIndex)) { + return ReplayLocateDecision.found( + firstUncoveredReplayableSearchIndex, + effectiveRecoveryRegionProgress, + "resolved first uncovered replayable WAL record"); + } + return ReplayLocateDecision.atEnd( + consensusReqReader.getCurrentSearchIndex(), + computeTailRegionProgress(), + "all locally replayable WAL records are already covered"); + } + + protected ReplayLocateDecision locateReplayStartForRegionProgress( + final RegionProgress regionProgress, final boolean seekAfter) { + if (!(consensusReqReader instanceof WALNode)) { + return ReplayLocateDecision.locateMiss( + regionProgress, "WAL access is unavailable for region-level replay lookup"); + } + + final WALNode walNode = (WALNode) consensusReqReader; + final List replayRequests = new ArrayList<>(); + try (final ProgressWALIterator iterator = new ProgressWALIterator(walNode, Long.MIN_VALUE)) { + while (iterator.hasNext()) { + replayRequests.add(iterator.next()); + } + if (iterator.hasIncompleteScan()) { + return ReplayLocateDecision.locateMiss( + regionProgress, + "replay lookup did not complete: " + iterator.getIncompleteScanDetail()); + } + return scanReplayStartForRequests(replayRequests, regionProgress, seekAfter); + } catch (final IOException e) { + return ReplayLocateDecision.locateMiss( + regionProgress, "failed to close replay lookup iterator: " + e.getMessage()); + } + } + + private boolean shouldTrackFollowerProgressForDedup(final IndexedConsensusRequest request) { + return request.getSearchIndex() < 0 + && request.getNodeId() >= 0 + && request.getWriterEpoch() >= 0 + && request.getProgressLocalSeq() >= 0; + } + + private boolean shouldSkipForMaterializedFollowerProgress(final IndexedConsensusRequest request) { + if (!shouldTrackFollowerProgressForDedup(request)) { + return false; + } + final Long materializedLocalSeq = + materializedFollowerProgressByWriter.get( + new WriterLaneId(request.getNodeId(), request.getWriterEpoch())); + return Objects.nonNull(materializedLocalSeq) + && request.getProgressLocalSeq() <= materializedLocalSeq; + } + + private void markMaterializedFollowerProgress(final IndexedConsensusRequest request) { + if (!shouldTrackFollowerProgressForDedup(request)) { + return; + } + materializedFollowerProgressByWriter.merge( + new WriterLaneId(request.getNodeId(), request.getWriterEpoch()), + request.getProgressLocalSeq(), + Math::max); + } + + private int compareWriterProgress( + final WriterProgress leftProgress, final WriterProgress rightProgress) { + int cmp = Long.compare(leftProgress.getPhysicalTime(), rightProgress.getPhysicalTime()); + if (cmp != 0) { + return cmp; + } + return Long.compare(leftProgress.getLocalSeq(), rightProgress.getLocalSeq()); + } + + private WriterLaneState trackWriterLane(final int writerNodeId, final long writerEpoch) { + return writerLanes.computeIfAbsent( + new WriterLaneId(writerNodeId, writerEpoch), ignored -> new WriterLaneState()); + } + + private void refreshWriterLaneSafeFrontiers() { + final Map safePts = + serverImpl.getWriterSafeFrontierTracker().snapshotEffectiveSafePts(); + for (final Map.Entry entry : + safePts.entrySet()) { + final WriterLaneState laneState = + trackWriterLane(entry.getKey().getWriterNodeId(), entry.getKey().getWriterEpoch()); + laneState.effectiveSafePt = Math.max(laneState.effectiveSafePt, entry.getValue()); + } + } + + private PriorityQueue buildLaneFrontiers( + final Map laneEntriesByLane, final Function headSupplier) { + refreshWriterLaneSafeFrontiers(); + final PriorityQueue frontiers = new PriorityQueue<>(); + final boolean useActiveWriterBarriers = shouldUseActiveWriterBarriers(); + final Set laneIds = ConcurrentHashMap.newKeySet(); + final Set seenActiveWriterNodeIds = ConcurrentHashMap.newKeySet(); + laneIds.addAll(writerLanes.keySet()); + laneIds.addAll(laneEntriesByLane.keySet()); + for (final WriterLaneId laneId : laneIds) { + final WriterLaneState laneState = writerLanes.get(laneId); + if (Objects.nonNull(laneState) && laneState.closed) { + continue; + } + final T head = headSupplier.apply(laneId); + if (Objects.nonNull(head)) { + if (isLaneRuntimeActive(laneId)) { + seenActiveWriterNodeIds.add(laneId.writerNodeId); + } + frontiers.add(LaneFrontier.forHead(laneId, head)); + continue; + } + if (Objects.nonNull(laneState) + && laneState.effectiveSafePt > 0 + && useActiveWriterBarriers + && isLaneRuntimeActive(laneId)) { + seenActiveWriterNodeIds.add(laneId.writerNodeId); + frontiers.add(LaneFrontier.forBarrier(laneId, laneState.effectiveSafePt)); + } + } + if (useActiveWriterBarriers) { + for (final Integer activeWriterNodeId : activeWriterNodeIds) { + if (!seenActiveWriterNodeIds.contains(activeWriterNodeId)) { + frontiers.add( + LaneFrontier.forBarrier(new WriterLaneId(activeWriterNodeId, 0L), Long.MIN_VALUE)); + break; + } + } + } + return frontiers; + } + + private boolean shouldUseActiveWriterBarriers() { + return !TopicConstant.ORDER_MODE_PER_WRITER_VALUE.equals(orderMode); + } + + private void bufferRealtimeEntry(final PreparedEntry entry) { + final WriterLaneId laneId = new WriterLaneId(entry.writerNodeId, entry.writerEpoch); + realtimeEntriesByLane + .computeIfAbsent(laneId, ignored -> new TreeMap<>()) + .put(entry.localSeq, entry); + } + + private PreparedEntry peekRealtimeEntry(final WriterLaneId laneId) { + final NavigableMap laneEntries = realtimeEntriesByLane.get(laneId); + if (Objects.isNull(laneEntries) || laneEntries.isEmpty()) { + return null; + } + final Map.Entry firstEntry = laneEntries.firstEntry(); + return Objects.nonNull(firstEntry) ? firstEntry.getValue() : null; + } + + private void removeRealtimeEntry(final WriterLaneId laneId, final long localSeq) { + final NavigableMap laneEntries = realtimeEntriesByLane.get(laneId); + if (Objects.isNull(laneEntries)) { + return; + } + laneEntries.remove(localSeq); + if (laneEntries.isEmpty()) { + realtimeEntriesByLane.remove(laneId); + } + } + + private PriorityQueue buildRealtimeLaneFrontiers() { + return buildLaneFrontiers(realtimeEntriesByLane, this::peekRealtimeEntry); + } + + private SubscriptionEvent pollInternal(final String consumerId) { + final long size = prefetchingQueue.size(); + if (size == 0) { + LOGGER.debug( + "ConsensusPrefetchingQueue {}: prefetching queue is empty for consumerId={}, " + + "pendingEntriesSize={}, nextExpected={}, isClosed={}, prefetchInitialized={}, subtaskScheduled={}", + this, + consumerId, + pendingEntries.size(), + nextExpectedSearchIndex.get(), + isClosed, + prefetchInitialized, + Objects.nonNull(prefetchSubtask) && prefetchSubtask.isScheduledOrRunning()); + return null; + } + + LOGGER.debug( + "ConsensusPrefetchingQueue {}: polling, queue size={}, consumerId={}", + this, + size, + consumerId); + long count = 0; + + SubscriptionEvent event; + try { + while (count++ < size + && Objects.nonNull( + event = + prefetchingQueue.poll( + SubscriptionConfig.getInstance().getSubscriptionPollMaxBlockingTimeMs(), + TimeUnit.MILLISECONDS))) { + // Metadata events (currently WATERMARK) are fire-and-forget: + // skip inFlightEvents tracking so they are not recycled and re-delivered indefinitely. + if (event.getCurrentResponse().getResponseType() + == SubscriptionPollResponseType.WATERMARK.getType()) { + return event; + } + + if (event.isCommitted()) { + LOGGER.warn( + "ConsensusPrefetchingQueue {} poll committed event {} (broken invariant), remove it", + this, + event); + continue; + } + + if (!event.pollable()) { + LOGGER.warn( + "ConsensusPrefetchingQueue {} poll non-pollable event {} (broken invariant), nack it", + this, + event); + event.nack(); + continue; + } + + // Mark as polled before updating inFlightEvents + event.recordLastPolledTimestamp(); + inFlightEvents.put(new Pair<>(consumerId, event.getCommitContext()), event); + event.recordLastPolledConsumerId(consumerId); + return event; + } + } catch (final InterruptedException e) { + Thread.currentThread().interrupt(); + LOGGER.warn("ConsensusPrefetchingQueue {} interrupted while polling", this, e); + } + + return null; + } + + public SubscriptionEvent pollTablets( + final String consumerId, final SubscriptionCommitContext commitContext, final int offset) { + acquireReadLock(); + try { + if (isClosed || closeRequested || pendingSeekRequest != null) { + return null; + } + final SubscriptionEvent event = inFlightEvents.get(new Pair<>(consumerId, commitContext)); + if (Objects.isNull(event)) { + if (isCommitContextOutdated(commitContext)) { + return generateOutdatedErrorResponse(); + } + return generateErrorResponse( + String.format( + "ConsensusPrefetchingQueue %s: no in-flight event for consumer %s, commit context %s", + this, consumerId, commitContext)); + } + return event; + } finally { + releaseReadLock(); + } + } + + // ======================== Prefetch Round Drive ======================== + + private static final long WAL_GAP_RETRY_SLEEP_MS = 10L; + private static final long WAL_GAP_WAIT_LOG_INTERVAL_MS = 5_000L; + + private static final long PREFETCH_STATS_LOG_INTERVAL_MS = 5_000L; + + public PrefetchRoundResult drivePrefetchOnce() { + if (applyPendingSeekRequestIfNecessary()) { + return closeRequested ? PrefetchRoundResult.dormant() : PrefetchRoundResult.rescheduleNow(); + } + + acquireReadLock(); + try { + if (isClosed || closeRequested || !prefetchInitialized) { + return PrefetchRoundResult.dormant(); + } + + logPeriodicStatsIfNecessary(); + + final long currentSeekGeneration = seekGeneration.get(); + if (currentSeekGeneration != observedSeekGeneration) { + resetRoundStateForSeek(currentSeekGeneration); + } + + applyPendingSubscriptionWalReset(observedSeekGeneration); + recycleInFlightEvents(); + + if (!isActive || prefetchingQueue.size() >= MAX_PREFETCHING_QUEUE_SIZE) { + return computeIdleRoundResult(); + } + + final SubscriptionConfig config = SubscriptionConfig.getInstance(); + final int maxWalEntries = config.getSubscriptionConsensusBatchMaxWalEntries(); + final int batchMaxDelayMs = config.getSubscriptionConsensusBatchMaxDelayInMs(); + final int maxTablets = config.getSubscriptionConsensusBatchMaxTabletCount(); + final long maxBatchBytes = config.getSubscriptionConsensusBatchMaxSizeInBytes(); + + final List batch = drainPendingEntries(maxWalEntries); + if (!batch.isEmpty()) { + LOGGER.debug( + "ConsensusPrefetchingQueue {}: drained {} entries from pendingEntries, " + + "first searchIndex={}, last searchIndex={}, nextExpected={}, " + + "prefetchingQueueSize={}", + this, + batch.size(), + batch.get(0).getSearchIndex(), + batch.get(batch.size() - 1).getSearchIndex(), + nextExpectedSearchIndex.get(), + prefetchingQueue.size()); + + final boolean batchAccepted = + accumulateFromPending( + batch, lingerBatch, observedSeekGeneration, maxTablets, maxBatchBytes); + if (!batchAccepted) { + if (pendingWalGapRetryRequested) { + // Once a drained batch hits an unresolved WAL gap, the affected suffix falls back to + // the WAL path on later rounds instead of being requeued into the bounded pending path. + return PrefetchRoundResult.rescheduleAfter(WAL_GAP_RETRY_SLEEP_MS); + } + resetRoundStateForSeek(seekGeneration.get()); + return PrefetchRoundResult.rescheduleNow(); + } + } + + if (batch.isEmpty() && lingerBatch.isEmpty()) { + tryCatchUpFromWAL(observedSeekGeneration); + } + + if (!drainBufferedRealtimeLanes( + lingerBatch, observedSeekGeneration, maxTablets, maxBatchBytes)) { + resetRoundStateForSeek(seekGeneration.get()); + return PrefetchRoundResult.rescheduleNow(); + } + + if (!lingerBatch.isEmpty() && lingerBatch.firstTabletTimeMs > 0L) { + final long lingerElapsedMs = System.currentTimeMillis() - lingerBatch.firstTabletTimeMs; + if (lingerElapsedMs >= batchMaxDelayMs) { + if (seekGeneration.get() != observedSeekGeneration) { + resetRoundStateForSeek(seekGeneration.get()); + return PrefetchRoundResult.rescheduleNow(); + } + LOGGER.debug( + "ConsensusPrefetchingQueue {}: time-based flush, {} tablets lingered for {}ms " + + "(threshold={}ms)", + this, + lingerBatch.tablets.size(), + lingerElapsedMs, + batchMaxDelayMs); + flushBatch(lingerBatch, observedSeekGeneration); + } + } + + maybeInjectWatermark(); + return computeIdleRoundResult(); + } catch (final Throwable fatal) { + LOGGER.error( + "ConsensusPrefetchingQueue {}: prefetch round failed " + "(type={}, message={})", + this, + fatal.getClass().getName(), + fatal.getMessage(), + fatal); + if (fatal instanceof VirtualMachineError) { + markClosed(); + return PrefetchRoundResult.dormant(); + } + return PrefetchRoundResult.rescheduleAfter(100L); + } finally { + releaseReadLock(); + } + } + + private void logPeriodicStatsIfNecessary() { + final long nowMs = System.currentTimeMillis(); + if (nowMs - lastStatsLogTimeMs < PREFETCH_STATS_LOG_INTERVAL_MS) { + return; + } + + final long currentPendingAcceptedEntries = pendingPathAcceptedEntries.get(); + final long currentWalAcceptedEntries = walPathAcceptedEntries.get(); + LOGGER.info( + "ConsensusPrefetchingQueue {}: periodic stats, lag={}, pendingDelta={}, walDelta={}, " + + "pendingTotal={}, walTotal={}, pendingQueueSize={}, prefetchingQueueSize={}, " + + "inFlightEventsSize={}, realtimeLaneCount={}, walHasNext={}, isActive={}, subtaskScheduled={}", + this, + getLag(), + currentPendingAcceptedEntries - lastPendingAcceptedEntries, + currentWalAcceptedEntries - lastWalAcceptedEntries, + currentPendingAcceptedEntries, + currentWalAcceptedEntries, + pendingEntries.size(), + prefetchingQueue.size(), + inFlightEvents.size(), + realtimeEntriesByLane.size(), + hasReadableWalEntries(), + isActive, + Objects.nonNull(prefetchSubtask) && prefetchSubtask.isScheduledOrRunning()); + lastStatsLogTimeMs = nowMs; + lastPendingAcceptedEntries = currentPendingAcceptedEntries; + lastWalAcceptedEntries = currentWalAcceptedEntries; + } + + private void resetRoundStateForSeek(final long newSeekGeneration) { + restorePendingSubscriptionWalCursor(newSeekGeneration); + lingerBatch.reset(); + resetBatchWriterProgress(); + observedSeekGeneration = newSeekGeneration; + } + + private List drainPendingEntries(final int maxWalEntries) { + final List batch = new ArrayList<>(); + IndexedConsensusRequest next; + while (batch.size() < maxWalEntries && (next = pendingEntries.poll()) != null) { + batch.add(next); + } + return batch; + } + + private PrefetchRoundResult computeIdleRoundResult() { + if (isClosed || !prefetchInitialized || !isActive) { + return PrefetchRoundResult.dormant(); + } + if (prefetchingQueue.size() >= MAX_PREFETCHING_QUEUE_SIZE) { + return PrefetchRoundResult.dormant(); + } + if (hasImmediatePrefetchableWork()) { + return PrefetchRoundResult.rescheduleNow(); + } + long delayMs = Long.MAX_VALUE; + if (hasHistoricalWalLag()) { + delayMs = Math.min(delayMs, WAL_GAP_RETRY_SLEEP_MS); + } + if (!lingerBatch.isEmpty() && lingerBatch.firstTabletTimeMs > 0L) { + final long lingerDelayMs = + SubscriptionConfig.getInstance().getSubscriptionConsensusBatchMaxDelayInMs() + - (System.currentTimeMillis() - lingerBatch.firstTabletTimeMs); + delayMs = Math.min(delayMs, Math.max(1L, lingerDelayMs)); + } + + final long watermarkDelayMs = computeWatermarkDelayMs(); + if (watermarkDelayMs > 0L) { + delayMs = Math.min(delayMs, watermarkDelayMs); + } + + if (!inFlightEvents.isEmpty()) { + delayMs = + Math.min( + delayMs, + SubscriptionConfig.getInstance().getSubscriptionRecycleUncommittedEventIntervalMs()); + } + + return delayMs == Long.MAX_VALUE + ? PrefetchRoundResult.dormant() + : PrefetchRoundResult.rescheduleAfter(delayMs); + } + + private long computeWatermarkDelayMs() { + if (maxObservedTimestamp == Long.MIN_VALUE) { + return -1L; + } + final long intervalMs = + SubscriptionConfig.getInstance().getSubscriptionConsensusWatermarkIntervalMs(); + if (intervalMs <= 0L) { + return -1L; + } + if (lastWatermarkEmitTimeMs == 0L) { + return 1L; + } + final long elapsedMs = System.currentTimeMillis() - lastWatermarkEmitTimeMs; + return elapsedMs >= intervalMs ? 1L : Math.max(1L, intervalMs - elapsedMs); + } + + private boolean hasImmediatePrefetchableWork() { + return !pendingEntries.isEmpty() || !realtimeEntriesByLane.isEmpty() || hasReadableWalEntries(); + } + + private boolean hasHistoricalWalLag() { + return nextExpectedSearchIndex.get() < consensusReqReader.getCurrentSearchIndex(); + } + + /** + * Accumulates tablets from pending entries into the linger buffer. When pending replay outruns + * the local WAL reader, this method backfills the local-index gap from WAL before continuing. + * + * @return false if the batch became stale because seek generation changed while flushing + */ + private static boolean hasLocalSearchIndex(final IndexedConsensusRequest request) { + return request.getSearchIndex() >= 0; + } + + private boolean isBeforeLocalCursor(final IndexedConsensusRequest request) { + return hasLocalSearchIndex(request) && request.getSearchIndex() < nextExpectedSearchIndex.get(); + } + + private void advanceLocalCursorIfPresent(final IndexedConsensusRequest request) { + if (hasLocalSearchIndex(request)) { + nextExpectedSearchIndex.set(request.getSearchIndex() + 1); + } + } + + private boolean appendRealtimeRequest( + final IndexedConsensusRequest request, + final DeliveryBatchState batchState, + final long expectedSeekGeneration, + final int maxTablets, + final long maxBatchBytes, + final boolean fromPending) { + final PreparedEntry preparedEntry = prepareEntry(request); + if (Objects.isNull(preparedEntry)) { + return true; + } + if (!appendPreparedEntryViaRealtimeLane( + batchState, preparedEntry, expectedSeekGeneration, maxTablets, maxBatchBytes)) { + return false; + } + if (fromPending) { + markAcceptedFromPending(); + } else { + markAcceptedFromWal(); + } + return true; + } + + private boolean accumulateFromPending( + final List batch, + final DeliveryBatchState lingerBatch, + final long expectedSeekGeneration, + final int maxTablets, + final long maxBatchBytes) { + + int processedCount = 0; + int skippedCount = 0; + + for (int index = 0; index < batch.size(); index++) { + final IndexedConsensusRequest request = batch.get(index); + final long searchIndex = request.getSearchIndex(); + + // Only local-indexed requests participate in the internal WAL read cursor. + final long expected = nextExpectedSearchIndex.get(); + if (hasLocalSearchIndex(request) && searchIndex > expected) { + LOGGER.debug( + "ConsensusPrefetchingQueue {}: gap detected, expected={}, got={}. " + + "Filling {} entries from WAL.", + this, + expected, + searchIndex, + searchIndex - expected); + if (!fillGapFromWAL( + expected, + searchIndex, + lingerBatch, + expectedSeekGeneration, + maxTablets, + maxBatchBytes)) { + return false; + } + } + + if (isBeforeLocalCursor(request)) { + skippedCount++; + continue; + } + + if (shouldSkipForRecoveryProgress(request)) { + skippedCount++; + advanceLocalCursorIfPresent(request); + continue; + } + if (shouldSkipForMaterializedFollowerProgress(request)) { + skippedCount++; + advanceLocalCursorIfPresent(request); + continue; + } + + if (!appendRealtimeRequest( + request, lingerBatch, expectedSeekGeneration, maxTablets, maxBatchBytes, true)) { + return false; + } + markMaterializedFollowerProgress(request); + processedCount++; + advanceLocalCursorIfPresent(request); + } + + LOGGER.debug( + "ConsensusPrefetchingQueue {}: accumulate complete, batchSize={}, processed={}, " + + "skipped={}, lingerTablets={}, nextExpected={}", + this, + batch.size(), + processedCount, + skippedCount, + lingerBatch.tablets.size(), + nextExpectedSearchIndex.get()); + + return true; + } + + /** + * Fills a gap in the pending queue by reading entries from WAL so the internal local replay + * cursor stays contiguous even when pending delivery jumps ahead of the WAL iterator. + * + *

Temporary WAL visibility lag is treated as a normal back-pressure condition: once a drained + * pending batch encounters an unresolved local-index gap, the queue backs off and lets the + * affected suffix fall back to the WAL path on later rounds. This keeps replay contiguous without + * requeueing the drained batch back into the bounded pending queue. + * + * @return false if gap fill had to stop because the current batch became stale or the queue was + * interrupted/closed + */ + private boolean fillGapFromWAL( + final long fromIndex, + final long toIndex, + final DeliveryBatchState batchState, + final long expectedSeekGeneration, + final int maxTablets, + final long maxBatchBytes) { + pendingWalGapRetryRequested = false; + resetSubscriptionWALPosition(fromIndex); + if (seekGeneration.get() != expectedSeekGeneration || isClosed) { + return false; + } + if (!pumpFromSubscriptionWAL( + batchState, expectedSeekGeneration, Integer.MAX_VALUE, maxTablets, maxBatchBytes)) { + return false; + } + + final long nextExpected = nextExpectedSearchIndex.get(); + if (nextExpected >= toIndex) { + walGapWaitStartTimeMs = 0L; + lastWalGapWaitLogTimeMs = 0L; + return true; + } + + final long nowMs = System.currentTimeMillis(); + if (walGapWaitStartTimeMs == 0L) { + walGapWaitStartTimeMs = nowMs; + } + if (lastWalGapWaitLogTimeMs == 0L + || nowMs - lastWalGapWaitLogTimeMs >= WAL_GAP_WAIT_LOG_INTERVAL_MS) { + LOGGER.info( + "ConsensusPrefetchingQueue {}: waiting {}ms for WAL gap [{}, {}) to become visible, " + + "currentNextExpected={}, currentWalIndex={}, seekGeneration={}", + this, + nowMs - walGapWaitStartTimeMs, + nextExpected, + toIndex, + nextExpected, + consensusReqReader.getCurrentSearchIndex(), + expectedSeekGeneration); + lastWalGapWaitLogTimeMs = nowMs; + } + onWalGapRetryScheduled(); + pendingWalGapRetryRequested = true; + return false; + } + + /** + * Try catch-up from WAL when the pending queue was empty. This handles cold-start or scenarios + * where the subscription started after data was already written. + */ + private void tryCatchUpFromWAL(final long expectedSeekGeneration) { + final SubscriptionConfig config = SubscriptionConfig.getInstance(); + final int maxTablets = config.getSubscriptionConsensusBatchMaxTabletCount(); + final long maxBatchBytes = config.getSubscriptionConsensusBatchMaxSizeInBytes(); + final int maxWalEntries = config.getSubscriptionConsensusBatchMaxWalEntries(); + + final DeliveryBatchState batchState = new DeliveryBatchState(); + resetSubscriptionWALPosition(nextExpectedSearchIndex.get()); + final boolean accepted = + pumpFromSubscriptionWAL( + batchState, expectedSeekGeneration, maxWalEntries, maxTablets, maxBatchBytes); + if (!accepted) { + return; + } + + if (!batchState.isEmpty()) { + flushBatch(batchState, expectedSeekGeneration); + } + } + + private boolean pumpFromSubscriptionWAL( + final DeliveryBatchState batchState, + final long expectedSeekGeneration, + final int maxWalEntries, + final int maxTablets, + final long maxBatchBytes) { + if (Objects.isNull(subscriptionWALIterator)) { + return true; + } + + subscriptionWALIterator.refresh(); + ensureSubscriptionWalReadable(); + + int entriesRead = 0; + while (entriesRead < maxWalEntries + && subscriptionWALIterator.hasNext() + && prefetchingQueue.size() < MAX_PREFETCHING_QUEUE_SIZE) { + try { + final IndexedConsensusRequest walEntry = subscriptionWALIterator.next(); + entriesRead++; + + if (isBeforeLocalCursor(walEntry)) { + continue; + } + if (shouldSkipForRecoveryProgress(walEntry)) { + advanceLocalCursorIfPresent(walEntry); + continue; + } + if (shouldSkipForMaterializedFollowerProgress(walEntry)) { + advanceLocalCursorIfPresent(walEntry); + continue; + } + + if (!appendRealtimeRequest( + walEntry, batchState, expectedSeekGeneration, maxTablets, maxBatchBytes, false)) { + return false; + } + markMaterializedFollowerProgress(walEntry); + advanceLocalCursorIfPresent(walEntry); + } catch (final Exception e) { + LOGGER.warn("ConsensusPrefetchingQueue {}: error reading subscription WAL", this, e); + break; + } + } + + if (entriesRead > 0) { + LOGGER.debug( + "ConsensusPrefetchingQueue {}: subscription WAL read {} entries, nextExpectedSearchIndex={}", + this, + entriesRead, + nextExpectedSearchIndex.get()); + } + return true; + } + + private void ensureSubscriptionWalReadable() { + if (Objects.isNull(subscriptionWALIterator) + || subscriptionWALIterator.hasNext() + || !(consensusReqReader instanceof WALNode)) { + return; + } + + final long currentWalIndex = consensusReqReader.getCurrentSearchIndex(); + if (nextExpectedSearchIndex.get() > currentWalIndex) { + return; + } + + LOGGER.debug( + "ConsensusPrefetchingQueue {}: subscription WAL exhausted at {} while current WAL is {}. " + + "Rolling WAL file to expose current-file entries.", + this, + nextExpectedSearchIndex.get(), + currentWalIndex); + ((WALNode) consensusReqReader).rollWALFile(); + resetSubscriptionWALPosition(nextExpectedSearchIndex.get()); + if (Objects.nonNull(subscriptionWALIterator)) { + subscriptionWALIterator.refresh(); + } + } + + private void resetSubscriptionWALPosition(final long startSearchIndex) { + closeSubscriptionWALIterator(); + subscriptionWALIterator = createSubscriptionWALIterator(startSearchIndex); + } + + protected ProgressWALIterator createSubscriptionWALIterator(final long startSearchIndex) { + if (consensusReqReader instanceof WALNode) { + return new ProgressWALIterator((WALNode) consensusReqReader, startSearchIndex); + } + return null; + } + + protected void onWalGapRetryScheduled() {} + + private boolean hasReadableWalEntries() { + return Objects.nonNull(subscriptionWALIterator) && subscriptionWALIterator.hasNext(); + } + + private void requestSubscriptionWalReset( + final long targetSearchIndex, final long seekGenerationValue) { + pendingSubscriptionWalResetSearchIndex = targetSearchIndex; + pendingSubscriptionWalResetGeneration = seekGenerationValue; + } + + private void applyPendingSubscriptionWalReset(final long observedSeekGeneration) { + if (pendingSubscriptionWalResetGeneration != observedSeekGeneration + || pendingSubscriptionWalResetSearchIndex == Long.MIN_VALUE) { + return; + } + resetSubscriptionWALPosition(pendingSubscriptionWalResetSearchIndex); + pendingSubscriptionWalResetSearchIndex = Long.MIN_VALUE; + pendingSubscriptionWalResetGeneration = Long.MIN_VALUE; + } + + private void restorePendingSubscriptionWalCursor(final long observedSeekGeneration) { + if (pendingSubscriptionWalResetGeneration != observedSeekGeneration + || pendingSubscriptionWalResetSearchIndex == Long.MIN_VALUE) { + return; + } + // A seek can land in the middle of a prefetch iteration. Restore the local cursor to the + // pending seek target before resuming under the new generation so stale in-flight work does + // not permanently advance the historical replay frontier. + nextExpectedSearchIndex.set(pendingSubscriptionWalResetSearchIndex); + } + + private void closeSubscriptionWALIterator() { + if (Objects.isNull(subscriptionWALIterator)) { + return; + } + try { + subscriptionWALIterator.close(); + } catch (final IOException e) { + LOGGER.warn("ConsensusPrefetchingQueue {}: error closing subscription WAL iterator", this, e); + } finally { + subscriptionWALIterator = null; + } + } + + /** + * Deserializes the IConsensusRequest entries within an IndexedConsensusRequest to produce an + * InsertNode. WAL entries are typically stored as IoTConsensusRequest (serialized ByteBuffers), + * and a single logical write may be split across multiple fragments (SearchNode). This method + * handles both cases. + * + *

The deserialization follows the same pattern as {@code + * DataRegionStateMachine.grabPlanNode()}. + */ + private InsertNode deserializeToInsertNode(final IndexedConsensusRequest indexedRequest) { + final List searchNodes = new ArrayList<>(); + PlanNode nonSearchNode = null; + + for (final IConsensusRequest req : indexedRequest.getRequests()) { + PlanNode planNode; + try { + if (req instanceof IoTConsensusRequest) { + // WAL entries read from file are wrapped as IoTConsensusRequest (ByteBuffer) + planNode = WALEntry.deserializeForConsensus(req.serializeToByteBuffer()); + } else if (req instanceof InsertNode) { + // In-memory entries (not yet flushed to WAL file) may already be PlanNode + planNode = (PlanNode) req; + } else { + // ByteBufferConsensusRequest or unknown + planNode = PlanNodeType.deserialize(req.serializeToByteBuffer()); + } + } catch (final Exception e) { + LOGGER.warn( + "ConsensusPrefetchingQueue {}: failed to deserialize IConsensusRequest " + + "(type={}) in searchIndex={}: {}", + this, + req.getClass().getSimpleName(), + indexedRequest.getSearchIndex(), + e.getMessage(), + e); + continue; + } + + if (planNode instanceof SearchNode) { + final SearchNode searchNode = (SearchNode) planNode; + searchNode.setSearchIndex(indexedRequest.getSearchIndex()); + if (indexedRequest.getSyncIndex() >= 0) { + searchNode.setSyncIndex(indexedRequest.getSyncIndex()); + } + if (indexedRequest.getPhysicalTime() > 0) { + searchNode.setPhysicalTime(indexedRequest.getPhysicalTime()); + } + if (indexedRequest.getNodeId() >= 0) { + searchNode.setNodeId(indexedRequest.getNodeId()); + } + if (indexedRequest.getWriterEpoch() > 0) { + searchNode.setWriterEpoch(indexedRequest.getWriterEpoch()); + } + searchNodes.add(searchNode); + } else { + nonSearchNode = planNode; + } + } + + // Merge split SearchNode fragments (same pattern as DataRegionStateMachine.grabPlanNode) + if (!searchNodes.isEmpty()) { + final PlanNode merged = searchNodes.get(0).merge(searchNodes); + if (merged instanceof InsertNode) { + final InsertNode mergedInsert = (InsertNode) merged; + LOGGER.debug( + "ConsensusPrefetchingQueue {}: deserialized merged InsertNode for searchIndex={}, " + + "type={}, deviceId={}, searchNodeCount={}", + this, + indexedRequest.getSearchIndex(), + mergedInsert.getType(), + ConsensusLogToTabletConverter.safeDeviceIdForLog(mergedInsert), + searchNodes.size()); + + return mergedInsert; + } + } + + if (nonSearchNode != null) { + LOGGER.debug( + "ConsensusPrefetchingQueue {}: searchIndex={} contains non-InsertNode PlanNode: {}", + this, + indexedRequest.getSearchIndex(), + nonSearchNode.getClass().getSimpleName()); + } + + return null; + } + + private PreparedEntry prepareEntry(final IndexedConsensusRequest indexedRequest) { + final InsertNode insertNode = deserializeToInsertNode(indexedRequest); + if (Objects.isNull(insertNode)) { + return null; + } + + final long localSeq = + indexedRequest.getProgressLocalSeq() >= 0 + ? indexedRequest.getProgressLocalSeq() + : indexedRequest.getSearchIndex(); + final long searchIndex = indexedRequest.getSearchIndex(); + final long physicalTime = + indexedRequest.getPhysicalTime() > 0 + ? indexedRequest.getPhysicalTime() + : insertNode.getPhysicalTime(); + final int writerNodeId = + indexedRequest.getNodeId() >= 0 ? indexedRequest.getNodeId() : insertNode.getNodeId(); + final long writerEpoch = + indexedRequest.getWriterEpoch() > 0 + ? indexedRequest.getWriterEpoch() + : insertNode.getWriterEpoch(); + + trackWriterLane(writerNodeId, writerEpoch); + final long maxTs = extractMaxTime(insertNode); + if (maxTs > maxObservedTimestamp) { + maxObservedTimestamp = maxTs; + } + final List tablets = converter.convert(insertNode); + if (tablets.isEmpty()) { + return null; + } + + return new PreparedEntry( + tablets, searchIndex, physicalTime, writerNodeId, writerEpoch, localSeq); + } + + private static long estimateTabletSize(final Tablet tablet) { + return PipeMemoryWeightUtil.calculateTabletSizeInBytes(tablet); + } + + private void createAndEnqueueEvent( + final List tablets, final long startSearchIndex, final long endSearchIndex) { + createAndEnqueueEvent( + tablets, startSearchIndex, endSearchIndex, endSearchIndex, seekGeneration.get()); + } + + private boolean createAndEnqueueEvent( + final List tablets, + final long startSearchIndex, + final long endSearchIndex, + final long commitLocalSeq, + final long expectedSeekGeneration) { + if (tablets.isEmpty()) { + return true; + } + + if (seekGeneration.get() != expectedSeekGeneration) { + LOGGER.debug( + "ConsensusPrefetchingQueue {}: skip stale event with searchIndex range [{}, {}], " + + "expectedSeekGeneration={}, currentSeekGeneration={}", + this, + startSearchIndex, + endSearchIndex, + expectedSeekGeneration, + seekGeneration.get()); + return false; + } + + final SubscriptionCommitContext commitContext = buildWriterCommitContext(commitLocalSeq); + final WriterId writerId = commitContext.getWriterId(); + final WriterProgress writerProgress = commitContext.getWriterProgress(); + commitManager.recordMapping(brokerId, topicName, consensusGroupId, writerId, writerProgress); + + // nextOffset <= 0 means all tablets delivered in single batch + // -tablets.size() indicates total count + // Use Map> constructor with actual database name for table model; + final TabletsPayload payload = + new TabletsPayload( + Collections.singletonMap(converter.getDatabaseName(), tablets), -tablets.size()); + + final SubscriptionEvent event = + new SubscriptionEvent( + SubscriptionPollResponseType.TABLETS.getType(), payload, commitContext); + + prefetchingQueue.add(event); + + LOGGER.debug( + "ConsensusPrefetchingQueue {}: ENQUEUED event with {} tablets, " + + "searchIndex range [{}, {}], prefetchQueueSize={}", + this, + tablets.size(), + startSearchIndex, + endSearchIndex, + prefetchingQueue.size()); + + // After enqueuing the data event, control metadata is handled separately from user data. + return true; + } + + private SubscriptionCommitContext buildWriterCommitContext(final long localSeq) { + final int effectiveNodeId = + batchWriterNodeId >= 0 + ? batchWriterNodeId + : IoTDBDescriptor.getInstance().getConfig().getDataNodeId(); + final WriterId writerId = + new WriterId(consensusGroupId.toString(), effectiveNodeId, batchWriterEpoch); + final WriterProgress writerProgress = new WriterProgress(batchPhysicalTime, localSeq); + return new SubscriptionCommitContext( + IoTDBDescriptor.getInstance().getConfig().getDataNodeId(), + PipeDataNodeAgent.runtime().getRebootTimes(), + topicName, + brokerId, + seekGeneration.get(), + writerId, + writerProgress); + } + + private void updateBatchWriterProgress( + final long physicalTime, final int writerNodeId, final long writerEpoch) { + if (physicalTime > 0) { + this.batchPhysicalTime = physicalTime; + } + if (writerNodeId >= 0) { + this.batchWriterNodeId = writerNodeId; + } + if (writerEpoch > 0) { + this.batchWriterEpoch = writerEpoch; + } + } + + private void resetBatchWriterProgress() { + this.batchPhysicalTime = 0L; + this.batchWriterNodeId = -1; + this.batchWriterEpoch = 0L; + } + + private long estimateTabletsBytes(final List tablets) { + long estimatedBytes = 0L; + for (final Tablet tablet : tablets) { + estimatedBytes += estimateTabletSize(tablet); + } + return estimatedBytes; + } + + private boolean appendPreparedEntryViaRealtimeLane( + final DeliveryBatchState batchState, + final PreparedEntry preparedEntry, + final long expectedSeekGeneration, + final int maxTablets, + final long maxBatchBytes) { + bufferRealtimeEntry(preparedEntry); + return drainRealtimeLanes(batchState, expectedSeekGeneration, maxTablets, maxBatchBytes); + } + + private int getRealtimeBufferedEntryCount() { + int count = 0; + for (final NavigableMap laneEntries : realtimeEntriesByLane.values()) { + count += laneEntries.size(); + } + return count; + } + + private boolean drainBufferedRealtimeLanes( + final DeliveryBatchState batchState, + final long expectedSeekGeneration, + final int maxTablets, + final long maxBatchBytes) { + while (!realtimeEntriesByLane.isEmpty()) { + final int bufferedBefore = getRealtimeBufferedEntryCount(); + if (!drainRealtimeLanes(batchState, expectedSeekGeneration, maxTablets, maxBatchBytes)) { + return false; + } + + final int bufferedAfter = getRealtimeBufferedEntryCount(); + if (bufferedAfter == 0 || prefetchingQueue.size() >= MAX_PREFETCHING_QUEUE_SIZE) { + return true; + } + + if (batchState.isEmpty()) { + return true; + } + + if (!flushBatch(batchState, expectedSeekGeneration)) { + return false; + } + } + return true; + } + + private boolean canAppendLaneEntry( + final DeliveryBatchState batchState, + final LaneBufferedEntry entry, + final long entryEstimatedBytes, + final int maxEntries, + final int maxTablets, + final long maxBatchBytes) { + final boolean wouldExceedEntryLimit = + maxEntries != Integer.MAX_VALUE && batchState.entryCount >= maxEntries; + final boolean wouldExceedTabletLimit = + !batchState.isEmpty() && batchState.tablets.size() + entry.getTablets().size() > maxTablets; + final boolean wouldExceedByteLimit = + !batchState.isEmpty() && batchState.estimatedBytes + entryEstimatedBytes > maxBatchBytes; + // Keep all consensus subscription modes on a single-writer commit/delivery shape so + // SubscriptionCommitContext and RegionProgress remain per-writer. + final boolean writerChanged = + !batchState.isEmpty() + && (batchState.writerNodeId != entry.getWriterNodeId() + || batchState.writerEpoch != entry.getWriterEpoch()); + return !(wouldExceedEntryLimit + || wouldExceedTabletLimit + || wouldExceedByteLimit + || writerChanged); + } + + private boolean drainRealtimeLanes( + final DeliveryBatchState batchState, + final long expectedSeekGeneration, + final int maxTablets, + final long maxBatchBytes) { + return drainLaneEntries( + batchState, + this::buildRealtimeLaneFrontiers, + this::peekRealtimeEntry, + entry -> true, + (laneId, entry) -> removeRealtimeEntry(laneId, entry.localSeq), + Integer.MAX_VALUE, + maxTablets, + maxBatchBytes, + true); + } + + private boolean drainLaneEntries( + final DeliveryBatchState batchState, + final Supplier> frontierSupplier, + final Function headSupplier, + final Predicate releasePredicate, + final BiConsumer removeHeadAction, + final int maxEntries, + final int maxTablets, + final long maxBatchBytes, + final boolean trackLingerTime) { + while (true) { + final PriorityQueue frontiers = frontierSupplier.get(); + if (frontiers.isEmpty()) { + return true; + } + final LaneFrontier frontier = frontiers.peek(); + if (Objects.isNull(frontier) || frontier.isBarrier) { + return true; + } + final T laneHead = headSupplier.apply(frontier.laneId); + if (Objects.isNull(laneHead)) { + return true; + } + if (!releasePredicate.test(laneHead)) { + return true; + } + + final long entryEstimatedBytes = estimateTabletsBytes(laneHead.getTablets()); + if (!canAppendLaneEntry( + batchState, laneHead, entryEstimatedBytes, maxEntries, maxTablets, maxBatchBytes)) { + return true; + } + + removeHeadAction.accept(frontier.laneId, laneHead); + batchState.append(laneHead, entryEstimatedBytes, trackLingerTime); + } + } + + private boolean flushBatch( + final DeliveryBatchState batchState, final long expectedSeekGeneration) { + updateBatchWriterProgress( + batchState.physicalTime, batchState.writerNodeId, batchState.writerEpoch); + if (!createAndEnqueueEvent( + new ArrayList<>(batchState.tablets), + batchState.startSearchIndex, + batchState.endSearchIndex, + batchState.lastLocalSeq, + expectedSeekGeneration)) { + return false; + } + resetBatchWriterProgress(); + batchState.reset(); + return true; + } + + // ======================== Commit (Ack/Nack) ======================== + + private boolean canAcceptCommitContext( + final SubscriptionCommitContext commitContext, final String action, final boolean silent) { + if (isClosed || closeRequested || pendingSeekRequest != null) { + return false; + } + if (Objects.isNull(commitContext) || !commitContext.hasWriterProgress()) { + if (silent) { + LOGGER.debug( + "ConsensusPrefetchingQueue {}: reject {} without writer progress, commitContext={}", + this, + action, + commitContext); + } else { + LOGGER.warn( + "ConsensusPrefetchingQueue {}: reject {} without writer progress, commitContext={}", + this, + action, + commitContext); + } + return false; + } + if (!isActive) { + if (silent) { + LOGGER.debug( + "ConsensusPrefetchingQueue {}: reject {} for inactive queue, commitContext={}, runtimeVersion={}", + this, + action, + commitContext, + runtimeVersion); + } else { + LOGGER.warn( + "ConsensusPrefetchingQueue {}: reject {} for inactive queue, commitContext={}, runtimeVersion={}", + this, + action, + commitContext, + runtimeVersion); + } + return false; + } + return true; + } + + public boolean ack(final String consumerId, final SubscriptionCommitContext commitContext) { + acquireReadLock(); + try { + return canAcceptCommitContext(commitContext, "ack", false) + && ackInternal(consumerId, commitContext); + } finally { + releaseReadLock(); + } + } + + private boolean ackInternal( + final String consumerId, final SubscriptionCommitContext commitContext) { + final WriterId commitWriterId = extractCommitWriterId(commitContext); + final WriterProgress commitWriterProgress = extractCommitWriterProgress(commitContext); + final AtomicBoolean acked = new AtomicBoolean(false); + inFlightEvents.compute( + new Pair<>(consumerId, commitContext), + (key, ev) -> { + if (Objects.isNull(ev)) { + final boolean directCommitted = + commitManager.commitWithoutOutstanding( + brokerId, topicName, consensusGroupId, commitWriterId, commitWriterProgress); + acked.set(directCommitted); + if (!acked.get()) { + LOGGER.warn( + "ConsensusPrefetchingQueue {}: commit context {} does not exist for ack", + this, + commitContext); + } + return null; + } + + if (ev.isCommitted()) { + LOGGER.warn( + "ConsensusPrefetchingQueue {}: event {} already committed", this, commitContext); + ev.cleanUp(false); + return null; + } + + final boolean committed = + commitManager.commit( + brokerId, topicName, consensusGroupId, commitWriterId, commitWriterProgress); + if (!committed) { + LOGGER.warn( + "ConsensusPrefetchingQueue {}: failed to advance commit frontier for {}", + this, + commitContext); + return ev; + } + + ev.ack(); + ev.recordCommittedTimestamp(); + acked.set(true); + ev.cleanUp(false); + return null; + }); + + return acked.get(); + } + + public boolean nack(final String consumerId, final SubscriptionCommitContext commitContext) { + acquireReadLock(); + try { + return canAcceptCommitContext(commitContext, "nack", false) + && nackInternal(consumerId, commitContext); + } finally { + releaseReadLock(); + } + } + + /** + * Silent version of ack: returns false without logging if the commit context is not found. Used + * in multi-region iteration where only one queue owns the event. + */ + public boolean ackSilent(final String consumerId, final SubscriptionCommitContext commitContext) { + acquireReadLock(); + try { + if (!canAcceptCommitContext(commitContext, "ack", true)) { + return false; + } + final WriterId commitWriterId = extractCommitWriterId(commitContext); + final WriterProgress commitWriterProgress = extractCommitWriterProgress(commitContext); + final AtomicBoolean acked = new AtomicBoolean(false); + inFlightEvents.compute( + new Pair<>(consumerId, commitContext), + (key, ev) -> { + if (Objects.isNull(ev)) { + final boolean directCommitted = + commitManager.commitWithoutOutstanding( + brokerId, topicName, consensusGroupId, commitWriterId, commitWriterProgress); + acked.set(directCommitted); + return null; + } + if (ev.isCommitted()) { + ev.cleanUp(false); + return null; + } + final boolean committed = + commitManager.commit( + brokerId, topicName, consensusGroupId, commitWriterId, commitWriterProgress); + if (!committed) { + return ev; + } + ev.ack(); + ev.recordCommittedTimestamp(); + acked.set(true); + ev.cleanUp(false); + return null; + }); + return acked.get(); + } finally { + releaseReadLock(); + } + } + + private WriterId extractCommitWriterId(final SubscriptionCommitContext commitContext) { + final WriterId writerId = commitContext.getWriterId(); + return Objects.nonNull(writerId) ? writerId : new WriterId(consensusGroupId.toString(), -1, 0L); + } + + private WriterProgress extractCommitWriterProgress( + final SubscriptionCommitContext commitContext) { + return commitContext.getWriterProgress(); + } + + /** + * Silent version of nack: returns false without logging if the commit context is not found. Used + * in multi-region iteration where only one queue owns the event. + */ + public boolean nackSilent( + final String consumerId, final SubscriptionCommitContext commitContext) { + acquireReadLock(); + try { + if (!canAcceptCommitContext(commitContext, "nack", true)) { + return false; + } + final AtomicBoolean nacked = new AtomicBoolean(false); + inFlightEvents.compute( + new Pair<>(consumerId, commitContext), + (key, ev) -> { + if (Objects.isNull(ev)) { + return null; + } + ev.nack(); + nacked.set(true); + if (ev.isPoisoned()) { + LOGGER.error( + "ConsensusPrefetchingQueue {}: poison message detected (nackCount={}), " + + "force-acking event {} to prevent infinite re-delivery", + this, + ev.getNackCount(), + ev); + ev.ack(); + ev.recordCommittedTimestamp(); + ev.cleanUp(false); + return null; + } + prefetchingQueue.add(ev); + return null; + }); + return nacked.get(); + } finally { + releaseReadLock(); + } + } + + private boolean nackInternal( + final String consumerId, final SubscriptionCommitContext commitContext) { + final AtomicBoolean nacked = new AtomicBoolean(false); + inFlightEvents.compute( + new Pair<>(consumerId, commitContext), + (key, ev) -> { + if (Objects.isNull(ev)) { + LOGGER.warn( + "ConsensusPrefetchingQueue {}: commit context {} does not exist for nack", + this, + commitContext); + return null; + } + + ev.nack(); + nacked.set(true); + if (ev.isPoisoned()) { + LOGGER.error( + "ConsensusPrefetchingQueue {}: poison message detected (nackCount={}), " + + "force-acking event {} to prevent infinite re-delivery", + this, + ev.getNackCount(), + ev); + ev.ack(); + ev.recordCommittedTimestamp(); + ev.cleanUp(false); + return null; + } + prefetchingQueue.add(ev); + return null; + }); + + return nacked.get(); + } + + // ======================== Recycle ======================== + + /** Recycles in-flight events that are pollable (timed out) back to the prefetching queue. */ + private void recycleInFlightEvents() { + for (final Pair key : + new ArrayList<>(inFlightEvents.keySet())) { + inFlightEvents.compute( + key, + (k, ev) -> { + if (Objects.isNull(ev)) { + return null; + } + if (ev.isCommitted()) { + ev.cleanUp(false); + return null; + } + if (ev.pollable()) { + ev.nack(); + if (ev.isPoisoned()) { + LOGGER.error( + "ConsensusPrefetchingQueue {}: poison message detected during recycle " + + "(nackCount={}), force-acking event {}", + this, + ev.getNackCount(), + ev); + ev.ack(); + ev.recordCommittedTimestamp(); + ev.cleanUp(false); + return null; + } + prefetchingQueue.add(ev); + LOGGER.debug( + "ConsensusPrefetchingQueue {}: recycled timed-out event {} back to prefetching queue", + this, + ev); + return null; + } + return ev; + }); + } + } + + // ======================== Cleanup ======================== + + public void cleanUp() { + acquireWriteLock(); + try { + prefetchingQueue.forEach(event -> event.cleanUp(true)); + prefetchingQueue.clear(); + + inFlightEvents.values().forEach(event -> event.cleanUp(true)); + inFlightEvents.clear(); + + realtimeEntriesByLane.clear(); + writerLanes.clear(); + clearRecoveryWriterProgress(); + materializedFollowerProgressByWriter.clear(); + pendingEntries.clear(); + lingerBatch.reset(); + resetBatchWriterProgress(); + pendingWalGapRetryRequested = false; + walGapWaitStartTimeMs = 0L; + lastWalGapWaitLogTimeMs = 0L; + pendingSubscriptionWalResetSearchIndex = Long.MIN_VALUE; + pendingSubscriptionWalResetGeneration = Long.MIN_VALUE; + closeSubscriptionWALIterator(); + + } finally { + releaseWriteLock(); + } + } + + // ======================== Seek ======================== + + /** + * Seeks to the earliest available WAL position. The actual position depends on WAL retention: if + * old files have been reclaimed, the earliest available position may be later than 0. + */ + public void seekToBeginning() { + seekToResolvedPosition(0L, new RegionProgress(Collections.emptyMap()), "beginning"); + } + + /** + * Seeks to the current WAL write position. After this, only newly written data will be consumed. + */ + public void seekToEnd() { + seekToResolvedPosition( + consensusReqReader.getCurrentSearchIndex(), computeTailRegionProgress(), "end"); + } + + public void seekToRegionProgress(final RegionProgress regionProgress) { + if (!(consensusReqReader instanceof WALNode)) { + LOGGER.warn( + "ConsensusPrefetchingQueue {}: seekToRegionProgress not supported (no WAL directory)", + this); + seekToBeginning(); + return; + } + final WALNode walNode = (WALNode) consensusReqReader; + walNode.rollWALFile(); + + final ReplayLocateDecision replayTarget = + locateReplayStartForRegionProgress(regionProgress, false); + switch (replayTarget.getStatus()) { + case FOUND: + case AT_END: + LOGGER.info( + "ConsensusPrefetchingQueue {}: seekToRegionProgress writerCount={} -> {} searchIndex={}", + this, + regionProgress.getWriterPositions().size(), + replayTarget.getStatus(), + replayTarget.getStartSearchIndex()); + seekToResolvedPosition( + replayTarget.getStartSearchIndex(), + replayTarget.getRecoveryRegionProgress(), + "regionProgress"); + return; + case LOCATE_MISS: + default: + throw new IllegalStateException( + String.format( + "ConsensusPrefetchingQueue %s: cannot seekToRegionProgress %s: %s", + this, regionProgress, replayTarget.getDetail())); + } + } + + public void seekAfterRegionProgress(final RegionProgress regionProgress) { + if (!(consensusReqReader instanceof WALNode)) { + LOGGER.warn( + "ConsensusPrefetchingQueue {}: seekAfterRegionProgress not supported (no WAL directory)", + this); + seekToEnd(); + return; + } + final WALNode walNode = (WALNode) consensusReqReader; + walNode.rollWALFile(); + + final ReplayLocateDecision replayTarget = + locateReplayStartForRegionProgress(regionProgress, true); + switch (replayTarget.getStatus()) { + case FOUND: + case AT_END: + LOGGER.info( + "ConsensusPrefetchingQueue {}: seekAfterRegionProgress writerCount={} -> {} searchIndex={}", + this, + regionProgress.getWriterPositions().size(), + replayTarget.getStatus(), + replayTarget.getStartSearchIndex()); + seekToResolvedPosition( + replayTarget.getStartSearchIndex(), + replayTarget.getRecoveryRegionProgress(), + "regionProgressAfter"); + return; + case LOCATE_MISS: + default: + throw new IllegalStateException( + String.format( + "ConsensusPrefetchingQueue %s: cannot seekAfterRegionProgress %s: %s", + this, regionProgress, replayTarget.getDetail())); + } + } + + private synchronized void seekToResolvedPosition( + final long targetSearchIndex, + final RegionProgress committedRegionProgress, + final String seekReason) { + final PendingSeekRequest request; + + acquireWriteLock(); + try { + if (isClosed || closeRequested) { + return; + } + // Fence old commit contexts immediately. The grouped reset itself is applied later by the + // prefetch worker so WAL state and queue state still move under the queue's serial context. + final boolean previousPrefetchInitialized = prefetchInitialized; + final long previousSeekGeneration = seekGeneration.get(); + final long targetSeekGeneration = seekGeneration.incrementAndGet(); + request = + new PendingSeekRequest( + targetSearchIndex, + committedRegionProgress, + seekReason, + previousPrefetchInitialized, + previousSeekGeneration, + targetSeekGeneration); + pendingSeekRequest = request; + prefetchInitialized = true; + } finally { + releaseWriteLock(); + } + + final ConsensusPrefetchSubtask subtask = ensurePrefetchSubtaskBound(); + if (Objects.isNull(subtask)) { + failPendingSeekBeforeScheduling(request); + request.awaitCompletion(); + return; + } + + subtask.requestWakeupNow(); + request.awaitCompletion(); + } + + private boolean applyPendingSeekRequestIfNecessary() { + final PendingSeekRequest request = pendingSeekRequest; + if (Objects.isNull(request)) { + return false; + } + + acquireWriteLock(); + try { + if (pendingSeekRequest != request) { + return pendingSeekRequest != null; + } + pendingSeekRequest = null; + if (isClosed || closeRequested) { + request.fail( + new IllegalStateException( + String.format( + "ConsensusPrefetchingQueue %s is closing while applying seek", this))); + return true; + } + applySeekResetUnderWriteLock(request); + request.complete(); + return true; + } catch (final RuntimeException e) { + request.fail(e); + throw e; + } finally { + releaseWriteLock(); + } + } + + public void abortPendingSeekForRuntimeStop() { + final PendingSeekRequest requestToFail; + + acquireWriteLock(); + try { + requestToFail = pendingSeekRequest; + if (Objects.isNull(requestToFail)) { + return; + } + pendingSeekRequest = null; + prefetchInitialized = requestToFail.previousPrefetchInitialized; + if (seekGeneration.get() == requestToFail.targetSeekGeneration) { + seekGeneration.set(requestToFail.previousSeekGeneration); + } + LOGGER.info( + "ConsensusPrefetchingQueue {}: aborted pending seek({}) during runtime stop, restored prefetchInitialized {} -> {}, seekGeneration {} -> {}", + this, + requestToFail.seekReason, + true, + requestToFail.previousPrefetchInitialized, + requestToFail.targetSeekGeneration, + requestToFail.previousSeekGeneration); + } finally { + releaseWriteLock(); + } + + requestToFail.fail( + new IllegalStateException( + String.format( + "ConsensusPrefetchingQueue %s runtime stopped before seek(%s) was applied", + this, requestToFail.seekReason))); + } + + private void failPendingSeekBeforeScheduling(final PendingSeekRequest request) { + final boolean closing; + + acquireWriteLock(); + try { + if (pendingSeekRequest != request) { + return; + } + closing = isClosed || closeRequested; + pendingSeekRequest = null; + prefetchInitialized = request.previousPrefetchInitialized; + if (seekGeneration.get() == request.targetSeekGeneration) { + seekGeneration.set(request.previousSeekGeneration); + } + LOGGER.info( + "ConsensusPrefetchingQueue {}: failed to schedule seek({}) because {}, restored prefetchInitialized {} -> {}, seekGeneration {} -> {}", + this, + request.seekReason, + closing ? "the queue is closing" : "prefetch runtime is unavailable", + true, + request.previousPrefetchInitialized, + request.targetSeekGeneration, + request.previousSeekGeneration); + } finally { + releaseWriteLock(); + } + + request.fail( + new IllegalStateException( + String.format( + closing + ? "ConsensusPrefetchingQueue %s is closing before seek(%s) can be scheduled" + : "ConsensusPrefetchingQueue %s cannot schedule seek(%s) because prefetch runtime is unavailable", + this, + request.seekReason))); + } + + private void applySeekResetUnderWriteLock(final PendingSeekRequest request) { + // 1. Clean up all queued and in-flight events + prefetchingQueue.forEach(event -> event.cleanUp(true)); + prefetchingQueue.clear(); + inFlightEvents.values().forEach(event -> event.cleanUp(true)); + inFlightEvents.clear(); + + // 2. Discard stale pending entries from in-memory queue + pendingEntries.clear(); + + // 3. Reset per-writer release state and source-level dedup frontiers. + realtimeEntriesByLane.clear(); + writerLanes.clear(); + clearRecoveryWriterProgress(); + materializedFollowerProgressByWriter.clear(); + if (Objects.nonNull(request.committedRegionProgress) + && !request.committedRegionProgress.getWriterPositions().isEmpty()) { + installRecoveryWriterProgress(request.committedRegionProgress); + } + + // 4. Reset WAL read position + nextExpectedSearchIndex.set(request.targetSearchIndex); + requestSubscriptionWalReset(request.targetSearchIndex, seekGeneration.get()); + lingerBatch.reset(); + resetBatchWriterProgress(); + observedSeekGeneration = seekGeneration.get(); + pendingWalGapRetryRequested = false; + walGapWaitStartTimeMs = 0L; + lastWalGapWaitLogTimeMs = 0L; + + // 5. Reset commit state to the writer progress immediately before the first re-delivered + // entry so seek/rebind resumes from the intended frontier. + commitManager.resetState( + brokerId, topicName, consensusGroupId, request.committedRegionProgress); + + LOGGER.info( + "ConsensusPrefetchingQueue {}: seek({}) applied to searchIndex={}, writerCount={}, seekGeneration={}", + this, + request.seekReason, + request.targetSearchIndex, + Objects.nonNull(request.committedRegionProgress) + ? request.committedRegionProgress.getWriterPositions().size() + : 0, + seekGeneration.get()); + } + + private RegionProgress computeTailRegionProgress() { + if (!(consensusReqReader instanceof WALNode)) { + return new RegionProgress(Collections.emptyMap()); + } + + final WALNode walNode = (WALNode) consensusReqReader; + final Map tailProgressByWriter = new LinkedHashMap<>(); + final File[] walFiles = WALFileUtils.listAllWALFiles(walNode.getLogDirectory()); + if (Objects.isNull(walFiles) || walFiles.length == 0) { + mergeTailProgress(tailProgressByWriter, walNode.getCurrentWALMetaDataSnapshot()); + return new RegionProgress(tailProgressByWriter); + } + + WALFileUtils.ascSortByVersionId(walFiles); + final long liveVersionId = walNode.getCurrentWALFileVersion(); + final WALMetaData liveSnapshot = walNode.getCurrentWALMetaDataSnapshot(); + for (final File walFile : walFiles) { + final long versionId = WALFileUtils.parseVersionId(walFile.getName()); + if (versionId == liveVersionId) { + mergeTailProgress(tailProgressByWriter, liveSnapshot); + continue; + } + try (final ProgressWALReader reader = new ProgressWALReader(walFile)) { + mergeTailProgress(tailProgressByWriter, reader.getMetaData()); + } catch (final IOException e) { + LOGGER.warn( + "ConsensusPrefetchingQueue {}: failed to read WAL metadata from {} while computing seekToEnd frontier", + this, + walFile, + e); + } + } + return new RegionProgress(tailProgressByWriter); + } + + private void mergeTailProgress( + final Map tailProgressByWriter, final WALMetaData metadata) { + if (Objects.isNull(metadata)) { + return; + } + final List physicalTimes = metadata.getPhysicalTimes(); + final List nodeIds = metadata.getNodeIds(); + final List writerEpochs = metadata.getWriterEpochs(); + final List localSeqs = metadata.getLocalSeqs(); + final int size = + Math.min( + Math.min(physicalTimes.size(), nodeIds.size()), + Math.min(writerEpochs.size(), localSeqs.size())); + for (int i = 0; i < size; i++) { + final int writerNodeId = nodeIds.get(i); + final long writerEpoch = writerEpochs.get(i); + final long physicalTime = physicalTimes.get(i); + final long localSeq = localSeqs.get(i); + if (writerNodeId < 0 || physicalTime < 0L || localSeq < 0L) { + continue; + } + + final WriterId writerId = + new WriterId(consensusGroupId.toString(), writerNodeId, writerEpoch); + final WriterProgress candidateProgress = new WriterProgress(physicalTime, localSeq); + final WriterProgress currentProgress = tailProgressByWriter.get(writerId); + if (Objects.isNull(currentProgress) + || compareWriterProgress(candidateProgress, currentProgress) > 0) { + tailProgressByWriter.put(writerId, candidateProgress); + } + } + } + + /** + * Extracts the maximum timestamp from an InsertNode. For row nodes this is the single timestamp; + * for tablet nodes, {@code times} is sorted so the last element is the max. For composite nodes, + * iterates over children. + * + * @return the maximum timestamp, or {@code Long.MIN_VALUE} if extraction fails + */ + private long extractMaxTime(final InsertNode insertNode) { + try { + if (insertNode instanceof InsertRowNode) { + return ((InsertRowNode) insertNode).getTime(); + } + if (insertNode instanceof InsertTabletNode) { + final InsertTabletNode tabletNode = (InsertTabletNode) insertNode; + final int rowCount = tabletNode.getRowCount(); + return rowCount > 0 ? tabletNode.getTimes()[rowCount - 1] : Long.MIN_VALUE; + } + if (insertNode instanceof InsertMultiTabletsNode) { + long max = Long.MIN_VALUE; + for (final InsertTabletNode child : + ((InsertMultiTabletsNode) insertNode).getInsertTabletNodeList()) { + final int rowCount = child.getRowCount(); + if (rowCount > 0) { + max = Math.max(max, child.getTimes()[rowCount - 1]); + } + } + return max; + } + if (insertNode instanceof InsertRowsNode) { + long max = Long.MIN_VALUE; + for (final InsertRowNode row : ((InsertRowsNode) insertNode).getInsertRowNodeList()) { + max = Math.max(max, row.getTime()); + } + return max; + } + if (insertNode instanceof InsertRowsOfOneDeviceNode) { + long max = Long.MIN_VALUE; + for (final InsertRowNode row : + ((InsertRowsOfOneDeviceNode) insertNode).getInsertRowNodeList()) { + max = Math.max(max, row.getTime()); + } + return max; + } + // Fallback: use getMinTime() which at least gets a timestamp + return insertNode.getMinTime(); + } catch (final Exception e) { + return Long.MIN_VALUE; + } + } + + /** + * Checks whether it is time to inject a watermark event and does so if the configured interval + * has elapsed. Called from prefetch rounds after processing data and during idle scheduling. + */ + private void maybeInjectWatermark() { + if (maxObservedTimestamp == Long.MIN_VALUE) { + return; // No data observed yet, nothing to report + } + final long intervalMs = + SubscriptionConfig.getInstance().getSubscriptionConsensusWatermarkIntervalMs(); + if (intervalMs <= 0) { + return; // Watermark disabled + } + final long now = System.currentTimeMillis(); + if (now - lastWatermarkEmitTimeMs >= intervalMs) { + injectWatermark(maxObservedTimestamp); + lastWatermarkEmitTimeMs = now; + } + } + + /** + * Injects a {@link SubscriptionPollResponseType#WATERMARK} event into the prefetching queue. The + * committed mapping is deliberately NOT recorded because watermark events are metadata, not user + * data. + * + * @param watermarkTimestamp the maximum data timestamp observed so far + */ + private void injectWatermark(final long watermarkTimestamp) { + final int dataNodeId = IoTDBDescriptor.getInstance().getConfig().getDataNodeId(); + final SubscriptionCommitContext watermarkCtx = createNonCommittableSeekContext(dataNodeId); + final SubscriptionEvent watermarkEvent = + new SubscriptionEvent( + SubscriptionPollResponseType.WATERMARK.getType(), + new WatermarkPayload(watermarkTimestamp, dataNodeId), + watermarkCtx); + prefetchingQueue.add(watermarkEvent); + + LOGGER.debug( + "ConsensusPrefetchingQueue {}: injected WATERMARK, watermarkTimestamp={}", + this, + watermarkTimestamp); + } + + /** Returns the maximum observed data timestamp for metrics. */ + public long getMaxObservedTimestamp() { + return maxObservedTimestamp; + } + + private void markAcceptedFromPending() { + pendingPathAcceptedEntries.incrementAndGet(); + } + + private void markAcceptedFromWal() { + walPathAcceptedEntries.incrementAndGet(); + } + + public void close() { + final PendingSeekRequest seekRequestToFail; + final Pair prefetchBinding; + + acquireWriteLock(); + try { + if (isClosed || closeRequested) { + return; + } + closeRequested = true; + seekRequestToFail = pendingSeekRequest; + pendingSeekRequest = null; + } finally { + releaseWriteLock(); + } + + prefetchBinding = detachPrefetchSubtask(); + + if (Objects.nonNull(seekRequestToFail)) { + seekRequestToFail.fail( + new IllegalStateException( + String.format("ConsensusPrefetchingQueue %s is closing before seek applies", this))); + } + + if (Objects.nonNull(prefetchBinding.right)) { + prefetchBinding.right.cancelPendingExecution(); + prefetchBinding.right.awaitIdle(); + } + + try { + acquireWriteLock(); + try { + if (!isClosed + && pendingSeekRequest == null + && seekGeneration.get() == observedSeekGeneration) { + flushLingeringBatchOnCloseUnderWriteLock(); + } + markClosed(); + } finally { + releaseWriteLock(); + } + + // Deregister metrics after the queue is fully closed. + ConsensusSubscriptionPrefetchingQueueMetrics.getInstance() + .deregister(getPrefetchingQueueId()); + + if (Objects.nonNull(prefetchBinding.left) && Objects.nonNull(prefetchBinding.right)) { + if (!prefetchBinding.left.isShutdown()) { + prefetchBinding.left.deregister(prefetchBinding.right.getTaskId()); + } else { + prefetchBinding.right.close(); + } + } + + try { + // Deregister from IoTConsensusServerImpl (stop receiving in-memory data). + serverImpl.deregisterSubscriptionQueue(pendingEntries); + } catch (final Exception e) { + LOGGER.warn("ConsensusPrefetchingQueue {}: error during deregister", this, e); + } finally { + try { + cleanUp(); + } finally { + // Persist progress before closing + commitManager.persistAll(); + } + } + } finally { + closeRequested = false; + } + } + + private void flushLingeringBatchOnCloseUnderWriteLock() { + if (lingerBatch.isEmpty()) { + return; + } + LOGGER.info( + "ConsensusPrefetchingQueue {}: flushing {} lingering tablets during close", + this, + lingerBatch.tablets.size()); + if (!flushBatch(lingerBatch, observedSeekGeneration)) { + LOGGER.warn( + "ConsensusPrefetchingQueue {}: failed to flush lingering batch during close, discarding it", + this); + lingerBatch.reset(); + resetBatchWriterProgress(); + } + } + + private SubscriptionEvent generateErrorResponse(final String errorMessage) { + return new SubscriptionEvent( + SubscriptionPollResponseType.ERROR.getType(), + new ErrorPayload(errorMessage, false), + createNonCommittableContext(IoTDBDescriptor.getInstance().getConfig().getDataNodeId())); + } + + private SubscriptionEvent generateOutdatedErrorResponse() { + return new SubscriptionEvent( + SubscriptionPollResponseType.ERROR.getType(), + ErrorPayload.OUTDATED_ERROR_PAYLOAD, + createNonCommittableContext(IoTDBDescriptor.getInstance().getConfig().getDataNodeId())); + } + + /** + * Shared subscription events still use {@link SubscriptionCommitContext#INVALID_COMMIT_ID} to + * mark metadata and error payloads as non-committable. Consensus correctness never treats this + * sentinel as a replay or commit frontier. + */ + private SubscriptionCommitContext createNonCommittableContext(final int dataNodeId) { + return new SubscriptionCommitContext( + dataNodeId, + PipeDataNodeAgent.runtime().getRebootTimes(), + topicName, + brokerId, + INVALID_COMMIT_ID); + } + + private SubscriptionCommitContext createNonCommittableSeekContext(final int dataNodeId) { + return new SubscriptionCommitContext( + dataNodeId, + PipeDataNodeAgent.runtime().getRebootTimes(), + topicName, + brokerId, + INVALID_COMMIT_ID, + seekGeneration.get(), + consensusGroupId.toString()); + } + + public boolean isCommitContextOutdated(final SubscriptionCommitContext commitContext) { + return PipeDataNodeAgent.runtime().getRebootTimes() > commitContext.getRebootTimes() + || seekGeneration.get() != commitContext.getSeekGeneration(); + } + + // ======================== Status ======================== + + public boolean isClosed() { + return isClosed; + } + + public void markClosed() { + isClosed = true; + } + + // ======================== Routing Runtime Version Control ======================== + + public long getWalGapSkippedEntries() { + return walGapSkippedEntries.get(); + } + + public long getEpochChangeCount() { + return runtimeVersionChangeCount.get(); + } + + // ======================== Leader Activation ======================== + + /** + * Activates or deactivates this queue. Only the preferred-writer (leader) node's queue should be + * active. Inactive queues skip prefetching and return null on poll. + */ + public void setActive(final boolean active) { + this.isActive = active; + LOGGER.info( + "ConsensusPrefetchingQueue {}: isActive set to {} (region={})", + this, + active, + consensusGroupId); + if (active) { + requestPrefetch(); + } + } + + public boolean isActive() { + return isActive; + } + + public void setActiveWriterNodeIds(final Set activeWriterNodeIds) { + this.runtimeActiveWriterNodeIds = + Collections.unmodifiableSet( + new LinkedHashSet<>(Objects.requireNonNull(activeWriterNodeIds))); + refreshEffectiveActiveWriterNodeIds(); + LOGGER.info( + "ConsensusPrefetchingQueue {}: runtimeActiveWriterNodeIds={}, effectiveActiveWriterNodeIds={} " + + "(region={}, orderMode={}, preferredWriterNodeId={})", + this, + this.runtimeActiveWriterNodeIds, + this.activeWriterNodeIds, + consensusGroupId, + orderMode, + preferredWriterNodeId); + requestPrefetch(); + } + + private void refreshEffectiveActiveWriterNodeIds() { + final LinkedHashSet effectiveWriterNodeIds = new LinkedHashSet<>(); + switch (orderMode) { + case TopicConstant.ORDER_MODE_MULTI_WRITER_VALUE: + effectiveWriterNodeIds.addAll(runtimeActiveWriterNodeIds); + if (effectiveWriterNodeIds.isEmpty() && preferredWriterNodeId >= 0) { + effectiveWriterNodeIds.add(preferredWriterNodeId); + } + break; + case TopicConstant.ORDER_MODE_PER_WRITER_VALUE: + if (preferredWriterNodeId >= 0) { + effectiveWriterNodeIds.add(preferredWriterNodeId); + } + break; + case TopicConstant.ORDER_MODE_LEADER_ONLY_VALUE: + default: + if (preferredWriterNodeId >= 0) { + effectiveWriterNodeIds.add(preferredWriterNodeId); + } + if (previousPreferredWriterNodeId >= 0 + && previousPreferredWriterNodeId != preferredWriterNodeId + && runtimeActiveWriterNodeIds.contains(previousPreferredWriterNodeId)) { + effectiveWriterNodeIds.add(previousPreferredWriterNodeId); + } + break; + } + this.activeWriterNodeIds = Collections.unmodifiableSet(effectiveWriterNodeIds); + } + + public void setPreferredWriterNodeId(final int preferredWriterNodeId) { + if (this.preferredWriterNodeId != preferredWriterNodeId) { + previousPreferredWriterNodeId = this.preferredWriterNodeId; + } else { + previousPreferredWriterNodeId = -1; + } + this.preferredWriterNodeId = preferredWriterNodeId; + refreshEffectiveActiveWriterNodeIds(); + LOGGER.info( + "ConsensusPrefetchingQueue {}: preferredWriterNodeId set to {}, effectiveActiveWriterNodeIds={} " + + "(region={}, orderMode={})", + this, + this.preferredWriterNodeId, + this.activeWriterNodeIds, + consensusGroupId, + orderMode); + requestPrefetch(); + } + + public Set getActiveWriterNodeIds() { + return activeWriterNodeIds; + } + + public void setOrderMode(final String orderMode) { + final String normalizedOrderMode = TopicConfig.normalizeOrderMode(orderMode); + if (Objects.equals(this.orderMode, normalizedOrderMode)) { + return; + } + this.orderMode = normalizedOrderMode; + refreshEffectiveActiveWriterNodeIds(); + LOGGER.info( + "ConsensusPrefetchingQueue {}: orderMode set to {}, effectiveActiveWriterNodeIds={} " + + "(region={}, preferredWriterNodeId={}, runtimeActiveWriterNodeIds={})", + this, + this.orderMode, + this.activeWriterNodeIds, + consensusGroupId, + preferredWriterNodeId, + runtimeActiveWriterNodeIds); + requestPrefetch(); + } + + public String getOrderMode() { + return orderMode; + } + + private boolean isLaneRuntimeActive(final WriterLaneId laneId) { + final Set writerNodeIds = activeWriterNodeIds; + return writerNodeIds.isEmpty() || writerNodeIds.contains(laneId.writerNodeId); + } + + public void applyRuntimeState(final ConsensusRegionRuntimeState runtimeState) { + Objects.requireNonNull(runtimeState, "runtimeState"); + this.runtimeVersion = runtimeState.getRuntimeVersion(); + runtimeVersionChangeCount.incrementAndGet(); + LOGGER.info( + "ConsensusPrefetchingQueue {}: applied runtimeVersion {}", + this, + runtimeState.getRuntimeVersion()); + setPreferredWriterNodeId(runtimeState.getPreferredWriterNodeId()); + setActiveWriterNodeIds(runtimeState.getActiveWriterNodeIds()); + // "active" decides whether this replica should serve subscription traffic on the current node. + // In multi-writer mode, activeWriterNodeIds may intentionally include follower replicas for + // ordering/watermark coordination, so it must not be reused as the local service-activation + // signal. + setActive(runtimeState.isActive()); + LOGGER.info( + "ConsensusPrefetchingQueue {}: applied runtimeState={}, preferredWriterNodeId={}", + this, + runtimeState, + runtimeState.getPreferredWriterNodeId()); + if (runtimeState.isActive()) { + requestPrefetch(); + } + } + + public String getPrefetchingQueueId() { + return brokerId + "_" + topicName; + } + + public long getSubscriptionUncommittedEventCount() { + return inFlightEvents.size(); + } + + /** Exposes the current seek generation for runtime tests and metrics. */ + public long getCurrentSeekGeneration() { + return seekGeneration.get(); + } + + public int getPrefetchedEventCount() { + return prefetchingQueue.size(); + } + + public long getCurrentReadSearchIndex() { + return nextExpectedSearchIndex.get(); + } + + public long getPendingPathAcceptedEntries() { + return pendingPathAcceptedEntries.get(); + } + + public long getWalPathAcceptedEntries() { + return walPathAcceptedEntries.get(); + } + + public String getBrokerId() { + return brokerId; + } + + public String getTopicName() { + return topicName; + } + + public ConsensusGroupId getConsensusGroupId() { + return consensusGroupId; + } + + /** + * Returns an approximate backlog for this queue. + * + *

The metric intentionally avoids collapsing per-writer committed progress into a single + * scalar local sequence. Instead it counts queued/in-flight work and adds one extra unit when the + * local WAL reader still has unread entries beyond its current replay cursor. + */ + public long getLag() { + long lag = + prefetchingQueue.size() + + inFlightEvents.size() + + pendingEntries.size() + + getRealtimeBufferedEntryCount(); + if (nextExpectedSearchIndex.get() < consensusReqReader.getCurrentSearchIndex()) { + lag++; + } + return lag; + } + + // ======================== Stringify ======================== + + public Map coreReportMessage() { + final Map result = new HashMap<>(); + result.put("brokerId", brokerId); + result.put("topicName", topicName); + result.put("consensusGroupId", consensusGroupId.toString()); + result.put("currentReadSearchIndex", String.valueOf(nextExpectedSearchIndex.get())); + result.put("prefetchingQueueSize", String.valueOf(prefetchingQueue.size())); + result.put("inFlightEventsSize", String.valueOf(inFlightEvents.size())); + result.put("pendingEntriesSize", String.valueOf(pendingEntries.size())); + result.put("pendingPathAcceptedEntries", String.valueOf(getPendingPathAcceptedEntries())); + result.put("walPathAcceptedEntries", String.valueOf(getWalPathAcceptedEntries())); + result.put("seekGeneration", String.valueOf(seekGeneration.get())); + result.put("walGapSkippedEntries", String.valueOf(walGapSkippedEntries.get())); + result.put("bufferedRealtimeEntryCount", String.valueOf(getRealtimeBufferedEntryCount())); + result.put("lag", String.valueOf(getLag())); + result.put("isClosed", String.valueOf(isClosed)); + result.put("isActive", String.valueOf(isActive)); + result.put("orderMode", orderMode); + result.put("preferredWriterNodeId", String.valueOf(preferredWriterNodeId)); + result.put("activeWriterCount", String.valueOf(activeWriterNodeIds.size())); + result.put("runtimeActiveWriterCount", String.valueOf(runtimeActiveWriterNodeIds.size())); + result.put("recoveryWriterCount", String.valueOf(recoveryWriterProgressByWriter.size())); + result.put("writerLaneCount", String.valueOf(writerLanes.size())); + result.put("realtimeLaneCount", String.valueOf(realtimeEntriesByLane.size())); + return result; + } + + @Override + public String toString() { + return "ConsensusPrefetchingQueue" + coreReportMessage(); + } + + // ======================== Inner Classes ======================== + + private interface LaneBufferedEntry { + List getTablets(); + + long getSearchIndex(); + + long getPhysicalTime(); + + int getWriterNodeId(); + + long getWriterEpoch(); + + long getLocalSeq(); + + OrderingKey getOrderingKey(); + } + + private static final class DeliveryBatchState { + + private final List tablets = new ArrayList<>(); + private long startSearchIndex; + private long endSearchIndex; + private long estimatedBytes; + private long firstTabletTimeMs; + private long physicalTime; + private long lastLocalSeq; + private int writerNodeId; + private long writerEpoch; + private int entryCount; + + private DeliveryBatchState() { + reset(); + } + + private boolean isEmpty() { + return tablets.isEmpty(); + } + + private void append( + final LaneBufferedEntry entry, + final long entryEstimatedBytes, + final boolean trackLingerTime) { + if (tablets.isEmpty()) { + if (trackLingerTime) { + firstTabletTimeMs = System.currentTimeMillis(); + } + writerNodeId = entry.getWriterNodeId(); + writerEpoch = entry.getWriterEpoch(); + } + if (entry.getSearchIndex() >= 0) { + if (startSearchIndex < 0) { + startSearchIndex = entry.getSearchIndex(); + } + endSearchIndex = entry.getSearchIndex(); + } + tablets.addAll(entry.getTablets()); + estimatedBytes += entryEstimatedBytes; + physicalTime = entry.getPhysicalTime(); + lastLocalSeq = entry.getLocalSeq(); + writerNodeId = entry.getWriterNodeId(); + writerEpoch = entry.getWriterEpoch(); + entryCount++; + } + + private void reset() { + tablets.clear(); + startSearchIndex = -1L; + endSearchIndex = -1L; + estimatedBytes = 0L; + firstTabletTimeMs = 0L; + physicalTime = 0L; + lastLocalSeq = -1L; + writerNodeId = -1; + writerEpoch = 0L; + entryCount = 0; + } + } + + private static final class WriterLaneId { + private final int writerNodeId; + private final long writerEpoch; + + private WriterLaneId(final int writerNodeId, final long writerEpoch) { + this.writerNodeId = writerNodeId; + this.writerEpoch = writerEpoch; + } + + @Override + public boolean equals(final Object obj) { + if (this == obj) { + return true; + } + if (!(obj instanceof WriterLaneId)) { + return false; + } + final WriterLaneId that = (WriterLaneId) obj; + return writerNodeId == that.writerNodeId && writerEpoch == that.writerEpoch; + } + + @Override + public int hashCode() { + return Objects.hash(writerNodeId, writerEpoch); + } + } + + private static final class WriterLaneState { + private long effectiveSafePt = 0L; + private boolean closed = false; + } + + private static final class PreparedEntry implements LaneBufferedEntry { + private final List tablets; + private final long searchIndex; + private final long physicalTime; + private final int writerNodeId; + private final long writerEpoch; + private final long localSeq; + + private PreparedEntry( + final List tablets, + final long searchIndex, + final long physicalTime, + final int writerNodeId, + final long writerEpoch, + final long localSeq) { + this.tablets = tablets; + this.searchIndex = searchIndex; + this.physicalTime = physicalTime; + this.writerNodeId = writerNodeId; + this.writerEpoch = writerEpoch; + this.localSeq = localSeq; + } + + @Override + public List getTablets() { + return tablets; + } + + @Override + public long getSearchIndex() { + return searchIndex; + } + + @Override + public long getPhysicalTime() { + return physicalTime; + } + + @Override + public int getWriterNodeId() { + return writerNodeId; + } + + @Override + public long getWriterEpoch() { + return writerEpoch; + } + + @Override + public long getLocalSeq() { + return localSeq; + } + + @Override + public OrderingKey getOrderingKey() { + return new OrderingKey(physicalTime, writerNodeId, writerEpoch, localSeq); + } + } + + private static final class LaneFrontier implements Comparable { + private final WriterLaneId laneId; + private final OrderingKey orderingKey; + private final boolean isBarrier; + + private LaneFrontier( + final WriterLaneId laneId, final OrderingKey orderingKey, final boolean isBarrier) { + this.laneId = laneId; + this.orderingKey = orderingKey; + this.isBarrier = isBarrier; + } + + private static LaneFrontier forHead(final WriterLaneId laneId, final LaneBufferedEntry entry) { + return new LaneFrontier(laneId, entry.getOrderingKey(), false); + } + + private static LaneFrontier forBarrier(final WriterLaneId laneId, final long effectiveSafePt) { + return new LaneFrontier( + laneId, + new OrderingKey(effectiveSafePt, Integer.MIN_VALUE, Long.MIN_VALUE, Long.MIN_VALUE), + true); + } + + @Override + public int compareTo(final LaneFrontier other) { + int cmp = orderingKey.compareTo(other.orderingKey); + if (cmp != 0) { + return cmp; + } + if (isBarrier != other.isBarrier) { + return isBarrier ? -1 : 1; + } + cmp = Integer.compare(laneId.writerNodeId, other.laneId.writerNodeId); + if (cmp != 0) { + return cmp; + } + return Long.compare(laneId.writerEpoch, other.laneId.writerEpoch); + } + } + + /** Composite ordering key (physicalTime, nodeId, writerEpoch, localSeq) for lane ordering. */ + static final class OrderingKey implements Comparable { + final long physicalTime; + final int nodeId; + final long writerEpoch; + final long localSeq; + + OrderingKey( + final long physicalTime, final int nodeId, final long writerEpoch, final long localSeq) { + this.physicalTime = physicalTime; + this.nodeId = nodeId; + this.writerEpoch = writerEpoch; + this.localSeq = localSeq; + } + + @Override + public int compareTo(final OrderingKey o) { + int cmp = Long.compare(physicalTime, o.physicalTime); + if (cmp != 0) { + return cmp; + } + cmp = Integer.compare(nodeId, o.nodeId); + if (cmp != 0) { + return cmp; + } + cmp = Long.compare(writerEpoch, o.writerEpoch); + if (cmp != 0) { + return cmp; + } + return Long.compare(localSeq, o.localSeq); + } + + @Override + public boolean equals(final Object o) { + if (this == o) { + return true; + } + if (!(o instanceof OrderingKey)) { + return false; + } + final OrderingKey that = (OrderingKey) o; + return physicalTime == that.physicalTime + && nodeId == that.nodeId + && writerEpoch == that.writerEpoch + && localSeq == that.localSeq; + } + + @Override + public int hashCode() { + return Objects.hash(physicalTime, nodeId, writerEpoch, localSeq); + } + + @Override + public String toString() { + return "(" + physicalTime + "," + nodeId + "," + writerEpoch + "," + localSeq + ")"; + } + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusRegionRuntimeState.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusRegionRuntimeState.java new file mode 100644 index 0000000000000..92e030ce93b8f --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusRegionRuntimeState.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker.consensus; + +import java.util.Collections; +import java.util.LinkedHashSet; +import java.util.Objects; +import java.util.Set; + +/** Runtime control state for consensus subscription delivery on a single region replica. */ +public class ConsensusRegionRuntimeState { + + private final long runtimeVersion; + private final int preferredWriterNodeId; + private final boolean active; + private final Set activeWriterNodeIds; + + public ConsensusRegionRuntimeState( + final long runtimeVersion, + final int preferredWriterNodeId, + final boolean active, + final Set activeWriterNodeIds) { + this.runtimeVersion = runtimeVersion; + this.preferredWriterNodeId = preferredWriterNodeId; + this.active = active; + this.activeWriterNodeIds = + Collections.unmodifiableSet( + new LinkedHashSet<>(Objects.requireNonNull(activeWriterNodeIds))); + } + + public long getRuntimeVersion() { + return runtimeVersion; + } + + public int getPreferredWriterNodeId() { + return preferredWriterNodeId; + } + + public boolean isActive() { + return active; + } + + public Set getActiveWriterNodeIds() { + return activeWriterNodeIds; + } + + public static ConsensusRegionRuntimeState leaderOnly( + final long runtimeVersion, final int preferredWriterNodeId, final boolean active) { + return new ConsensusRegionRuntimeState( + runtimeVersion, + preferredWriterNodeId, + active, + Collections.singleton(preferredWriterNodeId)); + } + + @Override + public String toString() { + return "ConsensusRegionRuntimeState{" + + "runtimeVersion=" + + runtimeVersion + + ", preferredWriterNodeId=" + + preferredWriterNodeId + + ", active=" + + active + + ", activeWriterNodeIds=" + + activeWriterNodeIds + + '}'; + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionCommitManager.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionCommitManager.java new file mode 100644 index 0000000000000..70f45080763ff --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionCommitManager.java @@ -0,0 +1,1298 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker.consensus; + +import org.apache.iotdb.common.rpc.thrift.TDataNodeLocation; +import org.apache.iotdb.common.rpc.thrift.TEndPoint; +import org.apache.iotdb.common.rpc.thrift.TRegionReplicaSet; +import org.apache.iotdb.commons.client.ClientPoolFactory; +import org.apache.iotdb.commons.client.IClientManager; +import org.apache.iotdb.commons.client.exception.ClientManagerException; +import org.apache.iotdb.commons.client.sync.SyncDataNodeInternalServiceClient; +import org.apache.iotdb.commons.consensus.ConfigRegionId; +import org.apache.iotdb.commons.consensus.ConsensusGroupId; +import org.apache.iotdb.commons.subscription.config.SubscriptionConfig; +import org.apache.iotdb.confignode.rpc.thrift.TGetCommitProgressReq; +import org.apache.iotdb.confignode.rpc.thrift.TGetCommitProgressResp; +import org.apache.iotdb.db.conf.IoTDBDescriptor; +import org.apache.iotdb.db.protocol.client.ConfigNodeClient; +import org.apache.iotdb.db.protocol.client.ConfigNodeClientManager; +import org.apache.iotdb.db.protocol.client.ConfigNodeInfo; +import org.apache.iotdb.db.queryengine.plan.analyze.ClusterPartitionFetcher; +import org.apache.iotdb.mpp.rpc.thrift.TSyncSubscriptionProgressReq; +import org.apache.iotdb.rpc.TSStatusCode; +import org.apache.iotdb.rpc.subscription.payload.poll.RegionProgress; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterId; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterProgress; + +import org.apache.thrift.TException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.ByteArrayOutputStream; +import java.io.DataOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Collections; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +/** + * Manages committed progress for consensus-based subscriptions. + * + *

State is maintained per {@code (consumerGroup, topic, region)} so each DataRegion can recover + * independently. + * + *

Committed progress is represented in per-writer terms via {@link WriterId} and {@link + * WriterProgress}. Outstanding deliveries are tracked by writer-local slots, while commit + * advancement is computed with ordered progress keys derived from {@code physicalTime}, {@code + * writerNodeId}, {@code writerEpoch}, and {@code localSeq}. {@code searchIndex} is not the + * committed frontier here; it only remains an implementation aid for WAL positioning elsewhere. + * + *

Key responsibilities: + * + *

    + *
  • Track dispatched but uncommitted mappings per writer + *
  • Advance committed progress idempotently and contiguously on ack/commit + *
  • Persist, recover, and broadcast committed region progress + *
+ */ +public class ConsensusSubscriptionCommitManager { + + private static final Logger LOGGER = + LoggerFactory.getLogger(ConsensusSubscriptionCommitManager.class); + + private static final String PROGRESS_FILE_PREFIX = "consensus_subscription_progress_"; + private static final String PROGRESS_FILE_SUFFIX = ".dat"; + + private static final IClientManager CONFIG_NODE_CLIENT_MANAGER = + ConfigNodeClientManager.getInstance(); + + /** Client manager for DataNode-to-DataNode RPC (progress broadcast). */ + private static final IClientManager + SYNC_DN_CLIENT_MANAGER = + new IClientManager.Factory() + .createClientManager( + new ClientPoolFactory.SyncDataNodeInternalServiceClientPoolFactory()); + + /** Minimum interval (ms) between broadcasts for the same (consumerGroup, topic, region). */ + private static final long MIN_BROADCAST_INTERVAL_MS = 5000; + + /** Rate-limiting: last broadcast timestamp per key. */ + private final Map lastBroadcastTime = new ConcurrentHashMap<>(); + + /** Single-threaded executor for fire-and-forget broadcasts. */ + private final ExecutorService broadcastExecutor = + Executors.newSingleThreadExecutor( + r -> { + final Thread t = new Thread(r, "SubscriptionProgressBroadcast"); + t.setDaemon(true); + return t; + }); + + /** Key: "consumerGroupId##topicName##regionId" -> progress tracking state */ + private final Map commitStates = + new ConcurrentHashMap<>(); + + private final String persistDir; + + private ConsensusSubscriptionCommitManager() { + this.persistDir = + IoTDBDescriptor.getInstance().getConfig().getSystemDir() + + File.separator + + "subscription" + + File.separator + + "consensus_progress"; + final File dir = new File(persistDir); + if (!dir.exists()) { + dir.mkdirs(); + } + } + + /** + * Gets or creates the commit state for a specific (consumerGroup, topic, region) triple. + * + * @param consumerGroupId the consumer group ID + * @param topicName the topic name + * @param regionId the consensus group / data region ID + * @return the commit state + */ + public ConsensusSubscriptionCommitState getOrCreateState( + final String consumerGroupId, final String topicName, final ConsensusGroupId regionId) { + final String key = generateKey(consumerGroupId, topicName, regionId); + final String regionIdString = regionId.toString(); + return commitStates.computeIfAbsent( + key, + k -> { + // Try to recover from persisted local state + final ConsensusSubscriptionCommitState recovered = tryRecover(key, regionIdString); + if (recovered != null) { + return recovered; + } + final ConsensusSubscriptionCommitState recoveredFromConfigNode = + queryCommitProgressStateFromConfigNode(consumerGroupId, topicName, regionId); + if (Objects.nonNull(recoveredFromConfigNode)) { + return recoveredFromConfigNode; + } + return new ConsensusSubscriptionCommitState( + regionIdString, new SubscriptionConsensusProgress()); + }); + } + + public boolean hasPersistedState( + final String consumerGroupId, final String topicName, final ConsensusGroupId regionId) { + return getProgressFile(generateKey(consumerGroupId, topicName, regionId)).exists(); + } + + public void recordMapping( + final String consumerGroupId, + final String topicName, + final ConsensusGroupId regionId, + final WriterId writerId, + final WriterProgress writerProgress) { + final ConsensusSubscriptionCommitState state = + getOrCreateState(consumerGroupId, topicName, regionId); + state.recordMapping(writerId, writerProgress); + } + + public boolean commit( + final String consumerGroupId, + final String topicName, + final ConsensusGroupId regionId, + final WriterId writerId, + final WriterProgress writerProgress) { + final String key = generateKey(consumerGroupId, topicName, regionId); + final ConsensusSubscriptionCommitState state = commitStates.get(key); + if (state == null) { + LOGGER.warn( + "ConsensusSubscriptionCommitManager: Cannot commit for unknown state, " + + "consumerGroupId={}, topicName={}, regionId={}, writerId={}, writerProgress={}", + consumerGroupId, + topicName, + regionId, + writerId, + writerProgress); + return false; + } + final CommitOperationResult result = state.commitAndGetResult(writerId, writerProgress); + if (result.isHandled()) { + // Periodically persist progress + persistProgressIfNeeded(key, state); + if (result.hasAdvancedWriter()) { + maybeBroadcast( + key, + consumerGroupId, + topicName, + regionId, + result.getAdvancedWriterProgress(), + result.getAdvancedWriterId()); + } + } + return result.isHandled(); + } + + public boolean commitWithoutOutstanding( + final String consumerGroupId, + final String topicName, + final ConsensusGroupId regionId, + final WriterId writerId, + final WriterProgress writerProgress) { + final String key = generateKey(consumerGroupId, topicName, regionId); + final ConsensusSubscriptionCommitState state = commitStates.get(key); + if (state == null) { + LOGGER.warn( + "ConsensusSubscriptionCommitManager: Cannot direct-commit for unknown state, " + + "consumerGroupId={}, topicName={}, regionId={}, writerId={}, writerProgress={}", + consumerGroupId, + topicName, + regionId, + writerId, + writerProgress); + return false; + } + final CommitOperationResult result = + state.commitWithoutOutstandingAndGetResult(writerId, writerProgress); + if (result.isHandled()) { + persistProgressIfNeeded(key, state); + if (result.hasAdvancedWriter()) { + maybeBroadcast( + key, + consumerGroupId, + topicName, + regionId, + result.getAdvancedWriterProgress(), + result.getAdvancedWriterId()); + } + } + return result.isHandled(); + } + + public long getCommittedPhysicalTime( + final String consumerGroupId, final String topicName, final ConsensusGroupId regionId) { + final String key = generateKey(consumerGroupId, topicName, regionId); + final ConsensusSubscriptionCommitState state = commitStates.get(key); + return state != null ? state.getCommittedPhysicalTime() : 0L; + } + + public long getCommittedLocalSeq( + final String consumerGroupId, final String topicName, final ConsensusGroupId regionId) { + final String key = generateKey(consumerGroupId, topicName, regionId); + final ConsensusSubscriptionCommitState state = commitStates.get(key); + return state != null ? state.getCommittedLocalSeq() : -1L; + } + + public int getCommittedWriterNodeId( + final String consumerGroupId, final String topicName, final ConsensusGroupId regionId) { + final String key = generateKey(consumerGroupId, topicName, regionId); + final ConsensusSubscriptionCommitState state = commitStates.get(key); + return state != null ? state.getCommittedWriterNodeId() : -1; + } + + public long getCommittedWriterEpoch( + final String consumerGroupId, final String topicName, final ConsensusGroupId regionId) { + final String key = generateKey(consumerGroupId, topicName, regionId); + final ConsensusSubscriptionCommitState state = commitStates.get(key); + return state != null ? state.getCommittedWriterEpoch() : 0L; + } + + public WriterId getCommittedWriterId( + final String consumerGroupId, final String topicName, final ConsensusGroupId regionId) { + final String key = generateKey(consumerGroupId, topicName, regionId); + final ConsensusSubscriptionCommitState state = commitStates.get(key); + return state != null ? state.getCommittedWriterId() : null; + } + + public WriterProgress getCommittedWriterProgress( + final String consumerGroupId, final String topicName, final ConsensusGroupId regionId) { + final String key = generateKey(consumerGroupId, topicName, regionId); + final ConsensusSubscriptionCommitState state = commitStates.get(key); + return state != null ? state.getCommittedWriterProgress() : null; + } + + public RegionProgress getCommittedRegionProgress( + final String consumerGroupId, final String topicName, final ConsensusGroupId regionId) { + final String key = generateKey(consumerGroupId, topicName, regionId); + final ConsensusSubscriptionCommitState state = commitStates.get(key); + if (state == null) { + return new RegionProgress(Collections.emptyMap()); + } + return state.getCommittedRegionProgress(); + } + + /** + * Removes state for a specific (consumerGroup, topic, region) triple. + * + * @param consumerGroupId the consumer group ID + * @param topicName the topic name + * @param regionId the consensus group / data region ID + */ + public void removeState( + final String consumerGroupId, final String topicName, final ConsensusGroupId regionId) { + final String key = generateKey(consumerGroupId, topicName, regionId); + commitStates.remove(key); + // Clean up persisted file + final File file = getProgressFile(key); + if (file.exists()) { + file.delete(); + } + } + + /** + * Removes all states for a given (consumerGroup, topic) pair across all regions. Used during + * subscription teardown when the individual regionIds may not be readily available. + * + * @param consumerGroupId the consumer group ID + * @param topicName the topic name + */ + public void removeAllStatesForTopic(final String consumerGroupId, final String topicName) { + final String prefix = consumerGroupId + KEY_SEPARATOR + topicName + KEY_SEPARATOR; + final Iterator> it = + commitStates.entrySet().iterator(); + while (it.hasNext()) { + final Map.Entry entry = it.next(); + if (entry.getKey().startsWith(prefix)) { + it.remove(); + final File file = getProgressFile(entry.getKey()); + if (file.exists()) { + file.delete(); + } + } + } + } + + public void resetState( + final String consumerGroupId, + final String topicName, + final ConsensusGroupId regionId, + final RegionProgress regionProgress) { + final String key = generateKey(consumerGroupId, topicName, regionId); + final ConsensusSubscriptionCommitState state = commitStates.get(key); + if (state == null) { + LOGGER.warn( + "ConsensusSubscriptionCommitManager: Cannot reset unknown state, " + + "consumerGroupId={}, topicName={}, regionId={}", + consumerGroupId, + topicName, + regionId); + return; + } + state.resetForSeek(regionProgress); + persistProgress(key, state); + } + + /** Persists all states. Should be called during graceful shutdown. */ + public void persistAll() { + for (final Map.Entry entry : + commitStates.entrySet()) { + persistProgress(entry.getKey(), entry.getValue()); + } + } + + public Map collectAllRegionProgress(final int dataNodeId) { + final Map result = new ConcurrentHashMap<>(); + final String suffix = KEY_SEPARATOR + dataNodeId; + for (final Map.Entry entry : + commitStates.entrySet()) { + final RegionProgress regionProgress = entry.getValue().getCommittedRegionProgress(); + final ByteBuffer serialized = serializeRegionProgress(regionProgress); + if (Objects.nonNull(serialized)) { + result.put(entry.getKey() + suffix, serialized); + } + } + return result; + } + + // ======================== Progress Broadcast (Leader → Follower) ======================== + + /** + * Broadcasts committed progress to followers if enough time has elapsed since the last broadcast + * for this key. The broadcast is async and fire-and-forget. + */ + private void maybeBroadcast( + final String key, + final String consumerGroupId, + final String topicName, + final ConsensusGroupId regionId, + final WriterProgress committedWriterProgress, + final WriterId committedWriterId) { + if (Objects.isNull(committedWriterId) || Objects.isNull(committedWriterProgress)) { + return; + } + final String broadcastKey = buildBroadcastKey(key, committedWriterId); + final long now = System.currentTimeMillis(); + final Long last = lastBroadcastTime.get(broadcastKey); + if (last != null && now - last < MIN_BROADCAST_INTERVAL_MS) { + return; + } + lastBroadcastTime.put(broadcastKey, now); + broadcastExecutor.submit( + () -> + doBroadcast( + consumerGroupId, topicName, regionId, committedWriterProgress, committedWriterId)); + } + + /** + * Sends committed progress to all follower replicas of the given region. Uses the partition cache + * to discover replica endpoints and skips the local DataNode. + */ + private void doBroadcast( + final String consumerGroupId, + final String topicName, + final ConsensusGroupId regionId, + final WriterProgress writerProgress, + final WriterId writerId) { + final int localDataNodeId = IoTDBDescriptor.getInstance().getConfig().getDataNodeId(); + try { + final List replicaSets = + ClusterPartitionFetcher.getInstance() + .getRegionReplicaSet( + Collections.singletonList(regionId.convertToTConsensusGroupId())); + if (replicaSets.isEmpty()) { + return; + } + final String regionIdStr = regionId.toString(); + final TSyncSubscriptionProgressReq req = + new TSyncSubscriptionProgressReq( + consumerGroupId, + topicName, + regionIdStr, + Objects.nonNull(writerProgress) ? writerProgress.getPhysicalTime() : 0L, + Objects.nonNull(writerProgress) ? writerProgress.getLocalSeq() : -1L); + if (Objects.nonNull(writerId) && writerId.getNodeId() >= 0) { + req.setWriterNodeId(writerId.getNodeId()); + } + if (Objects.nonNull(writerId) && writerId.getWriterEpoch() > 0) { + req.setWriterEpoch(writerId.getWriterEpoch()); + } + + for (final TDataNodeLocation location : replicaSets.get(0).getDataNodeLocations()) { + if (location.getDataNodeId() == localDataNodeId) { + continue; // skip self + } + final TEndPoint endpoint = location.getInternalEndPoint(); + try (final SyncDataNodeInternalServiceClient client = + SYNC_DN_CLIENT_MANAGER.borrowClient(endpoint)) { + client.syncSubscriptionProgress(req); + } catch (final ClientManagerException | TException e) { + LOGGER.debug( + "Failed to broadcast subscription progress to DataNode {} at {}: {}", + location.getDataNodeId(), + endpoint, + e.getMessage()); + } + } + } catch (final Exception e) { + LOGGER.debug( + "Failed to broadcast subscription progress for region {}: {}", regionId, e.getMessage()); + } + } + + /** + * Receives a committed progress broadcast from another DataNode (Leader). Updates local state if + * the broadcast progress is ahead of the current local progress. + */ + public void receiveProgressBroadcast( + final String consumerGroupId, + final String topicName, + final String regionIdStr, + final long physicalTime, + final long localSeq, + final int writerNodeId, + final long writerEpoch) { + receiveProgressBroadcast( + consumerGroupId, + topicName, + regionIdStr, + buildWriterId(regionIdStr, writerNodeId, writerEpoch), + new WriterProgress(physicalTime, localSeq)); + } + + public void receiveProgressBroadcast( + final String consumerGroupId, + final String topicName, + final String regionIdStr, + final WriterId writerId, + final WriterProgress writerProgress) { + if (Objects.isNull(writerId) || Objects.isNull(writerProgress)) { + LOGGER.warn( + "ConsensusSubscriptionCommitManager: ignore broadcast without writer identity, " + + "consumerGroupId={}, topicName={}, regionId={}, writerId={}, writerProgress={}", + consumerGroupId, + topicName, + regionIdStr, + writerId, + writerProgress); + return; + } + final String key = consumerGroupId + KEY_SEPARATOR + topicName + KEY_SEPARATOR + regionIdStr; + final ConsensusSubscriptionCommitState state = commitStates.get(key); + if (state != null) { + // Update only if broadcast is ahead + state.updateFromBroadcast(writerId, writerProgress); + persistProgressIfNeeded(key, state); + } else { + // Create a new state from the broadcast progress + final ConsensusSubscriptionCommitState newState = + new ConsensusSubscriptionCommitState( + regionIdStr, + new SubscriptionConsensusProgress( + new RegionProgress(Collections.singletonMap(writerId, writerProgress)), 0L)); + newState.updateFromBroadcast(writerId, writerProgress); + commitStates.putIfAbsent(key, newState); + persistProgress(key, commitStates.get(key)); + } + LOGGER.debug( + "Received subscription progress broadcast: consumerGroupId={}, topicName={}, " + + "regionId={}, physicalTime={}, localSeq={}", + consumerGroupId, + topicName, + regionIdStr, + writerProgress != null ? writerProgress.getPhysicalTime() : 0L, + writerProgress != null ? writerProgress.getLocalSeq() : -1L); + } + + // ======================== Helper Methods ======================== + + // Use a separator that cannot appear in consumerGroupId, topicName, or regionId + // to prevent key collisions (e.g., "a_b" + "c" vs "a" + "b_c"). + private static final String KEY_SEPARATOR = "##"; + + private String generateKey( + final String consumerGroupId, final String topicName, final ConsensusGroupId regionId) { + return consumerGroupId + KEY_SEPARATOR + topicName + KEY_SEPARATOR + regionId.toString(); + } + + private File getProgressFile(final String key) { + return new File(persistDir, PROGRESS_FILE_PREFIX + key + PROGRESS_FILE_SUFFIX); + } + + private ConsensusSubscriptionCommitState tryRecover(final String key, final String regionIdStr) { + final File file = getProgressFile(key); + if (!file.exists()) { + return null; + } + try (final FileInputStream fis = new FileInputStream(file)) { + final byte[] bytes = new byte[(int) file.length()]; + fis.read(bytes); + final ByteBuffer buffer = ByteBuffer.wrap(bytes); + return ConsensusSubscriptionCommitState.deserialize(regionIdStr, buffer); + } catch (final IOException e) { + LOGGER.warn("Failed to recover consensus subscription progress from {}", file, e); + return null; + } + } + + private static WriterId buildWriterId( + final String regionIdStr, final int writerNodeId, final long writerEpoch) { + return writerNodeId >= 0 ? new WriterId(regionIdStr, writerNodeId, writerEpoch) : null; + } + + static String buildBroadcastKey(final String key, final WriterId writerId) { + return key + + KEY_SEPARATOR + + (Objects.nonNull(writerId) ? writerId.getNodeId() : -1) + + KEY_SEPARATOR + + (Objects.nonNull(writerId) ? writerId.getWriterEpoch() : 0L); + } + + private ConsensusSubscriptionCommitState queryCommitProgressStateFromConfigNode( + final String consumerGroupId, final String topicName, final ConsensusGroupId regionId) { + try (final ConfigNodeClient configNodeClient = + CONFIG_NODE_CLIENT_MANAGER.borrowClient(ConfigNodeInfo.CONFIG_REGION_ID)) { + final TGetCommitProgressReq req = + new TGetCommitProgressReq( + consumerGroupId, + topicName, + regionId.getId(), + IoTDBDescriptor.getInstance().getConfig().getDataNodeId()); + final TGetCommitProgressResp resp = configNodeClient.getCommitProgress(req); + if (resp.status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + return null; + } + if (resp.isSetCommittedRegionProgress()) { + final RegionProgress committedRegionProgress = + deserializeRegionProgress( + ByteBuffer.wrap(resp.getCommittedRegionProgress()).asReadOnlyBuffer()); + if (Objects.nonNull(committedRegionProgress) + && !committedRegionProgress.getWriterPositions().isEmpty()) { + LOGGER.info( + "ConsensusSubscriptionCommitManager: recovered committedRegionProgress={} from " + + "ConfigNode for consumerGroupId={}, topicName={}, regionId={}", + committedRegionProgress, + consumerGroupId, + topicName, + regionId); + final ConsensusSubscriptionCommitState recoveredState = + new ConsensusSubscriptionCommitState( + regionId.toString(), new SubscriptionConsensusProgress()); + recoveredState.resetForSeek(committedRegionProgress); + return recoveredState; + } + } + } catch (final ClientManagerException | TException e) { + LOGGER.warn( + "ConsensusSubscriptionCommitManager: failed to query commit progress from ConfigNode " + + "for consumerGroupId={}, topicName={}, regionId={}, starting from 0", + consumerGroupId, + topicName, + regionId, + e); + } + return null; + } + + private static ByteBuffer serializeRegionProgress(final RegionProgress regionProgress) { + if (Objects.isNull(regionProgress)) { + return null; + } + try (final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + final DataOutputStream dos = new DataOutputStream(baos)) { + regionProgress.serialize(dos); + dos.flush(); + return ByteBuffer.wrap(baos.toByteArray()); + } catch (final IOException e) { + LOGGER.warn("Failed to serialize committed region progress {}", regionProgress, e); + return null; + } + } + + private static RegionProgress deserializeRegionProgress(final ByteBuffer buffer) { + if (Objects.isNull(buffer)) { + return null; + } + final ByteBuffer duplicate = buffer.asReadOnlyBuffer(); + duplicate.rewind(); + return RegionProgress.deserialize(duplicate); + } + + private void persistProgressIfNeeded( + final String key, final ConsensusSubscriptionCommitState state) { + final int interval = + SubscriptionConfig.getInstance().getSubscriptionConsensusCommitPersistInterval(); + if (interval > 0 && state.getProgress().getCommitIndex() % interval == 0) { + persistProgress(key, state); + } + } + + private void persistProgress(final String key, final ConsensusSubscriptionCommitState state) { + final File file = getProgressFile(key); + try (final FileOutputStream fos = new FileOutputStream(file); + final DataOutputStream dos = new DataOutputStream(fos)) { + state.serialize(dos); + dos.flush(); + if (SubscriptionConfig.getInstance().isSubscriptionConsensusCommitFsyncEnabled()) { + fos.getFD().sync(); + } + } catch (final IOException e) { + LOGGER.warn("Failed to persist consensus subscription progress to {}", file, e); + } + } + + // ======================== Inner State Class ======================== + + /** + * Tracks commit state for a single (consumerGroup, topic, region) triple using (physicalTime, + * localSeq) pairs for cross-leader-migration consistency. Outstanding and committed positions are + * tracked as ProgressKey objects rather than raw searchIndex values. + */ + public static class ConsensusSubscriptionCommitState { + + private final String regionId; + + private final SubscriptionConsensusProgress progress; + + /** LRU set of recently committed keys for idempotent re-commit detection. */ + private static final int RECENTLY_COMMITTED_CAPACITY = 1024; + + private final Set recentlyCommittedKeys = + Collections.newSetFromMap( + new LinkedHashMap() { + @Override + protected boolean removeEldestEntry(final Map.Entry eldest) { + return size() > RECENTLY_COMMITTED_CAPACITY; + } + }); + + /** Real committed checkpoint per writer. */ + private final Map committedWriterPositions = new LinkedHashMap<>(); + + /** Tracks dispatched but not-yet-committed events by writer-local slot. */ + private final Map outstandingKeys = new ConcurrentHashMap<>(); + + /** Tracks committed dispatched entries that cannot yet advance the frontier because of gaps. */ + private final Map committedPendingKeys = new LinkedHashMap<>(); + + public ConsensusSubscriptionCommitState( + final String regionId, final SubscriptionConsensusProgress progress) { + this.regionId = regionId; + this.progress = progress; + committedWriterPositions.putAll(progress.getCommittedRegionProgress().getWriterPositions()); + syncPersistedProgress(); + } + + public SubscriptionConsensusProgress getProgress() { + return progress; + } + + public long getCommittedPhysicalTime() { + return getDerivedCommittedFrontierKey().physicalTime; + } + + public long getCommittedLocalSeq() { + return getDerivedCommittedFrontierKey().localSeq; + } + + public int getCommittedWriterNodeId() { + final WriterId committedWriterId = getCommittedWriterId(); + return Objects.nonNull(committedWriterId) ? committedWriterId.getNodeId() : -1; + } + + public long getCommittedWriterEpoch() { + final WriterId committedWriterId = getCommittedWriterId(); + return Objects.nonNull(committedWriterId) ? committedWriterId.getWriterEpoch() : 0L; + } + + public WriterId getCommittedWriterId() { + return getDerivedCommittedFrontierKey().toWriterId(regionId); + } + + public WriterProgress getCommittedWriterProgress() { + return getDerivedCommittedFrontierKey().toWriterProgress(); + } + + public RegionProgress getCommittedRegionProgress() { + synchronized (this) { + return new RegionProgress(new LinkedHashMap<>(committedWriterPositions)); + } + } + + /** Threshold for warning about outstanding (uncommitted) entries accumulation. */ + private static final int OUTSTANDING_SIZE_WARN_THRESHOLD = 10000; + + public void recordMapping(final WriterId writerId, final WriterProgress writerProgress) { + if (Objects.isNull(writerId) || Objects.isNull(writerProgress)) { + LOGGER.warn( + "ConsensusSubscriptionCommitState: ignore mapping without writer identity, " + + "writerId={}, writerProgress={}", + writerId, + writerProgress); + return; + } + final ProgressKey key = new ProgressKey(writerId, writerProgress); + final ProgressSlot slot = ProgressSlot.from(key); + synchronized (this) { + final ProgressKey previous = outstandingKeys.put(slot, key); + if (Objects.nonNull(previous) && !previous.equals(key)) { + LOGGER.warn( + "ConsensusSubscriptionCommitState: duplicate outstanding mapping for slot={}, " + + "previous={}, current={}", + slot, + previous, + key); + } + final int size = outstandingKeys.size(); + if (size > OUTSTANDING_SIZE_WARN_THRESHOLD && size % OUTSTANDING_SIZE_WARN_THRESHOLD == 1) { + LOGGER.warn( + "ConsensusSubscriptionCommitState: outstanding size ({}) exceeds threshold ({}), " + + "consumers may not be committing. committed=({},{}), writer=({}, {})", + size, + OUTSTANDING_SIZE_WARN_THRESHOLD, + getCommittedPhysicalTime(), + getCommittedLocalSeq(), + getCommittedWriterNodeId(), + getCommittedWriterEpoch()); + } + } + } + + /** + * Commits the specified event and advances the committed position contiguously. + * + * @param writerProgress the writer progress of the event to commit + * @return true if successfully committed + */ + public boolean commit(final WriterId writerId, final WriterProgress writerProgress) { + return commitAndGetResult(writerId, writerProgress).isHandled(); + } + + CommitOperationResult commitAndGetResult( + final WriterId writerId, final WriterProgress writerProgress) { + if (Objects.isNull(writerId) || Objects.isNull(writerProgress)) { + LOGGER.warn( + "ConsensusSubscriptionCommitState: missing writer identity for commit, " + + "writerId={}, writerProgress={}", + writerId, + writerProgress); + return CommitOperationResult.unhandled(); + } + final ProgressKey key = new ProgressKey(writerId, writerProgress); + + synchronized (this) { + final ProgressKey recordedKey = outstandingKeys.remove(ProgressSlot.from(key)); + if (recordedKey == null) { + if (recentlyCommittedKeys.contains(key)) { + LOGGER.debug( + "ConsensusSubscriptionCommitState: idempotent re-commit for ({},{},{},{})", + key.physicalTime, + key.localSeq, + key.writerNodeId, + key.writerEpoch); + progress.incrementCommitIndex(); + return CommitOperationResult.handledWithoutAdvance(); + } + LOGGER.warn( + "ConsensusSubscriptionCommitState: unknown key ({},{},{},{}) for commit", + key.physicalTime, + key.localSeq, + key.writerNodeId, + key.writerEpoch); + return CommitOperationResult.unhandled(); + } + final ProgressKey effectiveKey = recordedKey.resolveMissingFields(writerId, writerProgress); + final WriterId effectiveWriterId = effectiveKey.toWriterId(regionId); + final WriterProgress before = getCommittedWriterProgressForWriter(effectiveWriterId); + recentlyCommittedKeys.add(effectiveKey); + stageCommittedAndAdvance(effectiveKey); + progress.incrementCommitIndex(); + syncPersistedProgress(); + return buildCommitOperationResult( + effectiveWriterId, before, getCommittedWriterProgressForWriter(effectiveWriterId)); + } + } + + public boolean commitWithoutOutstanding( + final WriterId writerId, final WriterProgress writerProgress) { + return commitWithoutOutstandingAndGetResult(writerId, writerProgress).isHandled(); + } + + CommitOperationResult commitWithoutOutstandingAndGetResult( + final WriterId writerId, final WriterProgress writerProgress) { + if (Objects.isNull(writerId) || Objects.isNull(writerProgress)) { + LOGGER.warn( + "ConsensusSubscriptionCommitState: missing writer identity for direct commit, " + + "writerId={}, writerProgress={}", + writerId, + writerProgress); + return CommitOperationResult.unhandled(); + } + final ProgressKey incomingKey = new ProgressKey(writerId, writerProgress); + + synchronized (this) { + if (recentlyCommittedKeys.contains(incomingKey)) { + LOGGER.debug( + "ConsensusSubscriptionCommitState: idempotent direct commit for ({},{},{},{})", + incomingKey.physicalTime, + incomingKey.localSeq, + incomingKey.writerNodeId, + incomingKey.writerEpoch); + progress.incrementCommitIndex(); + return CommitOperationResult.handledWithoutAdvance(); + } + + final ProgressKey outstandingKey = outstandingKeys.remove(ProgressSlot.from(incomingKey)); + if (Objects.isNull(outstandingKey)) { + LOGGER.warn( + "ConsensusSubscriptionCommitState: reject direct commit without outstanding mapping " + + "for ({},{},{},{})", + incomingKey.physicalTime, + incomingKey.localSeq, + incomingKey.writerNodeId, + incomingKey.writerEpoch); + return CommitOperationResult.unhandled(); + } + final ProgressKey effectiveKey = + outstandingKey.resolveMissingFields(writerId, writerProgress); + final WriterId effectiveWriterId = effectiveKey.toWriterId(regionId); + final WriterProgress before = getCommittedWriterProgressForWriter(effectiveWriterId); + recentlyCommittedKeys.add(effectiveKey); + stageCommittedAndAdvance(effectiveKey); + progress.incrementCommitIndex(); + syncPersistedProgress(); + return buildCommitOperationResult( + effectiveWriterId, before, getCommittedWriterProgressForWriter(effectiveWriterId)); + } + } + + public void resetForSeek(final RegionProgress regionProgress) { + synchronized (this) { + outstandingKeys.clear(); + committedPendingKeys.clear(); + recentlyCommittedKeys.clear(); + committedWriterPositions.clear(); + if (Objects.nonNull(regionProgress)) { + for (final Map.Entry entry : + regionProgress.getWriterPositions().entrySet()) { + if (Objects.nonNull(entry.getKey()) && Objects.nonNull(entry.getValue())) { + committedWriterPositions.put(entry.getKey(), entry.getValue()); + } + } + } + syncPersistedProgress(); + } + } + + /** + * Updates committed progress from a Leader broadcast. Only advances if the broadcast position + * is ahead of the current local position. + */ + public void updateFromBroadcast(final WriterId writerId, final WriterProgress writerProgress) { + if (Objects.isNull(writerId) || Objects.isNull(writerProgress)) { + return; + } + synchronized (this) { + final ProgressKey incoming = new ProgressKey(writerId, writerProgress); + final WriterId incomingWriterId = incoming.toWriterId(regionId); + final WriterProgress currentWriterProgress = + getCommittedWriterProgressForWriter(incomingWriterId); + final ProgressKey current = new ProgressKey(incomingWriterId, currentWriterProgress); + if (incoming.compareTo(current) > 0) { + committedWriterPositions.put(incomingWriterId, incoming.toWriterProgress()); + syncPersistedProgress(); + } + } + } + + private void advanceCommitted(final ProgressKey key) { + final WriterId writerId = key.toWriterId(regionId); + if (Objects.isNull(writerId)) { + return; + } + committedWriterPositions.put(writerId, key.toWriterProgress()); + } + + private WriterProgress getCommittedWriterProgressForWriter(final WriterId writerId) { + return Objects.nonNull(writerId) + ? committedWriterPositions.getOrDefault(writerId, new WriterProgress(0L, -1L)) + : new WriterProgress(0L, -1L); + } + + private void stageCommittedAndAdvance(final ProgressKey key) { + committedPendingKeys.put(ProgressSlot.from(key), key); + final WriterId writerId = key.toWriterId(regionId); + if (Objects.isNull(writerId)) { + committedPendingKeys.remove(ProgressSlot.from(key)); + return; + } + ProgressKey current = + new ProgressKey(writerId, getCommittedWriterProgressForWriter(writerId)); + while (true) { + final ProgressKey nextCommitted = findNextCommittedKey(writerId, current); + if (Objects.isNull(nextCommitted)) { + return; + } + final ProgressKey nextOutstanding = findNextOutstandingKey(writerId, current); + if (Objects.nonNull(nextOutstanding) && nextOutstanding.compareTo(nextCommitted) < 0) { + return; + } + committedPendingKeys.remove(ProgressSlot.from(nextCommitted)); + advanceCommitted(nextCommitted); + current = nextCommitted; + } + } + + private void advanceCommittedIfAhead(final ProgressKey key) { + final WriterId writerId = key.toWriterId(regionId); + if (Objects.isNull(writerId)) { + return; + } + final WriterProgress currentWriterProgress = getCommittedWriterProgressForWriter(writerId); + final ProgressKey currentKey = new ProgressKey(writerId, currentWriterProgress); + if (key.compareTo(currentKey) > 0) { + advanceCommitted(key); + } + } + + private ProgressKey findNextCommittedKey(final WriterId writerId, final ProgressKey current) { + ProgressKey next = null; + for (final ProgressKey candidate : committedPendingKeys.values()) { + if (!sameWriter(writerId, candidate)) { + continue; + } + if (candidate.compareTo(current) <= 0) { + continue; + } + if (Objects.isNull(next) || candidate.compareTo(next) < 0) { + next = candidate; + } + } + return next; + } + + private ProgressKey findNextOutstandingKey(final WriterId writerId, final ProgressKey current) { + ProgressKey next = null; + for (final ProgressKey candidate : outstandingKeys.values()) { + if (!sameWriter(writerId, candidate)) { + continue; + } + if (candidate.compareTo(current) <= 0) { + continue; + } + if (Objects.isNull(next) || candidate.compareTo(next) < 0) { + next = candidate; + } + } + return next; + } + + private boolean sameWriter(final WriterId writerId, final ProgressKey key) { + return Objects.nonNull(writerId) + && writerId.getNodeId() == key.writerNodeId + && writerId.getWriterEpoch() == key.writerEpoch; + } + + private CommitOperationResult buildCommitOperationResult( + final WriterId writerId, final WriterProgress before, final WriterProgress after) { + if (Objects.isNull(writerId)) { + return CommitOperationResult.handledWithoutAdvance(); + } + final ProgressKey beforeKey = new ProgressKey(writerId, before); + final ProgressKey afterKey = new ProgressKey(writerId, after); + return afterKey.compareTo(beforeKey) > 0 + ? CommitOperationResult.handledWithAdvance(writerId, after) + : CommitOperationResult.handledWithoutAdvance(); + } + + private ProgressKey getDerivedCommittedFrontierKey() { + ProgressKey maxKey = null; + synchronized (this) { + for (final Map.Entry entry : + committedWriterPositions.entrySet()) { + final ProgressKey candidate = new ProgressKey(entry.getKey(), entry.getValue()); + if (Objects.isNull(maxKey) || candidate.compareTo(maxKey) > 0) { + maxKey = candidate; + } + } + } + return Objects.nonNull(maxKey) ? maxKey : new ProgressKey(0L, -1L, -1, 0L); + } + + private void syncPersistedProgress() { + progress.setCommittedRegionProgress( + new RegionProgress(new LinkedHashMap<>(committedWriterPositions))); + } + + public void serialize(final DataOutputStream stream) throws IOException { + synchronized (this) { + syncPersistedProgress(); + progress.serialize(stream); + } + } + + public static ConsensusSubscriptionCommitState deserialize( + final String regionId, final ByteBuffer buffer) { + final SubscriptionConsensusProgress progress = + SubscriptionConsensusProgress.deserialize(buffer); + return new ConsensusSubscriptionCommitState(regionId, progress); + } + } + + private static final class CommitOperationResult { + + private static final CommitOperationResult UNHANDLED = + new CommitOperationResult(false, null, null); + + private static final CommitOperationResult HANDLED_WITHOUT_ADVANCE = + new CommitOperationResult(true, null, null); + + private final boolean handled; + + private final WriterId advancedWriterId; + + private final WriterProgress advancedWriterProgress; + + private CommitOperationResult( + final boolean handled, + final WriterId advancedWriterId, + final WriterProgress advancedWriterProgress) { + this.handled = handled; + this.advancedWriterId = advancedWriterId; + this.advancedWriterProgress = advancedWriterProgress; + } + + private static CommitOperationResult unhandled() { + return UNHANDLED; + } + + private static CommitOperationResult handledWithoutAdvance() { + return HANDLED_WITHOUT_ADVANCE; + } + + private static CommitOperationResult handledWithAdvance( + final WriterId advancedWriterId, final WriterProgress advancedWriterProgress) { + return new CommitOperationResult(true, advancedWriterId, advancedWriterProgress); + } + + private boolean isHandled() { + return handled; + } + + private boolean hasAdvancedWriter() { + return Objects.nonNull(advancedWriterId) && Objects.nonNull(advancedWriterProgress); + } + + private WriterId getAdvancedWriterId() { + return advancedWriterId; + } + + private WriterProgress getAdvancedWriterProgress() { + return advancedWriterProgress; + } + } + + static final class ProgressSlot { + final int writerNodeId; + final long writerEpoch; + final long localSeq; + + private ProgressSlot(final int writerNodeId, final long writerEpoch, final long localSeq) { + this.writerNodeId = writerNodeId; + this.writerEpoch = writerEpoch; + this.localSeq = localSeq; + } + + static ProgressSlot of(final int writerNodeId, final long writerEpoch, final long localSeq) { + return new ProgressSlot(writerNodeId, writerEpoch, localSeq); + } + + static ProgressSlot from(final ProgressKey key) { + return new ProgressSlot(key.writerNodeId, key.writerEpoch, key.localSeq); + } + + @Override + public boolean equals(final Object o) { + if (this == o) { + return true; + } + if (!(o instanceof ProgressSlot)) { + return false; + } + final ProgressSlot that = (ProgressSlot) o; + return writerNodeId == that.writerNodeId + && writerEpoch == that.writerEpoch + && localSeq == that.localSeq; + } + + @Override + public int hashCode() { + return Objects.hash(writerNodeId, writerEpoch, localSeq); + } + + @Override + public String toString() { + return "(" + writerNodeId + "," + writerEpoch + "," + localSeq + ")"; + } + } + + // ======================== ProgressKey ======================== + + /** + * Comparable key for tracking commit progress: (physicalTime, localSeq). Physical time takes + * priority; within the same physical time, writer identity and local sequence determine order. + */ + static final class ProgressKey implements Comparable { + final long physicalTime; + final long localSeq; + final int writerNodeId; + final long writerEpoch; + + ProgressKey(final long physicalTime, final long localSeq) { + this(physicalTime, localSeq, -1, 0L); + } + + ProgressKey(final WriterId writerId, final WriterProgress writerProgress) { + this( + Objects.nonNull(writerProgress) ? writerProgress.getPhysicalTime() : 0L, + Objects.nonNull(writerProgress) ? writerProgress.getLocalSeq() : -1L, + Objects.nonNull(writerId) ? writerId.getNodeId() : -1, + Objects.nonNull(writerId) ? writerId.getWriterEpoch() : 0L); + } + + ProgressKey( + final long physicalTime, + final long localSeq, + final int writerNodeId, + final long writerEpoch) { + this.physicalTime = physicalTime; + this.localSeq = localSeq; + this.writerNodeId = writerNodeId; + this.writerEpoch = writerEpoch; + } + + ProgressKey resolveMissingFields(final WriterId writerId, final WriterProgress writerProgress) { + final long effectivePhysicalTime = + this.physicalTime > 0 + ? this.physicalTime + : Objects.nonNull(writerProgress) + ? writerProgress.getPhysicalTime() + : this.physicalTime; + final long effectiveLocalSeq = + this.localSeq >= 0 + ? this.localSeq + : Objects.nonNull(writerProgress) ? writerProgress.getLocalSeq() : this.localSeq; + final int effectiveWriterNodeId = + this.writerNodeId >= 0 + ? this.writerNodeId + : Objects.nonNull(writerId) ? writerId.getNodeId() : this.writerNodeId; + final long effectiveWriterEpoch = + this.writerEpoch > 0 + ? this.writerEpoch + : Objects.nonNull(writerId) ? writerId.getWriterEpoch() : this.writerEpoch; + if (effectivePhysicalTime == this.physicalTime + && effectiveLocalSeq == this.localSeq + && effectiveWriterNodeId == this.writerNodeId + && effectiveWriterEpoch == this.writerEpoch) { + return this; + } + return new ProgressKey( + effectivePhysicalTime, effectiveLocalSeq, effectiveWriterNodeId, effectiveWriterEpoch); + } + + WriterId toWriterId(final String regionId) { + return writerNodeId >= 0 ? new WriterId(regionId, writerNodeId, writerEpoch) : null; + } + + WriterProgress toWriterProgress() { + return new WriterProgress(physicalTime, localSeq); + } + + @Override + public int compareTo(final ProgressKey o) { + int cmp = Long.compare(physicalTime, o.physicalTime); + if (cmp != 0) { + return cmp; + } + cmp = Integer.compare(writerNodeId, o.writerNodeId); + if (cmp != 0) { + return cmp; + } + cmp = Long.compare(writerEpoch, o.writerEpoch); + if (cmp != 0) { + return cmp; + } + return Long.compare(localSeq, o.localSeq); + } + + @Override + public boolean equals(final Object o) { + if (this == o) { + return true; + } + if (!(o instanceof ProgressKey)) { + return false; + } + final ProgressKey that = (ProgressKey) o; + return physicalTime == that.physicalTime + && localSeq == that.localSeq + && writerNodeId == that.writerNodeId + && writerEpoch == that.writerEpoch; + } + + @Override + public int hashCode() { + return Objects.hash(physicalTime, localSeq, writerNodeId, writerEpoch); + } + + @Override + public String toString() { + return "(" + physicalTime + "," + writerNodeId + "," + writerEpoch + "," + localSeq + ")"; + } + } + + // ======================== Singleton ======================== + + private static class Holder { + private static final ConsensusSubscriptionCommitManager INSTANCE = + new ConsensusSubscriptionCommitManager(); + } + + public static ConsensusSubscriptionCommitManager getInstance() { + return Holder.INSTANCE; + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionSetupHandler.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionSetupHandler.java new file mode 100644 index 0000000000000..0d072e50f23c1 --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionSetupHandler.java @@ -0,0 +1,694 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker.consensus; + +import org.apache.iotdb.common.rpc.thrift.TConsensusGroupId; +import org.apache.iotdb.common.rpc.thrift.TDataNodeLocation; +import org.apache.iotdb.common.rpc.thrift.TRegionReplicaSet; +import org.apache.iotdb.commons.consensus.ConsensusGroupId; +import org.apache.iotdb.commons.consensus.DataRegionId; +import org.apache.iotdb.commons.pipe.datastructure.pattern.IoTDBTreePattern; +import org.apache.iotdb.commons.pipe.datastructure.pattern.PrefixTreePattern; +import org.apache.iotdb.commons.pipe.datastructure.pattern.TablePattern; +import org.apache.iotdb.commons.pipe.datastructure.pattern.TreePattern; +import org.apache.iotdb.consensus.ConsensusFactory; +import org.apache.iotdb.consensus.IConsensus; +import org.apache.iotdb.consensus.iot.IoTConsensus; +import org.apache.iotdb.consensus.iot.IoTConsensusServerImpl; +import org.apache.iotdb.consensus.iot.SubscriptionWalRetentionPolicy; +import org.apache.iotdb.db.conf.IoTDBConfig; +import org.apache.iotdb.db.conf.IoTDBDescriptor; +import org.apache.iotdb.db.consensus.DataRegionConsensusImpl; +import org.apache.iotdb.db.storageengine.StorageEngine; +import org.apache.iotdb.db.storageengine.dataregion.DataRegion; +import org.apache.iotdb.db.subscription.agent.SubscriptionAgent; +import org.apache.iotdb.rpc.subscription.config.TopicConfig; +import org.apache.iotdb.rpc.subscription.config.TopicConstant; +import org.apache.iotdb.rpc.subscription.payload.poll.RegionProgress; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Collections; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.regex.Pattern; + +/** + * Handles setup and teardown of consensus-based subscription queues on DataNode. + * + *

For each consensus-mode topic subscribed by a consumer group, this handler discovers matching + * local IoTConsensus DataRegions, builds the appropriate log-to-tablet converter, and binds one + * queue per region to the consensus subscription broker. + */ +public class ConsensusSubscriptionSetupHandler { + + private static final Logger LOGGER = + LoggerFactory.getLogger(ConsensusSubscriptionSetupHandler.class); + + private static final IoTDBConfig IOTDB_CONFIG = IoTDBDescriptor.getInstance().getConfig(); + + /** Last-known preferred writer node ID per region, used to detect routing changes. */ + private static final ConcurrentHashMap lastKnownPreferredWriter = + new ConcurrentHashMap<>(); + + /** + * Per-region routing runtime version. Uses the routing-broadcast timestamp from ConfigNode so all + * DataNodes derive the same ordering version for the same routing change without local + * persistence. + */ + private static final ConcurrentHashMap regionRuntimeVersion = + new ConcurrentHashMap<>(); + + /** Per-region active writer node IDs for subscription runtime control. */ + private static final ConcurrentHashMap> + regionActiveWriterNodeIds = new ConcurrentHashMap<>(); + + static RegionProgress resolveFallbackCommittedRegionProgress( + final ConsensusSubscriptionCommitManager commitManager, + final String consumerGroupId, + final String topicName, + final ConsensusGroupId groupId) { + commitManager.getOrCreateState(consumerGroupId, topicName, groupId); + final RegionProgress committedRegionProgress = + commitManager.getCommittedRegionProgress(consumerGroupId, topicName, groupId); + return committedRegionProgress != null + && !committedRegionProgress.getWriterPositions().isEmpty() + ? committedRegionProgress + : null; + } + + private ConsensusSubscriptionSetupHandler() { + // utility class + } + + /** + * Ensures that the IoTConsensus new-peer and peer-removed callbacks are set, so that when a new + * DataRegion is created, all active consensus subscriptions are automatically bound to the new + * region, and when a DataRegion is removed, all subscription queues are properly cleaned up. + */ + public static void ensureNewRegionListenerRegistered() { + if (IoTConsensus.onNewPeerCreated == null) { + IoTConsensus.onNewPeerCreated = ConsensusSubscriptionSetupHandler::onNewRegionCreated; + LOGGER.info( + "Set IoTConsensus.onNewPeerCreated callback for consensus subscription auto-binding"); + } + if (IoTConsensus.onPeerRemoved == null) { + IoTConsensus.onPeerRemoved = ConsensusSubscriptionSetupHandler::onRegionRemoved; + LOGGER.info("Set IoTConsensus.onPeerRemoved callback for consensus subscription cleanup"); + } + } + + /** + * Callback invoked when a new DataRegion (IoTConsensusServerImpl) is created locally. Queries + * existing subscription metadata to find all active consensus subscriptions and binds prefetching + * queues to the new region. + */ + private static void onNewRegionCreated( + final ConsensusGroupId groupId, final IoTConsensusServerImpl serverImpl) { + if (!(groupId instanceof DataRegionId)) { + return; + } + + // Query existing metadata keepers for all active subscriptions + final Map> allSubscriptions = + SubscriptionAgent.consumer().getAllSubscriptions(); + if (allSubscriptions.isEmpty()) { + return; + } + + final ConsensusSubscriptionCommitManager commitManager = + ConsensusSubscriptionCommitManager.getInstance(); + + LOGGER.info( + "New DataRegion {} created, checking {} consumer group(s) for auto-binding, " + + "currentSearchIndex={}", + groupId, + allSubscriptions.size(), + serverImpl.getSearchIndex()); + + for (final Map.Entry> groupEntry : allSubscriptions.entrySet()) { + final String consumerGroupId = groupEntry.getKey(); + for (final String topicName : groupEntry.getValue()) { + if (!isConsensusBasedTopic(topicName)) { + continue; + } + try { + final Map topicConfigs = + SubscriptionAgent.topic().getTopicConfigs(java.util.Collections.singleton(topicName)); + final TopicConfig topicConfig = topicConfigs.get(topicName); + if (topicConfig == null) { + continue; + } + + // Resolve the new DataRegion's actual database name + final DataRegion dataRegion = + StorageEngine.getInstance().getDataRegion((DataRegionId) groupId); + if (dataRegion == null) { + continue; + } + final String dbRaw = dataRegion.getDatabaseName(); + final String dbTableModel = dbRaw.startsWith("root.") ? dbRaw.substring(5) : dbRaw; + + // For table topics, skip if this region's database doesn't match the topic filter. + if (!matchesTopicDatabase(topicConfig, dbTableModel)) { + continue; + } + + final String actualDbName = topicConfig.isTableTopic() ? dbTableModel : null; + final ConsensusLogToTabletConverter converter = buildConverter(topicConfig, actualDbName); + final SubscriptionWalRetentionPolicy retentionPolicy = + buildSubscriptionWalRetentionPolicy(topicName, topicConfig, serverImpl); + + // Recover from persisted per-writer region progress when available. The queue will + // resolve a replay start from that progress on first poll via the region-level locator. + final RegionProgress committedRegionProgress = + resolveFallbackCommittedRegionProgress( + commitManager, consumerGroupId, topicName, groupId); + final boolean hasLocalPersistedState = + commitManager.hasPersistedState(consumerGroupId, topicName, groupId); + final long tailStartSearchIndex = serverImpl.getSearchIndex() + 1; + final long initialRuntimeVersion = + regionRuntimeVersion.getOrDefault(groupId.convertToTConsensusGroupId(), 0L); + final boolean initialActive = + lastKnownPreferredWriter.getOrDefault(groupId.convertToTConsensusGroupId(), -1) + == IOTDB_CONFIG.getDataNodeId(); + final Set initialActiveWriterNodeIds = + regionActiveWriterNodeIds.getOrDefault( + groupId.convertToTConsensusGroupId(), + initialActive + ? Collections.singleton(IOTDB_CONFIG.getDataNodeId()) + : Collections.emptySet()); + final ConsensusRegionRuntimeState initialRuntimeState = + new ConsensusRegionRuntimeState( + initialRuntimeVersion, + lastKnownPreferredWriter.getOrDefault(groupId.convertToTConsensusGroupId(), -1), + initialActive, + initialActiveWriterNodeIds); + + LOGGER.info( + "Auto-binding consensus queue for topic [{}] in group [{}] to new region {} " + + "(database={}, tailStartSearchIndex={}, hasLocalPersistedState={}, " + + "committedRegionProgress={}, initialRuntimeVersion={}, initialActive={})", + topicName, + consumerGroupId, + groupId, + dbTableModel, + tailStartSearchIndex, + hasLocalPersistedState, + committedRegionProgress, + initialRuntimeVersion, + initialActive); + + SubscriptionAgent.broker() + .bindConsensusPrefetchingQueue( + consumerGroupId, + topicName, + topicConfig.getOrderMode(), + groupId, + serverImpl, + retentionPolicy, + converter, + commitManager, + committedRegionProgress, + tailStartSearchIndex, + initialRuntimeVersion, + initialActive); + SubscriptionAgent.broker().applyRuntimeStateForRegion(groupId, initialRuntimeState); + } catch (final Exception e) { + LOGGER.error( + "Failed to auto-bind topic [{}] in group [{}] to new region {}", + topicName, + consumerGroupId, + groupId, + e); + } + } + } + } + + /** + * Callback invoked before a DataRegion (IoTConsensusServerImpl) is deleted locally. Unbinds and + * cleans up all subscription prefetching queues associated with the removed region across all + * consumer groups. + */ + private static void onRegionRemoved(final ConsensusGroupId groupId) { + if (!(groupId instanceof DataRegionId)) { + return; + } + lastKnownPreferredWriter.remove(groupId.convertToTConsensusGroupId()); + regionRuntimeVersion.remove(groupId.convertToTConsensusGroupId()); + regionActiveWriterNodeIds.remove(groupId.convertToTConsensusGroupId()); + LOGGER.info( + "DataRegion {} being removed, unbinding all consensus subscription queues", groupId); + try { + SubscriptionAgent.broker().unbindByRegion(groupId); + } catch (final Exception e) { + LOGGER.error( + "Failed to unbind consensus subscription queues for removed region {}", groupId, e); + } + } + + public static boolean isConsensusBasedTopic(final String topicName) { + try { + final String topicMode = SubscriptionAgent.topic().getTopicMode(topicName); + final boolean result = TopicConstant.MODE_CONSENSUS_VALUE.equalsIgnoreCase(topicMode); + LOGGER.debug( + "isConsensusBasedTopic check for topic [{}]: mode={}, result={}", + topicName, + topicMode, + result); + return result; + } catch (final Exception e) { + LOGGER.warn( + "Failed to check if topic [{}] is consensus-based, defaulting to false", topicName, e); + return false; + } + } + + public static void setupConsensusSubscriptions( + final String consumerGroupId, final Set topicNames) { + final IConsensus dataRegionConsensus = DataRegionConsensusImpl.getInstance(); + if (!(dataRegionConsensus instanceof IoTConsensus)) { + final String configuredProtocol = IOTDB_CONFIG.getDataRegionConsensusProtocolClass(); + final String runtimeConsensusImplementation = + Objects.nonNull(dataRegionConsensus) ? dataRegionConsensus.getClass().getName() : "null"; + LOGGER.warn( + "Skipping setup of consensus-based subscriptions for consumer group [{}] because " + + "mode=consensus only supports data_region_consensus_protocol_class={}, but " + + "current configured value is {} (runtime consensus implementation: {})", + consumerGroupId, + ConsensusFactory.IOT_CONSENSUS, + configuredProtocol, + runtimeConsensusImplementation); + return; + } + + // Ensure the new-region listener is registered (idempotent) + ensureNewRegionListenerRegistered(); + + final IoTConsensus ioTConsensus = (IoTConsensus) dataRegionConsensus; + final ConsensusSubscriptionCommitManager commitManager = + ConsensusSubscriptionCommitManager.getInstance(); + + LOGGER.info( + "Setting up consensus subscriptions for consumer group [{}], topics={}, " + + "total consensus groups={}", + consumerGroupId, + topicNames, + ioTConsensus.getAllConsensusGroupIds().size()); + + for (final String topicName : topicNames) { + if (!isConsensusBasedTopic(topicName)) { + continue; + } + + try { + setupConsensusQueueForTopic(consumerGroupId, topicName, ioTConsensus, commitManager); + } catch (final Exception e) { + LOGGER.error( + "Failed to set up consensus subscription for topic [{}] in consumer group [{}]", + topicName, + consumerGroupId, + e); + } + } + } + + /** + * Sets up consensus queues for a single topic. + * + *

This method discovers local DataRegion consensus groups that match the topic filter and + * binds one consensus subscription queue to each matching region. + * + *

For table-model topics, only regions whose database matches the topic's {@code DATABASE_KEY} + * filter are bound. For tree-model topics, all local data regions are candidates. Additionally, + * the {@link #onNewRegionCreated} callback ensures that regions created after this method runs + * are also automatically bound. + */ + private static void setupConsensusQueueForTopic( + final String consumerGroupId, + final String topicName, + final IoTConsensus ioTConsensus, + final ConsensusSubscriptionCommitManager commitManager) { + final int myNodeId = IOTDB_CONFIG.getDataNodeId(); + + // Get topic config for building the converter + final Map topicConfigs = + SubscriptionAgent.topic().getTopicConfigs(java.util.Collections.singleton(topicName)); + final TopicConfig topicConfig = topicConfigs.get(topicName); + if (topicConfig == null) { + LOGGER.warn( + "Topic config not found for topic [{}], cannot set up consensus queue", topicName); + return; + } + + // Build the converter from the currently supported topic filters. + LOGGER.info( + "Setting up consensus queue for topic [{}]: isTableTopic={}, orderMode={}, config={}", + topicName, + topicConfig.isTableTopic(), + topicConfig.getOrderMode(), + topicConfig.getAttribute()); + + final List allGroupIds = ioTConsensus.getAllConsensusGroupIds(); + LOGGER.info( + "Discovered {} consensus group(s) for topic [{}] in consumer group [{}]: {}", + allGroupIds.size(), + topicName, + consumerGroupId, + allGroupIds); + boolean bound = false; + + for (final ConsensusGroupId groupId : allGroupIds) { + if (!(groupId instanceof DataRegionId)) { + continue; + } + + final IoTConsensusServerImpl serverImpl = ioTConsensus.getImpl(groupId); + if (serverImpl == null) { + continue; + } + + // Resolve the DataRegion's actual database name + final DataRegion dataRegion = + StorageEngine.getInstance().getDataRegion((DataRegionId) groupId); + if (dataRegion == null) { + continue; + } + final String dbRaw = dataRegion.getDatabaseName(); + final String dbTableModel = dbRaw.startsWith("root.") ? dbRaw.substring(5) : dbRaw; + + if (!matchesTopicDatabase(topicConfig, dbTableModel)) { + LOGGER.info( + "Skipping region {} (database={}) for table topic [{}] (DATABASE_KEY={})", + groupId, + dbTableModel, + topicName, + topicConfig.getStringOrDefault( + TopicConstant.DATABASE_KEY, TopicConstant.DATABASE_DEFAULT_VALUE)); + continue; + } + + final String actualDbName = topicConfig.isTableTopic() ? dbTableModel : null; + final ConsensusLogToTabletConverter converter = buildConverter(topicConfig, actualDbName); + final SubscriptionWalRetentionPolicy retentionPolicy = + buildSubscriptionWalRetentionPolicy(topicName, topicConfig, serverImpl); + + // Recover from persisted per-writer region progress when available. The queue will resolve a + // replay start from that progress on first poll via the region-level locator. + final RegionProgress committedRegionProgress = + resolveFallbackCommittedRegionProgress( + commitManager, consumerGroupId, topicName, groupId); + final boolean hasLocalPersistedState = + commitManager.hasPersistedState(consumerGroupId, topicName, groupId); + final long tailStartSearchIndex = serverImpl.getSearchIndex() + 1; + final long initialRuntimeVersion = + regionRuntimeVersion.getOrDefault(groupId.convertToTConsensusGroupId(), 0L); + final boolean initialActive = + lastKnownPreferredWriter.getOrDefault(groupId.convertToTConsensusGroupId(), -1) + == myNodeId; + final Set initialActiveWriterNodeIds = + regionActiveWriterNodeIds.getOrDefault( + groupId.convertToTConsensusGroupId(), + initialActive + ? Collections.singleton(IOTDB_CONFIG.getDataNodeId()) + : Collections.emptySet()); + final ConsensusRegionRuntimeState initialRuntimeState = + new ConsensusRegionRuntimeState( + initialRuntimeVersion, + lastKnownPreferredWriter.getOrDefault(groupId.convertToTConsensusGroupId(), -1), + initialActive, + initialActiveWriterNodeIds); + + LOGGER.info( + "Binding consensus prefetching queue for topic [{}] in consumer group [{}] " + + "to data region consensus group [{}] (database={}, tailStartSearchIndex={}, " + + "hasLocalPersistedState={}, committedRegionProgress={}, " + + "initialRuntimeVersion={}, initialActive={})", + topicName, + consumerGroupId, + groupId, + dbTableModel, + tailStartSearchIndex, + hasLocalPersistedState, + committedRegionProgress, + initialRuntimeVersion, + initialActive); + + SubscriptionAgent.broker() + .bindConsensusPrefetchingQueue( + consumerGroupId, + topicName, + topicConfig.getOrderMode(), + groupId, + serverImpl, + retentionPolicy, + converter, + commitManager, + committedRegionProgress, + tailStartSearchIndex, + initialRuntimeVersion, + initialActive); + + SubscriptionAgent.broker().applyRuntimeStateForRegion(groupId, initialRuntimeState); + + bound = true; + } + + if (!bound) { + LOGGER.warn( + "No local IoTConsensus data region found for topic [{}] in consumer group [{}]. " + + "Consensus subscription will be set up when a matching data region becomes available.", + topicName, + consumerGroupId); + } + } + + private static ConsensusLogToTabletConverter buildConverter( + final TopicConfig topicConfig, final String actualDatabaseName) { + // Determine tree or table model + final boolean isTableTopic = topicConfig.isTableTopic(); + + TreePattern treePattern = null; + TablePattern tablePattern = null; + + if (isTableTopic) { + // Table model: database + table name pattern + final String column = + topicConfig.getStringOrDefault( + TopicConstant.COLUMN_KEY, TopicConstant.COLUMN_DEFAULT_VALUE); + tablePattern = buildTablePattern(topicConfig); + final Pattern columnPattern = + TopicConstant.COLUMN_DEFAULT_VALUE.equals(column) ? null : Pattern.compile(column); + return new ConsensusLogToTabletConverter( + null, tablePattern, columnPattern, actualDatabaseName); + } else { + // Tree model: path or pattern + if (topicConfig.getAttribute().containsKey(TopicConstant.PATTERN_KEY)) { + final String pattern = topicConfig.getAttribute().get(TopicConstant.PATTERN_KEY); + treePattern = new PrefixTreePattern(pattern); + } else { + final String path = + topicConfig.getStringOrDefault( + TopicConstant.PATH_KEY, TopicConstant.PATH_DEFAULT_VALUE); + treePattern = new IoTDBTreePattern(path); + } + } + + return new ConsensusLogToTabletConverter(treePattern, tablePattern, null, actualDatabaseName); + } + + private static boolean matchesTopicDatabase( + final TopicConfig topicConfig, final String actualDatabaseName) { + return !topicConfig.isTableTopic() + || buildTablePattern(topicConfig).matchesDatabase(actualDatabaseName); + } + + private static TablePattern buildTablePattern(final TopicConfig topicConfig) { + return new TablePattern( + true, + topicConfig.getStringOrDefault( + TopicConstant.DATABASE_KEY, TopicConstant.DATABASE_DEFAULT_VALUE), + topicConfig.getStringOrDefault(TopicConstant.TABLE_KEY, TopicConstant.TABLE_DEFAULT_VALUE)); + } + + private static SubscriptionWalRetentionPolicy buildSubscriptionWalRetentionPolicy( + final String topicName, + final TopicConfig topicConfig, + final IoTConsensusServerImpl serverImpl) { + return new SubscriptionWalRetentionPolicy( + topicName, + resolveRetentionValue( + topicConfig, + TopicConstant.RETENTION_BYTES_KEY, + serverImpl.getConfig().getReplication().getSubscriptionWalRetentionSizeInBytes()), + resolveRetentionValue( + topicConfig, + TopicConstant.RETENTION_MS_KEY, + serverImpl.getConfig().getReplication().getSubscriptionWalRetentionTimeMs())); + } + + private static long resolveRetentionValue( + final TopicConfig topicConfig, final String key, final long defaultValue) { + if (!topicConfig.hasAttribute(key)) { + return normalizeRetentionValue(defaultValue); + } + final long parsedValue = Long.parseLong(topicConfig.getAttribute().get(key)); + if (parsedValue == 0 || parsedValue < SubscriptionWalRetentionPolicy.UNBOUNDED) { + throw new IllegalArgumentException( + String.format("Illegal %s=%s", key, topicConfig.getAttribute().get(key))); + } + return normalizeRetentionValue(parsedValue); + } + + private static long normalizeRetentionValue(final long retentionValue) { + return retentionValue <= 0 ? SubscriptionWalRetentionPolicy.UNBOUNDED : retentionValue; + } + + public static void teardownConsensusSubscriptions( + final String consumerGroupId, final Set topicNames) { + for (final String topicName : topicNames) { + try { + SubscriptionAgent.broker().unbindConsensusPrefetchingQueue(consumerGroupId, topicName); + + // Clean up commit state for all regions of this topic + ConsensusSubscriptionCommitManager.getInstance() + .removeAllStatesForTopic(consumerGroupId, topicName); + + LOGGER.info( + "Tore down consensus subscription for topic [{}] in consumer group [{}]", + topicName, + consumerGroupId); + } catch (final Exception e) { + LOGGER.warn( + "Failed to tear down consensus subscription for topic [{}] in consumer group [{}]", + topicName, + consumerGroupId, + e); + } + } + } + + public static void handleNewSubscriptions( + final String consumerGroupId, final Set newTopicNames) { + if (newTopicNames == null || newTopicNames.isEmpty()) { + return; + } + + LOGGER.info( + "Checking new subscriptions in consumer group [{}] for consensus-based topics: {}", + consumerGroupId, + newTopicNames); + + setupConsensusSubscriptions(consumerGroupId, newTopicNames); + } + + public static void applyRuntimeState( + final TConsensusGroupId groupId, final ConsensusRegionRuntimeState runtimeState) { + final int newPreferredNodeId = runtimeState.getPreferredWriterNodeId(); + final Integer oldPreferredBoxed = lastKnownPreferredWriter.put(groupId, newPreferredNodeId); + final int oldPreferredNodeId = (oldPreferredBoxed != null) ? oldPreferredBoxed : -1; + final ConsensusGroupId regionId = ConsensusGroupId.Factory.createFromTConsensusGroupId(groupId); + final long oldRuntimeVersion = regionRuntimeVersion.getOrDefault(groupId, 0L); + if (runtimeState.getRuntimeVersion() < oldRuntimeVersion) { + LOGGER.info( + "ConsensusSubscriptionSetupHandler: ignore stale runtime state for region {}, incomingRuntimeVersion={}, currentRuntimeVersion={}, runtimeState={}", + regionId, + runtimeState.getRuntimeVersion(), + oldRuntimeVersion, + runtimeState); + return; + } + regionRuntimeVersion.put(groupId, runtimeState.getRuntimeVersion()); + regionActiveWriterNodeIds.put(groupId, runtimeState.getActiveWriterNodeIds()); + LOGGER.info( + "ConsensusSubscriptionSetupHandler: applying runtime state for region {}, preferred writer {} -> {}, runtimeVersion {} -> {}, runtimeState={}", + regionId, + oldPreferredNodeId, + newPreferredNodeId, + oldRuntimeVersion, + runtimeState.getRuntimeVersion(), + runtimeState); + SubscriptionAgent.broker().applyRuntimeStateForRegion(regionId, runtimeState); + } + + public static void onRegionRouteChanged( + final Map newMap, final long routingTimestamp) { + final int myNodeId = IOTDB_CONFIG.getDataNodeId(); + + for (final Map.Entry newEntry : newMap.entrySet()) { + final TConsensusGroupId groupId = newEntry.getKey(); + final TRegionReplicaSet newReplicaSet = newEntry.getValue(); + + final int newPreferredNodeId = getPreferredNodeId(newReplicaSet); + final Integer oldPreferredBoxed = lastKnownPreferredWriter.put(groupId, newPreferredNodeId); + final int oldPreferredNodeId = (oldPreferredBoxed != null) ? oldPreferredBoxed : -1; + + if (oldPreferredNodeId == newPreferredNodeId) { + continue; + } + + final ConsensusGroupId regionId = + ConsensusGroupId.Factory.createFromTConsensusGroupId(groupId); + final long oldRuntimeVersion = regionRuntimeVersion.getOrDefault(groupId, 0L); + final long newRuntimeVersion = Math.max(routingTimestamp, oldRuntimeVersion); + regionRuntimeVersion.put(groupId, newRuntimeVersion); + + final LinkedHashSet activeWriterNodeIds = + new LinkedHashSet<>( + regionActiveWriterNodeIds.getOrDefault(groupId, Collections.emptySet())); + activeWriterNodeIds.add(newPreferredNodeId); + final Set runtimeActiveWriterNodeIds = + Collections.unmodifiableSet(activeWriterNodeIds); + regionActiveWriterNodeIds.put(groupId, runtimeActiveWriterNodeIds); + + final ConsensusRegionRuntimeState runtimeState = + new ConsensusRegionRuntimeState( + newRuntimeVersion, + newPreferredNodeId, + newPreferredNodeId == myNodeId, + runtimeActiveWriterNodeIds); + + LOGGER.info( + "ConsensusSubscriptionSetupHandler: region {} preferred writer changed {} -> {}, runtimeVersion {} -> {}, runtimeState={} (route hint)", + regionId, + oldPreferredNodeId, + newPreferredNodeId, + oldRuntimeVersion, + newRuntimeVersion, + runtimeState); + + SubscriptionAgent.broker().applyRuntimeStateForRegion(regionId, runtimeState); + } + } + + private static int getPreferredNodeId(final TRegionReplicaSet replicaSet) { + final List locations = replicaSet.getDataNodeLocations(); + if (locations == null || locations.isEmpty()) { + return -1; + } + return locations.get(0).getDataNodeId(); + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/PrefetchRoundResult.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/PrefetchRoundResult.java new file mode 100644 index 0000000000000..71066b4875e06 --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/PrefetchRoundResult.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker.consensus; + +public final class PrefetchRoundResult { + + public enum Type { + RESCHEDULE_NOW, + RESCHEDULE_LATER, + DORMANT + } + + private static final PrefetchRoundResult RESCHEDULE_NOW = + new PrefetchRoundResult(Type.RESCHEDULE_NOW, 0L); + + private static final PrefetchRoundResult DORMANT = new PrefetchRoundResult(Type.DORMANT, 0L); + + private final Type type; + private final long delayMs; + + private PrefetchRoundResult(final Type type, final long delayMs) { + this.type = type; + this.delayMs = delayMs; + } + + public static PrefetchRoundResult rescheduleNow() { + return RESCHEDULE_NOW; + } + + public static PrefetchRoundResult rescheduleAfter(final long delayMs) { + return new PrefetchRoundResult(Type.RESCHEDULE_LATER, Math.max(1L, delayMs)); + } + + public static PrefetchRoundResult dormant() { + return DORMANT; + } + + public Type getType() { + return type; + } + + public long getDelayMs() { + return delayMs; + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ProgressWALIterator.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ProgressWALIterator.java new file mode 100644 index 0000000000000..c6a83f52df15b --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ProgressWALIterator.java @@ -0,0 +1,522 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker.consensus; + +import org.apache.iotdb.consensus.common.request.IConsensusRequest; +import org.apache.iotdb.consensus.common.request.IndexedConsensusRequest; +import org.apache.iotdb.consensus.common.request.IoTConsensusRequest; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.PlanNodeType; +import org.apache.iotdb.db.storageengine.dataregion.wal.buffer.WALEntryType; +import org.apache.iotdb.db.storageengine.dataregion.wal.buffer.WALInfoEntry; +import org.apache.iotdb.db.storageengine.dataregion.wal.io.ProgressWALReader; +import org.apache.iotdb.db.storageengine.dataregion.wal.io.WALFileVersion; +import org.apache.iotdb.db.storageengine.dataregion.wal.io.WALMetaData; +import org.apache.iotdb.db.storageengine.dataregion.wal.node.WALNode; +import org.apache.iotdb.db.storageengine.dataregion.wal.utils.WALFileUtils; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.Closeable; +import java.io.EOFException; +import java.io.File; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.NoSuchElementException; +import java.util.Set; + +/** + * Writer-based WAL iterator for the new subscription progress model. + * + *

This iterator reads writer-local ordering metadata from WAL footer arrays instead of relying + * on the entry body to carry complete subscription ordering information. + */ +public class ProgressWALIterator implements Closeable { + + private static final Logger LOGGER = LoggerFactory.getLogger(ProgressWALIterator.class); + + private static final int SEARCH_INDEX_OFFSET = + WALInfoEntry.FIXED_SERIALIZED_SIZE + PlanNodeType.BYTES; + private static final long HEADER_ONLY_WAL_FILE_BYTES = + Math.max( + WALFileVersion.V2.getVersionBytes().length, WALFileVersion.V3.getVersionBytes().length); + + private final File logDirectory; + private final long startSearchIndex; + private final WALNode liveWalNode; + private File[] walFiles; + private int currentFileIndex = -1; + private ProgressWALReader currentReader; + private long currentReaderVersionId = -1L; + private boolean currentReaderUsesLiveSnapshot = false; + private int consumedEntryCountInCurrentFile = 0; + private final Set skippedBrokenWalVersionIds = new HashSet<>(); + private IOException lastError; + private boolean incompleteScan = false; + private String incompleteScanDetail; + + private long pendingSearchIndex = Long.MIN_VALUE; + private long pendingLocalSeq = Long.MIN_VALUE; + private long pendingPhysicalTime; + private int pendingNodeId; + private long pendingWriterEpoch; + private final List pendingRequests = new ArrayList<>(); + + private IndexedConsensusRequest nextReady; + + public ProgressWALIterator(final File logDirectory) { + this(logDirectory, Long.MIN_VALUE); + } + + public ProgressWALIterator(final File logDirectory, final long startSearchIndex) { + this(logDirectory, startSearchIndex, null); + } + + public ProgressWALIterator(final WALNode liveWalNode) { + this(liveWalNode, Long.MIN_VALUE); + } + + public ProgressWALIterator(final WALNode liveWalNode, final long startSearchIndex) { + this(liveWalNode.getLogDirectory(), startSearchIndex, liveWalNode); + } + + private ProgressWALIterator( + final File logDirectory, final long startSearchIndex, final WALNode liveWalNode) { + this.logDirectory = logDirectory; + this.startSearchIndex = startSearchIndex; + this.liveWalNode = liveWalNode; + refreshFileList(); + } + + private void refreshFileList() { + final File[] discoveredWalFiles = WALFileUtils.listAllWALFiles(logDirectory); + if (discoveredWalFiles == null) { + walFiles = new File[0]; + return; + } + WALFileUtils.ascSortByVersionId(discoveredWalFiles); + final List filteredWalFiles = new ArrayList<>(discoveredWalFiles.length); + for (int i = 0; i < discoveredWalFiles.length; i++) { + final File walFile = discoveredWalFiles[i]; + final boolean isLastWalFile = i == discoveredWalFiles.length - 1; + if (!isLastWalFile && shouldSkipWalFile(walFile)) { + continue; + } + filteredWalFiles.add(walFile); + } + walFiles = filteredWalFiles.toArray(new File[0]); + } + + private boolean shouldSkipWalFile(final File walFile) { + final long versionId = WALFileUtils.parseVersionId(walFile.getName()); + return skippedBrokenWalVersionIds.contains(versionId) || isHeaderOnlyWalFile(walFile); + } + + private boolean isHeaderOnlyWalFile(final File walFile) { + return walFile.length() <= HEADER_ONLY_WAL_FILE_BYTES; + } + + public void refresh() { + final long currentVersionId = + (currentFileIndex >= 0 && currentFileIndex < walFiles.length) + ? WALFileUtils.parseVersionId(walFiles[currentFileIndex].getName()) + : -1; + + refreshFileList(); + + if (currentVersionId >= 0) { + currentFileIndex = -1; + for (int i = 0; i < walFiles.length; i++) { + if (WALFileUtils.parseVersionId(walFiles[i].getName()) >= currentVersionId) { + currentFileIndex = i; + break; + } + } + if (currentFileIndex < 0) { + currentFileIndex = walFiles.length; + } + } + } + + public boolean hasNext() { + if (nextReady != null) { + return true; + } + try { + nextReady = advance(); + if (nextReady != null) { + lastError = null; + } + } catch (IOException e) { + lastError = e; + LOGGER.warn("ProgressWALIterator: error reading WAL", e); + return false; + } + return nextReady != null; + } + + public IndexedConsensusRequest next() { + if (!hasNext()) { + throw new NoSuchElementException(); + } + final IndexedConsensusRequest result = nextReady; + nextReady = null; + return result; + } + + public boolean hasReadError() { + return lastError != null; + } + + public IOException getLastError() { + return lastError; + } + + public boolean hasSkippedBrokenWalFiles() { + return !skippedBrokenWalVersionIds.isEmpty(); + } + + public boolean hasIncompleteScan() { + return incompleteScan || hasReadError() || hasSkippedBrokenWalFiles(); + } + + public String getIncompleteScanDetail() { + if (incompleteScanDetail != null) { + return incompleteScanDetail; + } + if (lastError != null) { + return lastError.getMessage(); + } + if (!skippedBrokenWalVersionIds.isEmpty()) { + return "encountered broken retained WAL files during replay scan"; + } + return "replay scan did not complete"; + } + + @Override + public void close() throws IOException { + closeCurrentReader(); + nextReady = null; + pendingRequests.clear(); + pendingSearchIndex = Long.MIN_VALUE; + pendingLocalSeq = Long.MIN_VALUE; + lastError = null; + incompleteScan = false; + incompleteScanDetail = null; + resetCurrentFileTracking(); + } + + private IndexedConsensusRequest advance() throws IOException { + while (true) { + if (currentReader != null && currentReader.hasNext()) { + try { + final ByteBuffer buffer = currentReader.next(); + consumedEntryCountInCurrentFile = currentReader.getCurrentEntryIndex() + 1; + final WALEntryType type = WALEntryType.valueOf(buffer.get()); + buffer.clear(); + if (!type.needSearch()) { + continue; + } + + final long localSeq = currentReader.getCurrentEntryLocalSeq(); + final long physicalTime = currentReader.getCurrentEntryPhysicalTime(); + final int nodeId = currentReader.getCurrentEntryNodeId(); + final long writerEpoch = currentReader.getCurrentEntryWriterEpoch(); + + buffer.position(SEARCH_INDEX_OFFSET); + final long bodySearchIndex = buffer.getLong(); + buffer.clear(); + + if (isSamePendingRequest(localSeq, nodeId, writerEpoch)) { + if (pendingSearchIndex < 0 && bodySearchIndex >= 0) { + pendingSearchIndex = bodySearchIndex; + } + pendingRequests.add(new IoTConsensusRequest(buffer)); + continue; + } + + final IndexedConsensusRequest flushed = flushPending(); + startPending(bodySearchIndex, localSeq, physicalTime, nodeId, writerEpoch, buffer); + if (flushed != null && !shouldSkip(flushed)) { + return flushed; + } + continue; + } catch (final EOFException eofException) { + if (!currentReaderUsesLiveSnapshot) { + throw eofException; + } + // Live snapshot metadata may get ahead of the bytes currently visible in the file. Treat + // EOF as "this snapshot is exhausted for now" instead of terminating the iterator. + final IndexedConsensusRequest flushed = flushPending(); + if (flushed != null && !shouldSkip(flushed)) { + closeCurrentReader(); + return flushed; + } + if (reopenLiveSnapshotReader()) { + continue; + } + return null; + } + } + + if (currentReaderUsesLiveSnapshot) { + final IndexedConsensusRequest flushed = flushPending(); + if (flushed != null && !shouldSkip(flushed)) { + return flushed; + } + if (reopenLiveSnapshotReader()) { + continue; + } + return null; + } + + if (currentReader != null) { + closeCurrentReader(); + final IndexedConsensusRequest flushed = flushPending(); + resetCurrentFileTracking(); + if (flushed != null && !shouldSkip(flushed)) { + return flushed; + } + continue; + } + + if (!openNextReader()) { + final IndexedConsensusRequest flushed = flushPending(); + if (flushed != null && !shouldSkip(flushed)) { + return flushed; + } + return null; + } + } + } + + private boolean openNextReader() throws IOException { + while (++currentFileIndex < walFiles.length) { + if (openReaderAtIndex(currentFileIndex, 0)) { + return true; + } + } + return false; + } + + private boolean reopenLiveSnapshotReader() throws IOException { + if (liveWalNode == null || currentReaderVersionId < 0) { + return false; + } + + closeCurrentReader(); + refresh(); + + final long currentLiveVersionId = liveWalNode.getCurrentWALFileVersion(); + if (currentLiveVersionId == currentReaderVersionId) { + final WALMetaData snapshot = liveWalNode.getCurrentWALMetaDataSnapshot(); + if (snapshot.getBuffersSize().size() <= consumedEntryCountInCurrentFile) { + return false; + } + final int fileIndex = findFileIndexByVersion(currentReaderVersionId); + if (fileIndex < 0) { + return false; + } + return openReaderAtIndex(fileIndex, consumedEntryCountInCurrentFile); + } + + final int previousFileIndex = findFileIndexByVersion(currentReaderVersionId); + if (previousFileIndex < 0) { + return openFirstReaderAfterVersion(currentReaderVersionId); + } + if (openReaderAtIndex(previousFileIndex, consumedEntryCountInCurrentFile)) { + return true; + } + return openFirstReaderAfterVersion(currentReaderVersionId); + } + + private boolean openReaderAtIndex(final int fileIndex, final int skipEntries) throws IOException { + return openReaderAtIndex(fileIndex, skipEntries, true); + } + + private boolean openReaderAtIndex( + final int fileIndex, final int skipEntries, final boolean allowNearLiveRetry) + throws IOException { + final File walFile = walFiles[fileIndex]; + final long versionId = WALFileUtils.parseVersionId(walFile.getName()); + final boolean useLiveSnapshot = + liveWalNode != null && versionId == liveWalNode.getCurrentWALFileVersion(); + + try { + final ProgressWALReader reader = + useLiveSnapshot + ? new ProgressWALReader(walFile, liveWalNode.getCurrentWALMetaDataSnapshot()) + : new ProgressWALReader(walFile); + if (!skipEntries(reader, skipEntries)) { + reader.close(); + markIncompleteScan( + String.format( + "failed to reopen WAL file %s at entry offset %s: iterator could not skip to the requested position", + walFile.getName(), skipEntries), + null); + resetCurrentFileTracking(); + return false; + } + currentReader = reader; + currentFileIndex = fileIndex; + currentReaderVersionId = versionId; + currentReaderUsesLiveSnapshot = useLiveSnapshot; + consumedEntryCountInCurrentFile = skipEntries; + return true; + } catch (final IOException e) { + if (isNearLiveWalVersion(versionId)) { + LOGGER.debug( + "ProgressWALIterator: failed to open near-live WAL file {}, retrying without blacklisting", + walFile.getName(), + e); + if (allowNearLiveRetry) { + refresh(); + final int refreshedIndex = findFileIndexByVersion(versionId); + if (refreshedIndex >= 0) { + if (openReaderAtIndex(refreshedIndex, skipEntries, false)) { + return true; + } + } + } + markIncompleteScan( + String.format( + "failed to open near-live WAL file %s while replay scan was still in progress", + walFile.getName()), + e); + return false; + } + skippedBrokenWalVersionIds.add(versionId); + LOGGER.warn( + "ProgressWALIterator: failed to open WAL file {}, skipping", walFile.getName(), e); + return false; + } + } + + private boolean skipEntries(final ProgressWALReader reader, final int skipEntries) + throws IOException { + int skipped = 0; + while (skipped < skipEntries) { + if (!reader.hasNext()) { + return false; + } + reader.next(); + skipped++; + } + return true; + } + + private int findFileIndexByVersion(final long versionId) { + for (int i = 0; i < walFiles.length; i++) { + if (WALFileUtils.parseVersionId(walFiles[i].getName()) == versionId) { + return i; + } + } + return -1; + } + + private boolean openFirstReaderAfterVersion(final long versionId) throws IOException { + for (int i = 0; i < walFiles.length; i++) { + if (WALFileUtils.parseVersionId(walFiles[i].getName()) > versionId + && openReaderAtIndex(i, 0)) { + return true; + } + } + resetCurrentFileTracking(); + return false; + } + + private boolean isNearLiveWalVersion(final long versionId) { + if (liveWalNode == null) { + return false; + } + return versionId >= Math.max(0L, liveWalNode.getCurrentWALFileVersion() - 1L); + } + + private boolean isSamePendingRequest( + final long localSeq, final int nodeId, final long writerEpoch) { + return !pendingRequests.isEmpty() + && pendingLocalSeq == localSeq + && pendingNodeId == nodeId + && pendingWriterEpoch == writerEpoch; + } + + private void startPending( + final long searchIndex, + final long localSeq, + final long physicalTime, + final int nodeId, + final long writerEpoch, + final ByteBuffer buffer) { + pendingSearchIndex = searchIndex; + pendingLocalSeq = localSeq; + pendingPhysicalTime = physicalTime; + pendingNodeId = nodeId; + pendingWriterEpoch = writerEpoch; + pendingRequests.clear(); + pendingRequests.add(new IoTConsensusRequest(buffer)); + } + + private IndexedConsensusRequest flushPending() { + if (pendingRequests.isEmpty()) { + return null; + } + final IndexedConsensusRequest result = + new IndexedConsensusRequest( + pendingSearchIndex, pendingLocalSeq, new ArrayList<>(pendingRequests)); + result + .setPhysicalTime(pendingPhysicalTime) + .setNodeId(pendingNodeId) + .setWriterEpoch(pendingWriterEpoch); + pendingRequests.clear(); + pendingSearchIndex = Long.MIN_VALUE; + pendingLocalSeq = Long.MIN_VALUE; + return result; + } + + private boolean shouldSkip(final IndexedConsensusRequest request) { + return request.getSearchIndex() >= 0 && request.getSearchIndex() < startSearchIndex; + } + + private void closeCurrentReader() throws IOException { + if (currentReader != null) { + currentReader.close(); + currentReader = null; + } + } + + private void resetCurrentFileTracking() { + currentReaderVersionId = -1L; + currentReaderUsesLiveSnapshot = false; + consumedEntryCountInCurrentFile = 0; + } + + private void markIncompleteScan(final String detail, final IOException cause) { + incompleteScan = true; + if (incompleteScanDetail == null) { + incompleteScanDetail = detail; + } + if (lastError == null && cause != null) { + lastError = cause; + } + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/SubscriptionConsensusProgress.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/SubscriptionConsensusProgress.java new file mode 100644 index 0000000000000..c39f17b86b1db --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/SubscriptionConsensusProgress.java @@ -0,0 +1,193 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker.consensus; + +import org.apache.iotdb.rpc.subscription.payload.poll.RegionProgress; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterId; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterProgress; + +import org.apache.tsfile.utils.ReadWriteIOUtils; + +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Objects; +import java.util.concurrent.atomic.AtomicLong; + +/** + * Persisted commit metadata for a single (consumerGroup, topic, region) combination. + * + *

This object stores the committed per-writer region frontier plus the persistence throttling + * counter. + */ +public class SubscriptionConsensusProgress { + + private volatile RegionProgress committedRegionProgress; + + private final AtomicLong commitIndex; + + public SubscriptionConsensusProgress() { + this(new RegionProgress(Collections.emptyMap()), 0L); + } + + public SubscriptionConsensusProgress( + final RegionProgress committedRegionProgress, final long commitIndex) { + this.committedRegionProgress = normalize(committedRegionProgress); + this.commitIndex = new AtomicLong(commitIndex); + } + + public RegionProgress getCommittedRegionProgress() { + return committedRegionProgress; + } + + public void setCommittedRegionProgress(final RegionProgress committedRegionProgress) { + this.committedRegionProgress = normalize(committedRegionProgress); + } + + public WriterId getCommittedWriterId() { + return getDerivedCommittedWriterState().writerId; + } + + public WriterProgress getCommittedWriterProgress() { + return getDerivedCommittedWriterState().writerProgress; + } + + public long getCommitIndex() { + return commitIndex.get(); + } + + public void incrementCommitIndex() { + commitIndex.incrementAndGet(); + } + + public void serialize(final DataOutputStream stream) throws IOException { + committedRegionProgress.serialize(stream); + ReadWriteIOUtils.write(commitIndex.get(), stream); + } + + public static SubscriptionConsensusProgress deserialize(final ByteBuffer buffer) { + final RegionProgress committedRegionProgress = RegionProgress.deserialize(buffer); + final long commitIndex = ReadWriteIOUtils.readLong(buffer); + return new SubscriptionConsensusProgress(committedRegionProgress, commitIndex); + } + + @Override + public boolean equals(final Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + final SubscriptionConsensusProgress that = (SubscriptionConsensusProgress) o; + return commitIndex.get() == that.commitIndex.get() + && Objects.equals(committedRegionProgress, that.committedRegionProgress); + } + + @Override + public int hashCode() { + return Objects.hash(committedRegionProgress, commitIndex.get()); + } + + @Override + public String toString() { + return "SubscriptionConsensusProgress{" + + "committedRegionProgress=" + + committedRegionProgress + + ", commitIndex=" + + commitIndex.get() + + '}'; + } + + private static RegionProgress normalize(final RegionProgress committedRegionProgress) { + if (Objects.isNull(committedRegionProgress) + || committedRegionProgress.getWriterPositions().isEmpty()) { + return new RegionProgress(Collections.emptyMap()); + } + final Map normalized = new LinkedHashMap<>(); + for (final Map.Entry entry : + committedRegionProgress.getWriterPositions().entrySet()) { + if (Objects.nonNull(entry.getKey()) && Objects.nonNull(entry.getValue())) { + normalized.put(entry.getKey(), entry.getValue()); + } + } + return new RegionProgress(normalized); + } + + private DerivedCommittedWriterState getDerivedCommittedWriterState() { + WriterId bestWriterId = null; + WriterProgress bestWriterProgress = null; + for (final Map.Entry entry : + committedRegionProgress.getWriterPositions().entrySet()) { + if (Objects.isNull(bestWriterProgress) + || compareWriterProgress(entry.getValue(), bestWriterProgress) > 0 + || (compareWriterProgress(entry.getValue(), bestWriterProgress) == 0 + && compareWriterId(entry.getKey(), bestWriterId) > 0)) { + bestWriterId = entry.getKey(); + bestWriterProgress = entry.getValue(); + } + } + return new DerivedCommittedWriterState( + bestWriterId, + Objects.nonNull(bestWriterProgress) ? bestWriterProgress : new WriterProgress(0L, -1L)); + } + + private static int compareWriterProgress( + final WriterProgress leftProgress, final WriterProgress rightProgress) { + int cmp = Long.compare(leftProgress.getPhysicalTime(), rightProgress.getPhysicalTime()); + if (cmp != 0) { + return cmp; + } + return Long.compare(leftProgress.getLocalSeq(), rightProgress.getLocalSeq()); + } + + private static int compareWriterId(final WriterId leftWriterId, final WriterId rightWriterId) { + if (Objects.isNull(leftWriterId) && Objects.isNull(rightWriterId)) { + return 0; + } + if (Objects.isNull(leftWriterId)) { + return -1; + } + if (Objects.isNull(rightWriterId)) { + return 1; + } + int cmp = Integer.compare(leftWriterId.getNodeId(), rightWriterId.getNodeId()); + if (cmp != 0) { + return cmp; + } + return Long.compare(leftWriterId.getWriterEpoch(), rightWriterId.getWriterEpoch()); + } + + private static final class DerivedCommittedWriterState { + + private final WriterId writerId; + + private final WriterProgress writerProgress; + + private DerivedCommittedWriterState( + final WriterId writerId, final WriterProgress writerProgress) { + this.writerId = writerId; + this.writerProgress = writerProgress; + } + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/event/SubscriptionEvent.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/event/SubscriptionEvent.java index dfadee5908fa5..eba81238316ed 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/event/SubscriptionEvent.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/event/SubscriptionEvent.java @@ -47,7 +47,6 @@ import java.util.concurrent.atomic.AtomicLong; import static com.google.common.base.MoreObjects.toStringHelper; -import static org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext.INVALID_COMMIT_ID; public class SubscriptionEvent implements Comparable { @@ -71,6 +70,9 @@ public class SubscriptionEvent implements Comparable { private volatile SubscriptionCommitContext rootCommitContext; private static final long NACK_COUNT_REPORT_THRESHOLD = 3; + + private static final long POISON_MESSAGE_NACK_THRESHOLD = 10; + private final AtomicLong nackCount = new AtomicLong(); /** @@ -159,16 +161,15 @@ public void recordCommittedTimestamp() { } public boolean isCommitted() { - if (commitContext.getCommitId() == INVALID_COMMIT_ID) { - // event with invalid commit id is committed + if (!commitContext.isCommittable()) { + // fire-and-forget events are treated as already committed return true; } return committedTimestamp.get() != INVALID_TIMESTAMP; } public boolean isCommittable() { - if (commitContext.getCommitId() == INVALID_COMMIT_ID) { - // event with invalid commit id is uncommittable + if (!commitContext.isCommittable()) { return false; } return response.isCommittable(); @@ -248,6 +249,15 @@ public void nack() { } } + /** Returns the current nack count for this event. */ + public long getNackCount() { + return nackCount.get(); + } + + public boolean isPoisoned() { + return nackCount.get() >= POISON_MESSAGE_NACK_THRESHOLD; + } + public void recordLastPolledConsumerId(final String consumerId) { lastPolledConsumerId = consumerId; } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/metric/ConsensusSubscriptionPrefetchingQueueMetrics.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/metric/ConsensusSubscriptionPrefetchingQueueMetrics.java new file mode 100644 index 0000000000000..ecf79360237b7 --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/metric/ConsensusSubscriptionPrefetchingQueueMetrics.java @@ -0,0 +1,246 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.metric; + +import org.apache.iotdb.commons.service.metric.enums.Metric; +import org.apache.iotdb.commons.service.metric.enums.Tag; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusPrefetchingQueue; +import org.apache.iotdb.metrics.AbstractMetricService; +import org.apache.iotdb.metrics.metricsets.IMetricSet; +import org.apache.iotdb.metrics.type.Rate; +import org.apache.iotdb.metrics.utils.MetricLevel; +import org.apache.iotdb.metrics.utils.MetricType; + +import com.google.common.collect.ImmutableSet; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Map; +import java.util.Objects; +import java.util.concurrent.ConcurrentHashMap; + +public class ConsensusSubscriptionPrefetchingQueueMetrics implements IMetricSet { + + private static final Logger LOGGER = + LoggerFactory.getLogger(ConsensusSubscriptionPrefetchingQueueMetrics.class); + + private volatile AbstractMetricService metricService; + + private final Map queueMap = new ConcurrentHashMap<>(); + + private final Map rateMap = new ConcurrentHashMap<>(); + + @Override + public void bindTo(final AbstractMetricService metricService) { + this.metricService = metricService; + final ImmutableSet ids = ImmutableSet.copyOf(queueMap.keySet()); + for (final String id : ids) { + createMetrics(id); + } + } + + @Override + public void unbindFrom(final AbstractMetricService metricService) { + final ImmutableSet ids = ImmutableSet.copyOf(queueMap.keySet()); + for (final String id : ids) { + deregister(id); + } + if (!queueMap.isEmpty()) { + LOGGER.warn( + "Failed to unbind from consensus subscription prefetching queue metrics, queue map not empty"); + } + } + + //////////////////////////// register & deregister //////////////////////////// + + public void register(final ConsensusPrefetchingQueue queue) { + final String id = queue.getPrefetchingQueueId(); + queueMap.putIfAbsent(id, queue); + if (Objects.nonNull(metricService)) { + createMetrics(id); + } + } + + private void createMetrics(final String id) { + createAutoGauge(id); + createRate(id); + } + + private void createAutoGauge(final String id) { + final ConsensusPrefetchingQueue queue = queueMap.get(id); + if (Objects.isNull(queue)) { + return; + } + metricService.createAutoGauge( + Metric.SUBSCRIPTION_UNCOMMITTED_EVENT_COUNT.toString(), + MetricLevel.IMPORTANT, + queue, + ConsensusPrefetchingQueue::getSubscriptionUncommittedEventCount, + Tag.NAME.toString(), + queue.getPrefetchingQueueId()); + // Keep the legacy metric name for dashboard compatibility, but expose seek generation here. + metricService.createAutoGauge( + Metric.SUBSCRIPTION_CURRENT_COMMIT_ID.toString(), + MetricLevel.IMPORTANT, + queue, + ConsensusPrefetchingQueue::getCurrentSeekGeneration, + Tag.NAME.toString(), + queue.getPrefetchingQueueId()); + metricService.createAutoGauge( + Metric.SUBSCRIPTION_CONSENSUS_LAG.toString(), + MetricLevel.IMPORTANT, + queue, + ConsensusPrefetchingQueue::getLag, + Tag.NAME.toString(), + queue.getPrefetchingQueueId()); + metricService.createAutoGauge( + Metric.SUBSCRIPTION_CONSENSUS_WAL_GAP.toString(), + MetricLevel.IMPORTANT, + queue, + ConsensusPrefetchingQueue::getWalGapSkippedEntries, + Tag.NAME.toString(), + queue.getPrefetchingQueueId()); + metricService.createAutoGauge( + Metric.SUBSCRIPTION_CONSENSUS_ROUTING_EPOCH_CHANGE.toString(), + MetricLevel.IMPORTANT, + queue, + ConsensusPrefetchingQueue::getEpochChangeCount, + Tag.NAME.toString(), + queue.getPrefetchingQueueId()); + metricService.createAutoGauge( + Metric.SUBSCRIPTION_CONSENSUS_WATERMARK.toString(), + MetricLevel.IMPORTANT, + queue, + ConsensusPrefetchingQueue::getMaxObservedTimestamp, + Tag.NAME.toString(), + queue.getPrefetchingQueueId()); + } + + private void createRate(final String id) { + final ConsensusPrefetchingQueue queue = queueMap.get(id); + if (Objects.isNull(queue)) { + return; + } + rateMap.put( + id, + metricService.getOrCreateRate( + Metric.SUBSCRIPTION_EVENT_TRANSFER.toString(), + MetricLevel.IMPORTANT, + Tag.NAME.toString(), + queue.getPrefetchingQueueId())); + } + + public void deregister(final String id) { + if (!queueMap.containsKey(id)) { + LOGGER.warn( + "Failed to deregister consensus subscription prefetching queue metrics, " + + "ConsensusPrefetchingQueue({}) does not exist", + id); + return; + } + if (Objects.nonNull(metricService)) { + removeMetrics(id); + } + queueMap.remove(id); + } + + private void removeMetrics(final String id) { + removeAutoGauge(id); + removeRate(id); + } + + private void removeAutoGauge(final String id) { + final ConsensusPrefetchingQueue queue = queueMap.get(id); + if (Objects.isNull(queue)) { + return; + } + metricService.remove( + MetricType.AUTO_GAUGE, + Metric.SUBSCRIPTION_UNCOMMITTED_EVENT_COUNT.toString(), + Tag.NAME.toString(), + queue.getPrefetchingQueueId()); + metricService.remove( + MetricType.AUTO_GAUGE, + Metric.SUBSCRIPTION_CURRENT_COMMIT_ID.toString(), + Tag.NAME.toString(), + queue.getPrefetchingQueueId()); + metricService.remove( + MetricType.AUTO_GAUGE, + Metric.SUBSCRIPTION_CONSENSUS_LAG.toString(), + Tag.NAME.toString(), + queue.getPrefetchingQueueId()); + metricService.remove( + MetricType.AUTO_GAUGE, + Metric.SUBSCRIPTION_CONSENSUS_WAL_GAP.toString(), + Tag.NAME.toString(), + queue.getPrefetchingQueueId()); + metricService.remove( + MetricType.AUTO_GAUGE, + Metric.SUBSCRIPTION_CONSENSUS_ROUTING_EPOCH_CHANGE.toString(), + Tag.NAME.toString(), + queue.getPrefetchingQueueId()); + metricService.remove( + MetricType.AUTO_GAUGE, + Metric.SUBSCRIPTION_CONSENSUS_WATERMARK.toString(), + Tag.NAME.toString(), + queue.getPrefetchingQueueId()); + } + + private void removeRate(final String id) { + final ConsensusPrefetchingQueue queue = queueMap.get(id); + if (Objects.isNull(queue)) { + return; + } + metricService.remove( + MetricType.RATE, + Metric.SUBSCRIPTION_EVENT_TRANSFER.toString(), + Tag.NAME.toString(), + queue.getPrefetchingQueueId()); + } + + public void mark(final String id, final long size) { + if (Objects.isNull(metricService)) { + return; + } + final Rate rate = rateMap.get(id); + if (rate == null) { + LOGGER.warn( + "Failed to mark transfer event rate, ConsensusPrefetchingQueue({}) does not exist", id); + return; + } + rate.mark(size); + } + + //////////////////////////// singleton //////////////////////////// + + private static class Holder { + + private static final ConsensusSubscriptionPrefetchingQueueMetrics INSTANCE = + new ConsensusSubscriptionPrefetchingQueueMetrics(); + + private Holder() {} + } + + public static ConsensusSubscriptionPrefetchingQueueMetrics getInstance() { + return Holder.INSTANCE; + } + + private ConsensusSubscriptionPrefetchingQueueMetrics() {} +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/metric/SubscriptionMetrics.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/metric/SubscriptionMetrics.java index 48a6dc50e6d43..29de59ddf3266 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/metric/SubscriptionMetrics.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/metric/SubscriptionMetrics.java @@ -29,11 +29,13 @@ public class SubscriptionMetrics implements IMetricSet { @Override public void bindTo(final AbstractMetricService metricService) { SubscriptionPrefetchingQueueMetrics.getInstance().bindTo(metricService); + ConsensusSubscriptionPrefetchingQueueMetrics.getInstance().bindTo(metricService); } @Override public void unbindFrom(final AbstractMetricService metricService) { SubscriptionPrefetchingQueueMetrics.getInstance().unbindFrom(metricService); + ConsensusSubscriptionPrefetchingQueueMetrics.getInstance().unbindFrom(metricService); } //////////////////////////// singleton //////////////////////////// diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/receiver/SubscriptionReceiverV1.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/receiver/SubscriptionReceiverV1.java index bfcbbaf850f7a..8ad014ae8c0bc 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/receiver/SubscriptionReceiverV1.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/receiver/SubscriptionReceiverV1.java @@ -38,7 +38,9 @@ import org.apache.iotdb.db.protocol.client.ConfigNodeInfo; import org.apache.iotdb.db.subscription.agent.SubscriptionAgent; import org.apache.iotdb.db.subscription.broker.SubscriptionPrefetchingQueue; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusSubscriptionSetupHandler; import org.apache.iotdb.db.subscription.event.SubscriptionEvent; +import org.apache.iotdb.db.subscription.metric.ConsensusSubscriptionPrefetchingQueueMetrics; import org.apache.iotdb.db.subscription.metric.SubscriptionPrefetchingQueueMetrics; import org.apache.iotdb.rpc.RpcUtils; import org.apache.iotdb.rpc.TSStatusCode; @@ -54,6 +56,8 @@ import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionPollRequest; import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionPollRequestType; import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionPollResponse; +import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionPollResponseType; +import org.apache.iotdb.rpc.subscription.payload.poll.TopicProgress; import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeCloseReq; import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeCommitReq; import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeHandshakeReq; @@ -61,6 +65,7 @@ import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribePollReq; import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeRequestType; import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeRequestVersion; +import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeSeekReq; import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeSubscribeReq; import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeUnsubscribeReq; import org.apache.iotdb.rpc.subscription.payload.response.PipeSubscribeCloseResp; @@ -70,6 +75,7 @@ import org.apache.iotdb.rpc.subscription.payload.response.PipeSubscribePollResp; import org.apache.iotdb.rpc.subscription.payload.response.PipeSubscribeResponseType; import org.apache.iotdb.rpc.subscription.payload.response.PipeSubscribeResponseVersion; +import org.apache.iotdb.rpc.subscription.payload.response.PipeSubscribeSeekResp; import org.apache.iotdb.rpc.subscription.payload.response.PipeSubscribeSubscribeResp; import org.apache.iotdb.rpc.subscription.payload.response.PipeSubscribeUnsubscribeResp; import org.apache.iotdb.service.rpc.thrift.TPipeSubscribeReq; @@ -85,6 +91,7 @@ import java.util.Collections; import java.util.HashMap; import java.util.HashSet; +import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Objects; @@ -103,10 +110,10 @@ public class SubscriptionReceiverV1 implements SubscriptionReceiver { private static final IClientManager CONFIG_NODE_CLIENT_MANAGER = ConfigNodeClientManager.getInstance(); - private static final TPipeSubscribeResp SUBSCRIPTION_MISSING_CUSTOMER_RESP = + private static final TPipeSubscribeResp SUBSCRIPTION_MISSING_CONSUMER_RESP = new TPipeSubscribeResp( RpcUtils.getStatus( - TSStatusCode.SUBSCRIPTION_MISSING_CUSTOMER, + TSStatusCode.SUBSCRIPTION_MISSING_CONSUMER, "Missing consumer config, please handshake first."), PipeSubscribeResponseVersion.VERSION_1.getVersion(), PipeSubscribeResponseType.ACK.getType()); @@ -145,6 +152,8 @@ public final TPipeSubscribeResp handle(final TPipeSubscribeReq req) { return handlePipeSubscribeCommit(PipeSubscribeCommitReq.fromTPipeSubscribeReq(req)); case CLOSE: return handlePipeSubscribeClose(PipeSubscribeCloseReq.fromTPipeSubscribeReq(req)); + case SEEK: + return handlePipeSubscribeSeek(PipeSubscribeSeekReq.fromTPipeSubscribeReq(req)); default: break; } @@ -321,7 +330,7 @@ private TPipeSubscribeResp handlePipeSubscribeHeartbeatInternal( if (Objects.isNull(consumerConfig)) { LOGGER.warn( "Subscription: missing consumer config when handling PipeSubscribeHeartbeatReq: {}", req); - return SUBSCRIPTION_MISSING_CUSTOMER_RESP; + return SUBSCRIPTION_MISSING_CONSUMER_RESP; } // TODO: do something @@ -400,7 +409,7 @@ private TPipeSubscribeResp handlePipeSubscribeSubscribeInternal( if (Objects.isNull(consumerConfig)) { LOGGER.warn( "Subscription: missing consumer config when handling PipeSubscribeSubscribeReq: {}", req); - return SUBSCRIPTION_MISSING_CUSTOMER_RESP; + return SUBSCRIPTION_MISSING_CONSUMER_RESP; } // subscribe topics @@ -442,7 +451,7 @@ private TPipeSubscribeResp handlePipeSubscribeUnsubscribeInternal( LOGGER.warn( "Subscription: missing consumer config when handling PipeSubscribeUnsubscribeReq: {}", req); - return SUBSCRIPTION_MISSING_CUSTOMER_RESP; + return SUBSCRIPTION_MISSING_CONSUMER_RESP; } // unsubscribe topics @@ -483,7 +492,7 @@ private TPipeSubscribeResp handlePipeSubscribePollInternal(final PipeSubscribePo if (Objects.isNull(consumerConfig)) { LOGGER.warn( "Subscription: missing consumer config when handling PipeSubscribePollReq: {}", req); - return SUBSCRIPTION_MISSING_CUSTOMER_RESP; + return SUBSCRIPTION_MISSING_CONSUMER_RESP; } final List events; @@ -498,7 +507,10 @@ private TPipeSubscribeResp handlePipeSubscribePollInternal(final PipeSubscribePo case POLL: events = handlePipeSubscribePollRequest( - consumerConfig, (PollPayload) request.getPayload(), maxBytes); + consumerConfig, + (PollPayload) request.getPayload(), + maxBytes, + request.getProgressByTopic()); break; case POLL_FILE: events = @@ -562,17 +574,33 @@ private TPipeSubscribeResp handlePipeSubscribePollInternal(final PipeSubscribePo } totalSize.getAndAdd(size); - SubscriptionPrefetchingQueueMetrics.getInstance() - .mark( - SubscriptionPrefetchingQueue.generatePrefetchingQueueId( - commitContext.getConsumerGroupId(), commitContext.getTopicName()), - size); + final String queueId = + SubscriptionPrefetchingQueue.generatePrefetchingQueueId( + commitContext.getConsumerGroupId(), commitContext.getTopicName()); + if (ConsensusSubscriptionSetupHandler.isConsensusBasedTopic( + commitContext.getTopicName())) { + ConsensusSubscriptionPrefetchingQueueMetrics.getInstance() + .mark(queueId, size); + } else { + SubscriptionPrefetchingQueueMetrics.getInstance().mark(queueId, size); + } event.invalidateCurrentResponseByteBuffer(); - LOGGER.info( - "Subscription: consumer {} poll {} successfully with request: {}", - consumerConfig, - response, - req.getRequest()); + if (response.getResponseType() + == SubscriptionPollResponseType.WATERMARK.getType() + || response.getResponseType() + == SubscriptionPollResponseType.TABLETS.getType()) { + LOGGER.debug( + "Subscription: consumer {} poll {} successfully with request: {}", + consumerConfig, + response, + req.getRequest()); + } else { + LOGGER.info( + "Subscription: consumer {} poll {} successfully with request: {}", + consumerConfig, + response, + req.getRequest()); + } return byteBuffer; } catch (final Exception e) { final boolean isOutdated = @@ -610,7 +638,10 @@ private TPipeSubscribeResp handlePipeSubscribePollInternal(final PipeSubscribePo } private List handlePipeSubscribePollRequest( - final ConsumerConfig consumerConfig, final PollPayload messagePayload, final long maxBytes) { + final ConsumerConfig consumerConfig, + final PollPayload messagePayload, + final long maxBytes, + final Map progressByTopic) { final Set subscribedTopicNames = SubscriptionAgent.consumer() .getTopicNamesSubscribedByConsumer( @@ -622,7 +653,7 @@ private List handlePipeSubscribePollRequest( // filter unsubscribed topics topicNames.removeIf((topicName) -> !subscribedTopicNames.contains(topicName)); - return SubscriptionAgent.broker().poll(consumerConfig, topicNames, maxBytes); + return SubscriptionAgent.broker().poll(consumerConfig, topicNames, maxBytes, progressByTopic); } private List handlePipeSubscribePollTsFileRequest( @@ -658,7 +689,7 @@ private TPipeSubscribeResp handlePipeSubscribeCommitInternal(final PipeSubscribe if (Objects.isNull(consumerConfig)) { LOGGER.warn( "Subscription: missing consumer config when handling PipeSubscribeCommitReq: {}", req); - return SUBSCRIPTION_MISSING_CUSTOMER_RESP; + return SUBSCRIPTION_MISSING_CONSUMER_RESP; } // commit (ack or nack) @@ -669,22 +700,90 @@ private TPipeSubscribeResp handlePipeSubscribeCommitInternal(final PipeSubscribe if (Objects.equals(successfulCommitContexts.size(), commitContexts.size())) { LOGGER.info( - "Subscription: consumer {} commit (nack: {}) successfully, commit contexts: {}", + "Subscription: consumer {} commit (nack: {}) successfully, summary: {}", consumerConfig, nack, - commitContexts); + summarizeCommitContexts(commitContexts)); + if (LOGGER.isDebugEnabled()) { + LOGGER.debug( + "Subscription: consumer {} commit (nack: {}) full commit contexts: {}", + consumerConfig, + nack, + commitContexts); + } } else { LOGGER.warn( - "Subscription: consumer {} commit (nack: {}) partially successful, commit contexts: {}, successful commit contexts: {}", + "Subscription: consumer {} commit (nack: {}) partially successful, requested summary: {}, successful summary: {}", consumerConfig, nack, - commitContexts, - successfulCommitContexts); + summarizeCommitContexts(commitContexts), + summarizeCommitContexts(successfulCommitContexts)); + if (LOGGER.isDebugEnabled()) { + LOGGER.debug( + "Subscription: consumer {} commit (nack: {}) full requested commit contexts: {}, full successful commit contexts: {}", + consumerConfig, + nack, + commitContexts, + successfulCommitContexts); + } } return PipeSubscribeCommitResp.toTPipeSubscribeResp(RpcUtils.SUCCESS_STATUS); } + private static String summarizeCommitContexts( + final List commitContexts) { + if (Objects.isNull(commitContexts) || commitContexts.isEmpty()) { + return "count=0"; + } + + long minLocalSeq = Long.MAX_VALUE; + long maxLocalSeq = Long.MIN_VALUE; + long minPhysicalTime = Long.MAX_VALUE; + long maxPhysicalTime = Long.MIN_VALUE; + final Set regionIds = new LinkedHashSet<>(); + final Set topicNames = new LinkedHashSet<>(); + + for (final SubscriptionCommitContext commitContext : commitContexts) { + if (Objects.isNull(commitContext)) { + continue; + } + topicNames.add(commitContext.getTopicName()); + regionIds.add(commitContext.getRegionId()); + + final long localSeq = commitContext.getLocalSeq(); + minLocalSeq = Math.min(minLocalSeq, localSeq); + maxLocalSeq = Math.max(maxLocalSeq, localSeq); + + final long physicalTime = commitContext.getPhysicalTime(); + minPhysicalTime = Math.min(minPhysicalTime, physicalTime); + maxPhysicalTime = Math.max(maxPhysicalTime, physicalTime); + } + + return String.format( + "count=%d, topics=%s, regions=%s, localSeqRange=%s, physicalTimeRange=%s", + commitContexts.size(), + summarizeStringSet(topicNames, 2), + summarizeStringSet(regionIds, 4), + minLocalSeq == Long.MAX_VALUE ? "N/A" : "[" + minLocalSeq + ", " + maxLocalSeq + "]", + minPhysicalTime == Long.MAX_VALUE + ? "N/A" + : "[" + minPhysicalTime + ", " + maxPhysicalTime + "]"); + } + + private static String summarizeStringSet(final Set values, final int maxDisplayCount) { + if (Objects.isNull(values) || values.isEmpty()) { + return "[]"; + } + + final List displayValues = + values.stream().limit(maxDisplayCount).collect(Collectors.toList()); + if (values.size() <= maxDisplayCount) { + return displayValues.toString(); + } + return displayValues + "...(" + values.size() + " total)"; + } + private TPipeSubscribeResp handlePipeSubscribeClose(final PipeSubscribeCloseReq req) { try { return handlePipeSubscribeCloseInternal(req); @@ -705,7 +804,7 @@ private TPipeSubscribeResp handlePipeSubscribeCloseInternal(final PipeSubscribeC if (Objects.isNull(consumerConfig)) { LOGGER.warn( "Subscription: missing consumer config when handling PipeSubscribeCloseReq: {}", req); - return SUBSCRIPTION_MISSING_CUSTOMER_RESP; + return SUBSCRIPTION_MISSING_CONSUMER_RESP; } closeConsumer(consumerConfig); @@ -715,6 +814,71 @@ private TPipeSubscribeResp handlePipeSubscribeCloseInternal(final PipeSubscribeC return PipeSubscribeCloseResp.toTPipeSubscribeResp(RpcUtils.SUCCESS_STATUS); } + private TPipeSubscribeResp handlePipeSubscribeSeek(final PipeSubscribeSeekReq req) { + try { + return handlePipeSubscribeSeekInternal(req); + } catch (final Exception e) { + LOGGER.warn("Exception occurred when seeking with request {}", req, e); + final String exceptionMessage = + String.format( + "Subscription: something unexpected happened when seeking with request %s: %s", + req, e); + return PipeSubscribeSeekResp.toTPipeSubscribeResp( + RpcUtils.getStatus(TSStatusCode.SUBSCRIPTION_SEEK_ERROR, exceptionMessage)); + } + } + + private TPipeSubscribeResp handlePipeSubscribeSeekInternal(final PipeSubscribeSeekReq req) { + // check consumer config thread local + final ConsumerConfig consumerConfig = consumerConfigThreadLocal.get(); + if (Objects.isNull(consumerConfig)) { + LOGGER.warn( + "Subscription: missing consumer config when handling PipeSubscribeSeekReq: {}", req); + return SUBSCRIPTION_MISSING_CONSUMER_RESP; + } + + final String topicName = req.getTopicName(); + final short seekType = req.getSeekType(); + + if (seekType == PipeSubscribeSeekReq.SEEK_TO_TOPIC_PROGRESS) { + SubscriptionAgent.broker() + .seekToTopicProgress(consumerConfig, topicName, req.getTopicProgress()); + LOGGER.info( + "Subscription: consumer {} seek topic {} to topicProgress(regionCount={})", + consumerConfig, + topicName, + req.getTopicProgress().getRegionProgress().size()); + } else if (seekType == PipeSubscribeSeekReq.SEEK_AFTER_TOPIC_PROGRESS) { + SubscriptionAgent.broker() + .seekAfterTopicProgress(consumerConfig, topicName, req.getTopicProgress()); + LOGGER.info( + "Subscription: consumer {} seekAfter topic {} to topicProgress(regionCount={})", + consumerConfig, + topicName, + req.getTopicProgress().getRegionProgress().size()); + } else if (seekType == PipeSubscribeSeekReq.SEEK_TO_BEGINNING + || seekType == PipeSubscribeSeekReq.SEEK_TO_END) { + SubscriptionAgent.broker().seek(consumerConfig, topicName, seekType); + LOGGER.info( + "Subscription: consumer {} seek topic {} with seekType={}", + consumerConfig, + topicName, + seekType); + } else { + final String errorMessage = + String.format( + "Subscription: unsupported seekType %s for topic %s. " + + "Consensus subscription only supports seekToBeginning, seekToEnd, " + + "seek(topicProgress), and seekAfter(topicProgress).", + seekType, topicName); + LOGGER.warn(errorMessage); + return PipeSubscribeSeekResp.toTPipeSubscribeResp( + RpcUtils.getStatus(TSStatusCode.SUBSCRIPTION_SEEK_ERROR, errorMessage)); + } + + return PipeSubscribeSeekResp.toTPipeSubscribeResp(RpcUtils.SUCCESS_STATUS); + } + private void closeConsumer(final ConsumerConfig consumerConfig) { // unsubscribe all subscribed topics final Set topicNames = diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/task/execution/ConsensusSubscriptionPrefetchExecutor.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/task/execution/ConsensusSubscriptionPrefetchExecutor.java new file mode 100644 index 0000000000000..660de3770cd7d --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/task/execution/ConsensusSubscriptionPrefetchExecutor.java @@ -0,0 +1,160 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.task.execution; + +import org.apache.iotdb.commons.concurrent.IoTDBThreadPoolFactory; +import org.apache.iotdb.commons.concurrent.ThreadName; +import org.apache.iotdb.commons.subscription.config.SubscriptionConfig; +import org.apache.iotdb.db.subscription.task.subtask.ConsensusPrefetchSubtask; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Map; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; + +public class ConsensusSubscriptionPrefetchExecutor { + + private static final Logger LOGGER = + LoggerFactory.getLogger(ConsensusSubscriptionPrefetchExecutor.class); + + private static final AtomicInteger ID_GENERATOR = new AtomicInteger(0); + + private final String workerThreadName; + private final String schedulerThreadName; + private final int workerThreadNum; + + private final BlockingQueue readyQueue = new LinkedBlockingQueue<>(); + private final Map taskIdToSubtask = new ConcurrentHashMap<>(); + private final AtomicBoolean shutdown = new AtomicBoolean(false); + + private final ExecutorService workerPool; + private final ScheduledExecutorService delayedScheduler; + + public ConsensusSubscriptionPrefetchExecutor() { + final int executorId = ID_GENERATOR.getAndIncrement(); + this.workerThreadNum = + Math.max( + 1, + SubscriptionConfig.getInstance() + .getSubscriptionConsensusPrefetchExecutorMaxThreadNum()); + this.workerThreadName = + ThreadName.SUBSCRIPTION_CONSENSUS_PREFETCH_EXECUTOR_POOL.getName() + "-" + executorId; + this.schedulerThreadName = + ThreadName.SUBSCRIPTION_CONSENSUS_PREFETCH_SCHEDULER.getName() + "-" + executorId; + this.workerPool = IoTDBThreadPoolFactory.newFixedThreadPool(workerThreadNum, workerThreadName); + this.delayedScheduler = + IoTDBThreadPoolFactory.newSingleThreadScheduledExecutor(schedulerThreadName); + + for (int i = 0; i < workerThreadNum; i++) { + workerPool.submit(this::workerLoop); + } + } + + public synchronized boolean register(final ConsensusPrefetchSubtask subtask) { + if (shutdown.get()) { + LOGGER.warn( + "Consensus prefetch executor is shutdown, skip registering {}", subtask.getTaskId()); + return false; + } + if (taskIdToSubtask.putIfAbsent(subtask.getTaskId(), subtask) != null) { + LOGGER.warn("Consensus prefetch subtask {} is already registered", subtask.getTaskId()); + return false; + } + subtask.bindExecutor(this); + return true; + } + + public synchronized void deregister(final String taskId) { + final ConsensusPrefetchSubtask subtask = taskIdToSubtask.remove(taskId); + if (subtask == null) { + return; + } + readyQueue.remove(subtask); + subtask.cancelPendingExecution(); + subtask.close(); + } + + public void enqueue(final ConsensusPrefetchSubtask subtask) { + if (shutdown.get() || subtask.isClosed()) { + return; + } + readyQueue.offer(subtask); + } + + public void schedule( + final ConsensusPrefetchSubtask subtask, final long delayMs, final long delayedToken) { + if (shutdown.get() || subtask.isClosed()) { + return; + } + delayedScheduler.schedule( + () -> { + if (!shutdown.get()) { + subtask.fireScheduledWakeup(delayedToken); + } + }, + delayMs, + TimeUnit.MILLISECONDS); + } + + public synchronized void shutdown() { + if (!shutdown.compareAndSet(false, true)) { + return; + } + + for (final ConsensusPrefetchSubtask subtask : taskIdToSubtask.values()) { + readyQueue.remove(subtask); + subtask.cancelPendingExecution(); + subtask.close(); + } + taskIdToSubtask.clear(); + readyQueue.clear(); + + delayedScheduler.shutdownNow(); + workerPool.shutdownNow(); + } + + public boolean isShutdown() { + return shutdown.get(); + } + + private void workerLoop() { + try { + while (!shutdown.get() && !Thread.currentThread().isInterrupted()) { + final ConsensusPrefetchSubtask subtask = readyQueue.take(); + if (subtask.isClosed()) { + continue; + } + subtask.runOneRound(); + } + } catch (final InterruptedException e) { + Thread.currentThread().interrupt(); + } catch (final Throwable t) { + LOGGER.error("Consensus prefetch worker loop exits abnormally", t); + } + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/task/execution/ConsensusSubscriptionPrefetchExecutorManager.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/task/execution/ConsensusSubscriptionPrefetchExecutorManager.java new file mode 100644 index 0000000000000..9362a38a58b7e --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/task/execution/ConsensusSubscriptionPrefetchExecutorManager.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.task.execution; + +import org.apache.iotdb.commons.subscription.config.SubscriptionConfig; + +public class ConsensusSubscriptionPrefetchExecutorManager { + + private volatile ConsensusSubscriptionPrefetchExecutor executor; + private volatile boolean started = false; + + private ConsensusSubscriptionPrefetchExecutorManager() { + // singleton + } + + public synchronized void start() { + if (!SubscriptionConfig.getInstance().getSubscriptionEnabled()) { + started = false; + return; + } + started = true; + if (executor == null || executor.isShutdown()) { + executor = new ConsensusSubscriptionPrefetchExecutor(); + } + } + + public synchronized ConsensusSubscriptionPrefetchExecutor getExecutor() { + if (!started || !SubscriptionConfig.getInstance().getSubscriptionEnabled()) { + return null; + } + if (executor == null || executor.isShutdown()) { + executor = new ConsensusSubscriptionPrefetchExecutor(); + } + return executor; + } + + public synchronized void stop() { + started = false; + if (executor != null) { + executor.shutdown(); + executor = null; + } + } + + public boolean isStarted() { + return started; + } + + private static class Holder { + private static final ConsensusSubscriptionPrefetchExecutorManager INSTANCE = + new ConsensusSubscriptionPrefetchExecutorManager(); + } + + public static ConsensusSubscriptionPrefetchExecutorManager getInstance() { + return Holder.INSTANCE; + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/task/subtask/ConsensusPrefetchSubtask.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/task/subtask/ConsensusPrefetchSubtask.java new file mode 100644 index 0000000000000..79997bb7405a1 --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/task/subtask/ConsensusPrefetchSubtask.java @@ -0,0 +1,243 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.task.subtask; + +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusPrefetchingQueue; +import org.apache.iotdb.db.subscription.broker.consensus.PrefetchRoundResult; +import org.apache.iotdb.db.subscription.task.execution.ConsensusSubscriptionPrefetchExecutor; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class ConsensusPrefetchSubtask { + + private static final Logger LOGGER = LoggerFactory.getLogger(ConsensusPrefetchSubtask.class); + + private final String taskId; + private final ConsensusPrefetchingQueue queue; + private final Object monitor = new Object(); + + private ConsensusSubscriptionPrefetchExecutor executor; + + private boolean scheduledOrRunning = false; + private boolean running = false; + private boolean wakeupPending = false; + private boolean closed = false; + private long delayedWakeToken = 0L; + + public ConsensusPrefetchSubtask(final ConsensusPrefetchingQueue queue) { + this.queue = queue; + this.taskId = queue.getPrefetchingQueueId() + "_" + queue.getConsensusGroupId(); + } + + public String getTaskId() { + return taskId; + } + + public void bindExecutor(final ConsensusSubscriptionPrefetchExecutor executor) { + this.executor = executor; + } + + public void requestWakeupNow() { + final ConsensusSubscriptionPrefetchExecutor currentExecutor = executor; + if (currentExecutor == null) { + return; + } + + boolean shouldEnqueue = false; + synchronized (monitor) { + if (closed) { + return; + } + delayedWakeToken++; + if (scheduledOrRunning) { + wakeupPending = true; + return; + } + scheduledOrRunning = true; + shouldEnqueue = true; + } + + if (shouldEnqueue) { + currentExecutor.enqueue(this); + } + } + + public void scheduleWakeupAfter(final long delayMs) { + final ConsensusSubscriptionPrefetchExecutor currentExecutor = executor; + if (currentExecutor == null) { + return; + } + + long delayedToken; + synchronized (monitor) { + if (closed || scheduledOrRunning || wakeupPending) { + return; + } + delayedToken = ++delayedWakeToken; + } + currentExecutor.schedule(this, delayMs, delayedToken); + } + + public void runOneRound() { + PrefetchRoundResult result = PrefetchRoundResult.dormant(); + + synchronized (monitor) { + if (closed) { + scheduledOrRunning = false; + monitor.notifyAll(); + return; + } + running = true; + } + + try { + result = queue.drivePrefetchOnce(); + } catch (final Throwable t) { + LOGGER.error( + "ConsensusPrefetchSubtask {}: unexpected error while driving queue {}", taskId, queue, t); + result = PrefetchRoundResult.rescheduleAfter(100L); + } + + boolean shouldEnqueue = false; + Long delayedWakeMs = null; + long delayedToken = 0L; + synchronized (monitor) { + running = false; + if (closed) { + scheduledOrRunning = false; + monitor.notifyAll(); + return; + } + + if (wakeupPending) { + wakeupPending = false; + shouldEnqueue = true; + } else { + switch (result.getType()) { + case RESCHEDULE_NOW: + shouldEnqueue = true; + break; + case RESCHEDULE_LATER: + delayedToken = ++delayedWakeToken; + delayedWakeMs = result.getDelayMs(); + scheduledOrRunning = false; + break; + case DORMANT: + default: + scheduledOrRunning = false; + break; + } + } + + if (shouldEnqueue) { + scheduledOrRunning = true; + } + monitor.notifyAll(); + } + + final ConsensusSubscriptionPrefetchExecutor currentExecutor = executor; + if (currentExecutor == null) { + return; + } + if (shouldEnqueue) { + currentExecutor.enqueue(this); + } else if (delayedWakeMs != null) { + currentExecutor.schedule(this, delayedWakeMs, delayedToken); + } + } + + public void fireScheduledWakeup(final long delayedToken) { + final ConsensusSubscriptionPrefetchExecutor currentExecutor = executor; + if (currentExecutor == null) { + return; + } + + boolean shouldEnqueue = false; + synchronized (monitor) { + if (closed || delayedWakeToken != delayedToken || scheduledOrRunning) { + return; + } + scheduledOrRunning = true; + shouldEnqueue = true; + } + + if (shouldEnqueue) { + currentExecutor.enqueue(this); + } + } + + public void cancelPendingExecution() { + synchronized (monitor) { + delayedWakeToken++; + wakeupPending = false; + if (scheduledOrRunning && !running) { + scheduledOrRunning = false; + } + monitor.notifyAll(); + } + } + + public void awaitIdle() { + synchronized (monitor) { + while (running || scheduledOrRunning) { + try { + monitor.wait(50L); + } catch (final InterruptedException e) { + Thread.currentThread().interrupt(); + return; + } + } + } + } + + public void close() { + synchronized (monitor) { + closed = true; + delayedWakeToken++; + wakeupPending = false; + if (!running) { + scheduledOrRunning = false; + monitor.notifyAll(); + return; + } + while (scheduledOrRunning) { + try { + monitor.wait(50L); + } catch (final InterruptedException e) { + Thread.currentThread().interrupt(); + return; + } + } + } + } + + public boolean isClosed() { + synchronized (monitor) { + return closed; + } + } + + public boolean isScheduledOrRunning() { + synchronized (monitor) { + return scheduledOrRunning; + } + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/task/subtask/SubscriptionSinkSubtask.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/task/subtask/SubscriptionSinkSubtask.java index 2ca332263b52b..7b67f79e62291 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/task/subtask/SubscriptionSinkSubtask.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/task/subtask/SubscriptionSinkSubtask.java @@ -22,6 +22,7 @@ import org.apache.iotdb.commons.pipe.agent.task.connection.UnboundedBlockingPendingQueue; import org.apache.iotdb.db.pipe.agent.task.subtask.sink.PipeSinkSubtask; import org.apache.iotdb.db.subscription.agent.SubscriptionAgent; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusSubscriptionSetupHandler; import org.apache.iotdb.pipe.api.PipeConnector; import org.apache.iotdb.pipe.api.event.Event; @@ -77,11 +78,27 @@ protected void registerCallbackHookAfterSubmit(final ListenableFuture f Futures.addCallback(future, this, subtaskCallbackListeningExecutor); } + @Override + public synchronized void onSuccess(final Boolean hasAtLeastOneEventProcessed) { + isSubmitted = false; + if (isConsensusDrivenTopic()) { + return; + } + super.onSuccess(hasAtLeastOneEventProcessed); + } + @Override public synchronized void onFailure(final Throwable throwable) { isSubmitted = false; - // just resubmit + if (isConsensusDrivenTopic()) { + LOGGER.warn( + "SubscriptionSinkSubtask for consensus topic [{}] failed unexpectedly, skip auto-resubmit", + topicName, + throwable); + return; + } + submitSelf(); } @@ -91,6 +108,14 @@ protected boolean executeOnce() { return false; } + if (isConsensusDrivenTopic()) { + return false; + } + return SubscriptionAgent.broker().executePrefetch(consumerGroupId, topicName); } + + private boolean isConsensusDrivenTopic() { + return ConsensusSubscriptionSetupHandler.isConsensusBasedTopic(topicName); + } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/task/subtask/SubscriptionSinkSubtaskLifeCycle.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/task/subtask/SubscriptionSinkSubtaskLifeCycle.java index 98163697374da..95dcba88b8f5a 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/task/subtask/SubscriptionSinkSubtaskLifeCycle.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/task/subtask/SubscriptionSinkSubtaskLifeCycle.java @@ -24,6 +24,7 @@ import org.apache.iotdb.db.pipe.agent.task.subtask.sink.PipeSinkSubtask; import org.apache.iotdb.db.pipe.agent.task.subtask.sink.PipeSinkSubtaskLifeCycle; import org.apache.iotdb.db.subscription.agent.SubscriptionAgent; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusSubscriptionSetupHandler; import org.apache.iotdb.pipe.api.event.Event; import org.slf4j.Logger; @@ -48,8 +49,10 @@ public synchronized void register() { } if (registeredTaskCount == 0) { - // bind prefetching queue - SubscriptionAgent.broker().bindPrefetchingQueue((SubscriptionSinkSubtask) subtask); + if (!ConsensusSubscriptionSetupHandler.isConsensusBasedTopic( + ((SubscriptionSinkSubtask) subtask).getTopicName())) { + SubscriptionAgent.broker().bindPrefetchingQueue((SubscriptionSinkSubtask) subtask); + } executor.register(subtask); runningTaskCount = 0; } @@ -97,6 +100,8 @@ public synchronized void close() { // when dropping the subscription. final String consumerGroupId = ((SubscriptionSinkSubtask) subtask).getConsumerGroupId(); final String topicName = ((SubscriptionSinkSubtask) subtask).getTopicName(); - SubscriptionAgent.broker().unbindPrefetchingQueue(consumerGroupId, topicName); + if (!ConsensusSubscriptionSetupHandler.isConsensusBasedTopic(topicName)) { + SubscriptionAgent.broker().unbindPrefetchingQueue(consumerGroupId, topicName); + } } } diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/ProgressWALReaderTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/ProgressWALReaderTest.java new file mode 100644 index 0000000000000..09c44534b82fa --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/ProgressWALReaderTest.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.storageengine.dataregion.wal.io; + +import org.junit.Test; + +import java.io.File; +import java.nio.ByteBuffer; +import java.nio.file.Files; +import java.nio.file.Path; + +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public class ProgressWALReaderTest { + + @Test + public void testReadWriterProgressMetadataFromV3Wal() throws Exception { + Path dir = Files.createTempDirectory("progress-wal-reader"); + File walFile = dir.resolve("test.wal").toFile(); + + try { + try (WALWriter writer = new WALWriter(walFile, WALFileVersion.V3)) { + writer.write( + entryBuffer((byte) 1, (byte) 2, (byte) 3), + singleEntryMeta(3, 10L, 1L, 10000L, 1, 2L, 10L)); + writer.write( + entryBuffer((byte) 4, (byte) 5), singleEntryMeta(2, 11L, 1L, 10010L, 1, 2L, 11L)); + writer.write( + entryBuffer((byte) 6, (byte) 7, (byte) 8, (byte) 9), + singleEntryMeta(4, 12L, 2L, 20000L, 4, 1L, 1L)); + } + + try (ProgressWALReader reader = new ProgressWALReader(walFile)) { + assertTrue(reader.hasNext()); + assertArrayEquals(new byte[] {1, 2, 3}, reader.next().array()); + assertEquals(0, reader.getCurrentEntryIndex()); + assertEquals(10000L, reader.getCurrentEntryPhysicalTime()); + assertEquals(1, reader.getCurrentEntryNodeId()); + assertEquals(2L, reader.getCurrentEntryWriterEpoch()); + assertEquals(10L, reader.getCurrentEntryLocalSeq()); + + assertTrue(reader.hasNext()); + assertArrayEquals(new byte[] {4, 5}, reader.next().array()); + assertEquals(1, reader.getCurrentEntryIndex()); + assertEquals(10010L, reader.getCurrentEntryPhysicalTime()); + assertEquals(1, reader.getCurrentEntryNodeId()); + assertEquals(2L, reader.getCurrentEntryWriterEpoch()); + assertEquals(11L, reader.getCurrentEntryLocalSeq()); + + assertTrue(reader.hasNext()); + assertArrayEquals(new byte[] {6, 7, 8, 9}, reader.next().array()); + assertEquals(2, reader.getCurrentEntryIndex()); + assertEquals(20000L, reader.getCurrentEntryPhysicalTime()); + assertEquals(4, reader.getCurrentEntryNodeId()); + assertEquals(1L, reader.getCurrentEntryWriterEpoch()); + assertEquals(1L, reader.getCurrentEntryLocalSeq()); + } + } finally { + Files.deleteIfExists(walFile.toPath()); + Files.deleteIfExists(dir); + } + } + + private static ByteBuffer entryBuffer(byte... bytes) { + ByteBuffer buffer = ByteBuffer.allocate(bytes.length); + buffer.put(bytes); + return buffer; + } + + private static WALMetaData singleEntryMeta( + int size, + long searchIndex, + long memTableId, + long physicalTime, + int nodeId, + long writerEpoch, + long localSeq) { + WALMetaData metaData = new WALMetaData(); + metaData.add(size, searchIndex, memTableId, physicalTime, nodeId, writerEpoch, localSeq); + return metaData; + } +} diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALMetaDataV3CompatibilityTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALMetaDataV3CompatibilityTest.java new file mode 100644 index 0000000000000..49c4a2cd7dc57 --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALMetaDataV3CompatibilityTest.java @@ -0,0 +1,198 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iotdb.db.storageengine.dataregion.wal.io; + +import org.junit.Test; + +import java.nio.ByteBuffer; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +/** Tests for WALMetaData V3 serialization/deserialization roundtrip and V2 file handling. */ +public class WALMetaDataV3CompatibilityTest { + + @Test + public void testV3RoundTrip() { + WALMetaData original = new WALMetaData(); + + original.add(100, 10, 1, 10000L, 1, 2L, 10L); + original.add(200, 11, 1, 10010L, 1, 2L, 11L); + original.add(150, 12, 1, 10020L, 1, 2L, 12L); + original.add(300, 13, 2, 20000L, 4, 1L, 1L); + original.add(250, 14, 2, 20010L, 1, 2L, 14L); + + original.updateTimestampRange(1600000000000L); + original.updateTimestampRange(1600000001000L); + + int size = original.serializedSize(WALFileVersion.V3); + ByteBuffer buffer = ByteBuffer.allocate(size); + original.serialize(buffer, WALFileVersion.V3); + buffer.flip(); + + WALMetaData deserialized = WALMetaData.deserialize(buffer, WALFileVersion.V3); + + assertEquals(10, deserialized.getFirstSearchIndex()); + assertEquals(5, deserialized.getBuffersSize().size()); + assertEquals(Integer.valueOf(100), deserialized.getBuffersSize().get(0)); + assertEquals(Integer.valueOf(200), deserialized.getBuffersSize().get(1)); + assertEquals(Integer.valueOf(150), deserialized.getBuffersSize().get(2)); + assertEquals(Integer.valueOf(300), deserialized.getBuffersSize().get(3)); + assertEquals(Integer.valueOf(250), deserialized.getBuffersSize().get(4)); + + assertTrue(deserialized.getMemTablesId().contains(1L)); + assertTrue(deserialized.getMemTablesId().contains(2L)); + + assertEquals(1600000000000L, deserialized.getMinDataTs()); + assertEquals(1600000001000L, deserialized.getMaxDataTs()); + + assertEquals(5, deserialized.getPhysicalTimes().size()); + assertEquals(Long.valueOf(10000L), deserialized.getPhysicalTimes().get(0)); + assertEquals(Long.valueOf(10010L), deserialized.getPhysicalTimes().get(1)); + assertEquals(Long.valueOf(10020L), deserialized.getPhysicalTimes().get(2)); + assertEquals(Long.valueOf(20000L), deserialized.getPhysicalTimes().get(3)); + assertEquals(Long.valueOf(20010L), deserialized.getPhysicalTimes().get(4)); + + assertEquals(5, deserialized.getNodeIds().size()); + assertEquals(Short.valueOf((short) 1), deserialized.getNodeIds().get(0)); + assertEquals(Short.valueOf((short) 1), deserialized.getNodeIds().get(1)); + assertEquals(Short.valueOf((short) 1), deserialized.getNodeIds().get(2)); + assertEquals(Short.valueOf((short) 4), deserialized.getNodeIds().get(3)); + assertEquals(Short.valueOf((short) 1), deserialized.getNodeIds().get(4)); + + assertEquals(5, deserialized.getWriterEpochs().size()); + assertEquals(Short.valueOf((short) 2), deserialized.getWriterEpochs().get(0)); + assertEquals(Short.valueOf((short) 2), deserialized.getWriterEpochs().get(1)); + assertEquals(Short.valueOf((short) 2), deserialized.getWriterEpochs().get(2)); + assertEquals(Short.valueOf((short) 1), deserialized.getWriterEpochs().get(3)); + assertEquals(Short.valueOf((short) 2), deserialized.getWriterEpochs().get(4)); + + assertEquals(5, deserialized.getLocalSeqs().size()); + assertEquals(Long.valueOf(10L), deserialized.getLocalSeqs().get(0)); + assertEquals(Long.valueOf(11L), deserialized.getLocalSeqs().get(1)); + assertEquals(Long.valueOf(12L), deserialized.getLocalSeqs().get(2)); + assertEquals(Long.valueOf(1L), deserialized.getLocalSeqs().get(3)); + assertEquals(Long.valueOf(14L), deserialized.getLocalSeqs().get(4)); + } + + @Test + public void testV2DeserializationHasEmptyV3Fields() { + WALMetaData original = new WALMetaData(); + original.add(100, 10, 1); + original.add(200, 11, 1); + + int size = original.serializedSize(WALFileVersion.V2); + ByteBuffer buffer = ByteBuffer.allocate(size); + original.serialize(buffer, WALFileVersion.V2); + buffer.flip(); + + WALMetaData deserialized = WALMetaData.deserialize(buffer, WALFileVersion.V2); + + assertEquals(10, deserialized.getFirstSearchIndex()); + assertEquals(2, deserialized.getBuffersSize().size()); + assertTrue(deserialized.getPhysicalTimes().isEmpty()); + assertTrue(deserialized.getNodeIds().isEmpty()); + assertTrue(deserialized.getWriterEpochs().isEmpty()); + assertTrue(deserialized.getLocalSeqs().isEmpty()); + assertEquals(Long.MAX_VALUE, deserialized.getMinDataTs()); + assertEquals(Long.MIN_VALUE, deserialized.getMaxDataTs()); + } + + @Test + public void testV2SerializedSizeSmallerThanV3() { + WALMetaData meta = new WALMetaData(); + meta.add(100, 10, 1, 10L, 1, 2L, 10L); + meta.add(200, 11, 1, 11L, 1, 2L, 11L); + meta.add(300, 12, 1, 12L, 3, 1L, 12L); + + int v2Size = meta.serializedSize(WALFileVersion.V2); + int v3Size = meta.serializedSize(WALFileVersion.V3); + + int entryCount = 3; + int overrideCount = 1; + int expectedDiff = + entryCount * Long.BYTES * 2 + + Long.BYTES * 2 + + Short.BYTES * 2 + + Integer.BYTES + + overrideCount * (Integer.BYTES + Short.BYTES + Short.BYTES); + assertEquals(expectedDiff, v3Size - v2Size); + } + + @Test + public void testV3AddAllMerge() { + WALMetaData meta1 = new WALMetaData(); + meta1.add(100, 10, 1, 100L, 1, 2L, 10L); + meta1.add(200, 11, 1, 110L, 1, 2L, 11L); + meta1.updateTimestampRange(100L); + + WALMetaData meta2 = new WALMetaData(); + meta2.add(300, 12, 2, 200L, 4, 1L, 1L); + meta2.updateTimestampRange(200L); + + meta1.addAll(meta2); + + assertEquals(3, meta1.getBuffersSize().size()); + assertEquals(Long.valueOf(100L), meta1.getPhysicalTimes().get(0)); + assertEquals(Long.valueOf(110L), meta1.getPhysicalTimes().get(1)); + assertEquals(Long.valueOf(200L), meta1.getPhysicalTimes().get(2)); + assertEquals(Short.valueOf((short) 1), meta1.getNodeIds().get(0)); + assertEquals(Short.valueOf((short) 1), meta1.getNodeIds().get(1)); + assertEquals(Short.valueOf((short) 4), meta1.getNodeIds().get(2)); + assertEquals(Short.valueOf((short) 2), meta1.getWriterEpochs().get(0)); + assertEquals(Short.valueOf((short) 2), meta1.getWriterEpochs().get(1)); + assertEquals(Short.valueOf((short) 1), meta1.getWriterEpochs().get(2)); + assertEquals(Long.valueOf(10L), meta1.getLocalSeqs().get(0)); + assertEquals(Long.valueOf(11L), meta1.getLocalSeqs().get(1)); + assertEquals(Long.valueOf(1L), meta1.getLocalSeqs().get(2)); + assertEquals(100L, meta1.getMinDataTs()); + assertEquals(200L, meta1.getMaxDataTs()); + } + + @Test + public void testV3EmptyMetadata() { + WALMetaData empty = new WALMetaData(); + + int size = empty.serializedSize(WALFileVersion.V3); + ByteBuffer buffer = ByteBuffer.allocate(size); + empty.serialize(buffer, WALFileVersion.V3); + buffer.flip(); + + WALMetaData deserialized = WALMetaData.deserialize(buffer, WALFileVersion.V3); + + assertEquals(0, deserialized.getBuffersSize().size()); + assertTrue(deserialized.getPhysicalTimes().isEmpty()); + assertTrue(deserialized.getNodeIds().isEmpty()); + assertTrue(deserialized.getWriterEpochs().isEmpty()); + assertTrue(deserialized.getLocalSeqs().isEmpty()); + assertEquals(Long.MAX_VALUE, deserialized.getMinDataTs()); + assertEquals(Long.MIN_VALUE, deserialized.getMaxDataTs()); + } + + @Test + public void testV2CompatibleAddDefaultsWriterProgress() { + WALMetaData meta = new WALMetaData(); + meta.add(100, 10, 1); + + assertEquals(Long.valueOf(0L), meta.getPhysicalTimes().get(0)); + assertEquals(Short.valueOf((short) -1), meta.getNodeIds().get(0)); + assertEquals(Short.valueOf((short) 0), meta.getWriterEpochs().get(0)); + assertEquals(Long.valueOf(10L), meta.getLocalSeqs().get(0)); + } +} diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/node/ConsensusReqReaderTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/node/ConsensusReqReaderTest.java index 688e5df205c4e..600a003d5522d 100644 --- a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/node/ConsensusReqReaderTest.java +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/node/ConsensusReqReaderTest.java @@ -250,6 +250,24 @@ public void scenario01TestGetReqIterator02() throws Exception { checkThread.shutdown(); } + @Test + public void testReqIteratorCarriesWriterMetadata() throws Exception { + final InsertRowNode insertRowNode = getInsertRowNode(devicePath); + insertRowNode.setSearchIndex(1).setPhysicalTime(123456789L).setNodeId(7).setWriterEpoch(3L); + walNode.log(0, insertRowNode); + walNode.rollWALFile(); + walNode.rollWALFile(); + + final ConsensusReqReader.ReqIterator iterator = walNode.getReqIterator(1); + Assert.assertTrue(iterator.hasNext()); + final IndexedConsensusRequest request = iterator.next(); + + Assert.assertEquals(1L, request.getSearchIndex()); + Assert.assertEquals(123456789L, request.getPhysicalTime()); + Assert.assertEquals(7, request.getNodeId()); + Assert.assertEquals(3L, request.getWriterEpoch()); + } + @Test public void scenario01TestGetReqIterator03() throws Exception { simulateFileScenario01(); diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/node/WALNodeTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/node/WALNodeTest.java index 3706eeb388fbf..f12e6539b00e0 100644 --- a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/node/WALNodeTest.java +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/node/WALNodeTest.java @@ -52,6 +52,7 @@ import org.junit.Test; import java.io.File; +import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.HashSet; @@ -63,6 +64,7 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; @@ -150,6 +152,56 @@ public void testConcurrentWrite() throws Exception { assertEquals(expectedInsertTabletNodes, actualInsertTabletNodes); } + @Test + public void testGetRetentionBoundaryByLastModifiedTime() throws Exception { + final String timeRetentionDirectory = logDirectory + File.separator + "time-retention"; + EnvironmentUtils.cleanDir(timeRetentionDirectory); + + final long now = System.currentTimeMillis(); + createWalFile( + timeRetentionDirectory, + 0, + 0, + now - TimeUnit.HOURS.toMillis(3), + WALFileStatus.CONTAINS_SEARCH_INDEX); + createWalFile( + timeRetentionDirectory, + 1, + 10, + now - TimeUnit.HOURS.toMillis(2), + WALFileStatus.CONTAINS_SEARCH_INDEX); + createWalFile( + timeRetentionDirectory, + 2, + 20, + now - TimeUnit.MINUTES.toMillis(10), + WALFileStatus.CONTAINS_SEARCH_INDEX); + + final WALNode timeRetentionNode = + new WALNode("time-retention-node", timeRetentionDirectory, 3, 30); + try { + final long cutoffTimeMs = now - TimeUnit.HOURS.toMillis(1); + assertEquals(20L, timeRetentionNode.getSearchIndexToFreeBeforeTimestamp(cutoffTimeMs)); + assertEquals(2L, timeRetentionNode.getVersionIdToFreeBeforeTimestamp(cutoffTimeMs)); + + final long futureCutoffTimeMs = now + TimeUnit.MINUTES.toMillis(1); + assertEquals( + Long.MAX_VALUE, + timeRetentionNode.getSearchIndexToFreeBeforeTimestamp(futureCutoffTimeMs)); + assertEquals( + Long.MAX_VALUE, timeRetentionNode.getVersionIdToFreeBeforeTimestamp(futureCutoffTimeMs)); + + final long earlyCutoffTimeMs = now - TimeUnit.DAYS.toMillis(1); + assertEquals( + Long.MIN_VALUE + 1, + timeRetentionNode.getSearchIndexToFreeBeforeTimestamp(earlyCutoffTimeMs)); + assertEquals(0L, timeRetentionNode.getVersionIdToFreeBeforeTimestamp(earlyCutoffTimeMs)); + } finally { + timeRetentionNode.close(); + EnvironmentUtils.cleanDir(timeRetentionDirectory); + } + } + private void writeInsertTabletNode( int memTableId, Set expectedInsertTabletNodes, @@ -221,6 +273,20 @@ private InsertTabletNode getInsertTabletNode(String devicePath, long[] times) times.length); } + private static void createWalFile( + final String directory, + final long versionId, + final long startSearchIndex, + final long lastModifiedTime, + final WALFileStatus status) + throws IOException { + assertTrue(new File(directory).mkdirs() || new File(directory).exists()); + final File walFile = + new File(directory, WALFileUtils.getLogFileName(versionId, startSearchIndex, status)); + assertTrue(walFile.createNewFile()); + assertTrue(walFile.setLastModified(lastModifiedTime)); + } + @Test public void testConcurrentCheckpoint() throws Exception { // start write threads to write concurrently diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/utils/WALFileUtilsTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/utils/WALFileUtilsTest.java index a7d8fa5662f7a..fbfdf49528d90 100644 --- a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/utils/WALFileUtilsTest.java +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/utils/WALFileUtilsTest.java @@ -18,10 +18,20 @@ */ package org.apache.iotdb.db.storageengine.dataregion.wal.utils; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.PlanNodeType; +import org.apache.iotdb.db.storageengine.dataregion.wal.buffer.WALEntryType; +import org.apache.iotdb.db.storageengine.dataregion.wal.buffer.WALInfoEntry; +import org.apache.iotdb.db.storageengine.dataregion.wal.io.WALFileVersion; +import org.apache.iotdb.db.storageengine.dataregion.wal.io.WALMetaData; +import org.apache.iotdb.db.storageengine.dataregion.wal.io.WALWriter; + import org.junit.Assert; import org.junit.Test; import java.io.File; +import java.nio.ByteBuffer; +import java.nio.file.Files; +import java.nio.file.Path; public class WALFileUtilsTest { @Test @@ -238,4 +248,71 @@ public void binarySearchFileBySearchIndex13() { i = WALFileUtils.binarySearchFileBySearchIndex(files, 0); Assert.assertEquals(-1, i); } + + @Test + public void testLocateByWriterProgress() throws Exception { + final Path dir = Files.createTempDirectory("wal-writer-progress-utils"); + final File wal0 = + dir.resolve(WALFileUtils.getLogFileName(0, 0, WALFileStatus.CONTAINS_SEARCH_INDEX)) + .toFile(); + final File wal1 = + dir.resolve(WALFileUtils.getLogFileName(1, 12, WALFileStatus.CONTAINS_SEARCH_INDEX)) + .toFile(); + final File wal2 = + dir.resolve(WALFileUtils.getLogFileName(2, 13, WALFileStatus.CONTAINS_SEARCH_INDEX)) + .toFile(); + + try { + try (final WALWriter writer = new WALWriter(wal0, WALFileVersion.V3)) { + writer.write(entryBuffer(10L), singleEntryMeta(19, 10L, 1L, 10000L, 1, 2L, 110L)); + writer.write(entryBuffer(11L), singleEntryMeta(19, 11L, 1L, 10010L, 1, 2L, 111L)); + } + try (final WALWriter writer = new WALWriter(wal1, WALFileVersion.V3)) { + writer.write(entryBuffer(13L), singleEntryMeta(19, 13L, 1L, 10020L, 1, 2L, 113L)); + } + // Leave wal2 as the active file placeholder; helper methods only scan sealed files. + try (final WALWriter writer = new WALWriter(wal2, WALFileVersion.V3)) { + writer.write(entryBuffer(20L), singleEntryMeta(19, 20L, 1L, 20000L, 4, 1L, 120L)); + } + + Assert.assertArrayEquals( + new long[] {11L, 1L}, + WALFileUtils.locateByWriterProgress(dir.toFile(), 1, 2L, 10010L, 111L)); + Assert.assertArrayEquals( + new long[] {10L, 0L}, + WALFileUtils.locateByWriterProgress(dir.toFile(), 1, 2L, 9999L, 109L)); + Assert.assertEquals( + 13L, WALFileUtils.findSearchIndexAfterWriterProgress(dir.toFile(), 1, 2L, 10010L, 111L)); + Assert.assertEquals( + -1L, WALFileUtils.findSearchIndexAfterWriterProgress(dir.toFile(), 4, 1L, 20000L, 120L)); + } finally { + Files.deleteIfExists(wal0.toPath()); + Files.deleteIfExists(wal1.toPath()); + Files.deleteIfExists(wal2.toPath()); + Files.deleteIfExists(dir); + } + } + + private static ByteBuffer entryBuffer(final long bodySearchIndex) { + final ByteBuffer buffer = + ByteBuffer.allocate(WALInfoEntry.FIXED_SERIALIZED_SIZE + PlanNodeType.BYTES + Long.BYTES); + buffer.put(WALEntryType.INSERT_ROW_NODE.getCode()); + buffer.putLong(1L); + buffer.putShort(PlanNodeType.INSERT_ROW.getNodeType()); + buffer.putLong(bodySearchIndex); + return buffer; + } + + private static WALMetaData singleEntryMeta( + final int size, + final long searchIndex, + final long memTableId, + final long physicalTime, + final int nodeId, + final long writerEpoch, + final long localSeq) { + final WALMetaData metaData = new WALMetaData(); + metaData.add(size, searchIndex, memTableId, physicalTime, nodeId, writerEpoch, localSeq); + return metaData; + } } diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusLogToTabletConverterTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusLogToTabletConverterTest.java new file mode 100644 index 0000000000000..b8a953a6e839b --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusLogToTabletConverterTest.java @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker.consensus; + +import org.apache.iotdb.commons.pipe.datastructure.pattern.TablePattern; +import org.apache.iotdb.db.queryengine.plan.statement.StatementTestUtils; + +import org.apache.tsfile.enums.ColumnCategory; +import org.apache.tsfile.utils.Binary; +import org.apache.tsfile.write.record.Tablet; +import org.junit.Assert; +import org.junit.Test; + +import java.nio.charset.StandardCharsets; +import java.util.List; +import java.util.regex.Pattern; + +public class ConsensusLogToTabletConverterTest { + + private static final String DATABASE_NAME = "db"; + + @Test + public void testConvertRelationalInsertRowNodeWithSingleMatchedColumn() { + final ConsensusLogToTabletConverter converter = createConverter("id1"); + + final List tablets = converter.convert(StatementTestUtils.genInsertRowNode(7)); + + Assert.assertEquals(1, tablets.size()); + final Tablet tablet = tablets.get(0); + Assert.assertEquals(StatementTestUtils.tableName(), tablet.getTableName()); + Assert.assertEquals(1, tablet.getRowSize()); + Assert.assertEquals(1, tablet.getSchemas().size()); + Assert.assertEquals("id1", tablet.getSchemas().get(0).getMeasurementName()); + Assert.assertEquals(ColumnCategory.TAG, tablet.getColumnTypes().get(0)); + Assert.assertEquals("id:7", toUtf8(((Binary[]) tablet.getValues()[0])[0])); + } + + @Test + public void testConvertRelationalInsertRowNodeWithMultipleMatchedColumns() { + final ConsensusLogToTabletConverter converter = createConverter("(id1|m1)"); + + final List tablets = converter.convert(StatementTestUtils.genInsertRowNode(9)); + + Assert.assertEquals(1, tablets.size()); + final Tablet tablet = tablets.get(0); + Assert.assertEquals(2, tablet.getSchemas().size()); + Assert.assertEquals("id1", tablet.getSchemas().get(0).getMeasurementName()); + Assert.assertEquals("m1", tablet.getSchemas().get(1).getMeasurementName()); + Assert.assertEquals(ColumnCategory.TAG, tablet.getColumnTypes().get(0)); + Assert.assertEquals(ColumnCategory.FIELD, tablet.getColumnTypes().get(1)); + Assert.assertEquals("id:9", toUtf8(((Binary[]) tablet.getValues()[0])[0])); + Assert.assertEquals(9.0, ((double[]) tablet.getValues()[1])[0], 0.0); + } + + @Test + public void testConvertRelationalInsertTabletNodeWithSingleMatchedColumn() { + final ConsensusLogToTabletConverter converter = createConverter("m1"); + + final List tablets = converter.convert(StatementTestUtils.genInsertTabletNode(3, 10)); + + Assert.assertEquals(1, tablets.size()); + final Tablet tablet = tablets.get(0); + Assert.assertEquals(StatementTestUtils.tableName(), tablet.getTableName()); + Assert.assertEquals(3, tablet.getRowSize()); + Assert.assertEquals(1, tablet.getSchemas().size()); + Assert.assertEquals("m1", tablet.getSchemas().get(0).getMeasurementName()); + Assert.assertEquals(ColumnCategory.FIELD, tablet.getColumnTypes().get(0)); + Assert.assertArrayEquals(new long[] {10L, 11L, 12L}, tablet.getTimestamps()); + Assert.assertArrayEquals( + new double[] {10.0, 11.0, 12.0}, (double[]) tablet.getValues()[0], 0.0); + } + + @Test + public void testConvertRelationalInsertNodeReturnsEmptyWhenNoColumnsMatch() { + final ConsensusLogToTabletConverter converter = createConverter("not_exist"); + + Assert.assertTrue(converter.convert(StatementTestUtils.genInsertRowNode(0)).isEmpty()); + Assert.assertTrue(converter.convert(StatementTestUtils.genInsertTabletNode(2, 0)).isEmpty()); + } + + private static ConsensusLogToTabletConverter createConverter(final String columnPattern) { + return new ConsensusLogToTabletConverter( + null, + new TablePattern(true, DATABASE_NAME, StatementTestUtils.tableName()), + Pattern.compile(columnPattern), + DATABASE_NAME); + } + + private static String toUtf8(final Binary value) { + return new String(value.getValues(), StandardCharsets.UTF_8); + } +} diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionCommitStateTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionCommitStateTest.java new file mode 100644 index 0000000000000..751074893a6a5 --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionCommitStateTest.java @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker.consensus; + +import org.apache.iotdb.rpc.subscription.payload.poll.RegionProgress; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterId; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterProgress; + +import org.junit.Test; + +import java.io.ByteArrayOutputStream; +import java.io.DataOutputStream; +import java.nio.ByteBuffer; +import java.util.LinkedHashMap; +import java.util.Map; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotEquals; +import static org.junit.Assert.assertTrue; + +public class ConsensusSubscriptionCommitStateTest { + + @Test + public void testCommitAdvancesContiguousWriterProgress() { + final WriterId writerId = new WriterId("1_1", 7, 2L); + final Map initialCommitted = new LinkedHashMap<>(); + initialCommitted.put(writerId, new WriterProgress(100L, 0L)); + final ConsensusSubscriptionCommitManager.ConsensusSubscriptionCommitState state = + new ConsensusSubscriptionCommitManager.ConsensusSubscriptionCommitState( + "1_1", new SubscriptionConsensusProgress(new RegionProgress(initialCommitted), 0L)); + + state.recordMapping(writerId, new WriterProgress(101L, 1L)); + state.recordMapping(writerId, new WriterProgress(102L, 2L)); + state.recordMapping(writerId, new WriterProgress(103L, 3L)); + + assertTrue(state.commit(writerId, new WriterProgress(102L, 2L))); + assertEquals(100L, state.getCommittedPhysicalTime()); + assertEquals(0L, state.getCommittedLocalSeq()); + assertEquals( + new WriterProgress(100L, 0L), + state.getCommittedRegionProgress().getWriterPositions().get(writerId)); + + assertTrue(state.commit(writerId, new WriterProgress(101L, 1L))); + assertEquals(102L, state.getCommittedPhysicalTime()); + assertEquals(2L, state.getCommittedLocalSeq()); + assertEquals(7, state.getCommittedWriterNodeId()); + assertEquals(2L, state.getCommittedWriterEpoch()); + assertEquals(writerId, state.getCommittedWriterId()); + assertEquals( + new WriterProgress(102L, 2L), + state.getCommittedRegionProgress().getWriterPositions().get(writerId)); + + assertTrue(state.commit(writerId, new WriterProgress(103L, 3L))); + assertEquals(103L, state.getCommittedPhysicalTime()); + assertEquals(3L, state.getCommittedLocalSeq()); + assertEquals(7, state.getCommittedWriterNodeId()); + assertEquals(2L, state.getCommittedWriterEpoch()); + } + + @Test + public void testSerializeDeserializeWriterProgress() throws Exception { + final ConsensusSubscriptionCommitManager.ConsensusSubscriptionCommitState state = + new ConsensusSubscriptionCommitManager.ConsensusSubscriptionCommitState( + "2_5", new SubscriptionConsensusProgress()); + final Map seekProgress = new LinkedHashMap<>(); + final WriterId writerA = new WriterId("2_5", 4, 9L); + final WriterId writerB = new WriterId("2_5", 5, 3L); + seekProgress.put(writerA, new WriterProgress(222L, 11L)); + seekProgress.put(writerB, new WriterProgress(230L, 4L)); + state.resetForSeek(new RegionProgress(seekProgress)); + + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try (DataOutputStream dos = new DataOutputStream(baos)) { + state.serialize(dos); + } + + final ConsensusSubscriptionCommitManager.ConsensusSubscriptionCommitState restored = + ConsensusSubscriptionCommitManager.ConsensusSubscriptionCommitState.deserialize( + "2_5", ByteBuffer.wrap(baos.toByteArray())); + + assertEquals(new RegionProgress(seekProgress), restored.getCommittedRegionProgress()); + assertEquals(230L, restored.getCommittedPhysicalTime()); + assertEquals(4L, restored.getCommittedLocalSeq()); + assertEquals(5, restored.getCommittedWriterNodeId()); + assertEquals(3L, restored.getCommittedWriterEpoch()); + assertEquals(writerB, restored.getCommittedWriterId()); + assertEquals(new WriterProgress(230L, 4L), restored.getCommittedWriterProgress()); + } + + @Test + public void testDirectCommitWithoutOutstandingRequiresOutstandingMapping() { + final WriterId writerId = new WriterId("3_1", 9, 4L); + final Map initialCommitted = new LinkedHashMap<>(); + initialCommitted.put(writerId, new WriterProgress(100L, 0L)); + final ConsensusSubscriptionCommitManager.ConsensusSubscriptionCommitState state = + new ConsensusSubscriptionCommitManager.ConsensusSubscriptionCommitState( + "3_1", new SubscriptionConsensusProgress(new RegionProgress(initialCommitted), 0L)); + + assertFalse(state.commitWithoutOutstanding(writerId, new WriterProgress(103L, 3L))); + assertEquals(100L, state.getCommittedPhysicalTime()); + assertEquals(0L, state.getCommittedLocalSeq()); + } + + @Test + public void testDirectCommitWithoutOutstandingRespectsOutstandingGap() { + final ConsensusSubscriptionCommitManager.ConsensusSubscriptionCommitState state = + new ConsensusSubscriptionCommitManager.ConsensusSubscriptionCommitState( + "3_2", new SubscriptionConsensusProgress()); + + final WriterId writerId = new WriterId("3_2", 8, 1L); + state.recordMapping(writerId, new WriterProgress(101L, 1L)); + state.recordMapping(writerId, new WriterProgress(102L, 2L)); + state.recordMapping(writerId, new WriterProgress(103L, 3L)); + + assertTrue(state.commitWithoutOutstanding(writerId, new WriterProgress(103L, 3L))); + assertEquals(new WriterProgress(0L, -1L), state.getCommittedWriterProgress()); + + assertTrue(state.commitWithoutOutstanding(writerId, new WriterProgress(101L, 1L))); + assertEquals(new WriterProgress(101L, 1L), state.getCommittedWriterProgress()); + + assertTrue(state.commitWithoutOutstanding(writerId, new WriterProgress(102L, 2L))); + assertEquals(new WriterProgress(103L, 3L), state.getCommittedWriterProgress()); + } + + @Test + public void testBroadcastThrottleKeyIsPerWriter() { + final String baseKey = "cg##topic##1_1"; + final WriterId writerA = new WriterId("1_1", 7, 1L); + final WriterId writerB = new WriterId("1_1", 8, 1L); + + assertNotEquals( + ConsensusSubscriptionCommitManager.buildBroadcastKey(baseKey, writerA), + ConsensusSubscriptionCommitManager.buildBroadcastKey(baseKey, writerB)); + } +} diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/subscription/broker/consensus/ProgressWALIteratorTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/subscription/broker/consensus/ProgressWALIteratorTest.java new file mode 100644 index 0000000000000..715e8086a8bb3 --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/subscription/broker/consensus/ProgressWALIteratorTest.java @@ -0,0 +1,292 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker.consensus; + +import org.apache.iotdb.consensus.common.request.IndexedConsensusRequest; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.PlanNodeType; +import org.apache.iotdb.db.storageengine.dataregion.wal.buffer.WALEntryType; +import org.apache.iotdb.db.storageengine.dataregion.wal.buffer.WALInfoEntry; +import org.apache.iotdb.db.storageengine.dataregion.wal.io.WALFileVersion; +import org.apache.iotdb.db.storageengine.dataregion.wal.io.WALMetaData; +import org.apache.iotdb.db.storageengine.dataregion.wal.io.WALWriter; +import org.apache.iotdb.db.storageengine.dataregion.wal.node.WALNode; +import org.apache.iotdb.db.storageengine.dataregion.wal.utils.WALFileStatus; +import org.apache.iotdb.db.storageengine.dataregion.wal.utils.WALFileUtils; + +import org.junit.Test; + +import java.io.File; +import java.nio.ByteBuffer; +import java.nio.file.Files; +import java.nio.file.Path; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public class ProgressWALIteratorTest { + + @Test + public void testIteratorGroupsByLocalSeqAndCarriesWriterMetadata() throws Exception { + final Path dir = Files.createTempDirectory("progress-wal-iterator"); + final File firstWal = + dir.resolve(WALFileUtils.getLogFileName(0, 0, WALFileStatus.CONTAINS_SEARCH_INDEX)) + .toFile(); + final File lastWal = + dir.resolve(WALFileUtils.getLogFileName(1, 12, WALFileStatus.CONTAINS_SEARCH_INDEX)) + .toFile(); + + try { + try (WALWriter writer = new WALWriter(firstWal, WALFileVersion.V3)) { + writer.write(searchableEntry(5L), singleEntryMeta(19, 5L, 1L, 1000L, 7, 3L, 105L)); + writer.write(searchableEntry(5L), singleEntryMeta(19, 5L, 1L, 1000L, 7, 3L, 105L)); + writer.write(searchableEntry(12L), singleEntryMeta(19, 12L, 1L, 2000L, 7, 4L, 112L)); + } + try (WALWriter ignored = new WALWriter(lastWal, WALFileVersion.V3)) { + // Create a sealed successor so the first WAL becomes historical and readable. + } + + try (ProgressWALIterator iterator = new ProgressWALIterator(dir.toFile(), 6L)) { + assertTrue(iterator.hasNext()); + final IndexedConsensusRequest request = iterator.next(); + assertEquals(12L, request.getSearchIndex()); + assertEquals(112L, request.getProgressLocalSeq()); + assertEquals(2000L, request.getPhysicalTime()); + assertEquals(7, request.getNodeId()); + assertEquals(4L, request.getWriterEpoch()); + assertEquals(1, request.getRequests().size()); + assertFalse(iterator.hasNext()); + } + } finally { + Files.deleteIfExists(firstWal.toPath()); + Files.deleteIfExists(lastWal.toPath()); + Files.deleteIfExists(dir); + } + } + + @Test + public void testIteratorMergesFragmentsWithSameLocalSeq() throws Exception { + final Path dir = Files.createTempDirectory("progress-wal-iterator-merge"); + final File firstWal = + dir.resolve(WALFileUtils.getLogFileName(0, 0, WALFileStatus.CONTAINS_SEARCH_INDEX)) + .toFile(); + final File lastWal = + dir.resolve(WALFileUtils.getLogFileName(1, 9, WALFileStatus.CONTAINS_SEARCH_INDEX)) + .toFile(); + + try { + try (WALWriter writer = new WALWriter(firstWal, WALFileVersion.V3)) { + writer.write(searchableEntry(9L), singleEntryMeta(19, 9L, 1L, 900L, 5, 2L, 1009L)); + writer.write(searchableEntry(9L), singleEntryMeta(19, 9L, 1L, 900L, 5, 2L, 1009L)); + } + try (WALWriter ignored = new WALWriter(lastWal, WALFileVersion.V3)) { + // Create a sealed successor so the first WAL becomes historical and readable. + } + + try (ProgressWALIterator iterator = new ProgressWALIterator(dir.toFile(), Long.MIN_VALUE)) { + assertTrue(iterator.hasNext()); + final IndexedConsensusRequest request = iterator.next(); + assertEquals(9L, request.getSearchIndex()); + assertEquals(1009L, request.getProgressLocalSeq()); + assertEquals(900L, request.getPhysicalTime()); + assertEquals(5, request.getNodeId()); + assertEquals(2L, request.getWriterEpoch()); + assertEquals(2, request.getRequests().size()); + assertFalse(iterator.hasNext()); + } + } finally { + Files.deleteIfExists(firstWal.toPath()); + Files.deleteIfExists(lastWal.toPath()); + Files.deleteIfExists(dir); + } + } + + @Test + public void testIteratorKeepsDifferentWritersWithSameLocalSeqSeparated() throws Exception { + final Path dir = Files.createTempDirectory("progress-wal-iterator-writers"); + final File firstWal = + dir.resolve(WALFileUtils.getLogFileName(0, 0, WALFileStatus.CONTAINS_SEARCH_INDEX)) + .toFile(); + final File lastWal = + dir.resolve(WALFileUtils.getLogFileName(1, 16, WALFileStatus.CONTAINS_SEARCH_INDEX)) + .toFile(); + + try { + try (WALWriter writer = new WALWriter(firstWal, WALFileVersion.V3)) { + writer.write(searchableEntry(15L), singleEntryMeta(19, 15L, 1L, 1500L, 7, 1L, 1L)); + writer.write(searchableEntry(16L), singleEntryMeta(19, 16L, 1L, 1501L, 8, 1L, 1L)); + } + try (WALWriter ignored = new WALWriter(lastWal, WALFileVersion.V3)) { + // Create a sealed successor so the first WAL becomes historical and readable. + } + + try (ProgressWALIterator iterator = new ProgressWALIterator(dir.toFile(), Long.MIN_VALUE)) { + assertTrue(iterator.hasNext()); + final IndexedConsensusRequest first = iterator.next(); + assertEquals(15L, first.getSearchIndex()); + assertEquals(1L, first.getProgressLocalSeq()); + assertEquals(7, first.getNodeId()); + + assertTrue(iterator.hasNext()); + final IndexedConsensusRequest second = iterator.next(); + assertEquals(16L, second.getSearchIndex()); + assertEquals(1L, second.getProgressLocalSeq()); + assertEquals(8, second.getNodeId()); + + assertFalse(iterator.hasNext()); + } + } finally { + Files.deleteIfExists(firstWal.toPath()); + Files.deleteIfExists(lastWal.toPath()); + Files.deleteIfExists(dir); + } + } + + @Test + public void testIteratorDoesNotSkipNextWalFileAfterExhaustingCurrentOne() throws Exception { + final Path dir = Files.createTempDirectory("progress-wal-iterator-sequential-files"); + final File firstWal = + dir.resolve(WALFileUtils.getLogFileName(0, 0, WALFileStatus.CONTAINS_SEARCH_INDEX)) + .toFile(); + final File secondWal = + dir.resolve(WALFileUtils.getLogFileName(1, 1, WALFileStatus.CONTAINS_SEARCH_INDEX)) + .toFile(); + final File thirdWal = + dir.resolve(WALFileUtils.getLogFileName(2, 2, WALFileStatus.CONTAINS_SEARCH_INDEX)) + .toFile(); + + try { + try (WALWriter writer = new WALWriter(firstWal, WALFileVersion.V3)) { + writer.write(searchableEntry(1L), singleEntryMeta(19, 1L, 1L, 100L, 7, 1L, 1L)); + } + try (WALWriter writer = new WALWriter(secondWal, WALFileVersion.V3)) { + writer.write(searchableEntry(2L), singleEntryMeta(19, 2L, 1L, 200L, 7, 1L, 2L)); + } + try (WALWriter writer = new WALWriter(thirdWal, WALFileVersion.V3)) { + writer.write(searchableEntry(3L), singleEntryMeta(19, 3L, 1L, 300L, 7, 1L, 3L)); + } + + try (ProgressWALIterator iterator = new ProgressWALIterator(dir.toFile(), Long.MIN_VALUE)) { + assertTrue(iterator.hasNext()); + assertEquals(1L, iterator.next().getSearchIndex()); + + assertTrue(iterator.hasNext()); + assertEquals(2L, iterator.next().getSearchIndex()); + + assertTrue(iterator.hasNext()); + assertEquals(3L, iterator.next().getSearchIndex()); + + assertFalse(iterator.hasNext()); + } + } finally { + Files.deleteIfExists(firstWal.toPath()); + Files.deleteIfExists(secondWal.toPath()); + Files.deleteIfExists(thirdWal.toPath()); + Files.deleteIfExists(dir); + } + } + + @Test + public void testFollowerEntryDoesNotSynthesizeSearchIndexFromProgressLocalSeq() throws Exception { + final Path dir = Files.createTempDirectory("progress-wal-iterator-follower"); + final File firstWal = + dir.resolve(WALFileUtils.getLogFileName(0, 0, WALFileStatus.CONTAINS_SEARCH_INDEX)) + .toFile(); + final File lastWal = + dir.resolve(WALFileUtils.getLogFileName(1, 0, WALFileStatus.CONTAINS_SEARCH_INDEX)) + .toFile(); + + try { + try (WALWriter writer = new WALWriter(firstWal, WALFileVersion.V3)) { + writer.write(searchableEntry(-1L), singleEntryMeta(19, -1L, 1L, 900L, 5, 2L, 1009L)); + } + try (WALWriter ignored = new WALWriter(lastWal, WALFileVersion.V3)) { + // Create a readable successor for the first WAL file. + } + + try (ProgressWALIterator iterator = new ProgressWALIterator(dir.toFile(), Long.MIN_VALUE)) { + assertTrue(iterator.hasNext()); + final IndexedConsensusRequest request = iterator.next(); + assertEquals(-1L, request.getSearchIndex()); + assertEquals(1009L, request.getProgressLocalSeq()); + assertEquals(900L, request.getPhysicalTime()); + assertEquals(5, request.getNodeId()); + assertEquals(2L, request.getWriterEpoch()); + assertFalse(iterator.hasNext()); + } + } finally { + Files.deleteIfExists(firstWal.toPath()); + Files.deleteIfExists(lastWal.toPath()); + Files.deleteIfExists(dir); + } + } + + @Test + public void testIteratorMarksIncompleteScanWhenNearLiveWalCannotBeOpened() throws Exception { + final Path dir = Files.createTempDirectory("progress-wal-iterator-incomplete-scan"); + final File brokenLiveWal = + dir.resolve(WALFileUtils.getLogFileName(7, 0, WALFileStatus.CONTAINS_SEARCH_INDEX)) + .toFile(); + + try { + assertTrue(brokenLiveWal.mkdir()); + + final WALNode walNode = mock(WALNode.class); + when(walNode.getLogDirectory()).thenReturn(dir.toFile()); + when(walNode.getCurrentWALFileVersion()).thenReturn(7L); + when(walNode.getCurrentWALMetaDataSnapshot()).thenReturn(new WALMetaData()); + + try (ProgressWALIterator iterator = new ProgressWALIterator(walNode, Long.MIN_VALUE)) { + assertFalse(iterator.hasNext()); + assertTrue(iterator.hasIncompleteScan()); + assertTrue(iterator.hasReadError()); + assertTrue(iterator.getIncompleteScanDetail().contains("near-live WAL file")); + } + } finally { + Files.deleteIfExists(brokenLiveWal.toPath()); + Files.deleteIfExists(dir); + } + } + + private static ByteBuffer searchableEntry(final long bodySearchIndex) { + final ByteBuffer buffer = + ByteBuffer.allocate(WALInfoEntry.FIXED_SERIALIZED_SIZE + PlanNodeType.BYTES + Long.BYTES); + buffer.put(WALEntryType.INSERT_ROW_NODE.getCode()); + buffer.putLong(1L); + buffer.putShort(PlanNodeType.INSERT_ROW.getNodeType()); + buffer.putLong(bodySearchIndex); + return buffer; + } + + private static WALMetaData singleEntryMeta( + final int size, + final long searchIndex, + final long memTableId, + final long physicalTime, + final int nodeId, + final long writerEpoch, + final long localSeq) { + final WALMetaData metaData = new WALMetaData(); + metaData.add(size, searchIndex, memTableId, physicalTime, nodeId, writerEpoch, localSeq); + return metaData; + } +} diff --git a/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties b/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties index 4015a4b2f3e92..021eaac902401 100644 --- a/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties +++ b/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties @@ -54,6 +54,12 @@ dn_data_region_consensus_port=10760 schema_replication_factor=1 data_replication_factor=1 +#################### +### Subscription Consensus Configuration +#################### + +# subscription_consensus_idle_safe_hlc_interval_ms=10000 + #################### ### Directory Configuration #################### @@ -70,4 +76,3 @@ cn_metric_prometheus_reporter_port=9091 # dn_metric_reporter_list= dn_metric_prometheus_reporter_port=9092 - diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/concurrent/ThreadName.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/concurrent/ThreadName.java index 81f2aa7156cf7..e7fcdca7c1b2a 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/concurrent/ThreadName.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/concurrent/ThreadName.java @@ -156,6 +156,8 @@ public enum ThreadName { PIPE_TERMINATE_EXECUTION_POOL("Pipe-Terminate-Execution-Pool"), LOAD_DATATYPE_CONVERT_POOL("Load-Datatype-Convert-Pool"), SUBSCRIPTION_EXECUTOR_POOL("Subscription-Executor-Pool"), + SUBSCRIPTION_CONSENSUS_PREFETCH_EXECUTOR_POOL("Subscription-Consensus-Prefetch-Executor-Pool"), + SUBSCRIPTION_CONSENSUS_PREFETCH_SCHEDULER("Subscription-Consensus-Prefetch-Scheduler"), SUBSCRIPTION_RUNTIME_META_SYNCER("Subscription-Runtime-Meta-Syncer"), WINDOW_EVALUATION_SERVICE("WindowEvaluationTaskPoolManager"), STATEFUL_TRIGGER_INFORMATION_UPDATER("Stateful-Trigger-Information-Updater"), @@ -318,6 +320,8 @@ public enum ThreadName { PIPE_AIR_GAP_RECEIVER, PIPE_PARALLEL_EXECUTION_POOL, SUBSCRIPTION_EXECUTOR_POOL, + SUBSCRIPTION_CONSENSUS_PREFETCH_EXECUTOR_POOL, + SUBSCRIPTION_CONSENSUS_PREFETCH_SCHEDULER, SUBSCRIPTION_RUNTIME_META_SYNCER, WINDOW_EVALUATION_SERVICE, STATEFUL_TRIGGER_INFORMATION_UPDATER)); diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonConfig.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonConfig.java index 4703fa77f9294..beccee36a6fba 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonConfig.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonConfig.java @@ -362,10 +362,11 @@ public class CommonConfig { private boolean pipeAutoSplitFullEnabled = true; - private boolean subscriptionEnabled = false; + private boolean subscriptionEnabled = true; private float subscriptionCacheMemoryUsagePercentage = 0.2F; private int subscriptionSubtaskExecutorMaxThreadNum = 2; + private int subscriptionConsensusPrefetchExecutorMaxThreadNum = 2; private int subscriptionPrefetchTabletBatchMaxDelayInMs = 20; private long subscriptionPrefetchTabletBatchMaxSizeInBytes = MB; @@ -394,6 +395,29 @@ public class CommonConfig { private long subscriptionMetaSyncerInitialSyncDelayMinutes = 3; private long subscriptionMetaSyncerSyncIntervalMinutes = 3; + private int subscriptionConsensusBatchMaxDelayInMs = 50; + private long subscriptionConsensusBatchMaxSizeInBytes = 8 * MB; + private int subscriptionConsensusBatchMaxTabletCount = 64; + private int subscriptionConsensusBatchMaxWalEntries = 128; + + private long subscriptionConsensusWalRetentionSizeInBytes = 512 * MB; + private long subscriptionConsensusWalRetentionTimeMs = -1L; + + private int subscriptionConsensusCommitPersistInterval = 100; + private boolean subscriptionConsensusCommitFsyncEnabled = false; + + private long subscriptionConsensusConsumerEvictionTimeoutMs = 60_000; + + private boolean subscriptionConsensusLagBasedPriority = true; + + private int subscriptionConsensusPrefetchingQueueCapacity = 256; + + private boolean subscriptionConsensusWatermarkEnabled = false; + + private long subscriptionConsensusWatermarkIntervalMs = 1000; + + private long subscriptionConsensusIdleSafeHlcIntervalMs = 1_000; + /** Whether to use persistent schema mode. */ private String schemaEngineMode = "Memory"; @@ -2281,6 +2305,14 @@ public void setPipeAutoSplitFullEnabled(boolean pipeAutoSplitFullEnabled) { this.pipeAutoSplitFullEnabled = pipeAutoSplitFullEnabled; } + public boolean getSubscriptionEnabled() { + return subscriptionEnabled; + } + + public void setSubscriptionEnabled(boolean subscriptionEnabled) { + this.subscriptionEnabled = subscriptionEnabled; + } + public float getSubscriptionCacheMemoryUsagePercentage() { return subscriptionCacheMemoryUsagePercentage; } @@ -2299,6 +2331,16 @@ public void setSubscriptionSubtaskExecutorMaxThreadNum( this.subscriptionSubtaskExecutorMaxThreadNum = subscriptionSubtaskExecutorMaxThreadNum; } + public int getSubscriptionConsensusPrefetchExecutorMaxThreadNum() { + return subscriptionConsensusPrefetchExecutorMaxThreadNum; + } + + public void setSubscriptionConsensusPrefetchExecutorMaxThreadNum( + int subscriptionConsensusPrefetchExecutorMaxThreadNum) { + this.subscriptionConsensusPrefetchExecutorMaxThreadNum = + subscriptionConsensusPrefetchExecutorMaxThreadNum; + } + public int getSubscriptionPrefetchTabletBatchMaxDelayInMs() { return subscriptionPrefetchTabletBatchMaxDelayInMs; } @@ -2512,6 +2554,135 @@ public long getSubscriptionMetaSyncerSyncIntervalMinutes() { return subscriptionMetaSyncerSyncIntervalMinutes; } + public int getSubscriptionConsensusBatchMaxDelayInMs() { + return subscriptionConsensusBatchMaxDelayInMs; + } + + public void setSubscriptionConsensusBatchMaxDelayInMs( + final int subscriptionConsensusBatchMaxDelayInMs) { + this.subscriptionConsensusBatchMaxDelayInMs = subscriptionConsensusBatchMaxDelayInMs; + } + + public long getSubscriptionConsensusBatchMaxSizeInBytes() { + return subscriptionConsensusBatchMaxSizeInBytes; + } + + public void setSubscriptionConsensusBatchMaxSizeInBytes( + final long subscriptionConsensusBatchMaxSizeInBytes) { + this.subscriptionConsensusBatchMaxSizeInBytes = subscriptionConsensusBatchMaxSizeInBytes; + } + + public int getSubscriptionConsensusBatchMaxTabletCount() { + return subscriptionConsensusBatchMaxTabletCount; + } + + public int getSubscriptionConsensusCommitPersistInterval() { + return subscriptionConsensusCommitPersistInterval; + } + + public void setSubscriptionConsensusCommitPersistInterval( + final int subscriptionConsensusCommitPersistInterval) { + this.subscriptionConsensusCommitPersistInterval = subscriptionConsensusCommitPersistInterval; + } + + public boolean isSubscriptionConsensusCommitFsyncEnabled() { + return subscriptionConsensusCommitFsyncEnabled; + } + + public void setSubscriptionConsensusCommitFsyncEnabled( + final boolean subscriptionConsensusCommitFsyncEnabled) { + this.subscriptionConsensusCommitFsyncEnabled = subscriptionConsensusCommitFsyncEnabled; + } + + public long getSubscriptionConsensusConsumerEvictionTimeoutMs() { + return subscriptionConsensusConsumerEvictionTimeoutMs; + } + + public void setSubscriptionConsensusConsumerEvictionTimeoutMs( + final long subscriptionConsensusConsumerEvictionTimeoutMs) { + this.subscriptionConsensusConsumerEvictionTimeoutMs = + subscriptionConsensusConsumerEvictionTimeoutMs; + } + + public boolean isSubscriptionConsensusLagBasedPriority() { + return subscriptionConsensusLagBasedPriority; + } + + public void setSubscriptionConsensusLagBasedPriority( + final boolean subscriptionConsensusLagBasedPriority) { + this.subscriptionConsensusLagBasedPriority = subscriptionConsensusLagBasedPriority; + } + + public int getSubscriptionConsensusPrefetchingQueueCapacity() { + return subscriptionConsensusPrefetchingQueueCapacity; + } + + public void setSubscriptionConsensusPrefetchingQueueCapacity( + final int subscriptionConsensusPrefetchingQueueCapacity) { + this.subscriptionConsensusPrefetchingQueueCapacity = + subscriptionConsensusPrefetchingQueueCapacity; + } + + public boolean isSubscriptionConsensusWatermarkEnabled() { + return subscriptionConsensusWatermarkEnabled; + } + + public void setSubscriptionConsensusWatermarkEnabled( + final boolean subscriptionConsensusWatermarkEnabled) { + this.subscriptionConsensusWatermarkEnabled = subscriptionConsensusWatermarkEnabled; + } + + public long getSubscriptionConsensusWatermarkIntervalMs() { + return subscriptionConsensusWatermarkIntervalMs; + } + + public void setSubscriptionConsensusWatermarkIntervalMs( + final long subscriptionConsensusWatermarkIntervalMs) { + this.subscriptionConsensusWatermarkIntervalMs = subscriptionConsensusWatermarkIntervalMs; + } + + public long getSubscriptionConsensusIdleSafeHlcIntervalMs() { + return subscriptionConsensusIdleSafeHlcIntervalMs; + } + + public void setSubscriptionConsensusIdleSafeHlcIntervalMs( + final long subscriptionConsensusIdleSafeHlcIntervalMs) { + this.subscriptionConsensusIdleSafeHlcIntervalMs = subscriptionConsensusIdleSafeHlcIntervalMs; + } + + public void setSubscriptionConsensusBatchMaxTabletCount( + final int subscriptionConsensusBatchMaxTabletCount) { + this.subscriptionConsensusBatchMaxTabletCount = subscriptionConsensusBatchMaxTabletCount; + } + + public int getSubscriptionConsensusBatchMaxWalEntries() { + return subscriptionConsensusBatchMaxWalEntries; + } + + public void setSubscriptionConsensusBatchMaxWalEntries( + final int subscriptionConsensusBatchMaxWalEntries) { + this.subscriptionConsensusBatchMaxWalEntries = subscriptionConsensusBatchMaxWalEntries; + } + + public long getSubscriptionConsensusWalRetentionSizeInBytes() { + return subscriptionConsensusWalRetentionSizeInBytes; + } + + public void setSubscriptionConsensusWalRetentionSizeInBytes( + final long subscriptionConsensusWalRetentionSizeInBytes) { + this.subscriptionConsensusWalRetentionSizeInBytes = + subscriptionConsensusWalRetentionSizeInBytes; + } + + public long getSubscriptionConsensusWalRetentionTimeMs() { + return subscriptionConsensusWalRetentionTimeMs; + } + + public void setSubscriptionConsensusWalRetentionTimeMs( + final long subscriptionConsensusWalRetentionTimeMs) { + this.subscriptionConsensusWalRetentionTimeMs = subscriptionConsensusWalRetentionTimeMs; + } + public void setSubscriptionMetaSyncerSyncIntervalMinutes( long subscriptionMetaSyncerSyncIntervalMinutes) { this.subscriptionMetaSyncerSyncIntervalMinutes = subscriptionMetaSyncerSyncIntervalMinutes; diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonDescriptor.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonDescriptor.java index d392a60bbbd76..62f2e4539ecb1 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonDescriptor.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonDescriptor.java @@ -283,6 +283,10 @@ public void loadCommonProps(TrimProperties properties) throws IOException { } private void loadSubscriptionProps(TrimProperties properties) { + config.setSubscriptionEnabled( + Boolean.parseBoolean( + properties.getProperty( + "subscription_enabled", String.valueOf(config.getSubscriptionEnabled())))); config.setSubscriptionCacheMemoryUsagePercentage( Float.parseFloat( properties.getProperty( @@ -293,6 +297,11 @@ private void loadSubscriptionProps(TrimProperties properties) { properties.getProperty( "subscription_subtask_executor_max_thread_num", Integer.toString(config.getSubscriptionSubtaskExecutorMaxThreadNum())))); + config.setSubscriptionConsensusPrefetchExecutorMaxThreadNum( + Integer.parseInt( + properties.getProperty( + "subscription_consensus_prefetch_executor_max_thread_num", + Integer.toString(config.getSubscriptionConsensusPrefetchExecutorMaxThreadNum())))); config.setSubscriptionPrefetchTabletBatchMaxDelayInMs( Integer.parseInt( @@ -417,6 +426,77 @@ private void loadSubscriptionProps(TrimProperties properties) { properties.getProperty( "subscription_meta_syncer_sync_interval_minutes", String.valueOf(config.getSubscriptionMetaSyncerSyncIntervalMinutes())))); + + config.setSubscriptionConsensusBatchMaxDelayInMs( + Integer.parseInt( + properties.getProperty( + "subscription_consensus_batch_max_delay_in_ms", + String.valueOf(config.getSubscriptionConsensusBatchMaxDelayInMs())))); + config.setSubscriptionConsensusBatchMaxSizeInBytes( + Long.parseLong( + properties.getProperty( + "subscription_consensus_batch_max_size_in_bytes", + String.valueOf(config.getSubscriptionConsensusBatchMaxSizeInBytes())))); + config.setSubscriptionConsensusBatchMaxTabletCount( + Integer.parseInt( + properties.getProperty( + "subscription_consensus_batch_max_tablet_count", + String.valueOf(config.getSubscriptionConsensusBatchMaxTabletCount())))); + config.setSubscriptionConsensusWalRetentionSizeInBytes( + Long.parseLong( + properties.getProperty( + "subscription_consensus_wal_retention_size_in_bytes", + String.valueOf(config.getSubscriptionConsensusWalRetentionSizeInBytes())))); + config.setSubscriptionConsensusWalRetentionTimeMs( + Long.parseLong( + properties.getProperty( + "subscription_consensus_wal_retention_time_ms", + String.valueOf(config.getSubscriptionConsensusWalRetentionTimeMs())))); + config.setSubscriptionConsensusBatchMaxWalEntries( + Integer.parseInt( + properties.getProperty( + "subscription_consensus_batch_max_wal_entries", + String.valueOf(config.getSubscriptionConsensusBatchMaxWalEntries())))); + config.setSubscriptionConsensusCommitPersistInterval( + Integer.parseInt( + properties.getProperty( + "subscription_consensus_commit_persist_interval", + String.valueOf(config.getSubscriptionConsensusCommitPersistInterval())))); + config.setSubscriptionConsensusCommitFsyncEnabled( + Boolean.parseBoolean( + properties.getProperty( + "subscription_consensus_commit_fsync_enabled", + String.valueOf(config.isSubscriptionConsensusCommitFsyncEnabled())))); + config.setSubscriptionConsensusConsumerEvictionTimeoutMs( + Long.parseLong( + properties.getProperty( + "subscription_consensus_consumer_eviction_timeout_ms", + String.valueOf(config.getSubscriptionConsensusConsumerEvictionTimeoutMs())))); + config.setSubscriptionConsensusLagBasedPriority( + Boolean.parseBoolean( + properties.getProperty( + "subscription_consensus_lag_based_priority", + String.valueOf(config.isSubscriptionConsensusLagBasedPriority())))); + config.setSubscriptionConsensusPrefetchingQueueCapacity( + Integer.parseInt( + properties.getProperty( + "subscription_consensus_prefetching_queue_capacity", + String.valueOf(config.getSubscriptionConsensusPrefetchingQueueCapacity())))); + config.setSubscriptionConsensusWatermarkEnabled( + Boolean.parseBoolean( + properties.getProperty( + "subscription_consensus_watermark_enabled", + String.valueOf(config.isSubscriptionConsensusWatermarkEnabled())))); + config.setSubscriptionConsensusWatermarkIntervalMs( + Long.parseLong( + properties.getProperty( + "subscription_consensus_watermark_interval_ms", + String.valueOf(config.getSubscriptionConsensusWatermarkIntervalMs())))); + config.setSubscriptionConsensusIdleSafeHlcIntervalMs( + Long.parseLong( + properties.getProperty( + "subscription_consensus_idle_safe_hlc_interval_ms", + String.valueOf(config.getSubscriptionConsensusIdleSafeHlcIntervalMs())))); } public void loadRetryProperties(TrimProperties properties) throws IOException { @@ -435,6 +515,48 @@ public void loadRetryProperties(TrimProperties properties) throws IOException { "enable_retry_for_unknown_error")))); } + /** + * Reload only the subscription consensus properties that are intended to take effect on hot + * configuration reload. + * + *

Batching related properties are read dynamically by running consensus subscription queues + * and therefore take effect immediately after this method updates {@link CommonConfig}. Retention + * defaults are only used when new consensus subscription queues are created, so hot reload + * affects future topics / bindings and does not retroactively mutate existing queue policies. + */ + public void loadHotModifiedSubscriptionConsensusProps(final TrimProperties properties) { + config.setSubscriptionConsensusBatchMaxDelayInMs( + Integer.parseInt( + properties.getProperty( + "subscription_consensus_batch_max_delay_in_ms", + String.valueOf(config.getSubscriptionConsensusBatchMaxDelayInMs())))); + config.setSubscriptionConsensusBatchMaxSizeInBytes( + Long.parseLong( + properties.getProperty( + "subscription_consensus_batch_max_size_in_bytes", + String.valueOf(config.getSubscriptionConsensusBatchMaxSizeInBytes())))); + config.setSubscriptionConsensusBatchMaxTabletCount( + Integer.parseInt( + properties.getProperty( + "subscription_consensus_batch_max_tablet_count", + String.valueOf(config.getSubscriptionConsensusBatchMaxTabletCount())))); + config.setSubscriptionConsensusBatchMaxWalEntries( + Integer.parseInt( + properties.getProperty( + "subscription_consensus_batch_max_wal_entries", + String.valueOf(config.getSubscriptionConsensusBatchMaxWalEntries())))); + config.setSubscriptionConsensusWalRetentionSizeInBytes( + Long.parseLong( + properties.getProperty( + "subscription_consensus_wal_retention_size_in_bytes", + String.valueOf(config.getSubscriptionConsensusWalRetentionSizeInBytes())))); + config.setSubscriptionConsensusWalRetentionTimeMs( + Long.parseLong( + properties.getProperty( + "subscription_consensus_wal_retention_time_ms", + String.valueOf(config.getSubscriptionConsensusWalRetentionTimeMs())))); + } + public void loadBinaryAllocatorProps(TrimProperties properties) { config.setEnableBinaryAllocator( Boolean.parseBoolean( diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/enums/Metric.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/enums/Metric.java index ec16c181e618b..cbeef1f2c7c7e 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/enums/Metric.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/enums/Metric.java @@ -206,6 +206,10 @@ public enum Metric { SUBSCRIPTION_UNCOMMITTED_EVENT_COUNT("subscription_uncommitted_event_count"), SUBSCRIPTION_CURRENT_COMMIT_ID("subscription_current_commit_id"), SUBSCRIPTION_EVENT_TRANSFER("subscription_event_transfer"), + SUBSCRIPTION_CONSENSUS_LAG("subscription_consensus_lag"), + SUBSCRIPTION_CONSENSUS_WAL_GAP("subscription_consensus_wal_gap"), + SUBSCRIPTION_CONSENSUS_ROUTING_EPOCH_CHANGE("subscription_consensus_routing_epoch_change"), + SUBSCRIPTION_CONSENSUS_WATERMARK("subscription_consensus_watermark"), // load related ACTIVE_LOADING_FILES_NUMBER("active_loading_files_number"), ACTIVE_LOADING_FILES_SIZE("active_loading_files_size"), diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/config/SubscriptionConfig.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/config/SubscriptionConfig.java index c7e7fea8d12f8..97db60ca572d4 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/config/SubscriptionConfig.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/config/SubscriptionConfig.java @@ -30,7 +30,7 @@ public class SubscriptionConfig { private static final CommonConfig COMMON_CONFIG = CommonDescriptor.getInstance().getConfig(); public boolean getSubscriptionEnabled() { - return false; + return COMMON_CONFIG.getSubscriptionEnabled(); } public float getSubscriptionCacheMemoryUsagePercentage() { @@ -41,6 +41,10 @@ public int getSubscriptionSubtaskExecutorMaxThreadNum() { return COMMON_CONFIG.getSubscriptionSubtaskExecutorMaxThreadNum(); } + public int getSubscriptionConsensusPrefetchExecutorMaxThreadNum() { + return COMMON_CONFIG.getSubscriptionConsensusPrefetchExecutorMaxThreadNum(); + } + public int getSubscriptionPrefetchTabletBatchMaxDelayInMs() { return COMMON_CONFIG.getSubscriptionPrefetchTabletBatchMaxDelayInMs(); } @@ -137,16 +141,68 @@ public long getSubscriptionMetaSyncerSyncIntervalMinutes() { return COMMON_CONFIG.getSubscriptionMetaSyncerSyncIntervalMinutes(); } + // Consensus subscription batching parameters + public int getSubscriptionConsensusBatchMaxDelayInMs() { + return COMMON_CONFIG.getSubscriptionConsensusBatchMaxDelayInMs(); + } + + public long getSubscriptionConsensusBatchMaxSizeInBytes() { + return COMMON_CONFIG.getSubscriptionConsensusBatchMaxSizeInBytes(); + } + + public int getSubscriptionConsensusBatchMaxTabletCount() { + return COMMON_CONFIG.getSubscriptionConsensusBatchMaxTabletCount(); + } + + public int getSubscriptionConsensusBatchMaxWalEntries() { + return COMMON_CONFIG.getSubscriptionConsensusBatchMaxWalEntries(); + } + + public int getSubscriptionConsensusCommitPersistInterval() { + return COMMON_CONFIG.getSubscriptionConsensusCommitPersistInterval(); + } + + public boolean isSubscriptionConsensusCommitFsyncEnabled() { + return COMMON_CONFIG.isSubscriptionConsensusCommitFsyncEnabled(); + } + + public long getSubscriptionConsensusConsumerEvictionTimeoutMs() { + return COMMON_CONFIG.getSubscriptionConsensusConsumerEvictionTimeoutMs(); + } + + public boolean isSubscriptionConsensusLagBasedPriority() { + return COMMON_CONFIG.isSubscriptionConsensusLagBasedPriority(); + } + + public int getSubscriptionConsensusPrefetchingQueueCapacity() { + return COMMON_CONFIG.getSubscriptionConsensusPrefetchingQueueCapacity(); + } + + public long getSubscriptionConsensusWatermarkIntervalMs() { + if (!COMMON_CONFIG.isSubscriptionConsensusWatermarkEnabled()) { + return -1; + } + return COMMON_CONFIG.getSubscriptionConsensusWatermarkIntervalMs(); + } + + public long getSubscriptionConsensusIdleSafeHlcIntervalMs() { + return COMMON_CONFIG.getSubscriptionConsensusIdleSafeHlcIntervalMs(); + } + /////////////////////////////// Utils /////////////////////////////// private static final Logger LOGGER = LoggerFactory.getLogger(SubscriptionConfig.class); public void printAllConfigs() { + LOGGER.info("SubscriptionEnabled: {}", getSubscriptionEnabled()); LOGGER.info( "SubscriptionCacheMemoryUsagePercentage: {}", getSubscriptionCacheMemoryUsagePercentage()); LOGGER.info( "SubscriptionSubtaskExecutorMaxThreadNum: {}", getSubscriptionSubtaskExecutorMaxThreadNum()); + LOGGER.info( + "SubscriptionConsensusPrefetchExecutorMaxThreadNum: {}", + getSubscriptionConsensusPrefetchExecutorMaxThreadNum()); LOGGER.info( "SubscriptionPrefetchTabletBatchMaxDelayInMs: {}", @@ -207,6 +263,21 @@ public void printAllConfigs() { LOGGER.info( "SubscriptionMetaSyncerSyncIntervalMinutes: {}", getSubscriptionMetaSyncerSyncIntervalMinutes()); + + LOGGER.info( + "SubscriptionConsensusBatchMaxDelayInMs: {}", getSubscriptionConsensusBatchMaxDelayInMs()); + LOGGER.info( + "SubscriptionConsensusBatchMaxSizeInBytes: {}", + getSubscriptionConsensusBatchMaxSizeInBytes()); + LOGGER.info( + "SubscriptionConsensusBatchMaxTabletCount: {}", + getSubscriptionConsensusBatchMaxTabletCount()); + LOGGER.info( + "SubscriptionConsensusBatchMaxWalEntries: {}", + getSubscriptionConsensusBatchMaxWalEntries()); + LOGGER.info( + "SubscriptionConsensusIdleSafeHlcIntervalMs: {}", + getSubscriptionConsensusIdleSafeHlcIntervalMs()); } /////////////////////////////// Singleton /////////////////////////////// diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/meta/consumer/CommitProgressKeeper.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/meta/consumer/CommitProgressKeeper.java new file mode 100644 index 0000000000000..9e5f6e03779bd --- /dev/null +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/meta/consumer/CommitProgressKeeper.java @@ -0,0 +1,192 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.commons.subscription.meta.consumer; + +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.HashMap; +import java.util.Map; +import java.util.Objects; +import java.util.concurrent.ConcurrentHashMap; + +public class CommitProgressKeeper { + + private static final String KEY_SEPARATOR = "##"; + + private final Map regionProgressMap = new ConcurrentHashMap<>(); + + public CommitProgressKeeper() {} + + public static String generateKey( + final String consumerGroupId, + final String topicName, + final String regionId, + final int dataNodeId) { + return consumerGroupId + + KEY_SEPARATOR + + topicName + + KEY_SEPARATOR + + regionId + + KEY_SEPARATOR + + dataNodeId; + } + + public void updateRegionProgress(final String key, final ByteBuffer committedRegionProgress) { + if (Objects.isNull(committedRegionProgress)) { + return; + } + regionProgressMap.put(key, copyBuffer(committedRegionProgress)); + } + + public ByteBuffer getRegionProgress(final String key) { + final ByteBuffer buffer = regionProgressMap.get(key); + return Objects.nonNull(buffer) ? copyBuffer(buffer) : null; + } + + public Map getAllRegionProgress() { + final Map result = new HashMap<>(regionProgressMap.size()); + regionProgressMap.forEach((key, value) -> result.put(key, copyBuffer(value))); + return result; + } + + public void replaceAll(final Map newRegionProgressMap) { + regionProgressMap.clear(); + if (Objects.nonNull(newRegionProgressMap)) { + for (final Map.Entry entry : newRegionProgressMap.entrySet()) { + if (Objects.nonNull(entry.getValue())) { + regionProgressMap.put(entry.getKey(), copyBuffer(entry.getValue())); + } + } + } + } + + public boolean isEmpty() { + return regionProgressMap.isEmpty(); + } + + public void processTakeSnapshot(final FileOutputStream fileOutputStream) throws IOException { + final int regionSize = regionProgressMap.size(); + fileOutputStream.write(ByteBuffer.allocate(4).putInt(regionSize).array()); + for (final Map.Entry entry : regionProgressMap.entrySet()) { + final byte[] keyBytes = entry.getKey().getBytes("UTF-8"); + final ByteBuffer progressBuffer = copyBuffer(entry.getValue()); + final byte[] progressBytes = new byte[progressBuffer.remaining()]; + progressBuffer.get(progressBytes); + final ByteBuffer buffer = ByteBuffer.allocate(4 + keyBytes.length + 4 + progressBytes.length); + buffer.putInt(keyBytes.length); + buffer.put(keyBytes); + buffer.putInt(progressBytes.length); + buffer.put(progressBytes); + fileOutputStream.write(buffer.array()); + } + } + + public void processLoadSnapshot(final FileInputStream fileInputStream) throws IOException { + regionProgressMap.clear(); + final byte[] sizeBytes = new byte[4]; + if (fileInputStream.read(sizeBytes) != 4) { + return; + } + final int regionSize = ByteBuffer.wrap(sizeBytes).getInt(); + for (int i = 0; i < regionSize; i++) { + final byte[] keyLenBytes = new byte[4]; + if (fileInputStream.read(keyLenBytes) != 4) { + throw new IOException("Unexpected EOF reading region progress key length"); + } + final int keyLen = ByteBuffer.wrap(keyLenBytes).getInt(); + final byte[] keyBytes = new byte[keyLen]; + if (fileInputStream.read(keyBytes) != keyLen) { + throw new IOException("Unexpected EOF reading region progress key"); + } + final String key = new String(keyBytes, "UTF-8"); + final byte[] valueLenBytes = new byte[4]; + if (fileInputStream.read(valueLenBytes) != 4) { + throw new IOException("Unexpected EOF reading region progress value length"); + } + final int valueLen = ByteBuffer.wrap(valueLenBytes).getInt(); + final byte[] valueBytes = new byte[valueLen]; + if (fileInputStream.read(valueBytes) != valueLen) { + throw new IOException("Unexpected EOF reading region progress value"); + } + regionProgressMap.put(key, ByteBuffer.wrap(valueBytes).asReadOnlyBuffer()); + } + } + + public void serializeToStream(final java.io.DataOutputStream stream) throws IOException { + stream.writeInt(regionProgressMap.size()); + for (final Map.Entry entry : regionProgressMap.entrySet()) { + final byte[] keyBytes = entry.getKey().getBytes("UTF-8"); + final ByteBuffer progressBuffer = copyBuffer(entry.getValue()); + final byte[] progressBytes = new byte[progressBuffer.remaining()]; + progressBuffer.get(progressBytes); + stream.writeInt(keyBytes.length); + stream.write(keyBytes); + stream.writeInt(progressBytes.length); + stream.write(progressBytes); + } + } + + public static Map deserializeRegionProgressFromBuffer( + final ByteBuffer buffer) { + if (!buffer.hasRemaining()) { + return new HashMap<>(); + } + final int size = buffer.getInt(); + final Map result = new HashMap<>(size); + for (int i = 0; i < size; i++) { + final int keyLen = buffer.getInt(); + final byte[] keyBytes = new byte[keyLen]; + buffer.get(keyBytes); + final String key = new String(keyBytes, java.nio.charset.StandardCharsets.UTF_8); + final int valueLen = buffer.getInt(); + final byte[] valueBytes = new byte[valueLen]; + buffer.get(valueBytes); + result.put(key, ByteBuffer.wrap(valueBytes).asReadOnlyBuffer()); + } + return result; + } + + private static ByteBuffer copyBuffer(final ByteBuffer buffer) { + final ByteBuffer duplicate = buffer.asReadOnlyBuffer(); + duplicate.rewind(); + final byte[] bytes = new byte[duplicate.remaining()]; + duplicate.get(bytes); + return ByteBuffer.wrap(bytes).asReadOnlyBuffer(); + } + + @Override + public boolean equals(final Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + final CommitProgressKeeper that = (CommitProgressKeeper) o; + return Objects.equals(this.regionProgressMap, that.regionProgressMap); + } + + @Override + public int hashCode() { + return Objects.hash(regionProgressMap); + } +} diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/meta/consumer/ConsumerGroupMeta.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/meta/consumer/ConsumerGroupMeta.java index f89bfbc683379..6d22e7a177a1c 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/meta/consumer/ConsumerGroupMeta.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/meta/consumer/ConsumerGroupMeta.java @@ -116,6 +116,26 @@ private boolean shouldRecordSubscriptionCreationTime() { return unsubscribedTopicNames; } + public static Set getTopicsNewlySubByGroup( + final ConsumerGroupMeta currentMeta, final ConsumerGroupMeta updatedMeta) { + if (!Objects.equals(currentMeta.consumerGroupId, updatedMeta.consumerGroupId) + || !Objects.equals(currentMeta.creationTime, updatedMeta.creationTime)) { + return Collections.emptySet(); + } + + final Set newlySubscribedTopicNames = new HashSet<>(); + updatedMeta + .topicNameToSubscribedConsumerIdSet + .keySet() + .forEach( + topicName -> { + if (!currentMeta.topicNameToSubscribedConsumerIdSet.containsKey(topicName)) { + newlySubscribedTopicNames.add(topicName); + } + }); + return newlySubscribedTopicNames; + } + /////////////////////////////// consumer /////////////////////////////// public void checkAuthorityBeforeJoinConsumerGroup(final ConsumerMeta consumerMeta) @@ -174,6 +194,11 @@ public ConsumerMeta getConsumerMeta(final String consumerId) { ////////////////////////// subscription ////////////////////////// + /** Get all topic names subscribed by this consumer group. */ + public Set getSubscribedTopicNames() { + return Collections.unmodifiableSet(topicNameToSubscribedConsumerIdSet.keySet()); + } + /** * Get the consumers subscribing the given topic in this group. * diff --git a/iotdb-core/node-commons/src/test/java/org/apache/iotdb/commons/subscription/meta/consumer/CommitProgressKeeperTest.java b/iotdb-core/node-commons/src/test/java/org/apache/iotdb/commons/subscription/meta/consumer/CommitProgressKeeperTest.java new file mode 100644 index 0000000000000..2cdec776683f1 --- /dev/null +++ b/iotdb-core/node-commons/src/test/java/org/apache/iotdb/commons/subscription/meta/consumer/CommitProgressKeeperTest.java @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.commons.subscription.meta.consumer; + +import org.apache.iotdb.rpc.subscription.payload.poll.RegionProgress; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterId; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterProgress; + +import org.junit.Test; + +import java.io.ByteArrayOutputStream; +import java.io.DataOutputStream; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.nio.ByteBuffer; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.LinkedHashMap; +import java.util.Map; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public class CommitProgressKeeperTest { + + @Test + public void testUpdateAndReplaceAllUseDefensiveCopies() throws Exception { + final CommitProgressKeeper keeper = new CommitProgressKeeper(); + final String key = CommitProgressKeeper.generateKey("cg", "topic", "1_1", 3); + final RegionProgress regionProgress = createRegionProgress("1_1", 7, 2L, 100L, 10L); + + final ByteBuffer source = serialize(regionProgress); + keeper.updateRegionProgress(key, source); + source.position(source.limit()); + + final ByteBuffer firstRead = keeper.getRegionProgress(key); + assertTrue(firstRead.isReadOnly()); + firstRead.get(); + assertEquals(regionProgress, RegionProgress.deserialize(keeper.getRegionProgress(key))); + + final Map replacement = new LinkedHashMap<>(); + final RegionProgress replacementProgress = createRegionProgress("1_1", 8, 3L, 120L, 12L); + final ByteBuffer replacementBuffer = serialize(replacementProgress); + replacement.put(key, replacementBuffer); + + keeper.replaceAll(replacement); + replacementBuffer.position(replacementBuffer.limit()); + + assertEquals(replacementProgress, RegionProgress.deserialize(keeper.getRegionProgress(key))); + } + + @Test + public void testSnapshotRoundTripPreservesRegionProgress() throws Exception { + final CommitProgressKeeper keeper = new CommitProgressKeeper(); + final String firstKey = CommitProgressKeeper.generateKey("cg", "topicA", "1_1", 3); + final String secondKey = CommitProgressKeeper.generateKey("cg", "topicB", "1_2", 5); + final RegionProgress firstProgress = + createRegionProgress( + "1_1", + new WriterId("1_1", 7, 2L), + new WriterProgress(100L, 10L), + new WriterId("1_1", 8, 2L), + new WriterProgress(110L, 11L)); + final RegionProgress secondProgress = createRegionProgress("1_2", 9, 4L, 200L, 20L); + + keeper.updateRegionProgress(firstKey, serialize(firstProgress)); + keeper.updateRegionProgress(secondKey, serialize(secondProgress)); + + final Path snapshot = Files.createTempFile("commit-progress-keeper", ".snapshot"); + try { + try (FileOutputStream fos = new FileOutputStream(snapshot.toFile())) { + keeper.processTakeSnapshot(fos); + } + + final CommitProgressKeeper restored = new CommitProgressKeeper(); + try (FileInputStream fis = new FileInputStream(snapshot.toFile())) { + restored.processLoadSnapshot(fis); + } + + assertEquals(firstProgress, RegionProgress.deserialize(restored.getRegionProgress(firstKey))); + assertEquals( + secondProgress, RegionProgress.deserialize(restored.getRegionProgress(secondKey))); + assertEquals(2, restored.getAllRegionProgress().size()); + } finally { + Files.deleteIfExists(snapshot); + } + } + + private static RegionProgress createRegionProgress( + final String regionId, + final int nodeId, + final long writerEpoch, + final long physicalTime, + final long localSeq) { + return createRegionProgress( + regionId, + new WriterId(regionId, nodeId, writerEpoch), + new WriterProgress(physicalTime, localSeq)); + } + + private static RegionProgress createRegionProgress( + final String regionId, + final WriterId firstWriterId, + final WriterProgress firstWriterProgress) { + return createRegionProgress(regionId, firstWriterId, firstWriterProgress, null, null); + } + + private static RegionProgress createRegionProgress( + final String regionId, + final WriterId firstWriterId, + final WriterProgress firstWriterProgress, + final WriterId secondWriterId, + final WriterProgress secondWriterProgress) { + final Map writerPositions = new LinkedHashMap<>(); + writerPositions.put(firstWriterId, firstWriterProgress); + if (secondWriterId != null && secondWriterProgress != null) { + writerPositions.put(secondWriterId, secondWriterProgress); + } + return new RegionProgress(writerPositions); + } + + private static ByteBuffer serialize(final RegionProgress regionProgress) throws Exception { + try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); + DataOutputStream dos = new DataOutputStream(baos)) { + regionProgress.serialize(dos); + dos.flush(); + return ByteBuffer.wrap(baos.toByteArray()).asReadOnlyBuffer(); + } + } +} diff --git a/iotdb-protocol/thrift-confignode/src/main/thrift/confignode.thrift b/iotdb-protocol/thrift-confignode/src/main/thrift/confignode.thrift index 92312ee81a307..b17ccd6b1d974 100644 --- a/iotdb-protocol/thrift-confignode/src/main/thrift/confignode.thrift +++ b/iotdb-protocol/thrift-confignode/src/main/thrift/confignode.thrift @@ -1061,6 +1061,18 @@ struct TGetAllSubscriptionInfoResp { 2: required list allSubscriptionInfo } +struct TGetCommitProgressReq { + 1: required string consumerGroupId + 2: required string topicName + 3: required i32 regionId + 4: required i32 dataNodeId +} + +struct TGetCommitProgressResp { + 1: required common.TSStatus status + 2: optional binary committedRegionProgress +} + // ==================================================== // CQ // ==================================================== @@ -1956,6 +1968,9 @@ service IConfigNodeRPCService { /** Get all subscription information. It is used for DataNode registration and restart */ TGetAllSubscriptionInfoResp getAllSubscriptionInfo() + /** Get committed search index from ConfigNode for recovery */ + TGetCommitProgressResp getCommitProgress(TGetCommitProgressReq req) + // ====================================================== // TestTools // ====================================================== @@ -2055,4 +2070,3 @@ service IConfigNodeRPCService { common.TSStatus createTableView(TCreateTableViewReq req) } - diff --git a/iotdb-protocol/thrift-consensus/src/main/thrift/iotconsensus.thrift b/iotdb-protocol/thrift-consensus/src/main/thrift/iotconsensus.thrift index 829443f955282..9150215857f1e 100644 --- a/iotdb-protocol/thrift-consensus/src/main/thrift/iotconsensus.thrift +++ b/iotdb-protocol/thrift-consensus/src/main/thrift/iotconsensus.thrift @@ -27,6 +27,9 @@ struct TLogEntry { 2: required i64 searchIndex 3: required bool fromWAL 4: required i64 memorySize + 5: optional i64 routingEpoch + 6: optional i64 physicalTime + 7: optional i16 writerEpoch } struct TSyncLogEntriesReq { @@ -41,6 +44,18 @@ struct TSyncLogEntriesRes { 2: optional i64 receiverMemSize } +struct TSyncSafeHlcReq { + 1: required common.TConsensusGroupId consensusGroupId + 2: required i32 writerNodeId + 3: required i64 writerEpoch + 4: required i64 safePhysicalTime + 5: required i64 barrierLocalSeq +} + +struct TSyncSafeHlcRes { + 1: required common.TSStatus status +} + struct TInactivatePeerReq { 1: required common.TConsensusGroupId consensusGroupId 2: optional bool forDeletionPurpose @@ -129,6 +144,7 @@ struct TCleanupTransferredSnapshotRes { service IoTConsensusIService { TSyncLogEntriesRes syncLogEntries(TSyncLogEntriesReq req) + TSyncSafeHlcRes syncSafeHlc(TSyncSafeHlcReq req) TInactivatePeerRes inactivatePeer(TInactivatePeerReq req) TActivatePeerRes activatePeer(TActivatePeerReq req) TBuildSyncLogChannelRes buildSyncLogChannel(TBuildSyncLogChannelReq req) @@ -138,4 +154,4 @@ service IoTConsensusIService { TSendSnapshotFragmentRes sendSnapshotFragment(TSendSnapshotFragmentReq req) TTriggerSnapshotLoadRes triggerSnapshotLoad(TTriggerSnapshotLoadReq req) TCleanupTransferredSnapshotRes cleanupTransferredSnapshot(TCleanupTransferredSnapshotReq req) -} \ No newline at end of file +} diff --git a/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift b/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift index 4323e956a8e99..29ca37b81898c 100644 --- a/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift +++ b/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift @@ -597,6 +597,34 @@ struct TPushConsumerGroupMetaRespExceptionMessage { 3: required i64 timeStamp } +struct TPullCommitProgressReq { +} + +struct TPullCommitProgressResp { + 1: required common.TSStatus status + 2: optional map commitRegionProgress +} + +struct TSyncSubscriptionProgressReq { + 1: required string consumerGroupId + 2: required string topicName + 3: required string regionId + 4: required i64 physicalTime + 5: required i64 localSeq + 6: optional i32 writerNodeId + 7: optional i64 writerEpoch +} +struct TSubscriptionRuntimeStateEntry { + 1: required common.TConsensusGroupId regionId + 2: required i64 runtimeVersion + 3: required i32 preferredWriterNodeId + 4: required bool active + 5: required list activeWriterNodeIds +} +struct TPushSubscriptionRuntimeReq { + 1: required list runtimeStates +} + struct TConstructViewSchemaBlackListReq { 1: required list schemaRegionIdList 2: required binary pathPatternTree @@ -1211,6 +1239,20 @@ service IDataNodeRPCService { */ TPushConsumerGroupMetaResp pushSingleConsumerGroupMeta(TPushSingleConsumerGroupMetaReq req) + /** + * Pull commit progress from DataNode for subscription consensus persistence + */ + TPullCommitProgressResp pullCommitProgress(TPullCommitProgressReq req) + + /** + * Sync subscription committed progress from Leader to Follower (fire-and-forget) + */ + common.TSStatus syncSubscriptionProgress(TSyncSubscriptionProgressReq req) + /** + * Push subscription runtime state to DataNodes. + */ + common.TSStatus pushSubscriptionRuntime(TPushSubscriptionRuntimeReq req) + /** * ConfigNode will ask DataNode for pipe meta in every few seconds **/ @@ -1350,4 +1392,4 @@ service MPPDataExchangeService { /** Empty rpc, only for connection test */ common.TSStatus testConnectionEmptyRPC() -} \ No newline at end of file +}