From 560eb1e920dbfb024d5197cd7e0427e940e815ea Mon Sep 17 00:00:00 2001 From: xuzifu666 <1206332514@qq.com> Date: Thu, 30 Apr 2026 15:51:14 +0800 Subject: [PATCH 1/3] [core] Introduce PrefixFileIndex for prefix query optimization --- .../fileindex/prefix/PrefixFileIndex.java | 341 ++++++++++++++++++ .../prefix/PrefixFileIndexFactory.java | 40 ++ ...apache.paimon.fileindex.FileIndexerFactory | 1 + 3 files changed, 382 insertions(+) create mode 100644 paimon-common/src/main/java/org/apache/paimon/fileindex/prefix/PrefixFileIndex.java create mode 100644 paimon-common/src/main/java/org/apache/paimon/fileindex/prefix/PrefixFileIndexFactory.java diff --git a/paimon-common/src/main/java/org/apache/paimon/fileindex/prefix/PrefixFileIndex.java b/paimon-common/src/main/java/org/apache/paimon/fileindex/prefix/PrefixFileIndex.java new file mode 100644 index 000000000000..a99cc13e67b6 --- /dev/null +++ b/paimon-common/src/main/java/org/apache/paimon/fileindex/prefix/PrefixFileIndex.java @@ -0,0 +1,341 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.fileindex.prefix; + +import org.apache.paimon.data.BinaryString; +import org.apache.paimon.fileindex.FileIndexReader; +import org.apache.paimon.fileindex.FileIndexResult; +import org.apache.paimon.fileindex.FileIndexWriter; +import org.apache.paimon.fileindex.FileIndexer; +import org.apache.paimon.fs.SeekableInputStream; +import org.apache.paimon.options.Options; +import org.apache.paimon.predicate.FieldRef; +import org.apache.paimon.types.DataType; +import org.apache.paimon.utils.IOUtils; +import org.apache.paimon.utils.RoaringBitmap32; + +import java.io.ByteArrayOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.Map; + +/** + * Prefix file index for accelerating prefix match queries (e.g. LIKE 'prefix%', STARTS_WITH). + * + *

For each text value, extracts a fixed-length prefix (default 3 characters) and builds an + * inverted index: prefix -> RoaringBitmap of row numbers. Queries with a prefix literal can + * quickly determine whether the data file needs to be read. + */ +public class PrefixFileIndex implements FileIndexer { + + public static final int VERSION_1 = 1; + public static final String VERSION = "version"; + public static final String PREFIX_LENGTH = "prefix-length"; + + private static final int DEFAULT_PREFIX_LENGTH = 3; + + private final DataType dataType; + private final Options options; + + public PrefixFileIndex(DataType dataType, Options options) { + this.dataType = dataType; + this.options = options; + } + + @Override + public FileIndexWriter createWriter() { + return new Writer(options); + } + + @Override + public FileIndexReader createReader(SeekableInputStream inputStream, int start, int length) { + try { + inputStream.seek(start); + byte[] serializedBytes = new byte[length]; + IOUtils.readFully(inputStream, serializedBytes); + return new Reader(serializedBytes); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private static String keyToString(Object key) { + if (key instanceof BinaryString) { + return key.toString(); + } else if (key instanceof String) { + return (String) key; + } + throw new IllegalArgumentException( + "Prefix index only supports string types, but got: " + + (key == null ? "null" : key.getClass().getName())); + } + + private static String extractPrefix(String text, int prefixLength) { + if (text.length() <= prefixLength) { + return text; + } + return text.substring(0, prefixLength); + } + + // ==================== Writer ==================== + + private static class Writer extends FileIndexWriter { + + private final int prefixLength; + private final Map prefix2bitmap = new HashMap<>(); + private final RoaringBitmap32 nullBitmap = new RoaringBitmap32(); + private int rowNumber; + + Writer(Options options) { + this.prefixLength = options.getInteger(PREFIX_LENGTH, DEFAULT_PREFIX_LENGTH); + if (prefixLength <= 0) { + throw new IllegalArgumentException( + "prefix-length must be positive, but got: " + prefixLength); + } + } + + @Override + public void write(Object key) { + if (key == null) { + nullBitmap.add(rowNumber++); + } else { + String prefix = extractPrefix(keyToString(key), prefixLength); + prefix2bitmap.computeIfAbsent(prefix, k -> new RoaringBitmap32()).add(rowNumber++); + } + } + + @Override + public byte[] serializedBytes() { + try { + ByteArrayOutputStream output = new ByteArrayOutputStream(); + DataOutputStream dos = new DataOutputStream(output); + + // Compute body layout first + byte[] nullBitmapBytes = null; + int nullOffset; + int bodyOffset = 0; + + if (nullBitmap.isEmpty()) { + nullOffset = 0; + } else if (nullBitmap.getCardinality() == 1) { + nullOffset = -1 - nullBitmap.first(); + } else { + nullBitmapBytes = serializeBitmap(nullBitmap); + nullOffset = 0; // null bitmap at start of body + bodyOffset = nullBitmapBytes.length; + } + + // Sort entries by prefix for deterministic serialization + LinkedHashMap offsets = new LinkedHashMap<>(); + LinkedHashMap bitmapBytes = new LinkedHashMap<>(); + + for (String prefix : sortedPrefixes()) { + RoaringBitmap32 bitmap = prefix2bitmap.get(prefix); + byte[] bytes = serializeBitmap(bitmap); + offsets.put(prefix, bodyOffset); + bitmapBytes.put(prefix, bytes); + bodyOffset += bytes.length; + } + + // Write header + dos.writeByte(VERSION_1); + dos.writeInt(prefixLength); + dos.writeInt(rowNumber); + dos.writeInt(prefix2bitmap.size()); + dos.writeBoolean(!nullBitmap.isEmpty()); + dos.writeInt(nullOffset); + + // Write entries (prefix + offset) + for (Map.Entry entry : offsets.entrySet()) { + byte[] prefixBytes = entry.getKey().getBytes(StandardCharsets.UTF_8); + dos.writeInt(prefixBytes.length); + dos.write(prefixBytes); + dos.writeInt(entry.getValue()); + } + + // Write bitmap body + if (nullBitmapBytes != null) { + dos.write(nullBitmapBytes); + } + for (byte[] bytes : bitmapBytes.values()) { + dos.write(bytes); + } + + dos.flush(); + return output.toByteArray(); + } catch (Exception e) { + throw new RuntimeException("Failed to serialize prefix file index", e); + } + } + + private byte[] serializeBitmap(RoaringBitmap32 bitmap) throws IOException { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + DataOutputStream dos = new DataOutputStream(baos); + bitmap.serialize(dos); + dos.flush(); + return baos.toByteArray(); + } + + private java.util.List sortedPrefixes() { + java.util.List list = new java.util.ArrayList<>(prefix2bitmap.keySet()); + java.util.Collections.sort(list); + return list; + } + } + + // ==================== Reader ==================== + + private static class Reader extends FileIndexReader { + + private final byte[] data; + private int prefixLength; + + // Lazy loaded + private int rowCount; + private boolean hasNull; + private int nullOffset; + private Map prefixOffsets; + private int bodyStart; + + Reader(byte[] data) { + this.data = data; + // prefixLength is not stored in serialized data; we use a reasonable default + // In practice, query prefix extraction should match writer's prefixLength. + // For simplicity, we use the default here; the query literal's prefix is extracted + // with the same logic as the writer (min of literal length and prefix length). + this.prefixLength = DEFAULT_PREFIX_LENGTH; + } + + private void ensureLoaded() { + if (prefixOffsets != null) { + return; + } + try { + java.io.ByteArrayInputStream bais = new java.io.ByteArrayInputStream(data); + DataInputStream dis = new DataInputStream(bais); + + int version = dis.readByte(); + if (version != VERSION_1) { + throw new RuntimeException("Unsupported prefix file index version: " + version); + } + prefixLength = dis.readInt(); + rowCount = dis.readInt(); + int entryCount = dis.readInt(); + hasNull = dis.readBoolean(); + nullOffset = dis.readInt(); + + prefixOffsets = new HashMap<>(entryCount); + for (int i = 0; i < entryCount; i++) { + int prefixLen = dis.readInt(); + byte[] prefixBytes = new byte[prefixLen]; + dis.readFully(prefixBytes); + String prefix = new String(prefixBytes, StandardCharsets.UTF_8); + int offset = dis.readInt(); + prefixOffsets.put(prefix, offset); + } + bodyStart = data.length - bais.available(); + } catch (IOException e) { + throw new RuntimeException("Failed to deserialize prefix file index", e); + } + } + + private RoaringBitmap32 readBitmap(int offset) { + try { + java.io.ByteArrayInputStream bais = + new java.io.ByteArrayInputStream( + data, bodyStart + offset, data.length - bodyStart - offset); + DataInputStream dis = new DataInputStream(bais); + RoaringBitmap32 bitmap = new RoaringBitmap32(); + bitmap.deserialize(dis); + return bitmap; + } catch (IOException e) { + throw new RuntimeException("Failed to read bitmap from prefix file index", e); + } + } + + private boolean hasPrefix(String prefix) { + ensureLoaded(); + Integer offset = prefixOffsets.get(prefix); + if (offset != null) { + if (offset < 0) { + // single value shortcut + return true; + } + RoaringBitmap32 bitmap = readBitmap(offset); + return !bitmap.isEmpty(); + } + // If exact prefix not found, check if any stored prefix starts with the query prefix. + // This handles the case where query prefix is shorter than prefixLength. + for (Map.Entry entry : prefixOffsets.entrySet()) { + if (entry.getKey().startsWith(prefix)) { + return true; + } + } + return false; + } + + @Override + public FileIndexResult visitStartsWith(FieldRef fieldRef, Object literal) { + if (literal == null) { + return FileIndexResult.REMAIN; + } + String text = keyToString(literal); + String prefix = extractPrefix(text, prefixLength); + return hasPrefix(prefix) ? FileIndexResult.REMAIN : FileIndexResult.SKIP; + } + + @Override + public FileIndexResult visitEqual(FieldRef fieldRef, Object literal) { + if (literal == null) { + return hasNull ? FileIndexResult.REMAIN : FileIndexResult.SKIP; + } + String text = keyToString(literal); + String prefix = extractPrefix(text, prefixLength); + return hasPrefix(prefix) ? FileIndexResult.REMAIN : FileIndexResult.SKIP; + } + + @Override + public FileIndexResult visitLike(FieldRef fieldRef, Object literal) { + if (literal == null) { + return FileIndexResult.REMAIN; + } + String pattern = keyToString(literal); + // Optimize for "prefix%" patterns (no leading wildcard, single trailing %) + if (pattern.endsWith("%") + && !pattern.startsWith("%") + && pattern.indexOf('%') == pattern.length() - 1 + && pattern.indexOf('_') == -1) { + String prefixText = pattern.substring(0, pattern.length() - 1); + String prefix = extractPrefix(prefixText, prefixLength); + return hasPrefix(prefix) ? FileIndexResult.REMAIN : FileIndexResult.SKIP; + } + return FileIndexResult.REMAIN; + } + + @Override + public FileIndexResult visitIsNull(FieldRef fieldRef) { + ensureLoaded(); + return hasNull ? FileIndexResult.REMAIN : FileIndexResult.SKIP; + } + } +} diff --git a/paimon-common/src/main/java/org/apache/paimon/fileindex/prefix/PrefixFileIndexFactory.java b/paimon-common/src/main/java/org/apache/paimon/fileindex/prefix/PrefixFileIndexFactory.java new file mode 100644 index 000000000000..b25cd865e8a3 --- /dev/null +++ b/paimon-common/src/main/java/org/apache/paimon/fileindex/prefix/PrefixFileIndexFactory.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.fileindex.prefix; + +import org.apache.paimon.fileindex.FileIndexer; +import org.apache.paimon.fileindex.FileIndexerFactory; +import org.apache.paimon.options.Options; +import org.apache.paimon.types.DataType; + +/** Index factory to construct {@link PrefixFileIndex}. */ +public class PrefixFileIndexFactory implements FileIndexerFactory { + + public static final String PREFIX = "prefix"; + + @Override + public String identifier() { + return PREFIX; + } + + @Override + public FileIndexer create(DataType type, Options options) { + return new PrefixFileIndex(type, options); + } +} diff --git a/paimon-common/src/main/resources/META-INF/services/org.apache.paimon.fileindex.FileIndexerFactory b/paimon-common/src/main/resources/META-INF/services/org.apache.paimon.fileindex.FileIndexerFactory index 5f8ed20221d4..848d2dc79121 100644 --- a/paimon-common/src/main/resources/META-INF/services/org.apache.paimon.fileindex.FileIndexerFactory +++ b/paimon-common/src/main/resources/META-INF/services/org.apache.paimon.fileindex.FileIndexerFactory @@ -17,3 +17,4 @@ org.apache.paimon.fileindex.bloomfilter.BloomFilterFileIndexFactory org.apache.paimon.fileindex.bitmap.BitmapFileIndexFactory org.apache.paimon.fileindex.bsi.BitSliceIndexBitmapFileIndexFactory org.apache.paimon.fileindex.rangebitmap.RangeBitmapFileIndexFactory +org.apache.paimon.fileindex.prefix.PrefixFileIndexFactory From 7ce0e7d9f9918945b9f248f8a6f375e71d07a107 Mon Sep 17 00:00:00 2001 From: xuzifu666 <1206332514@qq.com> Date: Thu, 30 Apr 2026 15:52:21 +0800 Subject: [PATCH 2/3] added --- .../fileindex/prefix/PrefixFileIndexTest.java | 206 ++++++++++++++++++ 1 file changed, 206 insertions(+) create mode 100644 paimon-common/src/test/java/org/apache/paimon/fileindex/prefix/PrefixFileIndexTest.java diff --git a/paimon-common/src/test/java/org/apache/paimon/fileindex/prefix/PrefixFileIndexTest.java b/paimon-common/src/test/java/org/apache/paimon/fileindex/prefix/PrefixFileIndexTest.java new file mode 100644 index 000000000000..ba1a6d8ebbd4 --- /dev/null +++ b/paimon-common/src/test/java/org/apache/paimon/fileindex/prefix/PrefixFileIndexTest.java @@ -0,0 +1,206 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.fileindex.prefix; + +import org.apache.paimon.fileindex.FileIndexReader; +import org.apache.paimon.fileindex.FileIndexWriter; +import org.apache.paimon.fs.ByteArraySeekableStream; +import org.apache.paimon.options.Options; +import org.apache.paimon.types.DataTypes; + +import org.assertj.core.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.util.HashMap; + +/** Tests for {@link PrefixFileIndex}. */ +public class PrefixFileIndexTest { + + @Test + public void testStartsWith() { + PrefixFileIndex index = + new PrefixFileIndex(DataTypes.STRING(), new Options(new HashMap())); + FileIndexWriter writer = index.createWriter(); + + writer.write("hello"); + writer.write("world"); + writer.write("help"); + writer.write("helm"); + writer.write("helium"); + + byte[] serialized = writer.serializedBytes(); + FileIndexReader reader = + index.createReader(new ByteArraySeekableStream(serialized), 0, serialized.length); + + // Existing prefixes should return REMAIN + Assertions.assertThat(reader.visitStartsWith(null, "hel").remain()).isTrue(); + Assertions.assertThat(reader.visitStartsWith(null, "hello").remain()).isTrue(); + Assertions.assertThat(reader.visitStartsWith(null, "wor").remain()).isTrue(); + Assertions.assertThat(reader.visitStartsWith(null, "world").remain()).isTrue(); + Assertions.assertThat(reader.visitStartsWith(null, "he").remain()).isTrue(); + + // Non-existing prefixes should return SKIP + Assertions.assertThat(reader.visitStartsWith(null, "abc").remain()).isFalse(); + Assertions.assertThat(reader.visitStartsWith(null, "xyz").remain()).isFalse(); + // "helz" truncates to "hel" which exists in index, so it's a false positive (REMAIN) + Assertions.assertThat(reader.visitStartsWith(null, "helz").remain()).isTrue(); + } + + @Test + public void testEqual() { + PrefixFileIndex index = + new PrefixFileIndex(DataTypes.STRING(), new Options(new HashMap())); + FileIndexWriter writer = index.createWriter(); + + writer.write("apple"); + writer.write("apply"); + writer.write("banana"); + + byte[] serialized = writer.serializedBytes(); + FileIndexReader reader = + index.createReader(new ByteArraySeekableStream(serialized), 0, serialized.length); + + // Equal to existing value: prefix matches, so REMAIN + Assertions.assertThat(reader.visitEqual(null, "apple").remain()).isTrue(); + Assertions.assertThat(reader.visitEqual(null, "apply").remain()).isTrue(); + + // Equal to non-existing value with matching prefix: REMAIN (false positive) + // "applx" has prefix "app" which exists in index + Assertions.assertThat(reader.visitEqual(null, "applx").remain()).isTrue(); + + // Equal to non-existing value with non-matching prefix: SKIP + Assertions.assertThat(reader.visitEqual(null, "apricot").remain()).isFalse(); + Assertions.assertThat(reader.visitEqual(null, "ban").remain()).isTrue(); + Assertions.assertThat(reader.visitEqual(null, "cherry").remain()).isFalse(); + } + + @Test + public void testLikePrefixPattern() { + PrefixFileIndex index = + new PrefixFileIndex(DataTypes.STRING(), new Options(new HashMap())); + FileIndexWriter writer = index.createWriter(); + + writer.write("database"); + writer.write("dataflow"); + writer.write("datamine"); + + byte[] serialized = writer.serializedBytes(); + FileIndexReader reader = + index.createReader(new ByteArraySeekableStream(serialized), 0, serialized.length); + + // "prefix%" pattern + Assertions.assertThat(reader.visitLike(null, "dat%").remain()).isTrue(); + Assertions.assertThat(reader.visitLike(null, "data%").remain()).isTrue(); + Assertions.assertThat(reader.visitLike(null, "xyz%").remain()).isFalse(); + + // Patterns with leading wildcard cannot use prefix index + Assertions.assertThat(reader.visitLike(null, "%base").remain()).isTrue(); + Assertions.assertThat(reader.visitLike(null, "%ata%").remain()).isTrue(); + } + + @Test + public void testNullValues() { + PrefixFileIndex index = + new PrefixFileIndex(DataTypes.STRING(), new Options(new HashMap())); + FileIndexWriter writer = index.createWriter(); + + writer.write("test"); + writer.write(null); + writer.write("testing"); + + byte[] serialized = writer.serializedBytes(); + FileIndexReader reader = + index.createReader(new ByteArraySeekableStream(serialized), 0, serialized.length); + + // IS NULL should return REMAIN when nulls exist + Assertions.assertThat(reader.visitIsNull(null).remain()).isTrue(); + + // STARTS_WITH with null literal should return REMAIN + Assertions.assertThat(reader.visitStartsWith(null, null).remain()).isTrue(); + } + + @Test + public void testNoNullValues() { + PrefixFileIndex index = + new PrefixFileIndex(DataTypes.STRING(), new Options(new HashMap())); + FileIndexWriter writer = index.createWriter(); + + writer.write("only"); + writer.write("values"); + + byte[] serialized = writer.serializedBytes(); + FileIndexReader reader = + index.createReader(new ByteArraySeekableStream(serialized), 0, serialized.length); + + // IS NULL should return SKIP when no nulls exist + Assertions.assertThat(reader.visitIsNull(null).remain()).isFalse(); + } + + @Test + public void testCustomPrefixLength() { + PrefixFileIndex index = + new PrefixFileIndex( + DataTypes.STRING(), + new Options( + new HashMap() { + { + put("prefix-length", "2"); + } + })); + FileIndexWriter writer = index.createWriter(); + + writer.write("abcde"); + writer.write("abxyz"); + writer.write("bcdef"); + + byte[] serialized = writer.serializedBytes(); + FileIndexReader reader = + index.createReader(new ByteArraySeekableStream(serialized), 0, serialized.length); + + // With prefix-length=2, "ab" matches both "abcde" and "abxyz" + Assertions.assertThat(reader.visitStartsWith(null, "ab").remain()).isTrue(); + Assertions.assertThat(reader.visitStartsWith(null, "abc").remain()).isTrue(); + + // "bc" matches "bcdef" + Assertions.assertThat(reader.visitStartsWith(null, "bc").remain()).isTrue(); + + // "xy" does not match any prefix + Assertions.assertThat(reader.visitStartsWith(null, "xy").remain()).isFalse(); + } + + @Test + public void testShortValues() { + PrefixFileIndex index = + new PrefixFileIndex(DataTypes.STRING(), new Options(new HashMap())); + FileIndexWriter writer = index.createWriter(); + + writer.write("ab"); + writer.write("a"); + writer.write("abc"); + + byte[] serialized = writer.serializedBytes(); + FileIndexReader reader = + index.createReader(new ByteArraySeekableStream(serialized), 0, serialized.length); + + // "ab" is both a prefix and a full value + Assertions.assertThat(reader.visitStartsWith(null, "ab").remain()).isTrue(); + // "a" matches "a", "ab", "abc" + Assertions.assertThat(reader.visitStartsWith(null, "a").remain()).isTrue(); + } +} From c346975bdc54ac34a845f8b85d7c1e63a4e4b212 Mon Sep 17 00:00:00 2001 From: xuzifu666 <1206332514@qq.com> Date: Thu, 30 Apr 2026 15:52:58 +0800 Subject: [PATCH 3/3] added --- .../prefix/PrefixIndexBenchmark.java | 356 ++++++++++++++++++ 1 file changed, 356 insertions(+) create mode 100644 paimon-benchmark/paimon-micro-benchmarks/src/test/java/org/apache/paimon/benchmark/prefix/PrefixIndexBenchmark.java diff --git a/paimon-benchmark/paimon-micro-benchmarks/src/test/java/org/apache/paimon/benchmark/prefix/PrefixIndexBenchmark.java b/paimon-benchmark/paimon-micro-benchmarks/src/test/java/org/apache/paimon/benchmark/prefix/PrefixIndexBenchmark.java new file mode 100644 index 000000000000..02b3229da8cb --- /dev/null +++ b/paimon-benchmark/paimon-micro-benchmarks/src/test/java/org/apache/paimon/benchmark/prefix/PrefixIndexBenchmark.java @@ -0,0 +1,356 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.benchmark.prefix; + +import org.apache.paimon.benchmark.Benchmark; +import org.apache.paimon.data.BinaryString; +import org.apache.paimon.fileindex.FileIndexReader; +import org.apache.paimon.fileindex.FileIndexWriter; +import org.apache.paimon.fileindex.bitmap.BitmapFileIndex; +import org.apache.paimon.fileindex.bitmap.BitmapIndexResult; +import org.apache.paimon.fileindex.prefix.PrefixFileIndex; +import org.apache.paimon.fs.local.LocalFileIO; +import org.apache.paimon.options.Options; +import org.apache.paimon.predicate.FieldRef; +import org.apache.paimon.types.DataTypes; + +import org.apache.commons.io.FileUtils; +import org.junit.Rule; +import org.junit.jupiter.api.Test; +import org.junit.rules.TemporaryFolder; + +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.Random; + +/** Benchmark for {@link PrefixFileIndex}. */ +public class PrefixIndexBenchmark { + + public static final int ROW_COUNT = 1000000; + public static final int[] CARDINALITIES = new int[] {100, 1000, 10000}; + public static final int[] PREFIX_LENGTHS = new int[] {2, 3, 4}; + public static final String[] CATEGORIES = + new String[] {"electronics", "clothing", "books", "food", "sports"}; + + @Rule public TemporaryFolder folder = new TemporaryFolder(); + + @Test + public void testQueryHit() throws Exception { + for (int cardinality : CARDINALITIES) { + IndexFiles files = createIndexes(cardinality); + String existsPrefix = CATEGORIES[0] + "_" + (cardinality / 2); + + Benchmark benchmark = + new Benchmark( + String.format( + "prefix-index-query-hit-cardinality-%s", cardinality), + 100) + .setNumWarmupIters(1) + .setOutputPerIteration(false); + + benchmark.addCase( + "prefix-index-prefix2", + 10, + () -> queryPrefix(files.prefixFile2, existsPrefix, 2)); + benchmark.addCase( + "prefix-index-prefix3", + 10, + () -> queryPrefix(files.prefixFile3, existsPrefix, 3)); + benchmark.addCase( + "prefix-index-prefix4", + 10, + () -> queryPrefix(files.prefixFile4, existsPrefix, 4)); + benchmark.addCase( + "bitmap-index-equal", 10, () -> queryBitmap(files.bitmapFile, existsPrefix)); + benchmark.addCase( + "no-index-full-scan", 10, () -> queryNoIndexScan(files.data, existsPrefix)); + + benchmark.run(); + } + } + + @Test + public void testQuerySkip() throws Exception { + for (int cardinality : CARDINALITIES) { + IndexFiles files = createIndexes(cardinality); + // A prefix that does not exist in the index + String notExistsPrefix = "unknown_not_exists"; + + Benchmark benchmark = + new Benchmark( + String.format( + "prefix-index-query-skip-cardinality-%s", cardinality), + 100) + .setNumWarmupIters(1) + .setOutputPerIteration(false); + + benchmark.addCase( + "prefix-index-prefix2", + 10, + () -> queryPrefix(files.prefixFile2, notExistsPrefix, 2)); + benchmark.addCase( + "prefix-index-prefix3", + 10, + () -> queryPrefix(files.prefixFile3, notExistsPrefix, 3)); + benchmark.addCase( + "prefix-index-prefix4", + 10, + () -> queryPrefix(files.prefixFile4, notExistsPrefix, 4)); + benchmark.addCase( + "bitmap-index-equal", 10, () -> queryBitmap(files.bitmapFile, notExistsPrefix)); + benchmark.addCase( + "no-index-full-scan", 10, () -> queryNoIndexScan(files.data, notExistsPrefix)); + + benchmark.run(); + } + } + + @Test + public void testIndexSize() throws Exception { + System.out.println("\n========== Prefix Index Size Comparison =========="); + System.out.printf( + "%-15s %-15s %-15s %-15s %-15s %-15s%n", + "Cardinality", + "PrefixLen=2", + "PrefixLen=3", + "PrefixLen=4", + "BitmapIndex", + "RawData"); + System.out.println( + "---------------------------------------------------------------------------------------------"); + + for (int cardinality : CARDINALITIES) { + IndexFiles files = createIndexes(cardinality); + long size2 = files.prefixFile2.length(); + long size3 = files.prefixFile3.length(); + long size4 = files.prefixFile4.length(); + long bitmapSize = files.bitmapFile.length(); + long rawDataSize = estimateRawDataSize(files.data); + System.out.printf( + "%-15d %-15d %-15d %-15d %-15d %-15d%n", + cardinality, size2, size3, size4, bitmapSize, rawDataSize); + } + System.out.println(); + } + + @Test + public void testBuildTime() throws Exception { + for (int cardinality : CARDINALITIES) { + Benchmark benchmark = + new Benchmark( + String.format("prefix-index-build-cardinality-%s", cardinality), + ROW_COUNT) + .setNumWarmupIters(0) + .setOutputPerIteration(false); + + benchmark.addCase("prefix-index-prefix2", 5, () -> buildPrefixIndex(cardinality, 2)); + benchmark.addCase("prefix-index-prefix3", 5, () -> buildPrefixIndex(cardinality, 3)); + benchmark.addCase("prefix-index-prefix4", 5, () -> buildPrefixIndex(cardinality, 4)); + benchmark.addCase("bitmap-index", 5, () -> buildBitmapIndex(cardinality)); + + benchmark.run(); + } + } + + private IndexFiles createIndexes(int cardinality) throws IOException { + folder.create(); + + File prefixFile2 = folder.newFile("prefix-index-2-" + cardinality); + File prefixFile3 = folder.newFile("prefix-index-3-" + cardinality); + File prefixFile4 = folder.newFile("prefix-index-4-" + cardinality); + File bitmapFile = folder.newFile("bitmap-index-" + cardinality); + + FileIndexWriter writer2 = + new PrefixFileIndex( + DataTypes.STRING(), + new Options(new HashMap()) { + { + setString("prefix-length", "2"); + } + }) + .createWriter(); + FileIndexWriter writer3 = + new PrefixFileIndex( + DataTypes.STRING(), + new Options(new HashMap()) { + { + setString("prefix-length", "3"); + } + }) + .createWriter(); + FileIndexWriter writer4 = + new PrefixFileIndex( + DataTypes.STRING(), + new Options(new HashMap()) { + { + setString("prefix-length", "4"); + } + }) + .createWriter(); + FileIndexWriter bitmapWriter = + new BitmapFileIndex(DataTypes.STRING(), new Options()).createWriter(); + + String[] data = new String[ROW_COUNT]; + Random random = new Random(42); + for (int i = 0; i < ROW_COUNT; i++) { + String value = + CATEGORIES[random.nextInt(CATEGORIES.length)] + + "_" + + random.nextInt(cardinality); + data[i] = value; + writer2.write(BinaryString.fromString(value)); + writer3.write(BinaryString.fromString(value)); + writer4.write(BinaryString.fromString(value)); + bitmapWriter.write(BinaryString.fromString(value)); + } + + FileUtils.writeByteArrayToFile(prefixFile2, writer2.serializedBytes()); + FileUtils.writeByteArrayToFile(prefixFile3, writer3.serializedBytes()); + FileUtils.writeByteArrayToFile(prefixFile4, writer4.serializedBytes()); + FileUtils.writeByteArrayToFile(bitmapFile, bitmapWriter.serializedBytes()); + + return new IndexFiles(prefixFile2, prefixFile3, prefixFile4, bitmapFile, data); + } + + private void buildPrefixIndex(int cardinality, int prefixLength) { + try { + FileIndexWriter writer = + new PrefixFileIndex( + DataTypes.STRING(), + new Options(new HashMap()) { + { + setString( + "prefix-length", String.valueOf(prefixLength)); + } + }) + .createWriter(); + Random random = new Random(42); + for (int i = 0; i < ROW_COUNT; i++) { + writer.write( + BinaryString.fromString( + CATEGORIES[random.nextInt(CATEGORIES.length)] + + "_" + + random.nextInt(cardinality))); + } + writer.serializedBytes(); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + private void buildBitmapIndex(int cardinality) { + try { + FileIndexWriter writer = + new BitmapFileIndex(DataTypes.STRING(), new Options()).createWriter(); + Random random = new Random(42); + for (int i = 0; i < ROW_COUNT; i++) { + writer.write( + BinaryString.fromString( + CATEGORIES[random.nextInt(CATEGORIES.length)] + + "_" + + random.nextInt(cardinality))); + } + writer.serializedBytes(); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + private static void queryPrefix(File indexFile, String prefix, int prefixLength) { + try { + FieldRef fieldRef = new FieldRef(0, "", DataTypes.STRING()); + Options options = + new Options(new HashMap()) { + { + setString("prefix-length", String.valueOf(prefixLength)); + } + }; + LocalFileIO.LocalSeekableInputStream stream = + new LocalFileIO.LocalSeekableInputStream(indexFile); + FileIndexReader reader = + new PrefixFileIndex(DataTypes.STRING(), options) + .createReader(stream, 0, (int) indexFile.length()); + reader.visitStartsWith(fieldRef, BinaryString.fromString(prefix)); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private static void queryBitmap(File indexFile, String value) { + try { + FieldRef fieldRef = new FieldRef(0, "", DataTypes.STRING()); + Options options = new Options(); + LocalFileIO.LocalSeekableInputStream stream = + new LocalFileIO.LocalSeekableInputStream(indexFile); + FileIndexReader reader = + new BitmapFileIndex(DataTypes.STRING(), options) + .createReader(stream, 0, (int) indexFile.length()); + ((BitmapIndexResult) reader.visitEqual(fieldRef, BinaryString.fromString(value))).get(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private static void queryNoIndexScan(String[] data, String prefix) { + boolean found = false; + for (String value : data) { + if (value.startsWith(prefix)) { + found = true; + break; + } + } + // Return the result for file pruning decision (REMAIN if found, SKIP if not) + // We don't throw here because skip scenarios intentionally query non-existing prefixes + boolean remain = found; + // Prevent the JVM from optimizing away the result + if (remain && System.nanoTime() == 0) { + throw new RuntimeException("Unreachable"); + } + } + + private static long estimateRawDataSize(String[] data) { + long size = 0; + for (String s : data) { + size += s.getBytes().length + 4; // 4 bytes for length prefix + } + return size; + } + + private static class IndexFiles { + final File prefixFile2; + final File prefixFile3; + final File prefixFile4; + final File bitmapFile; + final String[] data; + + IndexFiles( + File prefixFile2, + File prefixFile3, + File prefixFile4, + File bitmapFile, + String[] data) { + this.prefixFile2 = prefixFile2; + this.prefixFile3 = prefixFile3; + this.prefixFile4 = prefixFile4; + this.bitmapFile = bitmapFile; + this.data = data; + } + } +}