apache · JingsongLi · Jun 23, 2026 · Jun 22, 2026 · Jun 22, 2026
diff --git a/paimon-python/pypaimon/globalindex/batch_vector_search.py b/paimon-python/pypaimon/globalindex/batch_vector_search.py
@@ -0,0 +1,95 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""BatchVectorSearch for performing batch vector similarity search."""
+
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+
+from pypaimon.globalindex.vector_search import VectorSearch
+
+
+@dataclass
+class BatchVectorSearch:
+    """Batch vector search over multiple query vectors; result ``i`` maps to ``vectors[i]``."""
+
+    vectors: List[Union[List[float], np.ndarray]]
+    limit: int
+    field_name: str
+    include_row_ids: Optional['RoaringBitmap64'] = field(default=None)
+    options: Optional[Dict[str, str]] = field(default=None)
+
+    def __post_init__(self):
+        if not self.vectors:
+            raise ValueError("Search vectors cannot be empty")
+        if self.limit <= 0:
+            raise ValueError(f"Limit must be positive, got: {self.limit}")
+        if not self.field_name:
+            raise ValueError("Field name cannot be null or empty")
+        # Match VectorSearch: list vectors -> float32.
+        self.vectors = [
+            np.array(v, dtype=np.float32) if isinstance(v, list) else v
+            for v in self.vectors
+        ]
+        self.options = {} if self.options is None else dict(self.options)
+
+    @property
+    def vector_count(self) -> int:
+        return len(self.vectors)
+
+    def for_index(self, i: int) -> VectorSearch:
+        """Return the single VectorSearch for query vector ``i``."""
+        return VectorSearch(
+            vector=self.vectors[i],
+            limit=self.limit,
+            field_name=self.field_name,
+            include_row_ids=self.include_row_ids,
+            options=self.options,
+        )
+
+    def with_include_row_ids(self, include_row_ids: 'RoaringBitmap64') -> 'BatchVectorSearch':
+        return BatchVectorSearch(
+            vectors=self.vectors,
+            limit=self.limit,
+            field_name=self.field_name,
+            include_row_ids=include_row_ids,
+            options=self.options,
+        )
+
+    def offset_range(self, from_: int, to: int) -> 'BatchVectorSearch':
+        """Offset include_row_ids into the given range; vectors are shared by all queries."""
+        if self.include_row_ids is None:
+            return self
+        from pypaimon.utils.roaring_bitmap import RoaringBitmap64
+
+        range_bitmap = RoaringBitmap64()
+        range_bitmap.add_range(from_, to)
+        and_result = RoaringBitmap64.and_(range_bitmap, self.include_row_ids)
+        offset_bitmap = RoaringBitmap64()
+        # Per-element shift (RoaringBitmap64 has no bulk translate yet).
+        for row_id in and_result:
+            offset_bitmap.add(row_id - from_)
+        return self.with_include_row_ids(offset_bitmap)
+
+    def visit(self, visitor: 'GlobalIndexReader') -> 'Future[List[Optional[GlobalIndexResult]]]':
+        return visitor.visit_batch_vector_search(self)
+
+    def __repr__(self) -> str:
+        return (f"BatchVectorSearch(field_name={self.field_name}, "
+                f"limit={self.limit}, vector_count={self.vector_count})")
diff --git a/paimon-python/pypaimon/globalindex/global_index_reader.py b/paimon-python/pypaimon/globalindex/global_index_reader.py
@@ -58,6 +58,18 @@ class GlobalIndexReader(ABC):
     def visit_vector_search(self, vector_search: 'VectorSearch') -> 'Future[Optional[GlobalIndexResult]]':
         raise NotImplementedError("Vector search not supported by this reader")
 
+    def visit_batch_vector_search(
+            self, batch_vector_search: 'BatchVectorSearch'
+    ) -> 'Future[List[Optional[GlobalIndexResult]]]':
+        """Default: fan out to single-vector search; result ``i`` maps to ``vectors[i]``.
+
+        Blocks per future (fine while readers return completed futures); an
+        async reader should override.
+        """
+        singles = [self.visit_vector_search(batch_vector_search.for_index(i))
+                   for i in range(batch_vector_search.vector_count)]
+        return _completed_future([f.result() for f in singles])
+
     def visit_full_text_search(self, full_text_search: 'FullTextSearch') -> 'Future[Optional[GlobalIndexResult]]':
         raise NotImplementedError("Full-text search not supported by this reader")
 

diff --git a/paimon-python/pypaimon/globalindex/lumina/lumina_vector_global_index_reader.py b/paimon-python/pypaimon/globalindex/lumina/lumina_vector_global_index_reader.py
@@ -50,6 +50,21 @@ def _merge_options(base_options, index_options, query_options):
     return options
 
 
+def _collect_scored_result(distances, labels, base, k, index_metric):
+    """Convert one query's [base, base+k) slice of distances/labels into a result."""
+    from lumina_data import MetricType
+
+    SENTINEL = 0xFFFFFFFFFFFFFFFF
+    id_to_scores = {}
+    for i in range(k):
+        row_id = labels[base + i]
+        if row_id == SENTINEL:
+            continue
+        id_to_scores[int(row_id)] = MetricType.convert_distance_to_score(
+            float(distances[base + i]), index_metric)
+    return DictBasedScoredIndexResult(id_to_scores)
+
+
 class LuminaVectorGlobalIndexReader(GlobalIndexReader):
     """Vector global index reader using Lumina."""
 
@@ -66,57 +81,69 @@ def __init__(self, file_io, index_path, io_metas, options=None):
         self._load_lock = threading.Lock()
 
     def visit_vector_search(self, vector_search):
+        # Single-vector search is just the n == 1 case of the batch path.
+        results = self._run_search(
+            [vector_search.vector],
+            vector_search.limit,
+            vector_search.include_row_ids,
+            vector_search.options,
+        )
+        return _completed_future(results[0])
+
+    def visit_batch_vector_search(self, batch_vector_search):
+        results = self._run_search(
+            batch_vector_search.vectors,
+            batch_vector_search.limit,
+            batch_vector_search.include_row_ids,
+            batch_vector_search.options,
+        )
+        return _completed_future(results)
+
+    def _run_search(self, vectors, limit, include_row_ids, query_options):
+        """Run one native batch search; result ``i`` maps to ``vectors[i]`` (``None`` if
+        no hits). Single search is the n == 1 case, shared by both visit paths.
+        """
         self._ensure_loaded()
 
-        from lumina_data import MetricType
-        query_flat = [float(v) for v in np.asarray(vector_search.vector).tolist()]
+        n = len(vectors)
         expected_dim = self._index_meta.dim
-        if len(query_flat) != expected_dim:
-            raise ValueError(
-                "Query vector dimension mismatch: expected %d, got %d"
-                % (expected_dim, len(query_flat)))
+        query_flat = []
+        for vector in vectors:
+            flat = [float(v) for v in np.asarray(vector).tolist()]
+            if len(flat) != expected_dim:
+                raise ValueError(
+                    "Query vector dimension mismatch: expected %d, got %d"
+                    % (expected_dim, len(flat)))
+            query_flat.extend(flat)
 
-        limit = vector_search.limit
         index_metric = self._index_meta.metric
-
         count = self._searcher.get_count()
         effective_k = min(limit, count)
         if effective_k <= 0:
-            return _completed_future(None)
-
-        include_row_ids = vector_search.include_row_ids
-        query_options = vector_search.options
+            return [None] * n
 
         if include_row_ids is not None:
             filter_id_list = list(include_row_ids)
             if len(filter_id_list) == 0:
-                return _completed_future(None)
+                return [None] * n
             effective_k = min(effective_k, len(filter_id_list))
-            search_opts = _merge_options(
-                self._options, {}, query_options)
+            search_opts = _merge_options(self._options, {}, query_options)
             search_opts["search.thread_safe_filter"] = "true"
             _ensure_search_list_size(search_opts, effective_k)
             distances, labels = self._searcher.search_with_filter_list(
-                query_flat, 1, effective_k, filter_id_list, search_opts)
+                query_flat, n, effective_k, filter_id_list, search_opts)
         else:
-            search_opts = _merge_options(
-                self._options, {}, query_options)
+            search_opts = _merge_options(self._options, {}, query_options)
             _ensure_search_list_size(search_opts, effective_k)
             distances, labels = self._searcher.search_list(
-                query_flat, 1, effective_k, search_opts)
-
-        # Collect results with score conversion (same as Java collectResults)
-        SENTINEL = 0xFFFFFFFFFFFFFFFF
-        id_to_scores = {}
-        for i in range(effective_k):
-            row_id = labels[i]
-            if row_id == SENTINEL:
-                continue
-            score = MetricType.convert_distance_to_score(
-                float(distances[i]), index_metric)
-            id_to_scores[int(row_id)] = score
-
-        return _completed_future(DictBasedScoredIndexResult(id_to_scores))
+                query_flat, n, effective_k, search_opts)
+
+        # Each query's results occupy a contiguous [q * k, q * k + k) slice.
+        return [
+            _collect_scored_result(
+                distances, labels, q * effective_k, effective_k, index_metric)
+            for q in range(n)
+        ]
 
     def _ensure_loaded(self):
         if self._searcher is not None:

diff --git a/paimon-python/pypaimon/globalindex/offset_global_index_reader.py b/paimon-python/pypaimon/globalindex/offset_global_index_reader.py
@@ -48,6 +48,17 @@ def visit_vector_search(self, vector_search) -> 'Future[Optional[GlobalIndexResu
             self._wrapped.visit_vector_search(
                 vector_search.offset_range(self._offset, self._to)))
 
+    def visit_batch_vector_search(
+            self, batch_vector_search) -> 'Future[List[Optional[GlobalIndexResult]]]':
+        source = self._wrapped.visit_batch_vector_search(
+            batch_vector_search.offset_range(self._offset, self._to))
+
+        def transform(results):
+            return [r.offset(self._offset) if r is not None else None
+                    for r in results]
+
+        return _map_future(source, transform)
+
     def visit_full_text_search(self, full_text_search) -> 'Future[Optional[GlobalIndexResult]]':
         return self._apply_offset_future(
             self._wrapped.visit_full_text_search(full_text_search))