From 43416da3447226c5642ed1577083748ce588332f Mon Sep 17 00:00:00 2001 From: QuakeWang Date: Mon, 22 Jun 2026 23:31:09 +0800 Subject: [PATCH] [core][spark][python] Reject full-text hybrid route options Signed-off-by: QuakeWang --- .../global-index/hybrid-search.mdx | 5 ++-- .../paimon/predicate/HybridSearchRoute.java | 9 +++++++ .../paimon/predicate/FullTextQueryTest.java | 27 +++++++++++++++++++ .../table/source/HybridSearchBuilder.java | 2 +- .../table/source/hybrid_search_builder.py | 9 +++++++ .../tests/vector_search_filter_test.py | 17 ++++++++++++ .../logical/PaimonTableValuedFunctions.scala | 2 +- .../plans/logical/VectorSearchQueryTest.scala | 23 ++++++++++++++++ 8 files changed, 90 insertions(+), 4 deletions(-) diff --git a/docs/docs/multimodal-table/global-index/hybrid-search.mdx b/docs/docs/multimodal-table/global-index/hybrid-search.mdx index 52068d2fedfd..e4923bdec841 100644 --- a/docs/docs/multimodal-table/global-index/hybrid-search.mdx +++ b/docs/docs/multimodal-table/global-index/hybrid-search.mdx @@ -87,10 +87,11 @@ The third argument is an array of full-text route configs created by `named_stru | `query` | Yes | N/A | LanceDB-style full-text query JSON for this route. | | `limit` | No | Final limit | Top K results to retrieve from this text column before ranking. | | `weight` | No | `1.0` | Weight for this route when ranking results. | -| `options` | No | Empty map | Route-specific full-text search options. | +| `options` | No | Empty map | Reserved for future full-text search options. Only an empty map is accepted. | Within each route array, every `named_struct` should use the same fields because -Spark requires array elements to have the same struct type. +Spark requires array elements to have the same struct type. Full-text route `options` is +currently a reserved field; pass `map()` when the field is needed for struct-type consistency. Use route `limit` values larger than the final limit when each route should contribute enough candidates for ranking. For example, with a final limit of `10`, route limits such as `50` or `100` diff --git a/paimon-common/src/main/java/org/apache/paimon/predicate/HybridSearchRoute.java b/paimon-common/src/main/java/org/apache/paimon/predicate/HybridSearchRoute.java index beddab745aa4..57e5369bd45f 100644 --- a/paimon-common/src/main/java/org/apache/paimon/predicate/HybridSearchRoute.java +++ b/paimon-common/src/main/java/org/apache/paimon/predicate/HybridSearchRoute.java @@ -68,11 +68,19 @@ public static HybridSearchRoute vector( public static HybridSearchRoute fullText( String queryJson, int limit, float weight, Map options) { + checkFullTextOptions(options); FullTextQuery query = FullTextQuery.fromJson(queryJson); return new HybridSearchRoute( RouteType.FULL_TEXT, query.columns().get(0), null, query, limit, weight, options); } + private static void checkFullTextOptions(Map options) { + if (options != null && !options.isEmpty()) { + throw new IllegalArgumentException( + "Full-text hybrid route options are not supported yet."); + } + } + private HybridSearchRoute( RouteType routeType, String fieldName, @@ -239,6 +247,7 @@ public Builder options(Map options) { public HybridSearchRoute build() { if (fullTextQuery != null) { + checkFullTextOptions(options); return new HybridSearchRoute( RouteType.FULL_TEXT, fullTextQuery.columns().get(0), diff --git a/paimon-common/src/test/java/org/apache/paimon/predicate/FullTextQueryTest.java b/paimon-common/src/test/java/org/apache/paimon/predicate/FullTextQueryTest.java index 2cd4e2a491fd..5fc34ecd34e2 100644 --- a/paimon-common/src/test/java/org/apache/paimon/predicate/FullTextQueryTest.java +++ b/paimon-common/src/test/java/org/apache/paimon/predicate/FullTextQueryTest.java @@ -24,6 +24,7 @@ import java.util.Collections; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; /** Tests for {@link FullTextQuery}. */ public class FullTextQueryTest { @@ -172,4 +173,30 @@ public void testFullTextSearchKeepsStructuredQueryJson() { "{\"match_phrase\":{\"column\":\"content\",\"terms\":\"paimon lake\"," + "\"slop\":1}}"); } + + @Test + public void testHybridFullTextRouteRejectsOptions() { + assertThatThrownBy( + () -> + HybridSearchRoute.fullText( + "{\"match\":{\"column\":\"content\"," + + "\"terms\":\"paimon lake\"}}", + 10, + 1.0f, + Collections.singletonMap("some.option", "x"))) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Full-text hybrid route options are not supported yet"); + + assertThatThrownBy( + () -> + HybridSearchRoute.builder() + .query( + "{\"match\":{\"column\":\"content\"," + + "\"terms\":\"paimon lake\"}}") + .limit(10) + .option("some.option", "x") + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Full-text hybrid route options are not supported yet"); + } } diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/HybridSearchBuilder.java b/paimon-core/src/main/java/org/apache/paimon/table/source/HybridSearchBuilder.java index 01304e131531..c74abe0052b9 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/HybridSearchBuilder.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/HybridSearchBuilder.java @@ -67,7 +67,7 @@ default HybridSearchBuilder addFullTextRoute(String queryJson, int limit, float return addFullTextRoute(queryJson, limit, weight, null); } - /** Add a full-text-search route. */ + /** Add a full-text-search route. Non-empty options are not supported yet. */ default HybridSearchBuilder addFullTextRoute( String queryJson, int limit, float weight, Map options) { return addRoute(HybridSearchRoute.fullText(queryJson, limit, weight, options)); diff --git a/paimon-python/pypaimon/table/source/hybrid_search_builder.py b/paimon-python/pypaimon/table/source/hybrid_search_builder.py index 868ea1f44b0a..3415d3bf9fa2 100644 --- a/paimon-python/pypaimon/table/source/hybrid_search_builder.py +++ b/paimon-python/pypaimon/table/source/hybrid_search_builder.py @@ -35,6 +35,12 @@ _RRF_K = 60.0 +def _check_full_text_options(options: Dict[str, str]): + if options: + raise ValueError( + "Full-text hybrid route options are not supported yet.") + + def _normalize_ranker(ranker: Optional[str]) -> str: if ranker is None or not ranker.strip(): return RRF_RANKER @@ -74,6 +80,8 @@ def __post_init__(self): if self.weight <= 0: raise ValueError("Weight must be positive, got: %s" % self.weight) self.options = dict(self.options or {}) + if self.route_type == self.FULL_TEXT: + _check_full_text_options(self.options) @classmethod def vector_route( @@ -99,6 +107,7 @@ def full_text_route( limit: int, weight: float = 1.0, options: Optional[Dict[str, str]] = None) -> 'HybridSearchRoute': + _check_full_text_options(options or {}) query = FullTextQuery.from_json(query_json) return cls( route_type=cls.FULL_TEXT, diff --git a/paimon-python/pypaimon/tests/vector_search_filter_test.py b/paimon-python/pypaimon/tests/vector_search_filter_test.py index 0e29e5c1ad04..9b1aae326117 100644 --- a/paimon-python/pypaimon/tests/vector_search_filter_test.py +++ b/paimon-python/pypaimon/tests/vector_search_filter_test.py @@ -1968,6 +1968,23 @@ def test_hybrid_search_rejects_data_filter_with_full_text_route(self): builder.route_builders() self.assertIn("full-text routes", str(ctx.exception)) + def test_hybrid_search_rejects_full_text_route_options(self): + from pypaimon.table.source.hybrid_search_builder import ( + HybridSearchBuilderImpl, + ) + + id_field = _field(0, "id") + content = _field(1, "content", "STRING") + table = _StubTable(fields=[id_field, content], entries=[]) + + with self.assertRaises(ValueError) as ctx: + HybridSearchBuilderImpl(table).add_full_text_route( + '{"match":{"column":"content","terms":"paimon search"}}', + 10, + options={"some.option": "x"}) + self.assertIn("Full-text hybrid route options are not supported yet", + str(ctx.exception)) + def test_hybrid_search_partition_filter_prunes_full_text_route(self): from pypaimon.table.source.hybrid_search_builder import ( HybridSearchBuilderImpl, diff --git a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/plans/logical/PaimonTableValuedFunctions.scala b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/plans/logical/PaimonTableValuedFunctions.scala index 4cafdc0b20ec..bf3e53bf2f1d 100644 --- a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/plans/logical/PaimonTableValuedFunctions.scala +++ b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalyst/plans/logical/PaimonTableValuedFunctions.scala @@ -455,7 +455,7 @@ case class VectorSearchQuery(override val args: Seq[Expression]) * Usage: hybrid_search(table_name, vector_routes, full_text_routes, limit[, ranker]) * - table_name: the Paimon table to search * - vector_routes: route config array with field, query_vector, limit, weight, and options fields - * - full_text_routes: route config array with query, limit, weight, and options fields + * - full_text_routes: route config array with query, limit, weight, and empty options fields * - limit: the final number of ranked top results to return * - ranker: optional ranker for combining results from multiple routes */ diff --git a/paimon-spark/paimon-spark-common/src/test/scala/org/apache/paimon/spark/catalyst/plans/logical/VectorSearchQueryTest.scala b/paimon-spark/paimon-spark-common/src/test/scala/org/apache/paimon/spark/catalyst/plans/logical/VectorSearchQueryTest.scala index 4a13fbc56b0a..82a91fc9eecf 100644 --- a/paimon-spark/paimon-spark-common/src/test/scala/org/apache/paimon/spark/catalyst/plans/logical/VectorSearchQueryTest.scala +++ b/paimon-spark/paimon-spark-common/src/test/scala/org/apache/paimon/spark/catalyst/plans/logical/VectorSearchQueryTest.scala @@ -133,6 +133,29 @@ class VectorSearchQueryTest extends AnyFunSuite { assert(search.routes().get(0).weight() == 1.5f) } + test("reject hybrid full-text route with non-empty options") { + val exception = intercept[IllegalArgumentException] { + HybridSearchQuery(Seq.empty).createHybridSearch( + innerTable, + Seq( + CreateArray(Seq.empty), + CreateArray( + Seq( + CreateNamedStruct(Seq( + Literal("query"), + Literal("""{"match":{"column":"content","terms":"paimon lake"}}"""), + Literal("options"), + CreateMap(Seq(Literal("some.option"), Literal("x"))) + )) + )), + Literal(5) + ) + ) + } + + assert(exception.getMessage.contains("Full-text hybrid route options are not supported yet")) + } + test("create full-text search") { val search = FullTextSearchQuery(Seq.empty).createFullTextSearch( innerTable,