From a27c7ace85aa91c3aa5c3b403541e6cde84cfdc4 Mon Sep 17 00:00:00 2001 From: linzhenqi Date: Mon, 13 Apr 2026 17:51:10 +0800 Subject: [PATCH 01/11] [Enhancement](pyudf) Support empty arg pyudf && udtf --- be/src/exprs/function/function_python_udf.cpp | 11 ++- .../table_function/python_udtf_function.cpp | 11 ++- be/src/udf/python/python_udf_meta.cpp | 6 +- be/test/udf/python/python_udf_meta_test.cpp | 63 ++++++++++++++++- .../test_pythonudf_no_input.groovy | 66 +++++++++++++++++ .../test_pythonudtf_no_input.groovy | 70 +++++++++++++++++++ 6 files changed, 218 insertions(+), 9 deletions(-) create mode 100644 regression-test/suites/pythonudf_p0/test_pythonudf_no_input.groovy create mode 100644 regression-test/suites/pythonudtf_p0/test_pythonudtf_no_input.groovy diff --git a/be/src/exprs/function/function_python_udf.cpp b/be/src/exprs/function/function_python_udf.cpp index b874d3ce14a59d..3d999cdac5e718 100644 --- a/be/src/exprs/function/function_python_udf.cpp +++ b/be/src/exprs/function/function_python_udf.cpp @@ -112,7 +112,7 @@ Status PythonFunctionCall::execute_impl(FunctionContext* context, Block& block, return Status::InternalError("Python UDF client is null"); } - int64_t input_rows = block.rows(); + int64_t input_rows = num_rows; uint32_t input_columns = block.columns(); DCHECK(input_columns > 0 && result < input_columns && _argument_types.size() == arguments.size()); @@ -141,8 +141,13 @@ Status PythonFunctionCall::execute_impl(FunctionContext* context, Block& block, std::shared_ptr input_batch; std::shared_ptr output_batch; cctz::time_zone _timezone_obj; // default UTC - RETURN_IF_ERROR(convert_to_arrow_batch(input_block, schema, arrow::default_memory_pool(), - &input_batch, _timezone_obj)); + if (arguments.empty()) { + input_batch = arrow::RecordBatch::Make(schema, input_rows, + std::vector> {}); + } else { + RETURN_IF_ERROR(convert_to_arrow_batch(input_block, schema, arrow::default_memory_pool(), + &input_batch, _timezone_obj)); + } RETURN_IF_ERROR(client->evaluate(*input_batch, &output_batch)); int64_t output_rows = output_batch->num_rows(); diff --git a/be/src/exprs/table_function/python_udtf_function.cpp b/be/src/exprs/table_function/python_udtf_function.cpp index a116a3d6785297..eae1b71f638f4e 100644 --- a/be/src/exprs/table_function/python_udtf_function.cpp +++ b/be/src/exprs/table_function/python_udtf_function.cpp @@ -132,12 +132,19 @@ Status PythonUDTFFunction::process_init(Block* block, RuntimeState* state) { for (uint32_t i = 0; i < child_column_idxs.size(); ++i) { input_block.insert(block->get_by_position(child_column_idxs[i])); } + int64_t input_rows = block->rows(); std::shared_ptr input_schema; std::shared_ptr input_batch; RETURN_IF_ERROR(get_arrow_schema_from_block(input_block, &input_schema, TimezoneUtils::default_time_zone)); - RETURN_IF_ERROR(convert_to_arrow_batch(input_block, input_schema, arrow::default_memory_pool(), - &input_batch, _timezone_obj)); + if (child_column_idxs.empty()) { + input_batch = arrow::RecordBatch::Make(input_schema, input_rows, + std::vector> {}); + } else { + RETURN_IF_ERROR(convert_to_arrow_batch(input_block, input_schema, + arrow::default_memory_pool(), &input_batch, + _timezone_obj)); + } // Step 3: Call Python UDTF to evaluate all rows at once (similar to Java UDTF's JNI call) // Python returns a ListArray where each element contains outputs for one input row diff --git a/be/src/udf/python/python_udf_meta.cpp b/be/src/udf/python/python_udf_meta.cpp index 88af0c9ff64128..f0978dc926bf11 100644 --- a/be/src/udf/python/python_udf_meta.cpp +++ b/be/src/udf/python/python_udf_meta.cpp @@ -32,7 +32,6 @@ namespace doris { Status PythonUDFMeta::convert_types_to_schema(const DataTypes& types, const std::string& timezone, std::shared_ptr* schema) { - assert(!types.empty()); arrow::SchemaBuilder builder; for (size_t i = 0; i < types.size(); ++i) { std::shared_ptr arrow_type; @@ -152,8 +151,9 @@ Status PythonUDFMeta::check() const { return Status::InvalidArgument("Python UDF runtime version is empty"); } - if (input_types.empty()) { - return Status::InvalidArgument("Python UDF input types is empty"); + if (input_types.empty() && + (client_type == PythonClientType::UDAF || type == PythonUDFLoadType::UNKNOWN)) { + return Status::InvalidArgument("Python UDAF input types is empty"); } if (!return_type) { diff --git a/be/test/udf/python/python_udf_meta_test.cpp b/be/test/udf/python/python_udf_meta_test.cpp index b913f49d19b5f1..fd651ae07d042a 100644 --- a/be/test/udf/python/python_udf_meta_test.cpp +++ b/be/test/udf/python/python_udf_meta_test.cpp @@ -109,7 +109,7 @@ TEST_F(PythonUDFMetaTest, CheckEmptyRuntimeVersion) { EXPECT_TRUE(status.to_string().find("runtime version is empty") != std::string::npos); } -TEST_F(PythonUDFMetaTest, CheckEmptyInputTypes) { +TEST_F(PythonUDFMetaTest, CheckEmptyInputTypesAllowedForUdf) { PythonUDFMeta meta; meta.name = "test_udf"; meta.symbol = "test_func"; @@ -117,6 +117,35 @@ TEST_F(PythonUDFMetaTest, CheckEmptyInputTypes) { meta.input_types = {}; meta.return_type = nullable_int32_; meta.type = PythonUDFLoadType::INLINE; + meta.client_type = PythonClientType::UDF; + + Status status = meta.check(); + EXPECT_TRUE(status.ok()) << status.to_string(); +} + +TEST_F(PythonUDFMetaTest, CheckEmptyInputTypesAllowedForUdtf) { + PythonUDFMeta meta; + meta.name = "test_udtf"; + meta.symbol = "test_func"; + meta.runtime_version = "3.9.16"; + meta.input_types = {}; + meta.return_type = nullable_string_; + meta.type = PythonUDFLoadType::INLINE; + meta.client_type = PythonClientType::UDTF; + + Status status = meta.check(); + EXPECT_TRUE(status.ok()) << status.to_string(); +} + +TEST_F(PythonUDFMetaTest, CheckEmptyInputTypesRejectedForUdaf) { + PythonUDFMeta meta; + meta.name = "test_udaf"; + meta.symbol = "test_func"; + meta.runtime_version = "3.9.16"; + meta.input_types = {}; + meta.return_type = nullable_int32_; + meta.type = PythonUDFLoadType::INLINE; + meta.client_type = PythonClientType::UDAF; Status status = meta.check(); EXPECT_FALSE(status.ok()); @@ -401,6 +430,27 @@ TEST_F(PythonUDFMetaTest, SerializeToJsonMultipleInputTypes) { EXPECT_TRUE(doc.HasMember("input_types")); } +TEST_F(PythonUDFMetaTest, SerializeToJsonEmptyInputTypesForUdf) { + PythonUDFMeta meta; + meta.name = "zero_arg_udf"; + meta.symbol = "func"; + meta.runtime_version = "3.9.16"; + meta.input_types = {}; + meta.return_type = nullable_int32_; + meta.type = PythonUDFLoadType::INLINE; + meta.client_type = PythonClientType::UDF; + + std::string json_str; + Status status = meta.serialize_to_json(&json_str); + EXPECT_TRUE(status.ok()) << status.to_string(); + + rapidjson::Document doc; + doc.Parse(json_str.c_str()); + EXPECT_FALSE(doc.HasParseError()); + EXPECT_TRUE(doc.HasMember("input_types")); + EXPECT_FALSE(std::string(doc["input_types"].GetString()).empty()); +} + // ============================================================================ // PythonUDFMeta convert_types_to_schema() tests // ============================================================================ @@ -429,6 +479,17 @@ TEST_F(PythonUDFMetaTest, ConvertTypesToSchemaSingleType) { EXPECT_EQ(schema->num_fields(), 1); } +TEST_F(PythonUDFMetaTest, ConvertTypesToSchemaEmpty) { + DataTypes types = {}; + std::shared_ptr schema; + + Status status = PythonUDFMeta::convert_types_to_schema(types, TimezoneUtils::default_time_zone, + &schema); + EXPECT_TRUE(status.ok()) << status.to_string(); + EXPECT_NE(schema, nullptr); + EXPECT_EQ(schema->num_fields(), 0); +} + // ============================================================================ // PythonUDFMeta serialize_arrow_schema() tests // ============================================================================ diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_no_input.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_no_input.groovy new file mode 100644 index 00000000000000..c81ed89e0628bd --- /dev/null +++ b/regression-test/suites/pythonudf_p0/test_pythonudf_no_input.groovy @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudf_no_input") { + def runtime_version = getPythonUdfRuntimeVersion() + def table_name = "test_pythonudf_no_input_tbl" + + try { + sql """ DROP FUNCTION IF EXISTS py_const_no_input(); """ + sql """ DROP TABLE IF EXISTS ${table_name}; """ + + sql """ + CREATE FUNCTION py_const_no_input() + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def evaluate(): + return 7 +\$\$; + """ + + assert sql(""" SELECT py_const_no_input(); """)[0][0] == 7 + + sql """ + CREATE TABLE ${table_name} ( + id INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ INSERT INTO ${table_name} VALUES (1), (2), (3); """ + + def rows = sql(""" + SELECT id, py_const_no_input() AS v + FROM ${table_name} + ORDER BY id + """) + + assert rows.size() == 3 : "Expected 3 rows, got ${rows.size()}" + assert rows.collect { it[0] as int } == [1, 2, 3] + assert rows.every { (it[1] as int) == 7 } + } finally { + try_sql(""" DROP FUNCTION IF EXISTS py_const_no_input(); """) + try_sql(""" DROP TABLE IF EXISTS ${table_name}; """) + } +} diff --git a/regression-test/suites/pythonudtf_p0/test_pythonudtf_no_input.groovy b/regression-test/suites/pythonudtf_p0/test_pythonudtf_no_input.groovy new file mode 100644 index 00000000000000..0136aabca5e9ba --- /dev/null +++ b/regression-test/suites/pythonudtf_p0/test_pythonudtf_no_input.groovy @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudtf_no_input") { + def runtime_version = getPythonUdfRuntimeVersion() + def table_name = "test_pythonudtf_no_input_tbl" + + try { + sql """ DROP FUNCTION IF EXISTS py_emit_no_input(); """ + sql """ DROP TABLE IF EXISTS ${table_name}; """ + + sql """ + CREATE TABLES FUNCTION py_emit_no_input() + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "emit_values", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def emit_values(): + yield ('left',) + yield ('right',) +\$\$; + """ + + sql """ + CREATE TABLE ${table_name} ( + id INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ INSERT INTO ${table_name} VALUES (1), (2); """ + + def rows = sql(""" + SELECT id, value + FROM ${table_name} + LATERAL VIEW py_emit_no_input() tmp AS value + ORDER BY id, value + """) + + assert rows.size() == 4 : "Expected 4 rows, got ${rows.size()}" + assert rows.collect { [(it[0] as int), it[1].toString()] } == [ + [1, "left"], + [1, "right"], + [2, "left"], + [2, "right"] + ] + } finally { + try_sql(""" DROP FUNCTION IF EXISTS py_emit_no_input(); """) + try_sql(""" DROP TABLE IF EXISTS ${table_name}; """) + } +} From a9458f8d99909faff0d59b80facd1e17e2541232 Mon Sep 17 00:00:00 2001 From: linzhenqi Date: Thu, 23 Apr 2026 21:15:37 +0800 Subject: [PATCH 02/11] [Enhancement](udf) support deterministic property for udf --- .../org/apache/doris/catalog/Function.java | 16 +- .../doris/catalog/FunctionToSqlConverter.java | 14 + .../expressions/functions/udf/JavaUdaf.java | 15 +- .../expressions/functions/udf/JavaUdf.java | 14 +- .../expressions/functions/udf/JavaUdtf.java | 13 +- .../expressions/functions/udf/PythonUdaf.java | 13 +- .../expressions/functions/udf/PythonUdf.java | 13 +- .../expressions/functions/udf/PythonUdtf.java | 13 +- .../plans/commands/CreateFunctionCommand.java | 11 + .../doris/catalog/CreateFunctionTest.java | 50 ++++ .../catalog/FunctionToSqlConverterTest.java | 2 + .../test_pythonudaf_deterministic.out | 20 ++ .../test_pythonudf_deterministic.out | 20 ++ .../test_pythonudtf_deterministic.out | 20 ++ .../javaudf_p0/test_javaudf_float.groovy | 17 +- .../mtmv_p0/test_expand_star_mtmv.groovy | 3 +- .../test_pythonudaf_deterministic.groovy | 255 ++++++++++++++++++ .../test_pythonudf_deterministic.groovy | 203 ++++++++++++++ .../pythonudf_p0/test_pythonudf_float.groovy | 17 +- .../test_pythonudtf_deterministic.groovy | 176 ++++++++++++ .../javaudf/test_javaudf_deterministic.groovy | 176 ++++++++++++ 21 files changed, 1061 insertions(+), 20 deletions(-) create mode 100644 regression-test/data/pythonudaf_p0/test_pythonudaf_deterministic.out create mode 100644 regression-test/data/pythonudf_p0/test_pythonudf_deterministic.out create mode 100644 regression-test/data/pythonudtf_p0/test_pythonudtf_deterministic.out create mode 100644 regression-test/suites/pythonudaf_p0/test_pythonudaf_deterministic.groovy create mode 100644 regression-test/suites/pythonudf_p0/test_pythonudf_deterministic.groovy create mode 100644 regression-test/suites/pythonudtf_p0/test_pythonudtf_deterministic.groovy create mode 100644 regression-test/suites/query_p0/javaudf/test_javaudf_deterministic.groovy diff --git a/fe/fe-catalog/src/main/java/org/apache/doris/catalog/Function.java b/fe/fe-catalog/src/main/java/org/apache/doris/catalog/Function.java index 5c7c80f4535ef1..4c672c3c6a4a09 100644 --- a/fe/fe-catalog/src/main/java/org/apache/doris/catalog/Function.java +++ b/fe/fe-catalog/src/main/java/org/apache/doris/catalog/Function.java @@ -114,6 +114,8 @@ public enum BinaryType { protected String runtimeVersion; @SerializedName("fc") protected String functionCode; + @SerializedName("det") + protected boolean deterministic = false; // Only used for serialization protected Function() { @@ -174,6 +176,7 @@ public Function(Function other) { this.expirationTime = other.expirationTime; this.runtimeVersion = other.runtimeVersion; this.functionCode = other.functionCode; + this.deterministic = other.deterministic; } public Function clone() { @@ -301,6 +304,14 @@ public void setFunctionCode(String functionCode) { this.functionCode = functionCode; } + public boolean isDeterministic() { + return deterministic; + } + + public void setDeterministic(boolean deterministic) { + this.deterministic = deterministic; + } + // TODO(cmy): Currently we judge whether it is UDF by wheter the 'location' is set. // Maybe we should use a separate variable to identify, // but additional variables need to modify the persistence information. @@ -401,7 +412,8 @@ public boolean equals(Object o) { } Function function = (Function) o; return id == function.id && hasVarArgs == function.hasVarArgs && userVisible == function.userVisible - && vectorized == function.vectorized && Objects.equals(name, function.name) + && vectorized == function.vectorized && deterministic == function.deterministic + && Objects.equals(name, function.name) && Objects.equals(retType, function.retType) && Arrays.equals(argTypes, function.argTypes) && Objects.equals(location, function.location) && binaryType == function.binaryType && nullableMode == function.nullableMode && Objects.equals( @@ -411,7 +423,7 @@ public boolean equals(Object o) { @Override public int hashCode() { int result = Objects.hash(id, name, retType, hasVarArgs, userVisible, location, binaryType, nullableMode, - vectorized, checksum); + vectorized, checksum, deterministic); result = 31 * result + Arrays.hashCode(argTypes); return result; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionToSqlConverter.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionToSqlConverter.java index 8709eb5b6de8ed..e79e39fa9d14df 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionToSqlConverter.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionToSqlConverter.java @@ -75,6 +75,13 @@ public static String toSql(ScalarFunction fn, boolean ifNotExists) { .append("\"" + (fn.getLocation() == null ? "" : fn.getLocation().toString()) + "\""); boolean isReturnNull = fn.getNullableMode() == NullableMode.ALWAYS_NULLABLE; sb.append(",\n \"ALWAYS_NULLABLE\"=").append("\"" + isReturnNull + "\""); + sb.append(",\n \"DETERMINISTIC\"=").append("\"" + fn.isDeterministic() + "\""); + } else if (fn.getBinaryType() == Function.BinaryType.PYTHON_UDF) { + sb.append(",\n \"FILE\"=") + .append("\"" + (fn.getLocation() == null ? "" : fn.getLocation().toString()) + "\""); + boolean isReturnNull = fn.getNullableMode() == NullableMode.ALWAYS_NULLABLE; + sb.append(",\n \"ALWAYS_NULLABLE\"=").append("\"" + isReturnNull + "\""); + sb.append(",\n \"DETERMINISTIC\"=").append("\"" + fn.isDeterministic() + "\""); } else { sb.append(",\n \"OBJECT_FILE\"=") .append("\"" + (fn.getLocation() == null ? "" : fn.getLocation().toString()) + "\""); @@ -125,6 +132,13 @@ public static String toSql(AggregateFunction fn, boolean ifNotExists) { .append("\"" + (fn.getLocation() == null ? "" : fn.getLocation().toString()) + "\","); boolean isReturnNull = fn.getNullableMode() == NullableMode.ALWAYS_NULLABLE; sb.append("\n \"ALWAYS_NULLABLE\"=").append("\"" + isReturnNull + "\","); + sb.append("\n \"DETERMINISTIC\"=").append("\"" + fn.isDeterministic() + "\","); + } else if (fn.getBinaryType() == Function.BinaryType.PYTHON_UDF) { + sb.append("\n \"FILE\"=") + .append("\"" + (fn.getLocation() == null ? "" : fn.getLocation().toString()) + "\","); + boolean isReturnNull = fn.getNullableMode() == NullableMode.ALWAYS_NULLABLE; + sb.append("\n \"ALWAYS_NULLABLE\"=").append("\"" + isReturnNull + "\","); + sb.append("\n \"DETERMINISTIC\"=").append("\"" + fn.isDeterministic() + "\","); } else { sb.append("\n \"OBJECT_FILE\"=") .append("\"" + (fn.getLocation() == null ? "" : fn.getLocation().toString()) + "\","); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/JavaUdaf.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/JavaUdaf.java index c3eebfc283fd0c..2a428cd5fd0746 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/JavaUdaf.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/JavaUdaf.java @@ -62,6 +62,7 @@ public class JavaUdaf extends AggregateFunction implements ExplicitlyCastableSig private final String checkSum; private final boolean isStaticLoad; private final long expirationTime; + private final boolean deterministic; /** * Constructor of UDAF @@ -72,7 +73,8 @@ public JavaUdaf(String name, long functionId, String dbName, Function.BinaryType String objectFile, String symbol, String initFn, String updateFn, String mergeFn, String serializeFn, String finalizeFn, String getValueFn, String removeFn, - boolean isDistinct, String checkSum, boolean isStaticLoad, long expirationTime, Expression... args) { + boolean isDistinct, String checkSum, boolean isStaticLoad, long expirationTime, + boolean deterministic, Expression... args) { super(name, isDistinct, args); this.dbName = dbName; this.functionId = functionId; @@ -92,6 +94,7 @@ public JavaUdaf(String name, long functionId, String dbName, Function.BinaryType this.checkSum = checkSum; this.isStaticLoad = isStaticLoad; this.expirationTime = expirationTime; + this.deterministic = deterministic; } @Override @@ -114,6 +117,11 @@ public NullableMode getNullableMode() { return nullableMode; } + @Override + public boolean isDeterministic() { + return deterministic; + } + /** * withChildren. */ @@ -122,7 +130,8 @@ public JavaUdaf withDistinctAndChildren(boolean isDistinct, List chi Preconditions.checkArgument(children.size() == this.children.size()); return new JavaUdaf(getName(), functionId, dbName, binaryType, signature, intermediateType, nullableMode, objectFile, symbol, initFn, updateFn, mergeFn, serializeFn, finalizeFn, getValueFn, removeFn, - isDistinct, checkSum, isStaticLoad, expirationTime, children.toArray(new Expression[0])); + isDistinct, checkSum, isStaticLoad, expirationTime, deterministic, + children.toArray(new Expression[0])); } /** @@ -165,6 +174,7 @@ public static void translateToNereidsFunction(String dbName, org.apache.doris.ca aggregate.getChecksum(), aggregate.isStaticLoad(), aggregate.getExpirationTime(), + aggregate.isDeterministic(), arguments); JavaUdafBuilder builder = new JavaUdafBuilder(udaf); @@ -201,6 +211,7 @@ public Function getCatalogFunction() { expr.setId(functionId); expr.setStaticLoad(isStaticLoad); expr.setExpirationTime(expirationTime); + expr.setDeterministic(deterministic); return expr; } catch (Exception e) { throw new AnalysisException(e.getMessage(), e.getCause()); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/JavaUdf.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/JavaUdf.java index 07cd4556324f21..974c13b2160f58 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/JavaUdf.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/JavaUdf.java @@ -56,6 +56,7 @@ public class JavaUdf extends ScalarFunction implements ExplicitlyCastableSignatu private final String checkSum; private final boolean isStaticLoad; private final long expirationTime; + private final boolean deterministic; /** * Constructor of UDF @@ -63,7 +64,7 @@ public class JavaUdf extends ScalarFunction implements ExplicitlyCastableSignatu public JavaUdf(String name, long functionId, String dbName, Function.BinaryType binaryType, FunctionSignature signature, NullableMode nullableMode, String objectFile, String symbol, String prepareFn, String closeFn, - String checkSum, boolean isStaticLoad, long expirationTime, Expression... args) { + String checkSum, boolean isStaticLoad, long expirationTime, boolean deterministic, Expression... args) { super(name, args); this.dbName = dbName; this.functionId = functionId; @@ -77,6 +78,7 @@ public JavaUdf(String name, long functionId, String dbName, Function.BinaryType this.checkSum = checkSum; this.isStaticLoad = isStaticLoad; this.expirationTime = expirationTime; + this.deterministic = deterministic; } @Override @@ -99,6 +101,11 @@ public NullableMode getNullableMode() { return nullableMode; } + @Override + public boolean isDeterministic() { + return deterministic; + } + /** * withChildren. */ @@ -106,7 +113,7 @@ public NullableMode getNullableMode() { public JavaUdf withChildren(List children) { Preconditions.checkArgument(children.size() == this.children.size()); return new JavaUdf(getName(), functionId, dbName, binaryType, signature, nullableMode, - objectFile, symbol, prepareFn, closeFn, checkSum, isStaticLoad, expirationTime, + objectFile, symbol, prepareFn, closeFn, checkSum, isStaticLoad, expirationTime, deterministic, children.toArray(new Expression[0])); } @@ -135,7 +142,7 @@ public static void translateToNereidsFunction(String dbName, org.apache.doris.ca scalar.getSymbolName(), scalar.getPrepareFnSymbol(), scalar.getCloseFnSymbol(), - scalar.getChecksum(), scalar.isStaticLoad(), scalar.getExpirationTime(), + scalar.getChecksum(), scalar.isStaticLoad(), scalar.getExpirationTime(), scalar.isDeterministic(), arguments); JavaUdfBuilder builder = new JavaUdfBuilder(udf); @@ -166,6 +173,7 @@ public Function getCatalogFunction() { expr.setId(functionId); expr.setStaticLoad(isStaticLoad); expr.setExpirationTime(expirationTime); + expr.setDeterministic(deterministic); return expr; } catch (Exception e) { throw new AnalysisException(e.getMessage(), e.getCause()); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/JavaUdtf.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/JavaUdtf.java index 2e04dec1d68163..7935c67f6af7ac 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/JavaUdtf.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/JavaUdtf.java @@ -56,6 +56,7 @@ public class JavaUdtf extends TableGeneratingFunction implements ExplicitlyCasta private final String checkSum; private final boolean isStaticLoad; private final long expirationTime; + private final boolean deterministic; /** * Constructor of UDTF @@ -63,7 +64,7 @@ public class JavaUdtf extends TableGeneratingFunction implements ExplicitlyCasta public JavaUdtf(String name, long functionId, String dbName, Function.BinaryType binaryType, FunctionSignature signature, NullableMode nullableMode, String objectFile, String symbol, String prepareFn, String closeFn, - String checkSum, boolean isStaticLoad, long expirationTime, Expression... args) { + String checkSum, boolean isStaticLoad, long expirationTime, boolean deterministic, Expression... args) { super(name, args); this.dbName = dbName; this.functionId = functionId; @@ -77,6 +78,7 @@ public JavaUdtf(String name, long functionId, String dbName, Function.BinaryType this.checkSum = checkSum; this.isStaticLoad = isStaticLoad; this.expirationTime = expirationTime; + this.deterministic = deterministic; } /** @@ -86,7 +88,7 @@ public JavaUdtf(String name, long functionId, String dbName, Function.BinaryType public JavaUdtf withChildren(List children) { Preconditions.checkArgument(children.size() == this.children.size()); return new JavaUdtf(getName(), functionId, dbName, binaryType, signature, nullableMode, - objectFile, symbol, prepareFn, closeFn, checkSum, isStaticLoad, expirationTime, + objectFile, symbol, prepareFn, closeFn, checkSum, isStaticLoad, expirationTime, deterministic, children.toArray(new Expression[0])); } @@ -95,6 +97,11 @@ public List getSignatures() { return ImmutableList.of(signature); } + @Override + public boolean isDeterministic() { + return deterministic; + } + @Override public boolean hasVarArguments() { return signature.hasVarArgs; @@ -125,6 +132,7 @@ public Function getCatalogFunction() { expr.setStaticLoad(isStaticLoad); expr.setExpirationTime(expirationTime); expr.setUDTFunction(true); + expr.setDeterministic(deterministic); return expr; } catch (Exception e) { throw new AnalysisException(e.getMessage(), e.getCause()); @@ -159,6 +167,7 @@ public static void translateToNereidsFunction(String dbName, org.apache.doris.ca scalar.getChecksum(), scalar.isStaticLoad(), scalar.getExpirationTime(), + scalar.isDeterministic(), arguments); JavaUdtfBuilder builder = new JavaUdtfBuilder(udf); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdaf.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdaf.java index 456e0f1a6eac42..ee03571b731aab 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdaf.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdaf.java @@ -64,6 +64,7 @@ public class PythonUdaf extends AggregateFunction implements ExplicitlyCastableS private final long expirationTime; private final String runtimeVersion; private final String functionCode; + private final boolean deterministic; /** * Constructor of UDAF @@ -75,7 +76,7 @@ public PythonUdaf(String name, long functionId, String dbName, Function.BinaryTy String initFn, String updateFn, String mergeFn, String serializeFn, String finalizeFn, String getValueFn, String removeFn, boolean isDistinct, String checkSum, boolean isStaticLoad, long expirationTime, - String runtimeVersion, String functionCode, Expression... args) { + String runtimeVersion, String functionCode, boolean deterministic, Expression... args) { super(name, isDistinct, args); this.dbName = dbName; this.functionId = functionId; @@ -97,6 +98,7 @@ public PythonUdaf(String name, long functionId, String dbName, Function.BinaryTy this.expirationTime = expirationTime; this.runtimeVersion = runtimeVersion; this.functionCode = functionCode; + this.deterministic = deterministic; } @Override @@ -119,6 +121,11 @@ public NullableMode getNullableMode() { return nullableMode; } + @Override + public boolean isDeterministic() { + return deterministic; + } + /** * withChildren. */ @@ -127,7 +134,7 @@ public PythonUdaf withDistinctAndChildren(boolean isDistinct, List c Preconditions.checkArgument(children.size() == this.children.size()); return new PythonUdaf(getName(), functionId, dbName, binaryType, signature, intermediateType, nullableMode, objectFile, symbol, initFn, updateFn, mergeFn, serializeFn, finalizeFn, getValueFn, removeFn, - isDistinct, checkSum, isStaticLoad, expirationTime, runtimeVersion, functionCode, + isDistinct, checkSum, isStaticLoad, expirationTime, runtimeVersion, functionCode, deterministic, children.toArray(new Expression[0])); } @@ -173,6 +180,7 @@ public static void translateToNereidsFunction(String dbName, org.apache.doris.ca aggregate.getExpirationTime(), aggregate.getRuntimeVersion(), aggregate.getFunctionCode(), + aggregate.isDeterministic(), arguments); PythonUdafBuilder builder = new PythonUdafBuilder(udaf); @@ -211,6 +219,7 @@ public Function getCatalogFunction() { expr.setExpirationTime(expirationTime); expr.setRuntimeVersion(runtimeVersion); expr.setFunctionCode(functionCode); + expr.setDeterministic(deterministic); return expr; } catch (Exception e) { throw new AnalysisException(e.getMessage(), e.getCause()); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdf.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdf.java index 98a9e161308417..8ace16ccc08c48 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdf.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdf.java @@ -58,6 +58,7 @@ public class PythonUdf extends ScalarFunction implements ExplicitlyCastableSigna private final long expirationTime; private final String runtimeVersion; private final String functionCode; + private final boolean deterministic; /** * Constructor of UDF @@ -66,7 +67,7 @@ public PythonUdf(String name, long functionId, String dbName, Function.BinaryTyp FunctionSignature signature, NullableMode nullableMode, String objectFile, String symbol, String prepareFn, String closeFn, String checkSum, boolean isStaticLoad, long expirationTime, - String runtimeVersion, String functionCode, Expression... args) { + String runtimeVersion, String functionCode, boolean deterministic, Expression... args) { super(name, args); this.dbName = dbName; this.functionId = functionId; @@ -82,6 +83,7 @@ public PythonUdf(String name, long functionId, String dbName, Function.BinaryTyp this.expirationTime = expirationTime; this.runtimeVersion = runtimeVersion; this.functionCode = functionCode; + this.deterministic = deterministic; } @Override @@ -104,6 +106,11 @@ public NullableMode getNullableMode() { return nullableMode; } + @Override + public boolean isDeterministic() { + return deterministic; + } + /** * withChildren. */ @@ -112,7 +119,7 @@ public PythonUdf withChildren(List children) { Preconditions.checkArgument(children.size() == this.children.size()); return new PythonUdf(getName(), functionId, dbName, binaryType, signature, nullableMode, objectFile, symbol, prepareFn, closeFn, checkSum, isStaticLoad, expirationTime, - runtimeVersion, functionCode, children.toArray(new Expression[0])); + runtimeVersion, functionCode, deterministic, children.toArray(new Expression[0])); } /** @@ -143,6 +150,7 @@ public static void translateToNereidsFunction(String dbName, org.apache.doris.ca scalar.getChecksum(), scalar.isStaticLoad(), scalar.getExpirationTime(), scalar.getRuntimeVersion(), scalar.getFunctionCode(), + scalar.isDeterministic(), arguments); PythonUdfBuilder builder = new PythonUdfBuilder(udf); @@ -175,6 +183,7 @@ public Function getCatalogFunction() { expr.setExpirationTime(expirationTime); expr.setRuntimeVersion(runtimeVersion); expr.setFunctionCode(functionCode); + expr.setDeterministic(deterministic); return expr; } catch (Exception e) { throw new AnalysisException(e.getMessage(), e.getCause()); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdtf.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdtf.java index 74e662aee7297e..9ee167304ec11f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdtf.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdtf.java @@ -58,6 +58,7 @@ public class PythonUdtf extends TableGeneratingFunction implements ExplicitlyCas private final long expirationTime; private final String runtimeVersion; private final String functionCode; + private final boolean deterministic; /** * Constructor of Python UDTF @@ -66,7 +67,7 @@ public PythonUdtf(String name, long functionId, String dbName, Function.BinaryTy FunctionSignature signature, NullableMode nullableMode, String objectFile, String symbol, String prepareFn, String closeFn, String checkSum, boolean isStaticLoad, long expirationTime, - String runtimeVersion, String functionCode, Expression... args) { + String runtimeVersion, String functionCode, boolean deterministic, Expression... args) { super(name, args); this.dbName = dbName; this.functionId = functionId; @@ -82,6 +83,7 @@ public PythonUdtf(String name, long functionId, String dbName, Function.BinaryTy this.expirationTime = expirationTime; this.runtimeVersion = runtimeVersion; this.functionCode = functionCode; + this.deterministic = deterministic; } /** @@ -92,7 +94,7 @@ public PythonUdtf withChildren(List children) { Preconditions.checkArgument(children.size() == this.children.size()); return new PythonUdtf(getName(), functionId, dbName, binaryType, signature, nullableMode, objectFile, symbol, prepareFn, closeFn, checkSum, isStaticLoad, expirationTime, - runtimeVersion, functionCode, children.toArray(new Expression[0])); + runtimeVersion, functionCode, deterministic, children.toArray(new Expression[0])); } @Override @@ -100,6 +102,11 @@ public List getSignatures() { return ImmutableList.of(signature); } + @Override + public boolean isDeterministic() { + return deterministic; + } + @Override public boolean hasVarArguments() { return signature.hasVarArgs; @@ -132,6 +139,7 @@ public Function getCatalogFunction() { expr.setUDTFunction(true); expr.setRuntimeVersion(runtimeVersion); expr.setFunctionCode(functionCode); + expr.setDeterministic(deterministic); return expr; } catch (Exception e) { throw new AnalysisException(e.getMessage(), e.getCause()); @@ -168,6 +176,7 @@ public static void translateToNereidsFunction(String dbName, org.apache.doris.ca scalar.getExpirationTime(), scalar.getRuntimeVersion(), scalar.getFunctionCode(), + scalar.isDeterministic(), arguments); PythonUdtfBuilder builder = new PythonUdtfBuilder(udtf); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/CreateFunctionCommand.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/CreateFunctionCommand.java index eaacae8aaa62e3..d35000b4923516 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/CreateFunctionCommand.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/CreateFunctionCommand.java @@ -147,6 +147,7 @@ public class CreateFunctionCommand extends Command implements ForwardWithSync { public static final String IS_STATIC_LOAD = "static_load"; public static final String EXPIRATION_TIME = "expiration_time"; public static final String RUNTIME_VERSION = "runtime_version"; + public static final String IS_DETERMINISTIC = "deterministic"; private static final Pattern PYTHON_VERSION_PATTERN = Pattern.compile("^3\\.\\d{1,2}(?:\\.\\d{1,2})?$"); private static final Logger LOG = LogManager.getLogger(CreateFunctionCommand.class); @@ -179,6 +180,7 @@ public class CreateFunctionCommand extends Command implements ForwardWithSync { private NullableMode returnNullMode = NullableMode.ALWAYS_NULLABLE; private String runtimeVersion; private String functionCode; + private boolean deterministic = false; /** * CreateFunctionCommand @@ -366,6 +368,12 @@ private void analyzeCommon(ConnectContext ctx) throws AnalysisException { } runtimeVersion = runtimeVersionString; } + if (binaryType == Function.BinaryType.JAVA_UDF || binaryType == Function.BinaryType.PYTHON_UDF) { + Boolean deterministicProperty = parseBooleanFromProperties(IS_DETERMINISTIC); + if (deterministicProperty != null) { + deterministic = deterministicProperty; + } + } } private void extractExpirationTime() throws AnalysisException { @@ -476,6 +484,7 @@ private void analyzeUdtf() throws AnalysisException { function.setUDTFunction(true); function.setRuntimeVersion(runtimeVersion); function.setFunctionCode(functionCode); + function.setDeterministic(deterministic); // Todo: maybe in create tables function, need register two function, one is // normal and one is outer as those have different result when result is NULL. } @@ -550,6 +559,7 @@ private void analyzeUdaf() throws AnalysisException { function.setExpirationTime(expirationTime); function.setRuntimeVersion(runtimeVersion); function.setFunctionCode(functionCode); + function.setDeterministic(deterministic); } private void analyzeUdf() throws AnalysisException { @@ -587,6 +597,7 @@ private void analyzeUdf() throws AnalysisException { function.setExpirationTime(expirationTime); function.setRuntimeVersion(runtimeVersion); function.setFunctionCode(functionCode); + function.setDeterministic(deterministic); } private void analyzeJavaUdaf(String clazz) throws AnalysisException { diff --git a/fe/fe-core/src/test/java/org/apache/doris/catalog/CreateFunctionTest.java b/fe/fe-core/src/test/java/org/apache/doris/catalog/CreateFunctionTest.java index fc309de75b0c90..8ebc8ea6aada66 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/catalog/CreateFunctionTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/catalog/CreateFunctionTest.java @@ -24,6 +24,8 @@ import org.apache.doris.common.jmockit.Deencapsulation; import org.apache.doris.nereids.StatementContext; import org.apache.doris.nereids.parser.NereidsParser; +import org.apache.doris.nereids.trees.expressions.functions.FunctionBuilder; +import org.apache.doris.nereids.trees.expressions.functions.udf.JavaUdf; import org.apache.doris.nereids.trees.plans.commands.CreateDatabaseCommand; import org.apache.doris.nereids.trees.plans.commands.CreateFunctionCommand; import org.apache.doris.nereids.trees.plans.commands.CreateTableCommand; @@ -43,6 +45,7 @@ import org.junit.Test; import java.io.File; +import java.util.Collections; import java.util.List; import java.util.UUID; @@ -53,6 +56,12 @@ public class CreateFunctionTest { + public static class TestConstantUdf { + public Integer evaluate() { + return 1; + } + } + private static String runningDir = "fe/mocked/CreateFunctionTest/" + UUID.randomUUID().toString() + "/"; private static ConnectContext connectContext; private static DorisAssert dorisAssert; @@ -150,6 +159,47 @@ public void testCreateGlobalFunction() throws Exception { + " right(CAST(CAST(k1 AS BIGINT) AS VARCHAR(65533)), 4))")); } + @Test + public void testCreateJavaUdfDeterministicProperty() throws Exception { + ConnectContext ctx = UtFrameUtils.createDefaultCtx(); + createDatabase(ctx, "create database db_det;"); + + createFunction("CREATE FUNCTION db_det.default_det() RETURNS int PROPERTIES (\n" + + " \"symbol\"=\"" + TestConstantUdf.class.getName() + "\",\n" + + " \"type\"=\"JAVA_UDF\"\n" + + ");", ctx); + + createFunction("CREATE FUNCTION db_det.explicit_det() RETURNS int PROPERTIES (\n" + + " \"symbol\"=\"" + TestConstantUdf.class.getName() + "\",\n" + + " \"type\"=\"JAVA_UDF\",\n" + + " \"deterministic\"=\"true\"\n" + + ");", ctx); + + Database db = Env.getCurrentInternalCatalog().getDbNullable("db_det"); + Assert.assertNotNull(db); + + Function defaultFn = db.getFunction( + new FunctionSearchDesc(new FunctionName("db_det", "default_det"), new Type[] {}, false)); + Function explicitFn = db.getFunction( + new FunctionSearchDesc(new FunctionName("db_det", "explicit_det"), new Type[] {}, false)); + + Assert.assertNotNull(defaultFn); + Assert.assertNotNull(explicitFn); + Assert.assertFalse(defaultFn.isDeterministic()); + Assert.assertTrue(explicitFn.isDeterministic()); + + FunctionRegistry functionRegistry = Env.getCurrentEnv().getFunctionRegistry(); + FunctionBuilder defaultBuilder = functionRegistry.findFunctionBuilder( + "db_det", "default_det", Collections.emptyList()); + FunctionBuilder explicitBuilder = functionRegistry.findFunctionBuilder( + "db_det", "explicit_det", Collections.emptyList()); + + JavaUdf defaultNereidsFn = (JavaUdf) defaultBuilder.build("default_det", Collections.emptyList()).first; + JavaUdf explicitNereidsFn = (JavaUdf) explicitBuilder.build("explicit_det", Collections.emptyList()).first; + Assert.assertFalse(defaultNereidsFn.isDeterministic()); + Assert.assertTrue(explicitNereidsFn.isDeterministic()); + } + private void testFunctionQuery(ConnectContext ctx, String queryStr, Boolean isStringLiteral) throws Exception { ctx.getState().reset(); StmtExecutor stmtExecutor = new StmtExecutor(ctx, queryStr); diff --git a/fe/fe-core/src/test/java/org/apache/doris/catalog/FunctionToSqlConverterTest.java b/fe/fe-core/src/test/java/org/apache/doris/catalog/FunctionToSqlConverterTest.java index 26b22fa1baed1b..979874df53d95d 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/catalog/FunctionToSqlConverterTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/catalog/FunctionToSqlConverterTest.java @@ -45,6 +45,7 @@ void testScalarFunction_javaUdf_basicSql() { Assertions.assertTrue(sql.contains("\"FILE\"=\"\"")); Assertions.assertTrue(sql.contains("\"TYPE\"=\"JAVA_UDF\"")); Assertions.assertTrue(sql.contains("\"ALWAYS_NULLABLE\"=")); + Assertions.assertTrue(sql.contains("\"DETERMINISTIC\"=\"false\"")); Assertions.assertFalse(sql.contains("OBJECT_FILE")); Assertions.assertFalse(sql.contains("IF NOT EXISTS")); Assertions.assertFalse(sql.contains("GLOBAL")); @@ -186,6 +187,7 @@ void testAggregateFunction_javaUdf_basicSql() { Assertions.assertTrue(sql.contains("\"FILE\"=\"\"")); Assertions.assertTrue(sql.contains("\"TYPE\"=\"JAVA_UDF\"")); Assertions.assertTrue(sql.contains("\"ALWAYS_NULLABLE\"=")); + Assertions.assertTrue(sql.contains("\"DETERMINISTIC\"=\"false\"")); Assertions.assertFalse(sql.contains("INIT_FN")); Assertions.assertFalse(sql.contains("UPDATE_FN")); Assertions.assertFalse(sql.contains("MERGE_FN")); diff --git a/regression-test/data/pythonudaf_p0/test_pythonudaf_deterministic.out b/regression-test/data/pythonudaf_p0/test_pythonudaf_deterministic.out new file mode 100644 index 00000000000000..144af0dc4d1a9c --- /dev/null +++ b/regression-test/data/pythonudaf_p0/test_pythonudaf_deterministic.out @@ -0,0 +1,20 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !materialized -- +1 1 +2 1 +3 1 + +-- !inlined -- +1 1 +2 1 +3 1 + +-- !materialized_det -- +1 1 +2 1 +3 1 + +-- !inlined_det -- +1 2 +2 2 +3 2 diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_deterministic.out b/regression-test/data/pythonudf_p0/test_pythonudf_deterministic.out new file mode 100644 index 00000000000000..144af0dc4d1a9c --- /dev/null +++ b/regression-test/data/pythonudf_p0/test_pythonudf_deterministic.out @@ -0,0 +1,20 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !materialized -- +1 1 +2 1 +3 1 + +-- !inlined -- +1 1 +2 1 +3 1 + +-- !materialized_det -- +1 1 +2 1 +3 1 + +-- !inlined_det -- +1 2 +2 2 +3 2 diff --git a/regression-test/data/pythonudtf_p0/test_pythonudtf_deterministic.out b/regression-test/data/pythonudtf_p0/test_pythonudtf_deterministic.out new file mode 100644 index 00000000000000..144af0dc4d1a9c --- /dev/null +++ b/regression-test/data/pythonudtf_p0/test_pythonudtf_deterministic.out @@ -0,0 +1,20 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !materialized -- +1 1 +2 1 +3 1 + +-- !inlined -- +1 1 +2 1 +3 1 + +-- !materialized_det -- +1 1 +2 1 +3 1 + +-- !inlined_det -- +1 2 +2 2 +3 2 diff --git a/regression-test/suites/javaudf_p0/test_javaudf_float.groovy b/regression-test/suites/javaudf_p0/test_javaudf_float.groovy index 5372bda71c4f3e..e57e130bda9ded 100644 --- a/regression-test/suites/javaudf_p0/test_javaudf_float.groovy +++ b/regression-test/suites/javaudf_p0/test_javaudf_float.groovy @@ -64,12 +64,24 @@ suite("test_javaudf_float") { qt_select """ SELECT java_udf_float_test(2.83645,null) as result ; """ qt_select """ SELECT java_udf_float_test(cast(2.83645 as float),null) as result ; """ qt_select """ SELECT user_id,java_udf_float_test(float_1, float_2) as sum FROM ${tableName} order by user_id; """ - createMV("create materialized view udf_mv as SELECT user_id as a1,java_udf_float_test(float_1, float_2) as sum FROM test_javaudf_float order by user_id;") + sql """ DROP MATERIALIZED VIEW IF EXISTS udf_mv; """ + sql """ + CREATE MATERIALIZED VIEW udf_mv + BUILD DEFERRED REFRESH AUTO ON MANUAL + DISTRIBUTED BY RANDOM BUCKETS 2 + PROPERTIES ( + 'replication_num' = '1', + 'version_info'='3', + 'enable_nondeterministic_function' = 'true' + ) + AS + SELECT user_id as a1, java_udf_float_test(float_1, float_2) as sum FROM ${tableName}; + """ qt_select """ SELECT user_id,java_udf_float_test(float_1, float_2) as sum FROM ${tableName} order by user_id; """ explain { sql("SELECT user_id,java_udf_float_test(float_1, float_2) as sum FROM ${tableName} order by user_id; ") - contains "(udf_mv)" + notContains "(udf_mv)" } @@ -90,6 +102,7 @@ suite("test_javaudf_float") { } finally { + try_sql("DROP MATERIALIZED VIEW IF EXISTS udf_mv;") try_sql("DROP FUNCTION IF EXISTS java_udf_double_test(DOUBLE,DOUBLE);") try_sql("DROP FUNCTION IF EXISTS java_udf_float_test(FLOAT,FLOAT);") try_sql("DROP TABLE IF EXISTS ${tableName}") diff --git a/regression-test/suites/mtmv_p0/test_expand_star_mtmv.groovy b/regression-test/suites/mtmv_p0/test_expand_star_mtmv.groovy index f550dc78c3db33..d32041b3531b8b 100644 --- a/regression-test/suites/mtmv_p0/test_expand_star_mtmv.groovy +++ b/regression-test/suites/mtmv_p0/test_expand_star_mtmv.groovy @@ -71,7 +71,8 @@ suite("test_expand_star_mtmv","mtmv") { DISTRIBUTED BY RANDOM BUCKETS 2 PROPERTIES ( 'replication_num' = '1', - 'version_info'='3' + 'version_info'='3', + 'enable_nondeterministic_function' = 'true' ) AS SELECT ${functionName} ('2011-01-01','2011-01-03') as k1 from ${tableName}; diff --git a/regression-test/suites/pythonudaf_p0/test_pythonudaf_deterministic.groovy b/regression-test/suites/pythonudaf_p0/test_pythonudaf_deterministic.groovy new file mode 100644 index 00000000000000..ecfcb3ba3aa5a9 --- /dev/null +++ b/regression-test/suites/pythonudaf_p0/test_pythonudaf_deterministic.groovy @@ -0,0 +1,255 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.junit.Assert; + +suite("test_pythonudaf_deterministic") { + def runtime_version = getPythonUdfRuntimeVersion() + + try { + sql """ DROP TABLE IF EXISTS cte_uuid_seed; """ + sql """ DROP TABLE IF EXISTS mtmv_uuid_seed; """ + sql """ DROP MATERIALIZED VIEW IF EXISTS py_uuid_agg_mtmv; """ + sql """ DROP FUNCTION IF EXISTS py_uuid_agg_false(INT); """ + sql """ DROP FUNCTION IF EXISTS py_uuid_agg_det(INT); """ + sql """ + CREATE TABLE cte_uuid_seed (id INT) ENGINE=OLAP DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("replication_num" = "1"); + """ + sql """ INSERT INTO cte_uuid_seed VALUES (1), (2), (3); """ + sql """ sync; """ + + sql """ SET enable_nereids_planner = true; """ + sql """ SET enable_fallback_to_original_planner = false; """ + + sql """ DROP FUNCTION IF EXISTS py_uuid_agg(INT); """ + sql """ + CREATE AGGREGATE FUNCTION py_uuid_agg(INT) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "PyUuidAgg", + "always_nullable" = "false", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +import uuid + +class PyUuidAgg: + def __init__(self): + self.last = None + + def accumulate(self, value): + if value is not None: + self.last = value + + def merge(self, other_state): + if other_state is not None: + self.last = other_state + + def finish(self): + return f"{self.last}-{uuid.uuid4()}" + + @property + def aggregate_state(self): + return self.last +\$\$; + """ + def showDefault = sql """ SHOW CREATE FUNCTION py_uuid_agg(INT); """ + assertTrue(showDefault.size() == 1) + assertTrue(showDefault[0][1].contains("\"DETERMINISTIC\"=\"false\"")) + + sql """ + CREATE AGGREGATE FUNCTION py_uuid_agg_false(INT) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "PyUuidAggFalse", + "always_nullable" = "false", + "runtime_version" = "${runtime_version}", + "deterministic" = "false" + ) + AS \$\$ +import uuid + +class PyUuidAggFalse: + def __init__(self): + self.last = None + + def accumulate(self, value): + if value is not None: + self.last = value + + def merge(self, other_state): + if other_state is not None: + self.last = other_state + + def finish(self): + return f"{self.last}-{uuid.uuid4()}" + + @property + def aggregate_state(self): + return self.last +\$\$; + """ + def showExplicitFalse = sql """ SHOW CREATE FUNCTION py_uuid_agg_false(INT); """ + assertTrue(showExplicitFalse.size() == 1) + assertTrue(showExplicitFalse[0][1].contains("\"DETERMINISTIC\"=\"false\"")) + + sql """ + CREATE AGGREGATE FUNCTION py_uuid_agg_det(INT) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "PyUuidAggDet", + "always_nullable" = "false", + "runtime_version" = "${runtime_version}", + "deterministic" = "true" + ) + AS \$\$ +import uuid + +class PyUuidAggDet: + def __init__(self): + self.last = None + + def accumulate(self, value): + if value is not None: + self.last = value + + def merge(self, other_state): + if other_state is not None: + self.last = other_state + + def finish(self): + return f"{self.last}-{uuid.uuid4()}" + + @property + def aggregate_state(self): + return self.last +\$\$; + """ + def showDet = sql """ SHOW CREATE FUNCTION py_uuid_agg_det(INT); """ + assertTrue(showDet.size() == 1) + assertTrue(showDet[0][1].contains("\"DETERMINISTIC\"=\"true\"")) + + sql """ SET enable_cte_materialize = true; """ + sql """ SET inline_cte_referenced_threshold = 1; """ + qt_materialized """ + WITH cte AS ( + SELECT id, py_uuid_agg(id) AS token + FROM cte_uuid_seed + GROUP BY id + ) + SELECT id, COUNT(DISTINCT token) AS distinct_tokens + FROM ( + SELECT id, token FROM cte + UNION ALL + SELECT id, token FROM cte + ) u + GROUP BY id + ORDER BY id; + """ + + sql """ SET enable_cte_materialize = true; """ + sql """ SET inline_cte_referenced_threshold = 10; """ + qt_inlined """ + WITH cte AS ( + SELECT id, py_uuid_agg(id) AS token + FROM cte_uuid_seed + GROUP BY id + ) + SELECT id, COUNT(DISTINCT token) AS distinct_tokens + FROM ( + SELECT id, token FROM cte + UNION ALL + SELECT id, token FROM cte + ) u + GROUP BY id + ORDER BY id; + """ + + sql """ SET enable_cte_materialize = true; """ + sql """ SET inline_cte_referenced_threshold = 1; """ + qt_materialized_det """ + WITH cte AS ( + SELECT id, py_uuid_agg_det(id) AS token + FROM cte_uuid_seed + GROUP BY id + ) + SELECT id, COUNT(DISTINCT token) AS distinct_tokens + FROM ( + SELECT id, token FROM cte + UNION ALL + SELECT id, token FROM cte + ) u + GROUP BY id + ORDER BY id; + """ + + sql """ SET enable_cte_materialize = true; """ + sql """ SET inline_cte_referenced_threshold = 10; """ + qt_inlined_det """ + WITH cte AS ( + SELECT id, py_uuid_agg_det(id) AS token + FROM cte_uuid_seed + GROUP BY id + ) + SELECT id, COUNT(DISTINCT token) AS distinct_tokens + FROM ( + SELECT id, token FROM cte + UNION ALL + SELECT id, token FROM cte + ) u + GROUP BY id + ORDER BY id; + """ + + sql """ + CREATE TABLE mtmv_uuid_seed (id INT, v INT) ENGINE=OLAP DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("replication_num" = "1"); + """ + sql """ INSERT INTO mtmv_uuid_seed VALUES (1, 10), (1, 11), (2, 20), (2, 21), (3, 30), (3, 31); """ + sql """ sync; """ + + try { + sql """ + CREATE MATERIALIZED VIEW py_uuid_agg_mtmv + BUILD DEFERRED REFRESH COMPLETE ON MANUAL + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ("replication_num" = "1") + AS + SELECT id, py_uuid_agg(v) AS token + FROM mtmv_uuid_seed + GROUP BY id; + """ + Assert.fail() + } catch (Exception e) { + log.info(e.getMessage()) + assertTrue(e.getMessage().contains("can not contain nonDeterministic expression")) + } + } finally { + try_sql(""" DROP MATERIALIZED VIEW IF EXISTS py_uuid_agg_mtmv; """) + sql """ DROP FUNCTION IF EXISTS py_uuid_agg(INT); """ + sql """ DROP FUNCTION IF EXISTS py_uuid_agg_false(INT); """ + sql """ DROP FUNCTION IF EXISTS py_uuid_agg_det(INT); """ + sql """ DROP TABLE IF EXISTS cte_uuid_seed; """ + sql """ DROP TABLE IF EXISTS mtmv_uuid_seed; """ + } +} diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_deterministic.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_deterministic.groovy new file mode 100644 index 00000000000000..69e15a924d1740 --- /dev/null +++ b/regression-test/suites/pythonudf_p0/test_pythonudf_deterministic.groovy @@ -0,0 +1,203 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.junit.Assert; + +suite("test_pythonudf_deterministic") { + def runtime_version = getPythonUdfRuntimeVersion() + + try { + sql """ DROP TABLE IF EXISTS cte_uuid_seed; """ + sql """ DROP TABLE IF EXISTS mtmv_uuid_seed; """ + sql """ DROP MATERIALIZED VIEW IF EXISTS py_uuid_token_mtmv; """ + sql """ DROP FUNCTION IF EXISTS py_uuid_token_false(INT); """ + sql """ DROP FUNCTION IF EXISTS py_uuid_token_det(INT); """ + sql """ + CREATE TABLE cte_uuid_seed (id INT) ENGINE=OLAP DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("replication_num" = "1"); + """ + sql """ INSERT INTO cte_uuid_seed VALUES (1), (2), (3); """ + sql """ sync; """ + + sql """ SET enable_nereids_planner = true; """ + sql """ SET enable_fallback_to_original_planner = false; """ + + sql """ DROP FUNCTION IF EXISTS py_uuid_token(INT); """ + sql """ + CREATE FUNCTION py_uuid_token(INT) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "py_uuid_token_impl", + "always_nullable" = "false", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +import uuid + +def py_uuid_token_impl(x): + return f"{x}-{uuid.uuid4()}" +\$\$; + """ + def showDefault = sql """ SHOW CREATE FUNCTION py_uuid_token(INT); """ + assertTrue(showDefault.size() == 1) + assertTrue(showDefault[0][1].contains("DETERMINISTIC")) + assertTrue(showDefault[0][1].contains("\"DETERMINISTIC\"=\"false\"")) + + sql """ + CREATE FUNCTION py_uuid_token_false(INT) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "py_uuid_token_impl", + "always_nullable" = "false", + "runtime_version" = "${runtime_version}", + "deterministic" = "false" + ) + AS \$\$ +import uuid + +def py_uuid_token_impl(x): + return f"{x}-{uuid.uuid4()}" +\$\$; + """ + def showExplicitFalse = sql """ SHOW CREATE FUNCTION py_uuid_token_false(INT); """ + assertTrue(showExplicitFalse.size() == 1) + assertTrue(showExplicitFalse[0][1].contains("\"DETERMINISTIC\"=\"false\"")) + + sql """ + CREATE FUNCTION py_uuid_token_det(INT) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "py_uuid_token_det_impl", + "always_nullable" = "false", + "runtime_version" = "${runtime_version}", + "deterministic" = "true" + ) + AS \$\$ +import uuid + +def py_uuid_token_det_impl(x): + return f"{x}-{uuid.uuid4()}" +\$\$; + """ + def showDet = sql """ SHOW CREATE FUNCTION py_uuid_token_det(INT); """ + assertTrue(showDet.size() == 1) + assertTrue(showDet[0][1].contains("\"DETERMINISTIC\"=\"true\"")) + + sql """ SET enable_cte_materialize = true; """ + sql """ SET inline_cte_referenced_threshold = 1; """ + qt_materialized """ + WITH cte AS ( + SELECT id, py_uuid_token(id) AS token + FROM cte_uuid_seed + ) + SELECT id, COUNT(DISTINCT token) AS distinct_tokens + FROM ( + SELECT id, token FROM cte + UNION ALL + SELECT id, token FROM cte + ) u + GROUP BY id + ORDER BY id; + """ + + sql """ SET enable_cte_materialize = true; """ + sql """ SET inline_cte_referenced_threshold = 10; """ + qt_inlined """ + WITH cte AS ( + SELECT id, py_uuid_token(id) AS token + FROM cte_uuid_seed + ) + SELECT id, COUNT(DISTINCT token) AS distinct_tokens + FROM ( + SELECT id, token FROM cte + UNION ALL + SELECT id, token FROM cte + ) u + GROUP BY id + ORDER BY id; + """ + + sql """ SET enable_cte_materialize = true; """ + sql """ SET inline_cte_referenced_threshold = 1; """ + qt_materialized_det """ + WITH cte AS ( + SELECT id, py_uuid_token_det(id) AS token + FROM cte_uuid_seed + ) + SELECT id, COUNT(DISTINCT token) AS distinct_tokens + FROM ( + SELECT id, token FROM cte + UNION ALL + SELECT id, token FROM cte + ) u + GROUP BY id + ORDER BY id; + """ + + sql """ SET enable_cte_materialize = true; """ + sql """ SET inline_cte_referenced_threshold = 10; """ + qt_inlined_det """ + WITH cte AS ( + SELECT id, py_uuid_token_det(id) AS token + FROM cte_uuid_seed + ) + SELECT id, COUNT(DISTINCT token) AS distinct_tokens + FROM ( + SELECT id, token FROM cte + UNION ALL + SELECT id, token FROM cte + ) u + GROUP BY id + ORDER BY id; + """ + + sql """ + CREATE TABLE mtmv_uuid_seed (id INT) ENGINE=OLAP DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("replication_num" = "1"); + """ + sql """ INSERT INTO mtmv_uuid_seed VALUES (1), (2), (3); """ + sql """ sync; """ + + try { + sql """ + CREATE MATERIALIZED VIEW py_uuid_token_mtmv + BUILD DEFERRED REFRESH COMPLETE ON MANUAL + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ("replication_num" = "1") + AS + SELECT id, py_uuid_token(id) AS token + FROM mtmv_uuid_seed; + """ + Assert.fail() + } catch (Exception e) { + log.info(e.getMessage()) + assertTrue(e.getMessage().contains("can not contain nonDeterministic expression")) + } + } finally { + try_sql(""" DROP MATERIALIZED VIEW IF EXISTS py_uuid_token_mtmv; """) + sql """ DROP FUNCTION IF EXISTS py_uuid_token(INT); """ + sql """ DROP FUNCTION IF EXISTS py_uuid_token_false(INT); """ + sql """ DROP FUNCTION IF EXISTS py_uuid_token_det(INT); """ + sql """ DROP TABLE IF EXISTS cte_uuid_seed; """ + sql """ DROP TABLE IF EXISTS mtmv_uuid_seed; """ + } +} diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_float.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_float.groovy index 7a26136ed2d41c..cab7d580c61c22 100644 --- a/regression-test/suites/pythonudf_p0/test_pythonudf_float.groovy +++ b/regression-test/suites/pythonudf_p0/test_pythonudf_float.groovy @@ -59,12 +59,24 @@ suite("test_pythonudf_float") { qt_select """ SELECT python_udf_float_test(cast(2.83645 as float),cast(111.1111111 as float)) as result; """ qt_select """ SELECT python_udf_float_test(2.83645,111.1111111) as result ; """ qt_select """ SELECT user_id,python_udf_float_test(float_1, float_2) as sum FROM test_pythonudf_float order by user_id; """ - createMV("create materialized view udf_mv as SELECT user_id as a1,python_udf_float_test(float_1, float_2) as sum FROM test_pythonudf_float order by user_id;") + sql """ DROP MATERIALIZED VIEW IF EXISTS udf_mv; """ + sql """ + CREATE MATERIALIZED VIEW udf_mv + BUILD DEFERRED REFRESH AUTO ON MANUAL + DISTRIBUTED BY RANDOM BUCKETS 2 + PROPERTIES ( + 'replication_num' = '1', + 'version_info'='3', + 'enable_nondeterministic_function' = 'true' + ) + AS + SELECT user_id as a1, python_udf_float_test(float_1, float_2) as sum FROM test_pythonudf_float; + """ qt_select """ SELECT user_id,python_udf_float_test(float_1, float_2) as sum FROM test_pythonudf_float order by user_id; """ explain { sql("SELECT user_id,python_udf_float_test(float_1, float_2) as sum FROM test_pythonudf_float order by user_id; ") - contains "(udf_mv)" + notContains "(udf_mv)" } sql """ CREATE FUNCTION python_udf_double_test(DOUBLE,DOUBLE) RETURNS DOUBLE PROPERTIES ( @@ -80,6 +92,7 @@ suite("test_pythonudf_float") { qt_select """ SELECT user_id,python_udf_double_test(double_1, double_1) as sum FROM test_pythonudf_float order by user_id; """ } finally { + try_sql("DROP MATERIALIZED VIEW IF EXISTS udf_mv;") try_sql("DROP FUNCTION IF EXISTS python_udf_double_test(DOUBLE,DOUBLE);") try_sql("DROP FUNCTION IF EXISTS python_udf_float_test(FLOAT,FLOAT);") try_sql("DROP TABLE IF EXISTS test_pythonudf_float") diff --git a/regression-test/suites/pythonudtf_p0/test_pythonudtf_deterministic.groovy b/regression-test/suites/pythonudtf_p0/test_pythonudtf_deterministic.groovy new file mode 100644 index 00000000000000..d1dbdb63f7e437 --- /dev/null +++ b/regression-test/suites/pythonudtf_p0/test_pythonudtf_deterministic.groovy @@ -0,0 +1,176 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudtf_deterministic") { + def runtime_version = getPythonUdfRuntimeVersion() + + try { + sql """ DROP TABLE IF EXISTS cte_uuid_seed; """ + sql """ DROP FUNCTION IF EXISTS py_uuid_expand_false(INT); """ + sql """ DROP FUNCTION IF EXISTS py_uuid_expand_det(INT); """ + sql """ + CREATE TABLE cte_uuid_seed (id INT) ENGINE=OLAP DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("replication_num" = "1"); + """ + sql """ INSERT INTO cte_uuid_seed VALUES (1), (2), (3); """ + sql """ sync; """ + + sql """ SET enable_nereids_planner = true; """ + sql """ SET enable_fallback_to_original_planner = false; """ + + sql """ DROP FUNCTION IF EXISTS py_uuid_expand(INT); """ + sql """ + CREATE TABLES FUNCTION py_uuid_expand(INT) + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "py_uuid_expand_impl", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +import uuid + +def py_uuid_expand_impl(x): + if x is not None: + yield (f"{x}-{uuid.uuid4()}",) +\$\$; + """ + def showDefault = sql """ SHOW CREATE FUNCTION py_uuid_expand(INT); """ + assertTrue(showDefault.size() == 1) + assertTrue(showDefault[0][1].contains("\"DETERMINISTIC\"=\"false\"")) + + sql """ + CREATE TABLES FUNCTION py_uuid_expand_false(INT) + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "py_uuid_expand_false_impl", + "runtime_version" = "${runtime_version}", + "deterministic" = "false" + ) + AS \$\$ +import uuid + +def py_uuid_expand_false_impl(x): + if x is not None: + yield (f"{x}-{uuid.uuid4()}",) +\$\$; + """ + def showExplicitFalse = sql """ SHOW CREATE FUNCTION py_uuid_expand_false(INT); """ + assertTrue(showExplicitFalse.size() == 1) + assertTrue(showExplicitFalse[0][1].contains("\"DETERMINISTIC\"=\"false\"")) + + sql """ + CREATE TABLES FUNCTION py_uuid_expand_det(INT) + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "py_uuid_expand_det_impl", + "runtime_version" = "${runtime_version}", + "deterministic" = "true" + ) + AS \$\$ +import uuid + +def py_uuid_expand_det_impl(x): + if x is not None: + yield (f"{x}-{uuid.uuid4()}",) +\$\$; + """ + def showDet = sql """ SHOW CREATE FUNCTION py_uuid_expand_det(INT); """ + assertTrue(showDet.size() == 1) + assertTrue(showDet[0][1].contains("\"DETERMINISTIC\"=\"true\"")) + + sql """ SET enable_cte_materialize = true; """ + sql """ SET inline_cte_referenced_threshold = 1; """ + qt_materialized """ + WITH cte AS ( + SELECT id, token + FROM cte_uuid_seed + LATERAL VIEW py_uuid_expand(id) tmp AS token + ) + SELECT id, COUNT(DISTINCT token) AS distinct_tokens + FROM ( + SELECT id, token FROM cte + UNION ALL + SELECT id, token FROM cte + ) u + GROUP BY id + ORDER BY id; + """ + + sql """ SET enable_cte_materialize = true; """ + sql """ SET inline_cte_referenced_threshold = 10; """ + qt_inlined """ + WITH cte AS ( + SELECT id, token + FROM cte_uuid_seed + LATERAL VIEW py_uuid_expand(id) tmp AS token + ) + SELECT id, COUNT(DISTINCT token) AS distinct_tokens + FROM ( + SELECT id, token FROM cte + UNION ALL + SELECT id, token FROM cte + ) u + GROUP BY id + ORDER BY id; + """ + + sql """ SET enable_cte_materialize = true; """ + sql """ SET inline_cte_referenced_threshold = 1; """ + qt_materialized_det """ + WITH cte AS ( + SELECT id, token + FROM cte_uuid_seed + LATERAL VIEW py_uuid_expand_det(id) tmp AS token + ) + SELECT id, COUNT(DISTINCT token) AS distinct_tokens + FROM ( + SELECT id, token FROM cte + UNION ALL + SELECT id, token FROM cte + ) u + GROUP BY id + ORDER BY id; + """ + + sql """ SET enable_cte_materialize = true; """ + sql """ SET inline_cte_referenced_threshold = 10; """ + qt_inlined_det """ + WITH cte AS ( + SELECT id, token + FROM cte_uuid_seed + LATERAL VIEW py_uuid_expand_det(id) tmp AS token + ) + SELECT id, COUNT(DISTINCT token) AS distinct_tokens + FROM ( + SELECT id, token FROM cte + UNION ALL + SELECT id, token FROM cte + ) u + GROUP BY id + ORDER BY id; + """ + } finally { + sql """ DROP FUNCTION IF EXISTS py_uuid_expand(INT); """ + sql """ DROP FUNCTION IF EXISTS py_uuid_expand_false(INT); """ + sql """ DROP FUNCTION IF EXISTS py_uuid_expand_det(INT); """ + sql """ DROP TABLE IF EXISTS cte_uuid_seed; """ + } +} diff --git a/regression-test/suites/query_p0/javaudf/test_javaudf_deterministic.groovy b/regression-test/suites/query_p0/javaudf/test_javaudf_deterministic.groovy new file mode 100644 index 00000000000000..54364f88ed72cb --- /dev/null +++ b/regression-test/suites/query_p0/javaudf/test_javaudf_deterministic.groovy @@ -0,0 +1,176 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.junit.Assert + +suite("test_javaudf_deterministic") { + def tableName = "test_javaudf_deterministic_seed" + def udfMvName = "java_udf_deterministic_false_mtmv" + def udafMvName = "java_udaf_deterministic_false_mtmv" + def udfDetMvName = "java_udf_deterministic_true_mtmv" + def udafDetMvName = "java_udaf_deterministic_true_mtmv" + def jarPath = """${context.file.parent}/../../javaudf_p0/jars/java-udf-case-jar-with-dependencies.jar""" + scp_udf_file_to_all_be(jarPath) + + try { + sql """ SET enable_nereids_planner = true; """ + sql """ SET enable_fallback_to_original_planner = false; """ + + sql """ DROP TABLE IF EXISTS ${tableName}; """ + sql """ DROP MATERIALIZED VIEW IF EXISTS ${udfMvName}; """ + sql """ DROP MATERIALIZED VIEW IF EXISTS ${udafMvName}; """ + sql """ DROP MATERIALIZED VIEW IF EXISTS ${udfDetMvName}; """ + sql """ DROP MATERIALIZED VIEW IF EXISTS ${udafDetMvName}; """ + sql """ DROP FUNCTION IF EXISTS java_udf_deterministic_test(INT); """ + sql """ DROP FUNCTION IF EXISTS java_udaf_deterministic_test(INT); """ + sql """ DROP FUNCTION IF EXISTS java_udf_deterministic_true_test(INT); """ + sql """ DROP FUNCTION IF EXISTS java_udaf_deterministic_true_test(INT); """ + + sql """ + CREATE TABLE ${tableName} ( + id INT, + v INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("replication_num" = "1"); + """ + sql """ INSERT INTO ${tableName} VALUES (1, 10), (2, 20), (3, 30); """ + sql """ sync; """ + + File jarFile = new File(jarPath) + if (!jarFile.exists()) { + throw new IllegalStateException("""${jarPath} doesn't exist! """) + } + + sql """ + CREATE FUNCTION java_udf_deterministic_test(INT) RETURNS INT PROPERTIES ( + "file"="file://${jarPath}", + "symbol"="org.apache.doris.udf.IntTest", + "type"="JAVA_UDF" + ); + """ + def udfShowDefault = sql """ SHOW CREATE FUNCTION java_udf_deterministic_test(INT); """ + assertTrue(udfShowDefault.size() == 1) + assertTrue(udfShowDefault[0][1].contains("\"DETERMINISTIC\"=\"false\"")) + + sql """ + CREATE AGGREGATE FUNCTION java_udaf_deterministic_test(INT) RETURNS BIGINT PROPERTIES ( + "file"="file://${jarPath}", + "symbol"="org.apache.doris.udf.MySumInt", + "always_nullable"="false", + "type"="JAVA_UDF" + ); + """ + def udafShowDefault = sql """ SHOW CREATE FUNCTION java_udaf_deterministic_test(INT); """ + assertTrue(udafShowDefault.size() == 1) + assertTrue(udafShowDefault[0][1].contains("\"DETERMINISTIC\"=\"false\"")) + + sql """ + CREATE FUNCTION java_udf_deterministic_true_test(INT) RETURNS INT PROPERTIES ( + "file"="file://${jarPath}", + "symbol"="org.apache.doris.udf.IntTest", + "type"="JAVA_UDF", + "deterministic"="true" + ); + """ + def udfShowDet = sql """ SHOW CREATE FUNCTION java_udf_deterministic_true_test(INT); """ + assertTrue(udfShowDet.size() == 1) + assertTrue(udfShowDet[0][1].contains("\"DETERMINISTIC\"=\"true\"")) + + sql """ + CREATE AGGREGATE FUNCTION java_udaf_deterministic_true_test(INT) RETURNS BIGINT PROPERTIES ( + "file"="file://${jarPath}", + "symbol"="org.apache.doris.udf.MySumInt", + "always_nullable"="false", + "type"="JAVA_UDF", + "deterministic"="true" + ); + """ + def udafShowDet = sql """ SHOW CREATE FUNCTION java_udaf_deterministic_true_test(INT); """ + assertTrue(udafShowDet.size() == 1) + assertTrue(udafShowDet[0][1].contains("\"DETERMINISTIC\"=\"true\"")) + + try { + sql """ + CREATE MATERIALIZED VIEW ${udfMvName} + BUILD DEFERRED REFRESH COMPLETE ON MANUAL + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ("replication_num" = "1") + AS + SELECT id, java_udf_deterministic_test(v) AS result + FROM ${tableName}; + """ + Assert.fail() + } catch (Exception e) { + log.info(e.getMessage()) + assertTrue(e.getMessage().contains("can not contain nonDeterministic expression")) + } + + try { + sql """ + CREATE MATERIALIZED VIEW ${udafMvName} + BUILD DEFERRED REFRESH COMPLETE ON MANUAL + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ("replication_num" = "1") + AS + SELECT id, java_udaf_deterministic_test(v) AS result + FROM ${tableName} + GROUP BY id; + """ + Assert.fail() + } catch (Exception e) { + log.info(e.getMessage()) + assertTrue(e.getMessage().contains("can not contain nonDeterministic expression")) + } + + sql """ + CREATE MATERIALIZED VIEW ${udfDetMvName} + BUILD DEFERRED REFRESH COMPLETE ON MANUAL + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ("replication_num" = "1") + AS + SELECT id, java_udf_deterministic_true_test(v) AS result + FROM ${tableName}; + """ + def udfDetShow = sql """ SHOW CREATE MATERIALIZED VIEW ${udfDetMvName}; """ + assertTrue(udfDetShow.size() == 1) + + sql """ + CREATE MATERIALIZED VIEW ${udafDetMvName} + BUILD DEFERRED REFRESH COMPLETE ON MANUAL + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ("replication_num" = "1") + AS + SELECT id, java_udaf_deterministic_true_test(v) AS result + FROM ${tableName} + GROUP BY id; + """ + def udafDetShow = sql """ SHOW CREATE MATERIALIZED VIEW ${udafDetMvName}; """ + assertTrue(udafDetShow.size() == 1) + } finally { + try_sql(""" DROP MATERIALIZED VIEW IF EXISTS ${udfMvName}; """) + try_sql(""" DROP MATERIALIZED VIEW IF EXISTS ${udafMvName}; """) + try_sql(""" DROP MATERIALIZED VIEW IF EXISTS ${udfDetMvName}; """) + try_sql(""" DROP MATERIALIZED VIEW IF EXISTS ${udafDetMvName}; """) + try_sql(""" DROP FUNCTION IF EXISTS java_udf_deterministic_test(INT); """) + try_sql(""" DROP FUNCTION IF EXISTS java_udaf_deterministic_test(INT); """) + try_sql(""" DROP FUNCTION IF EXISTS java_udf_deterministic_true_test(INT); """) + try_sql(""" DROP FUNCTION IF EXISTS java_udaf_deterministic_true_test(INT); """) + try_sql(""" DROP TABLE IF EXISTS ${tableName}; """) + } +} From ab7280a09c75dbf4770553c1d0ed59b32d948a1c Mon Sep 17 00:00:00 2001 From: linzhenqi Date: Thu, 9 Apr 2026 18:01:06 +0800 Subject: [PATCH 03/11] [Fix](pyudf) Fix python udf error propagation fix p0 fix p0 fix p0 --- be/src/udf/python/python_server.py | 32 +-- ...test_python_raise_error_propagation.groovy | 196 ++++++++++++++++++ .../pythonudaf_p0/udaf_scripts/pyudaf.zip | Bin 7858 -> 9153 bytes .../pythonudaf_p0/udaf_scripts/udaf_errors.py | 42 ++++ .../test_pythonudf_file_protocol.groovy | 11 +- .../udf_scripts/array_int_test.py | 2 + .../array_return_array_int_test.py | 2 + .../array_return_array_string_test.py | 2 + .../udf_scripts/array_string_test.py | 2 + .../pythonudf_p0/udf_scripts/float_test.py | 2 + .../pythonudf_p0/udf_scripts/int_test.py | 2 + .../suites/pythonudf_p0/udf_scripts/pyudf.zip | Bin 6086 -> 15967 bytes .../pythonudf_p0/udf_scripts/udf_errors.py | 22 ++ .../pythonudtf_p0/udtf_scripts/pyudtf.zip | Bin 10216 -> 10924 bytes .../pyudtf_module/exceptions_udtf.py | 7 + 15 files changed, 304 insertions(+), 18 deletions(-) create mode 100644 regression-test/suites/pythonudaf_p0/test_python_raise_error_propagation.groovy create mode 100644 regression-test/suites/pythonudaf_p0/udaf_scripts/udaf_errors.py create mode 100644 regression-test/suites/pythonudf_p0/udf_scripts/udf_errors.py diff --git a/be/src/udf/python/python_server.py b/be/src/udf/python/python_server.py index d16fc352178942..2b9d260b47fddd 100644 --- a/be/src/udf/python/python_server.py +++ b/be/src/udf/python/python_server.py @@ -628,11 +628,9 @@ def _scalar_call(self, record_batch: pa.RecordBatch) -> pa.Array: converted_args, traceback.format_exc(), ) - # Return None for failed rows if always_nullable is True - if self.python_udf_meta.always_nullable: - result.append(None) - else: - raise + raise RuntimeError( + f"Error in scalar UDF execution at row {i}: {e}" + ) from e return pa.array(result, type=self._get_output_type()) @@ -1731,7 +1729,9 @@ def _handle_udaf_create( place_id, e, ) - success = False + raise RuntimeError( + f"CREATE operation failed for place_id={place_id}: {e}" + ) from e return pa.RecordBatch.from_arrays( [pa.array([success], type=pa.bool_())], ["success"] @@ -1879,7 +1879,9 @@ def _handle_udaf_serialize( place_id, e, ) - serialized = b"" + raise RuntimeError( + f"SERIALIZE operation failed for place_id={place_id}: {e}" + ) from e return pa.RecordBatch.from_arrays( [pa.array([serialized], type=pa.binary())], ["serialized_state"] @@ -1908,7 +1910,9 @@ def _handle_udaf_merge( place_id, e, ) - success = False + raise RuntimeError( + f"MERGE operation failed for place_id={place_id}: {e}" + ) from e return pa.RecordBatch.from_arrays( [pa.array([success], type=pa.bool_())], ["success"] @@ -1932,7 +1936,9 @@ def _handle_udaf_finalize( place_id, e, ) - result = None + raise RuntimeError( + f"FINALIZE operation failed for place_id={place_id}: {e}" + ) from e return pa.RecordBatch.from_arrays( [pa.array([result], type=output_type)], ["result"] @@ -1954,7 +1960,9 @@ def _handle_udaf_reset( place_id, e, ) - success = False + raise RuntimeError( + f"RESET operation failed for place_id={place_id}: {e}" + ) from e return pa.RecordBatch.from_arrays( [pa.array([success], type=pa.bool_())], ["success"] @@ -2262,9 +2270,7 @@ def _handle_exchange_udaf( e, traceback.format_exc(), ) - result_batch = self._create_unified_response( - success=False, rows_processed=0, data=b"" - ) + raise # Begin stream with unified schema on first call if not started: diff --git a/regression-test/suites/pythonudaf_p0/test_python_raise_error_propagation.groovy b/regression-test/suites/pythonudaf_p0/test_python_raise_error_propagation.groovy new file mode 100644 index 00000000000000..669bca070f9e47 --- /dev/null +++ b/regression-test/suites/pythonudaf_p0/test_python_raise_error_propagation.groovy @@ -0,0 +1,196 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_python_raise_error_propagation") { + // Keep using the existing type-wide archives under regression-test/suites. + // This avoids introducing extra zip names and preserves the same loader/cache path shape as p0. + def suitePath = context.file.parent + "/.." + def udfPath = """${suitePath}/pythonudf_p0/udf_scripts/pyudf.zip""" + def udafPath = """${suitePath}/pythonudaf_p0/udaf_scripts/pyudaf.zip""" + def udtfPath = """${suitePath}/pythonudtf_p0/udtf_scripts/pyudtf.zip""" + scp_udf_file_to_all_be(udfPath) + scp_udf_file_to_all_be(udafPath) + scp_udf_file_to_all_be(udtfPath) + def runtime_version = getPythonUdfRuntimeVersion() + log.info("Python UDF zip path: ${udfPath}".toString()) + log.info("Python UDAF zip path: ${udafPath}".toString()) + log.info("Python UDTF zip path: ${udtfPath}".toString()) + + try { + sql """ DROP TABLE IF EXISTS python_raise_error_test; """ + sql """ + CREATE TABLE python_raise_error_test ( + id INT, + val INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO python_raise_error_test VALUES + (1, 1), + (2, 2); + """ + + sql """ DROP FUNCTION IF EXISTS py_inline_raise_udf(INT); """ + sql """ + CREATE FUNCTION py_inline_raise_udf(INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) AS \$\$ +def evaluate(x): + raise TypeError("inline_udf_error_42") +\$\$; + """ + + test { + sql """ SELECT py_inline_raise_udf(1); """ + exception "inline_udf_error_42" + } + + sql """ DROP FUNCTION IF EXISTS py_module_raise_udf(INT); """ + sql """ + CREATE FUNCTION py_module_raise_udf(INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${udfPath}", + "symbol" = "udf_errors.raise_in_module", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + test { + sql """ SELECT py_module_raise_udf(1); """ + exception "module_udf_error_42" + } + + sql """ DROP FUNCTION IF EXISTS py_inline_raise_udaf(INT); """ + sql """ + CREATE AGGREGATE FUNCTION py_inline_raise_udaf(INT) + RETURNS BIGINT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "InlineFinishErrorUDAF", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) AS \$\$ +class InlineFinishErrorUDAF: + def __init__(self): + self.count = 0 + + @property + def aggregate_state(self): + return self.count + + def accumulate(self, value): + if value is not None: + self.count += 1 + + def merge(self, other_state): + self.count += other_state + + def finish(self): + raise TypeError("inline_udaf_error_42") +\$\$; + """ + + test { + sql """ SELECT py_inline_raise_udaf(val) FROM python_raise_error_test; """ + exception "inline_udaf_error_42" + } + + sql """ DROP FUNCTION IF EXISTS py_module_raise_udaf(INT); """ + sql """ + CREATE AGGREGATE FUNCTION py_module_raise_udaf(INT) + RETURNS BIGINT + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${udafPath}", + "symbol" = "udaf_errors.ModuleFinishErrorUDAF", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + test { + sql """ SELECT py_module_raise_udaf(val) FROM python_raise_error_test; """ + exception "module_udaf_error_42" + } + + sql """ DROP FUNCTION IF EXISTS py_inline_raise_udtf(INT); """ + sql """ + CREATE TABLES FUNCTION py_inline_raise_udtf(INT) + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "inline_raise_udtf", + "runtime_version" = "${runtime_version}" + ) AS \$\$ +def inline_raise_udtf(x): + if False: + yield x + raise TypeError("inline_udtf_error_42") +\$\$; + """ + + test { + sql """ + SELECT tmp.col + FROM python_raise_error_test + LATERAL VIEW py_inline_raise_udtf(val) tmp AS col; + """ + exception "inline_udtf_error_42" + } + + sql """ DROP FUNCTION IF EXISTS py_module_raise_udtf(INT); """ + sql """ + CREATE TABLES FUNCTION py_module_raise_udtf(INT) + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${udtfPath}", + "symbol" = "pyudtf_module.exceptions_udtf.raise_in_module_udtf", + "runtime_version" = "${runtime_version}" + ); + """ + + test { + sql """ + SELECT tmp.col + FROM python_raise_error_test + LATERAL VIEW py_module_raise_udtf(val) tmp AS col; + """ + exception "module_udtf_error_42" + } + } finally { + try_sql("DROP FUNCTION IF EXISTS py_inline_raise_udf(INT);") + try_sql("DROP FUNCTION IF EXISTS py_module_raise_udf(INT);") + try_sql("DROP FUNCTION IF EXISTS py_inline_raise_udaf(INT);") + try_sql("DROP FUNCTION IF EXISTS py_module_raise_udaf(INT);") + try_sql("DROP FUNCTION IF EXISTS py_inline_raise_udtf(INT);") + try_sql("DROP FUNCTION IF EXISTS py_module_raise_udtf(INT);") + try_sql("DROP TABLE IF EXISTS python_raise_error_test;") + } +} diff --git a/regression-test/suites/pythonudaf_p0/udaf_scripts/pyudaf.zip b/regression-test/suites/pythonudaf_p0/udaf_scripts/pyudaf.zip index 1dc76099d43326a3eb48e6b6e93b3eb0ae163100..835aa5af0ad6dc149f0b1fbe3f22bb0c46c59f04 100644 GIT binary patch delta 2434 zcma);c{J2}AIE>T!B`^uHWHN?yO0@6nzE%d7)yv|EFopd+Q`UWE`!NfqtRHBB`VjT zGNwsgPqM_=LJf&7V=axPr%r#|=XtvKJe}|Pe9!rw^Ln4p=X=ig^ZCA;%xKo4C_5fr zNdN!@fDh!QuoG30YhyUBc0ARtDeTrozw!QcS zE02bfRme66oY$)7A;&VowkdS-fS{!Uj#V4P=0q{SEKf%JUyv}lDh)2-yxp*vUT;4X z;$O=An8fB#`?d?SuW09IU-%Pg%LcMU3472v*~`%BsQ5LBx+zWtUVv&~%6+#(+jv7x zX!!u@QX962ZZV^ebx{|*a9B11(4m6%!4vaR(HK-UWe`j9@U#IV&o;g{o#`)vSaeW* zlrs;|cgg6=2+~wBVA)D>G4(4%j7JYu|8 z@9S}&GQV-ldw1S@mA!0s%`?)ZLK5C!4GKug9l8Ouka%6;)=-m@E%V2P9d-CC3xxgj z4B#1a31f;-wohe=dx>-?UqMbO)B9p&jHAF3%aN6LxNktK38pphYDWtYG}T8Ra-AyB zXdpMMiGMG4cONr%D<|wG_$bSuS{G?e=|m{V9a|h6Ty1x`JM#uCcSlM%Tn@pSEZO8? z>65(RuI{q02qXjSN`1z;53e(&$h6W_+eP7`ntj{4lF5IZkRicnTi5Ot*E=PH^$hDn zS>ZUhrhe%Kj7;fliXh$I6E2OHBj!5oGUJlPE1;rdVkNfr@;yk>c*0VXP)cdVi@kx<`2d9I32m3sK8LQb&5y&0#zj>d)rp#gQ6aUv zhrH-Hx92+Es+1Uitex{)Mn86xUhpwn#aoU=<()nM#?Jpw>qNsiySsxY{QQ_kGlUrI z@|hR8*@{PsV`gJZPo{H%$7=F%B-zPljTJ639Q`VDEeDY$$=??Z$yDVtf+MmpuMC4z z(X*WP)zr?NjQxo_L zZ#TIm3`~6zaL@?n*hS7=#)!GJI;DgXMGcQBj@M^y78 z#qy2>qu?g|HsWTurio|KcItjFmyn}eRtvbfPw2f`Yol9g~eWUrsv|hYmWE~qS zpB;`vm`BV{DiqzX@#k%>8t)n@ITzV)Me5=+(4x_j9l&AoolN!20Xb8~SjfjNEmORm`W_+O2%ISfsU^QyglfmHkH$OI96};7LnU4TSajU zt8G_EG<1$EM^>*1$ulPotg6xUX1Kcuca$%#(wExOgq6P z;TfvUb4feQ$KAr^xyQTS{!|H=Kw`VJ8ZM!Mk{k8JcaPIJucR7k#LA{ex^^{8_+J%$ zA`4=w`<+Bn*1GX4KwV#5b^8RFk0|MeBsny8H(g`2mWWPovASaIU598|o+9hs1$>z% z?PN>mWvTFj9B>-jR=BmNTAWF3c)nUcH!l$Y(kWjIP0pJMeX7uTNR7OU@nyl}n)1cB ztirY4cw+VdH(|0vTQ;vIPTVI*L3~WSiiD* zdf*a2%8rX0!tQ0eN5JoX(a~j(ZRutL0nBDzDXL{Ms#RQaGl`wBPa*`&VoGQ=zayjB8ER4Vk8-^LvcH z*%Y?y(q~{BKfo<=3t}P$YEBmE(i_^2d4ej80C>tm7_Z{A@Y>&Y9=?+%~=t zvnsimte>pp%Bx&4?`m9pnAS$gS!cuQn_VX!>X4k1o%}vmvsEp0)5;gi4m2tB1_xH& z`m3V?-GBM_a&uVn)XbhPiuOQrr+_%m(SKe zbk@7vI$K)e*^4!A53(J9H8u01<+aQEr+i$^V7e=QrkVZ8vk#}at)6v?D}4T=b`759 zhR%kL4Gyp0WuD=hBzHBs)Nb>V3fZL7F~yfZd)Vp)9Ik&T5mjz>(6)bBjFs!w^EQ5u z%VPbt)k_Ve&Gw9Kli7z>MgyWr7LdvZhKjubn1CPXxP$4uYPa) zxlh_6;V$o)O&4c1%O@tEco}`;$B#eS`T4z<7b>j3k&*pdp4+*pQ7h2q?fc{v{ikNn zxwZIpOI!IiTdCfJ4dz*w-$#D??R-0|cFw`-jQ;a0W=nlzixw1-`tJ3vqG~^5fHxzP z95W*KO+Kios|?D1Xek_79XJmPDj6z+@*o~{;Cz^>WT1?Z2N@WU%>(Df9ZD9u82J!Y g9a?^rRaV!;k{1KKS=m4(1}hLQV`O0XCIjLD08qTRWB>pF diff --git a/regression-test/suites/pythonudaf_p0/udaf_scripts/udaf_errors.py b/regression-test/suites/pythonudaf_p0/udaf_scripts/udaf_errors.py new file mode 100644 index 00000000000000..e7ea9ed3c3db60 --- /dev/null +++ b/regression-test/suites/pythonudaf_p0/udaf_scripts/udaf_errors.py @@ -0,0 +1,42 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Module-based UDAF error cases for regression tests.""" + + +class ModuleFinishErrorUDAF: + """Raise a stable error from finish() to verify propagation.""" + + def __init__(self): + self.count = 0 + + @property + def aggregate_state(self): + return self.count + + def accumulate(self, value): + if value is not None: + self.count += 1 + + def merge(self, other_state): + self.count += other_state + + def reset(self): + self.count = 0 + + def finish(self): + raise TypeError("module_udaf_error_42") diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_file_protocol.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_file_protocol.groovy index 151de115abafa1..be11b64d339a41 100644 --- a/regression-test/suites/pythonudf_p0/test_pythonudf_file_protocol.groovy +++ b/regression-test/suites/pythonudf_p0/test_pythonudf_file_protocol.groovy @@ -55,19 +55,20 @@ suite("test_pythonudf_file_protocol") { qt_select_file_string """ SELECT py_file_string_mask('1234567890', 3, 3) AS result; """ // Test 3: Load float_test.py from zip package using file:// protocol - sql """ DROP FUNCTION IF EXISTS py_file_float_process(FLOAT); """ + sql """ DROP FUNCTION IF EXISTS py_file_float_process(FLOAT, FLOAT); """ sql """ - CREATE FUNCTION py_file_float_process(FLOAT) + CREATE FUNCTION py_file_float_process(FLOAT, FLOAT) RETURNS FLOAT PROPERTIES ( "type" = "PYTHON_UDF", "file" = "file://${zipPath}", "symbol" = "float_test.evaluate", - "runtime_version" = "${runtime_version}" + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" ); """ - qt_select_file_float """ SELECT py_file_float_process(3.14) AS result; """ + qt_select_file_float """ SELECT py_file_float_process(3.14, null) AS result; """ // Test 4: Load boolean_test.py from zip package using file:// protocol sql """ DROP FUNCTION IF EXISTS py_file_bool_not(BOOLEAN); """ @@ -120,7 +121,7 @@ suite("test_pythonudf_file_protocol") { } finally { try_sql("DROP FUNCTION IF EXISTS py_file_int_add(INT);") try_sql("DROP FUNCTION IF EXISTS py_file_string_mask(STRING, INT, INT);") - try_sql("DROP FUNCTION IF EXISTS py_file_float_process(FLOAT);") + try_sql("DROP FUNCTION IF EXISTS py_file_float_process(FLOAT, FLOAT);") try_sql("DROP FUNCTION IF EXISTS py_file_bool_not(BOOLEAN);") try_sql("DROP TABLE IF EXISTS file_protocol_test_table;") } diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/array_int_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/array_int_test.py index ef3020985d4a4d..2e751c80ffa512 100644 --- a/regression-test/suites/pythonudf_p0/udf_scripts/array_int_test.py +++ b/regression-test/suites/pythonudf_p0/udf_scripts/array_int_test.py @@ -17,6 +17,8 @@ def evaluate(res): + if res is None: + return None value = 0 for data in res: if data is not None: diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/array_return_array_int_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/array_return_array_int_test.py index 7781d788f0794c..45292ab5499c0d 100644 --- a/regression-test/suites/pythonudf_p0/udf_scripts/array_return_array_int_test.py +++ b/regression-test/suites/pythonudf_p0/udf_scripts/array_return_array_int_test.py @@ -17,6 +17,8 @@ def evaluate(res): + if res is None: + return None value = 0 for data in res: if data is not None: diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/array_return_array_string_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/array_return_array_string_test.py index 92864bc800cb1f..4991fb395957ff 100644 --- a/regression-test/suites/pythonudf_p0/udf_scripts/array_return_array_string_test.py +++ b/regression-test/suites/pythonudf_p0/udf_scripts/array_return_array_string_test.py @@ -17,6 +17,8 @@ def evaluate(res): + if res is None: + return None value = "" for data in res: if data is not None: diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/array_string_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/array_string_test.py index ede02c1201e713..4539a27cb6aba1 100644 --- a/regression-test/suites/pythonudf_p0/udf_scripts/array_string_test.py +++ b/regression-test/suites/pythonudf_p0/udf_scripts/array_string_test.py @@ -17,6 +17,8 @@ def evaluate(res): + if res is None: + return None value = "" for data in res: if data is not None: diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/float_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/float_test.py index 3b2d726ff406f7..1ce8ca82010bb7 100644 --- a/regression-test/suites/pythonudf_p0/udf_scripts/float_test.py +++ b/regression-test/suites/pythonudf_p0/udf_scripts/float_test.py @@ -17,4 +17,6 @@ def evaluate(arg1, arg2): + if arg1 is None or arg2 is None: + return None return arg1 - arg2 \ No newline at end of file diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/int_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/int_test.py index b96f6b0d4029ef..23df4b06ce616a 100644 --- a/regression-test/suites/pythonudf_p0/udf_scripts/int_test.py +++ b/regression-test/suites/pythonudf_p0/udf_scripts/int_test.py @@ -17,4 +17,6 @@ def evaluate(arg): + if arg is None: + return None return int(arg + 1) \ No newline at end of file diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/pyudf.zip b/regression-test/suites/pythonudf_p0/udf_scripts/pyudf.zip index b4ed70a402bc025e976a5ef0775f453ee9071506..7a33bc3e20d22a932909d9567e87b59bcc390356 100644 GIT binary patch literal 15967 zcmeIZbySt<+xNZc?rso}M!LJZq@}xK(~WdUr+}cOA|NdxQX<_d-5?;i0Z9Szxwbkp zFf+e<=Kj6!U(b5}*eqC!b)1`Pe>mcMd`}H!ICy*z2!sq84vsaN`1aKs0~Q2oh690c zK;$5EFxcGB)ZWF@)YIC-likfvM;iqM$9Zqc9{Td~MFYXWUje`S?6yM! z+6#;4RTRhB%t$%Y%nlLsYdiz)7;q*8-rEq|)%LwPgemP~1kRy=!^PZ#ynO4C0kEW6 znlN`yW#2)c=eECjws*_jNH;AO!#6W7ioVv;hVDEJ&QHhf1{%O}`9jfBqS&cV(LK5c zW_UH5y8bfZMqfrIiam$W8u{LS-`9Tw$2YZgd`8>S#^OO{e3C41N z{oNXa_lsUWQ|@OMuZ?q8%Z?jcns zzprn=XFgpXpIJh|J`)0O72(+tbs}wpXXUzNJvw65MjBq`sxF;6;?!3)JmVws$d#wd zv6)L6BLj_=+c@G(?5UB`1&E~C(J&Cn$5hDfO%XQYWhQ=YlN>2GgmJnPPyqRu^In#W zRh=w3b3guJ4+;IugV%2iQY*LU1p^nvskZ2Y{o^L30nEP3oL(H!2-02PzW^{}30+D( zGXH_u(!D=0i~lQT{Qo0n{J$`Jc>}YM-!W?`%H99Me6WWjX5I0EV+_NK40GFbgk?0k zb&L=r%}GyK} zsV5j*q1;Ajca?N3-F&!e#e^opEi992yX)sK`n-vlyw^psB1h98qX2xPk`-Gd0DPOr z42F+kvS+>mIZhe$a#aS%acri|u}f$Wav;ae@oGKo3LhQ0QVZ>v;*J&*xbKYe{Pxu#ng3XVi@U4NJdLt{WU>wUKvcuXtKj7;v z&G?byV&m;f{>X8uEUWzEM?Z2LKNLQxpE=HQ7=Ul_XO3$rnPC-aQ@IR^NFq67!3?xq zh*Cb!I#N1X;j5QjW1Qp1oEHyPTWpF=)M-ZkHfUF9e<^wncauI6-e>nQ0s2}1`tX0J z?|%o}wUTQ5Ka|wo>m}9d|A&$~{nwJ3`(Kw-Q)o%8Y9E8e=oosNRJZXG(63aV(TT-2Gmek~5B~|1(w4{az{TC&b5?WHT*3T@z z2Nq}jQBtFW&a;0jsVt%5X-DpV$#s8IQvY49^SP1h{wS#}Wiza={2!iOAX9Pg^OK)N zwP0i&-Pumqnhdgwzg^T7y>;ggwzjnWj1cpKg>D0E&HToehle%T)70AC%iQTkGJB`g zZBKgZFYQ|7-A-^W2egu!?Dyoic&8N!i;ca>=Tbd*;GWa38HM18ikl%5^Zhl;j9T{g z6LJvSh4)fG>w^PE53Nj80Sq6yHDuPTUAo{Sqvs2~RLchZUiP zRC-fDqX11xJL)OT#x8kZjMTAbt2$(g@kb%GC4RYAA`FhJ-)i`pkjet|Yp-jb(hwl! zNQApbF`J`4V|JO(e^8N#*`jB)tJyg3yys<=%R&%P6<|Lgq~6lnN6h$|n6vv%)!N45 zaJyL-%7a*Ts-o>FfG!jzVb4r4hww$P1BHjRq3S*5Ct$4Ci~)Nc@TTm%N1Z5YM3P9J zf-2!|MV#IDi#$GqYql1WA7X?u=%h6`O%$k+dY`z3**#Qa z$5a72HmZBsYQ#dPdl(lLW2K zH&>=J!3G0)CC|$72%b#5uu5M_%2c7Pq$;zHLU5I#HP)vc>g;*udTEL8iwo=O^^nJaRAlDoMkmg zT7!&zAi3h2|^bq zHdrIQLLx5#V;UnL!JDAq_XrKAU$s0z8#qvMxQ*pRET)>BBk340>eR?Z!Yxi zwVNz4Q5);4L$@-vE~n48kECs{5(R9HD@mNnnn!I9bKVyxu#KV#u&p5B$uYgLM>Z|u zQJdPwwp~g*B@vM+yPIxxAQriX;*!|!oY&JEp^g-jDvubPls3mp&CnxrHlOU%!BsMf z6F-Bfih3D_-d%(7u{K@s-lPr-bHsoMjeR>6h87!?x=4UJHsf}=%#`nEv+$A{W+-QL z+AQSk&TQ#YhTR6b1QQNK`yvvV9KrjNbeDv$K0L8%Az}WO6k*j?rYmf)mfttVrD9mp zk+u)cyo&5}L85tuG~#*7_<%Fw&Yh{Ei1&1zcb2sWtwWeDsa&ntw7zHK$$gjD^jqru zk}&`6Gc!5e<64ibEyD@dtplV#z)kF`idq5H%^#+e?VmesZmOJawoNoE9b{Fk!aCx7yHqQwE*8|!CXW-bc$M)kB9 z%N6gOluI1ZAtSo)8PsjYEEW{KvNMq=W*|yE#^-9%PY#{V72hVT_Qw!>lPjP;?L8M} z(*VYabHRI8vOP8Jv6@>I=vqaUD~Dl_tZ(pK!G>*`A*!k zR9nFXVsw7B>h80ed9Pv4(u=WJa0MbIzl_g32Jz_ZYYyaMv~VG#fl+$Bw4N#vB$w{w zJ3g6x760B`T&SiJOrwSrMbB5L6Kq6_v`%Ea!x*RZOkuHkduA}qx$RYaGR(6$H{>pl zxxo!~5#2yn-p@_$TqdYuuP-sPx2XOA$NoXAQChFG=tlq?82}ti030hcC&ho9Pi|_wRcW{PLFQrKwCKWS|F=x$ z_wc^Y=1G7_;EQ}ZqkMQx`!gv9c4Y!Y4{4>fLJE73ZHdi2W@BIys4W-s=L#p>Ni6TK z7uf!>f?*-*X9LkN`Ehsl|f8~b$bO*|fTlo6UHE!g8akF_PST=RG z&Gd5+h=rVPOG5{NvL9kcJ9G+#E#puS6C%S8EkzYm@@tSJLNu2gS!q&3)p-Ux;kn?! zrk(3kBeW~e(5S=yX1k;Mja>aelH+>lr8-pX&|*&Yaq&N?HM_S|fxI-gVI0A@T)tO{ z$4>PAQpicDsJudu=3{M|5bqnabdt`G-spnfvOcEd0pb zo?~syk!gnG%p;hNQhibNt9FXB8qZU-?KU2W9AhF4H&DmUa=fg_Tfns*dgAXtlol=e zb8cio|BSRfqL7k_=p1}EQmK%&#Fdpa6&1<(F-+cyVYXH|X+mp!%JEDJB|cd|CPmAJ ziM)6A(`mkRG~@TxXw;}@Vmg!9R2a}Hz9GRC=(NF1t9cZRejI!rOGY<} zw+u8|25$h-A}kmXEu6R5`a>v9!T#|D1#3L&CnOXF+bh{(xBb8>I24R(OE8xi#Ytoq z0+Di;GbArSJSC?ogFwkczYH^qwiIT!qDND>>vhMW)G(x4Ggw@Rud+Tkj}TqiNrnYz zw9wW=S{gO+GmLOTVoHe;^}61@Ba7@Q2g7gzHqAS3BVC7mqS_)fRYQ z;|^MaS2-fO2@vZ(@!|mhkPQG38UWD0l~%9UC?MB-{*`Nz(G}CzND=jaDytz&FGIph-)#C0UgsLGktLdy&_pd()zEe^tYkHYK@%&GyZG} zqFWRTv{7e;RLSW!(26qhHD*#YTARrGw`kT(&ieuIc!>|{Gv+5uk8^+YZC*MtwF^fy z(>?5ce{zBB(vty9f%mSmRWI{BHaiIFXRA`AHYi0{E?&aGn{JYvufosvJPr43cA{Mg z<6V|%@K{(_;Xv;;d0=!ps~w%&)J;pCFu7*CPXVJ|ZtiG4QD8zU5YU{uCBZ;MlC=}dZ3(%i@u)KFc<6 z*zgEhZ(IwvBi9l*-k-2bAkry! zb)zM@U)+W2s*6u+h*=l41hGohkG#k(5&obCI_hD2)tiBUw%ZY)h7q8~+1%}iDF3Ti z0XnMbFHPZZq=o-5{#k3OCQSkvX;bgPgfffL;H@UsRc!BjI5kQow+9^}1)BQl8WeWT zgmWu(FMaYSBP!qY*NQ^9dI0%NUEq?f>~k+}@oeKzxwr64O5hp6aFJ&#wb4wnF8PoG znFCK8xH2y#dSvB^7(bmR$;YUSI+nt8zpeb;+A|yJk_ypR$tURDXC^JQO2s86xu)8M zX^g0Vnw3-BCCxns8gZjnq0x!#{M_I7X-nZWE~1SsO7@ zCg@N2936m{UM!B2BXmTp**2Z=gp#@6DxvVoG!4KL2jm4egvwv?!oZa(sI#{Qs%C*X zx2TOxGE>|-(cKDx0r1#=;q;plx{=6{n4tkWbqyTbJ`5OnqInAS zkJIAe6h0K5)?jR-Kd}sUU9Kz71kEv>4mz2%N6 z<-HMWCJ%+XIt{m2@Ss@r`iBV_WbNH^GWMt3ndU zq4&;J?u3*MChdRvB(30z%r~^L${^OA^x{sB1tZU{(Aq*x2+SXZLE9iByhv1dfUszQ zFuZGo{W(AV$6n~CmT*WQeA44$oVU&Sh}_OX{b8x^+hiNCQ8#Q(XtC0*Hz7+I=n1y| zu!?)1U!Qts?K3K+W-_~>8Yw5+v3^Z^KAT{NeBzKyAoPT;A~mobQ7jdPp2;wbYm5ak zLrI8MEqpB=qJ>?}DI=)#%3=$UVV67aR4k2aY3=3~|4E{4Bz$sYZ-!ydLu%WB;_S9Yoic>T7VO#T~D*(%r*~sQ$MZYfLd1 zmmh@|Aya3Fy69#j7XC&R?Fir`+vI^xv+W|ju>i%{8r{tIJ8F+vMg71hFMyHmp~KTO z=_t#6nDW8J_SC&pgN)8c4(gWl0_y`mITXPt>KOa7l?=C|V5g`GGr(E%# zqum7RO@yr~3IJ3L02JvjprCmQ3e?X)X-qQMfF7#PrKq3*$;xi0d_JkCAEm)Syg ziL9GOak&ICEc4xKR8$Q(h$72Owic!IzuWWUxfam3>2xKhPuDW9m72F?fEgOZjnIx?d$1n4KC^abPuG|~5%XD4r2GDw%St+a7j%Yu&d!2QdIl*h-jjak85!vi9wN|w-fHRM0mlgm!JoW zK-J-Ia6x4#Xg&N5u791eu1)>_kF5V2$Wm;6l~@f|%@$O;6w=ilFMjAQOWFHc3S_z6 zR2PpT?j8)1+uQ1XuU@Tg_Z=0x+lAw`AlbKToffnX`WOaQU;%0C0g$#Ru95Yh+pGUv z5q~p71lH&f@#_RAzK2tFBH-;xrAa6(kLE@9CHNsM=^QUz5U#z|Qn!mfwOI_{=59$t zU>T$rRE2cl`xJICOlFWpn9pvRWOON3+ELo(BCPZSL!X%(F!aGN*MsFC0nt((4hn^Q zLu@Rzw1p4+cJo~h*KJfYoJ88vPh`5!BwOI9Cng>bK zDk}CH-rRP<<{W`d=kcp94&${kP#)8QD+`e1ie}~9T(1l1E~po zLnkVb1G6L+*B32*adchH3KNEIs=X8N%lFs&XWD^HwZQRM00W4q2y5a!{euWJo5y$* zc{^2l&&`Q%aXKW_Khv}IcrFZVs&`(F3FAU^x-ZZM}guq=!`q8~|hQp8fc@(L)JlzuhWW1nBa zJqy{g$z8ndSF3t3tC1%bcE+aYn)OBjHTXdfpM$8P%gZJBL&`_1Aw zo>9)n3s80lP)72Lvi~oHU0_$VG=1XK!Q`pYd55nY#S>RfNl0u)HoX90r@6WqVsDcC zghKqW&2MkGVkJL|5nb1eewvU*x1w17wxcjtf?IO<)9ei`31Zu5j4=j>WY}LnbCa8!*y>g-aYbHt2SZdXDM@;hE4Z1KiNKwjnyIW z2{%A8(s**pgw;i`Ey@L)Ork)#4zc&Cp`#}sY`eI?ypEkI9kN z??zr>B_G#fWMh1QN3yHbnQC1SNW?ibq zj##Ro){lxiQ!`#U?Am+h(Z{45FyU}!$H&^!4VT5b7CYL(K}4na=AMdG^i(GV(>2!q z#{KN|r9>=2k+)@c^FH>Ag#$Wz(zbCbLvQ+POzbjs_{;oapL-HYafmC=Cq`iU*V*MO z2#Gv3mocpj6E*d}WCYM>cz8Xjgab*I1uJLUMx+vzT=ZtMhNGdhkv~#Bf92?hMB}Z`;dB-FaU2v1T9cT zo)Q;cOeGs3C0qb?Dpp4TE)ySNAzVr`vdT-23+xZ4r#DX3FKZW<@WP=!+*_Helk5); zp>w5u9vWsSz@aGu2t*8U=;r5X=jvkWWo7frl;i2=X8oJ%>p#qq4QZV&fF~kqFE|c| z9P~c2yjNSa3FWD7N(zTPzATR6Zq3XGs3V6{A?R$R3PXTiG!t#-_>y}`oSU*Mm4T6` z)Qm1$R7#^J;>zOT!ziYR$O_p3^eou7lCQEsqJ^+bV`O348cL{2#T;>a4$?%jx!C$u z*vj~F>M#~ES)zu{U2}p~ud2VK@94^Hkhz@Tck%LQeT#U&Rl~E<$iWhd*1|}ExQ+i& zrNn4myGe*Se-EQi?(sx`iDpB>G0q(|#+KnZ0{Q}Z10O#B#8$bp{YPK)TdB&Z%v4~C zNj&c$rKzhK)zTu{KmOp`kKwCO&Gkk}-&)7*HZBV!?-2|3vSpSu1ii@ZV_@jexp~I} zemcR9#q3Wue2zIb1LQl}<$>N@{P(ovi+r#0eIG5G;6LT%-PRi>@bdP7*Tqrm@`g94 zew#R~+Kch%6Mx(}E4bjzInU{p6MLIbP0B_v$SZ!P6FrQ$%NJxCCh#E07n$Ew@083J z`NIzNY{Vh4u49w2tk24ABiW<+P-=g=m6seXhcM6XY9nK1)M6s1&52pe&OOGvZ=CEc z(9#uQD?;OV>xDsL@*t6R7eqd?e0?_YE}pp(nI?GcgF@PufvIj#gtvXgIi$wO7;i^ zhR~`c!DnSt=5vcHMcTy3JA?--Db5caJ(2=eCx(-?xga^1GRLJZ$o5`$%* z(JA=^ryAx!K*DqzDBV)gZG2O+K9B{$sE{SW5RfL-JxVhoZyQ^0BGWOgw-u}m>paaA zldteO#cn-nF(fXHtkS*(7=7dp?Kp2211>X4qhvm9JNzDC?s%EeP2{JnaEp5gfyK5= z2FIhDvg$+UFhvR1>AuV?Vbn>3r*OK0asyvGBSM-2?lxI`UU3(ZEG^(>e8>^2O+uQ5 zL|gZW)|iFQn?c$0UI5j2VV1?c#VV#iO3(K*0v_`i7E((cYoAS2O(&5NI?!9GqSulz zny`_yW;Lz9zEORZ)a*HiY9RYi`H2M%hGOD=YM0yUB7*K!z!b^dUh*D9WJ zrFt*r3nG0{5I@>?u3hG$Ev`U)_B)ED^`}Tws}JS#)=p;6@ia}voAa*TX(c2U+X=a_ z6G&bd^?g3O=3|3`4JRj`5($7^3T)m9Z-Nb-e*Oj5zu9vE=j2J}W3Bm*&{_Hob9sEx zGEZD#pJ8|C%lpPHt;F~2uiEN}oyyO`yieO6o-RIMs9r`vO<*EMv5CjlW__a0%00;u z$&uBd@Nh*d6z`1((N+vt`6o_`P8r;7I*Kj#;D_oVicqU;>i` ziMMFV>dj&kPlLLLGpUWFY2Tw+X~&i@y-YBGF&dMd8KBPZ%6pa`pGT^P)N@z~GlQ({ zKgjZ_u~ux=?~W|hj<;(6rsRi53HR67k+Fg`DI8C`zQbd}#Bf6>dm&kwMzwH1vUC+j zlX^8*afVffPUk}f)@BdyPZF?Ht)yT};!cAdI4iyg815pJahS%p+F5l5lRRRM4d+@L zP^u+kDf$F6#PXHRb*dwv07ocUF(xgV0=;Fge40T0C&BHj8oruS%yJ6(fMJy`5Z}Dnxf63WEZk~UF>n+Z# zO+V$GmxEtgk3-M@nbh9m_2EX6ToFEZj%ouR_|ewFEVJ9zjdwQj{6fkTT(Fka%%9 zva^FP*6}X)yq%sXF*8oL%YMAd(R8uYwq117H9D9yr!3tV>#DuktzaSKX8#dsEsp;k zfI1cNXCHhmTLbN|mnE=T`{xEE1IX9F{yy;Zg}xwGsM4eIq-S52KT0={nllL`+#0N9 zuU>XsNnCssu9@?Fv5lLURX*brl4at|%)^-;Ten_3eA2rdxI>8MhBXoFHA5=H_+6Fr zap@%N2xpULe4rLt1g+&Q^xxRNdd2Ihv5Rtl1GPrR{0N>}&U1G>kh8Vr{sW>$kqv)S zxzCO+eDEuz4$p!m|)9U4`svQ&$Lk6vl%}33L!b&`G zu41Vsdi&I1xdyC~|WX$%Kwy5Nxo1Nsja4KV5M;uMl#! z@|a%h)>B2= zJcsuk?_PL>Boh8em9;Pnv6}iI;8i$P%n90S8>O1JWqU9bIAfCX;{^rvLO9h&%W%F= z_`fRFehb%isYW-7ILR`GnH!VRTWl?#mC(v3HjS=_=odS7D!uFFI{qG0kC>0C(*Y0`vCkf*5$(ODDu=EUL(J~FZOJa&h< z%$~T`OyA7LleK;A2EeDo^5KqCl%0BXfZ8iavFFOGp$%3q4= z&gsrM_fI2C9$S0DvTmM&y?on7(&C#v1Ru;FMi?gTmCm%<5Il)}>rYQS7&O@BD2ku& zaN)9N!>M_CR?WHab#qko^m$dPwl&Ah&3BK}EmU^+z>AbeRYod?_}ec#1>b_MsKI|M z884wr#%R!-GGq$6WL!b)pF(g`J~Ok?RB{(8+S4R0qY>?$N$R$g8y)YK23%R}CEaif zl&P%(@mMG>0dc8Bppduy#tP_V95GA6YXqTy7ZCiz&=3q58m1G`4EkXwv=`!Yo_zqe zj7z_78DkTGE#uUWEu$AUJiXq}En_6daaOX7!dEaEHEPo*f4BzU?+pYmj$&>2xm%vd zb~{v1)-beGX}J5{yocl z+ao#;-AuF}Mh6Gk!~*TCDxxU+GGQHLhK54FbO>RDJQO4Di73EWYQ$o@{>;NyGYL&m zJF3}yV>gb`nzE7Mf;ITJOF-t20aK}^jNzvSU{}ot?5gwi-_a46@Pwl;qw96yYw1wC z+>z6@m2c5=+BIl21KfR{J{cChZFo$9CZ7B=P3|HHHN`}`2i|*nrgn)iW?kEsO=o<- zN|6S-QWW>W3t7MBSXLjo;sh=M)dNO{jwa&3WOmnHg(fpZf$^E<6Z1C6MKC>LC5vz*rd)U6~nF6iE!E0*H__F8n(X8MD3u*o~?_UiOB#EQiJ;&b%`ruxoH|~po26_8pbo?^lT!w+Ag!@nT%!q;eXCM&Z ziU1Yghko}r7lIIe-a7jeY<~`W^P181u=)VYKg0g#OGp3ijGGrquAeahwB~Og8u6F1cLsJ*Yi3s&~kax{dYa+7i`d9ynsN=zwrTHkNH=R*!7sy zc!2Tereh2U`8PKy{~YqBYv_7Na{{2&U%#ID&jF#I)J;Rl^?+GKfQj~B+e?7c{(%eV zQ@&{~xPIDKl0Q!S^Plk#j~W{I=92k(U?uXOfq&Zy{(0s6&mSc;{>?q(_4s9!H{<^o zZ~o@0>3VP&nm>a7`XB$BfAxRyd~WU;t{-7b`}2tZIy{u3oAUAX@O$*2>(blLtoZvZ z_s`Toqu-SBu1D`=yb=ATu=nqR-;`6X2hU--5&VX@^6z5boNBMfc4fa28+iCPX54=c z4~6n(i+??QCns<-<4>0vftddQ%3s^P>oHMyK!0xjfRKOF_-QC308>0voFW9R0VT?Y I5Bk&p0SI$~F#rGn literal 6086 zcmb_g2{@E%8~$b(#1z@Xu{FwAq7!YTA!KP{>`}>NY}sOTOtMXe64G#@v|(tFlRC6W zWGD(H6%|q1oK8EDgZke|>idTJ=lK7NXD)MHuKT&4`+eT$yPwbARzOf1f*=v-0L{e% zb+M+N1inRsm<=gGUSVNgG*ACvswb5aPF)a6b9R~m3B2F=(T^Jukzx=^5d16j<-^vR z=xC*D+l(jbSl`jK(QPm9b%4<6R>kyEIx0JBzb7l5>BeNIA2<#<36Rj}1OE2!$c(B{ zNleXXl~V$aqZH&(_G{T1Qh>ETN!c6?*(126xC>$_AlGpa%HOz!QK%7N!Jhw$C^5xU zG)^BY_@Uiw+j zb|h$MDA8Z;xRz6QXN(?s`2|BUFxJws-0Xeya>JVLLC>R_g0h`*v2224<~0>0dCrSa z5nn)@4MJ^d^5N7l|6pH!gog2d#}Y9i^vR9JMT!_;OxqkBdqPSXm1iFus6Q>xL!IZK ztnT@^P3`#xd7)^};{MtDGebpA4=f9r|;Pkf2)9h(m(4Cret;wqCEsj0+ZWA^{4GYtk*-)0Jkd> z{KCU2VN_4b)(Edaene%Yo)WFo$C?ktT)vz^ynxHEU6qvfs&%ElIcgPoCvIru@lx9} z-Jwk7F8@Nzz@9CsHfLJi#XrmFUt&=3^qeI&_m-4$Z+61l(sj2i^`3FsjV*Q={+?n- zsun7f0a$K3m z%Lt@|hg1E$f+tlnGWi~+?p-*GqmQ-tZSA|Ow?)?4pVHTFi&qr9Q>tFD+uP_+4L(Q$ zYxQCOQN5byM}%6N7%wvw4y+fc_cb0{D%m+0_iN0hL6lyf0}j`%Wwn`bT~M#z$yinFi&UC_+yj379gMxViq9b z9TGtfr0}COFNYns%IMqbwBi*>hdAFHudB%+9D#`R?^fUC_c|6sf>&17N zgG6p1G(oZ;uh1`#kgvLeiS7AyZ2Vw){5K|zw@FqvzZ`dc6&d5fc;PNT-~DQ9tGTtQ zNA5Y6)m+C#vYJKS$r9E#1+{f-bn&{^1d4c1JwlJ8?+6%%gOGx;oQ^b&Ryx<;5_MX? zaO!GP!gyc=^o?|DtkjGu99E-P2F_}W>`=undVk^!&$s7_1Hmgd`twb2R2ur$rMz9xSn2Dx8_?|m zG3P7ge@&N|wqjPDD%Nu75XU3MeQTV}3bW&B&cbt*7@r6U)~xR8rCygZJ`6|M4H~0D z2G$knU&r;Q%7)K5mbUa9=P~UiKFcUeK}61=PI)c9E+^L4{)n2nA6ii~7L#-Qu`X?| zO#H1{s=9?!E^+W~+{Q;bx?-#Mx(DGbidLrPM;Azt4&A%eOlcwQjr~6S>(0A)nBkeL zZh^Yipdm;e)D}vk`h^60MtJ*twR2QjC}qNPRu6pi;|7oCtk_2s-Kj6Z7&=1UX3ZXe z7~O=f2n~&d>E!PAqXAim?`(edv}Fmd0iC(9IOC7~OGYxC3^nGz^6p8k zUA{;pwcYD=#Zrq_*Y1_&#e{nvkw%N7=TOD-;|sEIUIa`zLAK3(?k4})La}9brWQe3 zGA_kwH;i_W9*&qdkF^xfBwTq2>G(865f9p(cv2=AGdB{$imra!uH9L1(C)Kh*Nd1o zg%+#6r+W34b9^6R?@51>k;>dP)bRL5a6?JBT#xdO4nJD`mU?lV`gVcuB8^>i@krRP z!K4d_>zNUbB;0K2KW$tUl=eGLe3tCso`*b+Z zNy%s|N+!#6_V@L;Zr#{ina?;cY}?USX(4R7VBT*3-g|C7tH#8$6kCfe?iz4Bte(VP zJwBU!GDu7}uQ2##;HsoW6uk!W@g|qGhL2gN)GDIxKX};GKhxy(M%VoILYj+nUQFJz zrbN==AUCx_Lbc*R;r1pWt+`cozKKyMX&ug=&%V}=ZhoGf^NCtMEA>of2Yyd3y3p~h zy^_h`2)eOWMw4S|j1DuGA92k;AM0VfLu@!~;hojmp^}LSh9{M%AB4Y)C49lf!|@RT z^#F*K0nC#uQY`LFf@`Hb^wCcR1dmpfbki;9h0+Oc#$)4QOB^P9Sbo;9hcsm3L9M?KP?;`1zmZTXUU7751IQgF9`^-7&lya8z z@bH2&QW487#LukPlf||_#M-(Eoy^_Kde`t8s0RqLwKjR%fSi!UV6>^&F!wd759 zQueUTY=6#BhiGzHCF|%z()|qz%#3Xbc7e@(_s=ff<+J!uz|EBVNssK0C4LCaxPJNa zV8P$NdtqJ5u9~>rz}K^EyKkD6=OoUIRyopnxZ61EMvac>sB0lvI|lV#EHn^HN}=k6 z9S|y3V0^T|Itxsgh|R!|!0w+E(25&8dj0ND8&Iz*adAjH@gm#lJ&N(E_Vn2!W)jv0 zZjw^BH%OitdE?xHOVnX(x1ZBfMe9FgX;pT8{g#HVO94@vi&cz_MK=o*y_7PyJ$XEH zGyt7xTWhRrhnL?!`*nc9v8ImU6oPd&Vcj{bdtBd%xcMd=SM_DtM~tqdK3X&wW%+!@ zj!n6Ddy^c5Q5y8$2+}e=8y9b^(3-zpQp@aId|UicK|jXEVw*&R;%#>w(ru_SZ&k@{ z)4TO)^^?ZbKi|%vR3y?_R>}qx=RxX{?qLIua?T?W^$X7%J>G7WuDrK>&^kd0%Q4zX z_8ExxHZiU-N&i@^hh1lxZ*Hoc7Z<50-eeT)<3w5cYDKaXbpW5}xUZS5uczooE*dH* z$(BhaNL~9U=QDJDS?jy@c`t*}i+034!Jl`aafUQsn7#>LE_Y3MSx~B8;%8h*Q;T!u zLYF6U-%p6UjmtfYJ!=En*%<`am`>9!0dowDsTKHb`|?yB_ivc=$Z}`$31Hxu`7@rUpC~D1kd(2EZa27KyEj1;61|QWlSs( z(>qG!N&?4UEP}+J26^Nj0SE7ffq%W@)4S?O-w#i*Z^k(PX~R#p-f;LJQ3&oa$H&z- zBLKQ)3KI?==94`u9GxkKKu0=Q06v9#g@aG_cyRC@8^GT&OqQ`W!*DT-#*$_1OW8uQm7Oq*HBELYjW9$QDl{b7qNyym!VncL z2q9}4vP0A=%L!LBz@^+j$!vTPp8DIeT{@MXy+ZZb_Ye&G6QxNL#S7wcPY1B_7^)-T-h3mjs z5;pwA<>X{m0Eia6%p*%ow%bm-9Y3YbJ9;)~sFimMSKzt#tysx*pTQTdp>u;Vmg#Oq{H7WbzeAI zG^1P!C*Qjz_d6RtZm+fTt2h|I@zE=K;jp#CUEAd7rPYQ9)~f?7s2EG>#Nm2`vSVee-oePS9tTxF>S304l;`REOrh?lxZDeoC4!e#-tL+* z&4XTD$`jlF)Oq3kBPw+yP~%wh>ktJ;-_ZxuR7B6<<}{2P%d17S*oxrlBOs&e@-1gT zo7%3M@0UD4dbqJMTo->|j@Ix<T57mFE~Dwa<*LZQOsUEu!DJu>w|vw3yzV*}%h!uD1v8XNh?t1H^H62@t@ zw1xEz4arpISfbiT%AT2t#>v+RFP;DvF`s*hU7;mP*%2cPYo7Tro^0G+AEZ5Xy+Lw^ zmLskjBA}3y+Ms<(E6U~LHbqD`&R&GG{Q4CV!-&bewl3auEr~%tcUuX;3F;^p1|!Z4 zPJ|*EiS3b!d!5ddp0NcIYE9IzsIQ&w1nm8rhI6G!M3hC!?)j%~&y!_}@w18;K}1mF z*8I0C++U6hCkPI$C~hK?t11lr4;2`;xm})qG$CQm;r4u4C+}KQ(8-HoK z|Dji(^*t9aKpSKQL);6hJI-TddsShB_q#`1jJuDFx6I*gQ&~)!1c|$D3v4l#;PyOG z`T1F4Tf|z1@2oUtVk0HR@!lxb!Yyi@K12%|jodi>@fIq#b8kbRVOtH_=L@=%Lv9i= zcLzsINHHwhU7l{RtP>^3Vjz-`tbgKL@!JktTz}Nl;Efr*R~Ab~H>_=&Pi6_=vBy@O z6xF}v={5+zaVqj^r%ZY{_+Pgq>4G)K9`UvNo)gdZrf zF8-7lp`ENdT`OL;KhAsbw=4Z+Eq0E+@2tQX!=GeR zcA@0Yaz$&bCDCa{83*6Uiqjj4gt}YKsWK~mcxLFfXqDZo`didPrGOh?@ulBYWn(gY zJBxjn<|A8!gIAKKc|)Gb+(EPI9*!yRJ}&$0_K~UBTl2?NOT$k~qK6VYsv5)@tmT3- z)LT_T_2NbnFPngND=<7pB9C@s49$lp<=ACtxrObg`#Oq_WBi$vD4af8lbQ9A!%Xd6TH4-OGfvju-r- zjnMNX$#>pPL*9nGM>}JhuFomVhGesF;4RL{p+qE`N)j~B^&gPw5LoA4CLe$cAnPV0 zG?ndB-}KenlFA&t@9qxeF8Jb>iM_kvv>&<}Lk@@Qfg;E?a2+Bhq>xzDFv5C`xmmgl zuX>`FY{_}TYXcMz)m}RHYc5Ut{!Xd+YdV@P5}&x^!31F73VyQ(vc{3KnP zZWv$W&lYpg#t`3aS9Qp*-?-}}GuqH9%PFu*SwjaV0N*OTQUCCpx9M9Y3?FeRGuN8%zMj-iYnPP^1 z;j-0gzSC@*9GJU2g$2tz;SbXUbCI_0OSPa}WGiLRm^gHIb%w7~ic@5j)7-WuIL(&c z05U-KQIffbc`hK6Q8_oDewMrB?>Gsl5$aP81l4XkH&J4G+}SBopkhJw4Q}g>D+UVLh~$} zTNx(wC8C+YwbPNor$USvW;MfJHLgAe0!mwPN%s_7A*|IS0*ee=u`c>Pw2s) zv|h%1kRa)|uZtLzIwO!AVWjmd-|PcC-sP|ML0-G?m1~*SrBO6>SbMtI9~pRH$UbQs z=*IjHW$jlkl^JA1BH-8_7sYH+XZb=ev9VVD98vedZZJMo>$=ohL=sN%>@e4fa6Z27 zsEP@T$?l}JtB-XyA|&7Wb8zl0G+15XOEB)#Sb1~30kNF#ZZmHA+26?A-ZX)mv@*c3 zCAf|qgsMo+Y?v;rp+*YW&mV)XK4h*6S7gA*=zpcGT_x(5I!wy61v)KYBbJJ-d*?C|)Q2^|p_+PjTko z1puLN01yL&fndTZZ}K%(@xL*EZdTMT<{e8&cToH_Qjt^7{GA|US@!)Jp_Z}GnEqf! zMb!R=krrKr5dY^TI|8#pam;Q60RZ2&wg~>W(y(K=fFSQv{yrES4j+gQ!{HQy3HEku z0C-1rcH$=|@v%7^l%vm6zu|ac(H!-lcD_QVr^&%uA`Obqcq;@s3O`-*$egqlHRfrO z7GXeqT1UW9Z%8>+hGNrcQ!cj#lK4`wv}o9sIm#}3HcWNVBTc=L-EM=I9qIx)4OZU@^yt2v8Qyy;N+p_|>QevV@019Wwc3W}|q8ZE@Q zxnnJKO=t^Be9=^JZt>b^q>gZN$b}|~TJL==xoIv-6V;cg1=quww978#8JZ9w_;ljp zJF^D;BQgp+gCwO_>eVWlT#O!+Z~r3bwE6Nvpl+pe)w7Qg0G?#kq24L;qUMuMCppp* zWBNhjtimi>x8QA+$t5ix--y=U7K%wFEJ!N3&k-O>&b`e?n8qfJUEt&rvrQ0w;OyF6 zBqrQ?N7%PEqgu}6uv*;lt3uzh?yyU*dkoNf&+ij~In*Y(eVw}?oVN8PSv_a_41<_N z!GFpvSq817_>1lSgZtBMDcC6P&&&WY0R;eYaxxl8EA`--ZUYP%H%pWIp91VQ`q5MsoZm0j{p0Pl77MB%( OvI2-x`a3Y delta 2606 zcmZvec{r5q7snsQ3|VF}U$N7{xOxi$gtw4N&0QV8j z4sc0)@`n{W(#*&0dYYMZmtRD!|BXIB{D zLIcU@!_P>_5V_j!v6t!R<$III@gzdE;1w_#B3Iiz{m{Mq&|{o9ixWrZ3?qfXR)Mk#yyMDmitu# zW{+N2wLKrNTYonbqh^{PS>w!M6|KBzrY_=#y~aMTCRhtiAAuP*Fbj^V?i(s$ zv4(wh+?)FB4&o;Msb#H7bq#K;2-7tA^~mk5?~o5zyVzr%hB^H6tiXljbt-t{3Tv$4 z-9>dqPQxaD=w+kJI_IqCyx7xFUr(moXQQIZ5FY;<;`_&KT#n1;Wm3jlK1)+xg84+V zkk$XD9K&8MCzMdibW0a44r;t^PH#)JO*OmNXpE5A-wdR?WP51nx+&7j=524$tzEog5XoqKwvo?3uuAGk&x z`)OO4xZB-eP27GZRdYl{*7?l*{z$*65Bnmse5_;Yn?{K{n)Ez}9yIaq#Cv&&y0*rF z)=LQouQ;bX$Zx5MT`|)%GaaDRyvsU%un&=1rKga98~Lm*8a89?c$MtV*mruo zQbHBZbSBoUwj<<%3U z2$L94-V+w;;47=i6X?CCYCIgH>5%-fhjA*t1m{t{iv+vNcD;4BeA?MS`0u;4{X#}O zp|lG^N66@Bokn(JzI-Qm@oBx9NbRZCB1m<`W>ifbp?Asc?`8X3#PagPrFDX&I)XfS z-G&rdw&WUipUJQ$HpBBO0^~rJ35x@GIwMTCK1_PR@?&rH=(o0dFPEvVpo_K5`B4@0 zSATf~E}AeqTL-fu51_j4v`Q>Iwkt4GjDvSrk!$-={at6OyI4q*R?!cHbeXdD6RAptJJD-vcWIpNtE~1DX4HCEPQ)4V;yuI+gCxR7>Z_VarSJGQ$LaHYKQFCnwWnyyJ?Lcpj)su#ESZD<=o$b0#8A(&7s}uHizCrlaYX#f&tJUL;?V|R=>aHk_oJ5x~h#A03Lz?00l^K83*9us4)5@n3k%n zs_+M~Y>)AaTz8zu!KyKTWlQ4kx46MLS{Xekg|5N9dyYOrT zjLrG&F<}6Z-xeoAinfJTQuw0}+jIHVkTGRx-p<{$O=Yz5A3fpP!4Pgm_&KYVQzZn7 GyZaxH<2Y~t diff --git a/regression-test/suites/pythonudtf_p0/udtf_scripts/pyudtf_module/exceptions_udtf.py b/regression-test/suites/pythonudtf_p0/udtf_scripts/pyudtf_module/exceptions_udtf.py index b663c7aa878dc7..837426742406c7 100644 --- a/regression-test/suites/pythonudtf_p0/udtf_scripts/pyudtf_module/exceptions_udtf.py +++ b/regression-test/suites/pythonudtf_p0/udtf_scripts/pyudtf_module/exceptions_udtf.py @@ -211,3 +211,10 @@ def validate_date(dt): status = 'normal' yield (dt, year, is_leap, status) + + +def raise_in_module_udtf(value): + """Raise a stable error to verify UDTF exception propagation.""" + if False: + yield value + raise TypeError("module_udtf_error_42") From 11ad4a90e3755445b945465ffc870a450c340d1d Mon Sep 17 00:00:00 2001 From: linzhenqi Date: Mon, 20 Apr 2026 14:47:46 +0800 Subject: [PATCH 04/11] [Fix](pyudf) only select alive python servers in get_process --- be/src/udf/python/python_server.cpp | 53 +++++++++++++--- be/test/udf/python/python_server_test.cpp | 63 +++++++++++++++++++ .../pythonudaf_p0/test_pythonudaf_drop.out | 6 ++ .../data/pythonudf_p0/test_pythonudf_drop.out | 6 ++ .../pythonudtf_p0/test_pythonudtf_drop.out | 6 +- .../pythonudaf_p0/test_pythonudaf_drop.groovy | 39 +++++++++++- .../pythonudf_p0/test_pythonudf_drop.groovy | 26 +++++++- .../pythonudtf_p0/test_pythonudtf_drop.groovy | 51 ++++++++++++++- 8 files changed, 239 insertions(+), 11 deletions(-) diff --git a/be/src/udf/python/python_server.cpp b/be/src/udf/python/python_server.cpp index 646e1e79039b5b..6bf247f0417f11 100644 --- a/be/src/udf/python/python_server.cpp +++ b/be/src/udf/python/python_server.cpp @@ -125,14 +125,53 @@ Status PythonServerManager::get_process(const PythonVersion& version, ProcessPtr version.to_string()); } - // Find process with minimum load (use_count - 1 gives active client count) - auto min_iter = std::min_element( - pool.begin(), pool.end(), - [](const ProcessPtr& a, const ProcessPtr& b) { return a.use_count() < b.use_count(); }); + // Prefer an already-alive process and only use load balancing inside that alive subset. + // keep dead entries stay in the pool for the background health checker + // unless there is no alive process left for the current request. + auto min_alive_iter = std::min_element(pool.begin(), pool.end(), + [](const ProcessPtr& a, const ProcessPtr& b) { + const bool a_alive = a && a->is_alive(); + const bool b_alive = b && b->is_alive(); + if (a_alive != b_alive) { + return a_alive > b_alive; + } + if (!a_alive) { + return false; + } + return a.use_count() < b.use_count(); + }); + + if (min_alive_iter != pool.end() && *min_alive_iter && (*min_alive_iter)->is_alive()) { + *process = *min_alive_iter; + return Status::OK(); + } - // Return process with minimum load - *process = *min_iter; - return Status::OK(); + // Only reach here when the pool has no alive process at all. In that fallback path we + // rebuild one process so the caller can still make progress instead of waiting + // for the next health-check round. + for (size_t i = 0; i < pool.size(); ++i) { + auto& candidate = pool[i]; + ProcessPtr replacement; + Status status = fork(version, &replacement); + if (!status.ok()) { + if (candidate) { + LOG(WARNING) << "Failed to recreate unavailable Python process (pid=" + << candidate->get_child_pid() << ", version=" << version.to_string() + << "): " << status.to_string(); + } else { + LOG(WARNING) << "Failed to create Python process for empty slot, version=" + << version.to_string() << ": " << status.to_string(); + } + continue; + } + + pool[i] = replacement; + *process = std::move(replacement); + return Status::OK(); + } + + return Status::InternalError("Python process pool has no available process for version {}", + version.to_string()); } Status PythonServerManager::fork(const PythonVersion& version, ProcessPtr* process) { diff --git a/be/test/udf/python/python_server_test.cpp b/be/test/udf/python/python_server_test.cpp index 40e4ab3a11a24d..a1668806ec7b97 100644 --- a/be/test/udf/python/python_server_test.cpp +++ b/be/test/udf/python/python_server_test.cpp @@ -417,6 +417,69 @@ TEST_F(PythonServerTest, GetProcessFromInitializedPool) { mgr.shutdown(); } +TEST_F(PythonServerTest, GetProcessRecreatesDeadProcessWhenNoAliveProcess) { + setup_doris_home(); + std::string python_path = create_fake_python_with_socket_creation("3.9.16"); + + config::max_python_process_num = 1; + + PythonServerManager mgr; + PythonVersion version("3.9.16", test_dir_, python_path); + + ASSERT_TRUE(mgr.ensure_pool_initialized(version).ok()); + + ProcessPtr first_process; + ASSERT_TRUE(mgr.get_process(version, &first_process).ok()); + ASSERT_NE(first_process, nullptr); + ASSERT_TRUE(first_process->is_alive()); + pid_t first_pid = first_process->get_child_pid(); + + first_process->shutdown(); + ASSERT_FALSE(first_process->is_alive()); + + ProcessPtr replacement; + Status status = mgr.get_process(version, &replacement); + + EXPECT_TRUE(status.ok()) << status.to_string(); + ASSERT_NE(replacement, nullptr); + EXPECT_TRUE(replacement->is_alive()); + EXPECT_NE(replacement->get_child_pid(), first_pid); + + mgr.shutdown(); +} + +TEST_F(PythonServerTest, GetProcessSkipsDeadProcessWhenAliveProcessExists) { + setup_doris_home(); + std::string python_path = create_fake_python_with_socket_creation("3.9.16"); + + PythonServerManager mgr; + PythonVersion version("3.9.16", test_dir_, python_path); + + ProcessPtr alive_process; + ASSERT_TRUE(mgr.fork(version, &alive_process).ok()); + ASSERT_NE(alive_process, nullptr); + ASSERT_TRUE(alive_process->is_alive()); + + ProcessPtr dead_process; + ASSERT_TRUE(mgr.fork(version, &dead_process).ok()); + ASSERT_NE(dead_process, nullptr); + pid_t dead_pid = dead_process->get_child_pid(); + dead_process->shutdown(); + ASSERT_FALSE(dead_process->is_alive()); + + mgr.process_pools_for_test()[version] = {alive_process, dead_process}; + + ProcessPtr selected; + Status status = mgr.get_process(version, &selected); + + EXPECT_TRUE(status.ok()) << status.to_string(); + EXPECT_EQ(selected, alive_process); + EXPECT_FALSE(mgr.process_pools_for_test()[version][1]->is_alive()); + EXPECT_EQ(mgr.process_pools_for_test()[version][1]->get_child_pid(), dead_pid); + + mgr.shutdown(); +} + TEST_F(PythonServerTest, GetProcessLoadBalancing) { setup_doris_home(); std::string python_path = create_fake_python_with_socket_creation("3.9.16"); diff --git a/regression-test/data/pythonudaf_p0/test_pythonudaf_drop.out b/regression-test/data/pythonudaf_p0/test_pythonudaf_drop.out index 79e35e30ee5fcd..8c1eb081162e9b 100644 --- a/regression-test/data/pythonudaf_p0/test_pythonudaf_drop.out +++ b/regression-test/data/pythonudaf_p0/test_pythonudaf_drop.out @@ -8,3 +8,9 @@ -- !py_udaf_drop_3 -- 6 +-- !py_udaf_drop_4 -- +6 + +-- !py_udaf_drop_5 -- +6 + diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_drop.out b/regression-test/data/pythonudf_p0/test_pythonudf_drop.out index 254ebe44809dfc..903817f7e7364a 100644 --- a/regression-test/data/pythonudf_p0/test_pythonudf_drop.out +++ b/regression-test/data/pythonudf_p0/test_pythonudf_drop.out @@ -8,3 +8,9 @@ -- !py_udf_drop_3 -- 8 +-- !py_udf_drop_4-- +32 + +-- !py_udf_drop_5 -- +33 + diff --git a/regression-test/data/pythonudtf_p0/test_pythonudtf_drop.out b/regression-test/data/pythonudtf_p0/test_pythonudtf_drop.out index 6f1159a95d1d4b..faa6e95841cdcf 100644 --- a/regression-test/data/pythonudtf_p0/test_pythonudtf_drop.out +++ b/regression-test/data/pythonudtf_p0/test_pythonudtf_drop.out @@ -7,7 +7,11 @@ 1 2 2 3 --- !py_udtf_drop_3 -- +-- !py_udtf_drop_4 -- +1 +2 + +-- !py_udtf_drop_5 -- 1 2 diff --git a/regression-test/suites/pythonudaf_p0/test_pythonudaf_drop.groovy b/regression-test/suites/pythonudaf_p0/test_pythonudaf_drop.groovy index 964413828ab7d3..4b64921676fd0b 100644 --- a/regression-test/suites/pythonudaf_p0/test_pythonudaf_drop.groovy +++ b/regression-test/suites/pythonudaf_p0/test_pythonudaf_drop.groovy @@ -15,10 +15,23 @@ // specific language governing permissions and limitations // under the License. -suite('test_pythonudaf_drop') { +suite('test_pythonudaf_drop', "nonConcurrent") { def runtime_version = getPythonUdfRuntimeVersion() def zipA = """${context.file.parent}/udaf_scripts/python_udaf_drop_a/python_udaf_drop_test.zip""" def zipB = """${context.file.parent}/udaf_scripts/python_udaf_drop_b/python_udaf_drop_test.zip""" + def localDorisHome = System.getenv("DORIS_HOME") + def localUdfRoot = localDorisHome != null ? "${localDorisHome}/lib/udf" : "/tmp" + def backendId_to_backendIP = [:] + def backendId_to_backendHttpPort = [:] + getBackendIpHttpPort(backendId_to_backendIP, backendId_to_backendHttpPort) + + def execOnBackend = { be_ip, localCmd, remoteCmd -> + if (be_ip == "127.0.0.1" || be_ip == "localhost") { + cmd(localCmd) + } else { + sshExec("root", be_ip, remoteCmd, false) + } + } scp_udf_file_to_all_be(zipA) scp_udf_file_to_all_be(zipB) @@ -88,9 +101,33 @@ suite('test_pythonudaf_drop') { sql '''SELECT py_drop_sum_a(v) FROM py_udaf_drop_tbl;''' exception 'Can not found function' } + + // Case 3: kill Python servers between two aggregate queries, next CREATE handshake should recover + sql '''DROP FUNCTION IF EXISTS py_drop_sum_reconnect(INT)''' + sql """ + CREATE AGGREGATE FUNCTION py_drop_sum_reconnect(INT) RETURNS BIGINT PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${zipA}", + "symbol" = "drop_udaf.SumAgg", + "runtime_version" = "${runtime_version}" + ) + """ + + qt_py_udaf_drop_4 '''SELECT py_drop_sum_reconnect(v) FROM py_udaf_drop_tbl;''' + + backendId_to_backendIP.values().each { be_ip -> + execOnBackend( + be_ip, + "pkill -f 'python_server.py grpc+unix:///tmp/doris_python_udf' || true", + "pkill -f 'python_server.py grpc+unix:///tmp/doris_python_udf' || true") + } + + qt_py_udaf_drop_5 '''SELECT py_drop_sum_reconnect(v) FROM py_udaf_drop_tbl;''' + try_sql('DROP FUNCTION IF EXISTS py_drop_sum_reconnect(INT);') } finally { try_sql('DROP FUNCTION IF EXISTS py_drop_sum_once(INT);') try_sql('DROP FUNCTION IF EXISTS py_drop_sum_a(INT);') try_sql('DROP FUNCTION IF EXISTS py_drop_sum_b(INT);') + try_sql('DROP FUNCTION IF EXISTS py_drop_sum_reconnect(INT);') } } diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_drop.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_drop.groovy index ab103c21f25111..2672cadbee886a 100644 --- a/regression-test/suites/pythonudf_p0/test_pythonudf_drop.groovy +++ b/regression-test/suites/pythonudf_p0/test_pythonudf_drop.groovy @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -suite("test_pythonudf_drop") { +suite("test_pythonudf_drop", "nonConcurrent") { def runtime_version = getPythonUdfRuntimeVersion() def zipA = """${context.file.parent}/udf_scripts/python_udf_drop_a/python_udf_drop_test.zip""" def zipB = """${context.file.parent}/udf_scripts/python_udf_drop_b/python_udf_drop_test.zip""" @@ -88,9 +88,33 @@ suite("test_pythonudf_drop") { sql """SELECT py_drop_a(1);""" exception "Can not found function" } + + // Case 3: kill Python servers between two queries, next client handshake should recover + sql """DROP FUNCTION IF EXISTS py_drop_reconnect(INT)""" + sql """ + CREATE FUNCTION py_drop_reconnect(INT) RETURNS INT PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${zipA}", + "symbol" = "drop_udf.evaluate", + "runtime_version" = "${runtime_version}" + ) + """ + + qt_py_udf_drop_4 """SELECT py_drop_reconnect(31);""" + + backendId_to_backendIP.values().each { be_ip -> + execOnBackend( + be_ip, + "pkill -f 'python_server.py grpc+unix:///tmp/doris_python_udf' || true", + "pkill -f 'python_server.py grpc+unix:///tmp/doris_python_udf' || true") + } + + qt_py_udf_drop_5 """SELECT py_drop_reconnect(32);""" + try_sql("DROP FUNCTION IF EXISTS py_drop_reconnect(INT);") } finally { try_sql("DROP FUNCTION IF EXISTS py_drop_once(INT);") try_sql("DROP FUNCTION IF EXISTS py_drop_a(INT);") try_sql("DROP FUNCTION IF EXISTS py_drop_b(INT);") + try_sql("DROP FUNCTION IF EXISTS py_drop_reconnect(INT);") } } diff --git a/regression-test/suites/pythonudtf_p0/test_pythonudtf_drop.groovy b/regression-test/suites/pythonudtf_p0/test_pythonudtf_drop.groovy index 1f454243fb051c..04abde6c146b50 100644 --- a/regression-test/suites/pythonudtf_p0/test_pythonudtf_drop.groovy +++ b/regression-test/suites/pythonudtf_p0/test_pythonudtf_drop.groovy @@ -15,10 +15,23 @@ // specific language governing permissions and limitations // under the License. -suite("test_pythonudtf_drop") { +suite("test_pythonudtf_drop", "nonConcurrent") { def runtime_version = getPythonUdfRuntimeVersion() def zipA = """${context.file.parent}/udtf_scripts/python_udtf_drop_a/python_udtf_drop_test.zip""" def zipB = """${context.file.parent}/udtf_scripts/python_udtf_drop_b/python_udtf_drop_test.zip""" + def localDorisHome = System.getenv("DORIS_HOME") + def localUdfRoot = localDorisHome != null ? "${localDorisHome}/lib/udf" : "/tmp" + def backendId_to_backendIP = [:] + def backendId_to_backendHttpPort = [:] + getBackendIpHttpPort(backendId_to_backendIP, backendId_to_backendHttpPort) + + def execOnBackend = { be_ip, localCmd, remoteCmd -> + if (be_ip == "127.0.0.1" || be_ip == "localhost") { + cmd(localCmd) + } else { + sshExec("root", be_ip, remoteCmd, false) + } + } scp_udf_file_to_all_be(zipA) scp_udf_file_to_all_be(zipB) @@ -122,9 +135,45 @@ suite("test_pythonudtf_drop") { """ exception "Can not found function" } + + // Case 4: kill Python servers between two table-function queries, next handshake should recover + sql """DROP FUNCTION IF EXISTS py_drop_t_reconnect(INT)""" + sql """ + CREATE TABLES FUNCTION py_drop_t_reconnect(INT) + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${zipA}", + "symbol" = "drop_udtf.process", + "runtime_version" = "${runtime_version}" + ) + """ + + qt_py_udtf_drop_4 """ + SELECT c + FROM py_udtf_drop_tbl + LATERAL VIEW py_drop_t_reconnect(v) tmp AS c + ORDER BY c; + """ + + backendId_to_backendIP.values().each { be_ip -> + execOnBackend( + be_ip, + "pkill -f 'python_server.py grpc+unix:///tmp/doris_python_udf' || true", + "pkill -f 'python_server.py grpc+unix:///tmp/doris_python_udf' || true") + } + + qt_py_udtf_drop_5 """ + SELECT c + FROM py_udtf_drop_tbl + LATERAL VIEW py_drop_t_reconnect(v) tmp AS c + ORDER BY c; + """ + try_sql("DROP FUNCTION IF EXISTS py_drop_t_reconnect(INT);") } finally { try_sql("DROP FUNCTION IF EXISTS py_drop_t_once(INT);") try_sql("DROP FUNCTION IF EXISTS py_drop_t_a(INT);") try_sql("DROP FUNCTION IF EXISTS py_drop_t_b(INT);") + try_sql("DROP FUNCTION IF EXISTS py_drop_t_reconnect(INT);") } } From 4ea33a8241c210f97435b4aff9fd6b48407ff82c Mon Sep 17 00:00:00 2001 From: linzhenqi Date: Thu, 23 Apr 2026 21:50:10 +0800 Subject: [PATCH 05/11] [Enhancement](udf) Refine PythonServerManager locking to per-version process pools --- be/src/udf/python/python_server.cpp | 137 +++++++++++++++------- be/src/udf/python/python_server.h | 31 +++-- be/test/udf/python/python_server_test.cpp | 81 +++++++++++-- 3 files changed, 191 insertions(+), 58 deletions(-) diff --git a/be/src/udf/python/python_server.cpp b/be/src/udf/python/python_server.cpp index 6bf247f0417f11..2b001a927f05f0 100644 --- a/be/src/udf/python/python_server.cpp +++ b/be/src/udf/python/python_server.cpp @@ -27,6 +27,7 @@ #include #include #include +#include #include "arrow/flight/client.h" #include "common/config.h" @@ -37,6 +38,50 @@ namespace doris { +std::shared_ptr +PythonServerManager::_get_or_create_process_pool(const PythonVersion& version) { + std::lock_guard lock(_pools_mutex); + auto& pool = _process_pools[version]; + if (!pool) { + pool = std::make_shared(); + } + return pool; +} + +std::shared_ptr PythonServerManager::_get_process_pool( + const PythonVersion& version) { + std::lock_guard lock(_pools_mutex); + auto it = _process_pools.find(version); + return it == _process_pools.end() ? nullptr : it->second; +} + +std::vector>> +PythonServerManager::_snapshot_process_pools() { + std::lock_guard lock(_pools_mutex); + std::vector>> snapshot; + snapshot.reserve(_process_pools.size()); + for (const auto& [version, pool] : _process_pools) { + snapshot.emplace_back(version, pool); + } + return snapshot; +} + +#ifdef BE_TEST +void PythonServerManager::set_process_pool_for_test(const PythonVersion& version, + std::vector processes, + bool initialized) { + auto versioned_pool = _get_or_create_process_pool(version); + std::lock_guard lock(versioned_pool->mutex); + versioned_pool->processes = std::move(processes); + versioned_pool->initialized = initialized; +} + +std::vector& PythonServerManager::process_pool_for_test(const PythonVersion& version) { + auto versioned_pool = _get_or_create_process_pool(version); + return versioned_pool->processes; +} +#endif + template Status PythonServerManager::get_client(const PythonUDFMeta& func_meta, const PythonVersion& version, std::shared_ptr* client, @@ -57,12 +102,12 @@ Status PythonServerManager::get_client(const PythonUDFMeta& func_meta, const Pyt } Status PythonServerManager::ensure_pool_initialized(const PythonVersion& version) { - std::lock_guard lock(_pools_mutex); + auto versioned_pool = _get_or_create_process_pool(version); + std::lock_guard lock(versioned_pool->mutex); // Check if already initialized - if (_initialized_versions.count(version)) return Status::OK(); + if (versioned_pool->initialized) return Status::OK(); - std::vector& pool = _process_pools[version]; // 0 means use CPU core count as default, otherwise use the specified value int max_pool_size = config::max_python_process_num > 0 ? config::max_python_process_num : CpuInfo::num_cores(); @@ -91,7 +136,7 @@ Status PythonServerManager::ensure_pool_initialized(const PythonVersion& version for (int i = 0; i < max_pool_size; i++) { Status s = futures[i].get(); if (s.ok() && temp_processes[i]) { - pool.push_back(std::move(temp_processes[i])); + versioned_pool->processes.push_back(std::move(temp_processes[i])); success_count++; } else { failure_count++; @@ -100,7 +145,7 @@ Status PythonServerManager::ensure_pool_initialized(const PythonVersion& version } } - if (pool.empty()) { + if (versioned_pool->processes.empty()) { return Status::InternalError( "Failed to initialize Python process pool: all {} process creation attempts failed", max_pool_size); @@ -110,15 +155,20 @@ Status PythonServerManager::ensure_pool_initialized(const PythonVersion& version << ": created " << success_count << " processes" << (failure_count > 0 ? fmt::format(" ({} failed)", failure_count) : ""); - _initialized_versions.insert(version); + versioned_pool->initialized = true; _start_health_check_thread(); return Status::OK(); } Status PythonServerManager::get_process(const PythonVersion& version, ProcessPtr* process) { - std::lock_guard lock(_pools_mutex); - std::vector& pool = _process_pools[version]; + auto versioned_pool = _get_process_pool(version); + if (!versioned_pool) { + return Status::InternalError("Python process pool is empty for version {}", + version.to_string()); + } + std::lock_guard lock(versioned_pool->mutex); + std::vector& pool = versioned_pool->processes; if (UNLIKELY(pool.empty())) { return Status::InternalError("Python process pool is empty for version {}", @@ -230,39 +280,39 @@ Status PythonServerManager::fork(const PythonVersion& version, ProcessPtr* proce } void PythonServerManager::_start_health_check_thread() { - if (_health_check_thread) return; - - LOG(INFO) << "Starting Python process health check thread (interval: 30 seconds)"; - - _health_check_thread = std::make_unique([this]() { - // Health check loop - while (!_shutdown_flag.load(std::memory_order_acquire)) { - // Wait for interval or shutdown signal - { - std::unique_lock lock(_health_check_mutex); - _health_check_cv.wait_for(lock, std::chrono::seconds(30), [this]() { - return _shutdown_flag.load(std::memory_order_acquire); - }); - } + std::call_once(_health_check_once, [this]() { + LOG(INFO) << "Starting Python process health check thread (interval: 30 seconds)"; + + _health_check_thread = std::make_unique([this]() { + // Health check loop + while (!_shutdown_flag.load(std::memory_order_acquire)) { + // Wait for interval or shutdown signal + { + std::unique_lock lock(_health_check_mutex); + _health_check_cv.wait_for(lock, std::chrono::seconds(30), [this]() { + return _shutdown_flag.load(std::memory_order_acquire); + }); + } - if (_shutdown_flag.load(std::memory_order_acquire)) break; + if (_shutdown_flag.load(std::memory_order_acquire)) break; - _check_and_recreate_processes(); - _refresh_memory_stats(); - } + _check_and_recreate_processes(); + _refresh_memory_stats(); + } - LOG(INFO) << "Python process health check thread exiting"; + LOG(INFO) << "Python process health check thread exiting"; + }); }); } void PythonServerManager::_check_and_recreate_processes() { - std::lock_guard lock(_pools_mutex); - int total_checked = 0; int total_dead = 0; int total_recreated = 0; - for (auto& [version, pool] : _process_pools) { + for (auto& [version, versioned_pool] : _snapshot_process_pools()) { + std::lock_guard lock(versioned_pool->mutex); + auto& pool = versioned_pool->processes; for (size_t i = 0; i < pool.size(); ++i) { auto& process = pool[i]; if (!process) continue; @@ -307,15 +357,22 @@ void PythonServerManager::shutdown() { } // Shutdown all processes - std::lock_guard lock(_pools_mutex); - for (auto& [version, pool] : _process_pools) { + for (auto& [version, versioned_pool] : _snapshot_process_pools()) { + std::lock_guard lock(versioned_pool->mutex); + auto& pool = versioned_pool->processes; for (auto& process : pool) { if (process) { process->shutdown(); } } + pool.clear(); + versioned_pool->initialized = false; + } + + { + std::lock_guard lock(_pools_mutex); + _process_pools.clear(); } - _process_pools.clear(); } Status PythonServerManager::_read_process_memory(pid_t pid, size_t* rss_bytes) { @@ -344,11 +401,11 @@ Status PythonServerManager::_read_process_memory(pid_t pid, size_t* rss_bytes) { } void PythonServerManager::_refresh_memory_stats() { - std::lock_guard lock(_pools_mutex); - int64_t total_rss = 0; - for (const auto& [version, pool] : _process_pools) { + for (const auto& [version, versioned_pool] : _snapshot_process_pools()) { + std::lock_guard lock(versioned_pool->mutex); + const auto& pool = versioned_pool->processes; for (const auto& process : pool) { if (!process || !process->is_alive()) continue; @@ -378,15 +435,15 @@ Status PythonServerManager::clear_module_cache(const std::string& location) { return Status::InvalidArgument("Empty location for clear_module_cache"); } - std::lock_guard lock(_pools_mutex); - std::string body = fmt::format(R"({{"location": "{}"}})", location); int success_count = 0; int fail_count = 0; bool has_active_process = false; - for (auto& [version, pool] : _process_pools) { + for (auto& [version, versioned_pool] : _snapshot_process_pools()) { + std::lock_guard lock(versioned_pool->mutex); + auto& pool = versioned_pool->processes; for (auto& process : pool) { if (!process || !process->is_alive()) { continue; @@ -461,4 +518,4 @@ template Status PythonServerManager::get_client( std::shared_ptr* client, const std::shared_ptr& data_schema); -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/src/udf/python/python_server.h b/be/src/udf/python/python_server.h index 6427cb7e63c38e..1e0b978f3bb49e 100644 --- a/be/src/udf/python/python_server.h +++ b/be/src/udf/python/python_server.h @@ -20,7 +20,10 @@ #include #include #include +#include #include +#include +#include #include "common/status.h" #include "runtime/memory/mem_tracker.h" @@ -59,12 +62,19 @@ class PythonServerManager { // For unit testing only. void check_and_recreate_processes_for_test() { _check_and_recreate_processes(); } - std::unordered_map>& process_pools_for_test() { - return _process_pools; - } + void set_process_pool_for_test(const PythonVersion& version, std::vector processes, + bool initialized = true); + + std::vector& process_pool_for_test(const PythonVersion& version); #endif private: + struct VersionedProcessPool { + std::mutex mutex; + std::vector processes; + bool initialized = false; + }; + /** * Start health check background thread (called once by ensure_pool_initialized) * Thread periodically checks process health and refreshes memory stats @@ -86,17 +96,22 @@ class PythonServerManager { */ void _refresh_memory_stats(); - std::unordered_map> _process_pools; - // Protects _process_pools access + std::shared_ptr _get_or_create_process_pool(const PythonVersion& version); + std::shared_ptr _get_process_pool(const PythonVersion& version); + std::vector>> + _snapshot_process_pools(); + + std::unordered_map> _process_pools; + // Protects the version -> pool handle map only. Per-version process operations are guarded + // by VersionedProcessPool::mutex. std::mutex _pools_mutex; - // Track which versions have been initialized - std::unordered_set _initialized_versions; // Health check background thread std::unique_ptr _health_check_thread; + std::once_flag _health_check_once; std::atomic _shutdown_flag {false}; std::condition_variable _health_check_cv; std::mutex _health_check_mutex; MemTracker _mem_tracker {"PythonUDFProcesses"}; }; -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/test/udf/python/python_server_test.cpp b/be/test/udf/python/python_server_test.cpp index a1668806ec7b97..675c72988b64c4 100644 --- a/be/test/udf/python/python_server_test.cpp +++ b/be/test/udf/python/python_server_test.cpp @@ -23,6 +23,7 @@ #include #include +#include #include #include "common/config.h" @@ -99,6 +100,32 @@ class PythonServerTest : public ::testing::Test { return python_path; } + std::string create_fake_python_with_delay_and_socket_creation(const std::string& binary_name, + const std::string& version, + int delay_ms) { + std::string bin_dir = test_dir_ + "/bin"; + std::string python_path = bin_dir + "/" + binary_name; + fs::create_directories(bin_dir); + + std::ofstream ofs(python_path); + ofs << "#!/bin/bash\n"; + ofs << "if [ \"$1\" = \"--version\" ]; then\n"; + ofs << " echo 'Python " << version << "'\n"; + ofs << " exit 0\n"; + ofs << "fi\n"; + ofs << "sleep " << (delay_ms / 1000.0) << "\n"; + ofs << "SOCKET_PREFIX=\"$3\"\n"; + ofs << "SOCKET_BASE=\"${SOCKET_PREFIX#grpc+unix://}\"\n"; + ofs << "SOCKET_FILE=\"${SOCKET_BASE}_$$.sock\"\n"; + ofs << "touch \"$SOCKET_FILE\"\n"; + ofs << "trap 'rm -f \"$SOCKET_FILE\"; exit 0' TERM INT\n"; + ofs << "while true; do sleep 1; done\n"; + ofs.close(); + fs::permissions(python_path, fs::perms::owner_all); + + return python_path; + } + // Set DORIS_HOME and create flight server script directory void setup_doris_home() { setenv("DORIS_HOME", test_dir_.c_str(), 1); @@ -467,15 +494,15 @@ TEST_F(PythonServerTest, GetProcessSkipsDeadProcessWhenAliveProcessExists) { dead_process->shutdown(); ASSERT_FALSE(dead_process->is_alive()); - mgr.process_pools_for_test()[version] = {alive_process, dead_process}; + mgr.set_process_pool_for_test(version, {alive_process, dead_process}); ProcessPtr selected; Status status = mgr.get_process(version, &selected); EXPECT_TRUE(status.ok()) << status.to_string(); EXPECT_EQ(selected, alive_process); - EXPECT_FALSE(mgr.process_pools_for_test()[version][1]->is_alive()); - EXPECT_EQ(mgr.process_pools_for_test()[version][1]->get_child_pid(), dead_pid); + EXPECT_FALSE(mgr.process_pool_for_test(version)[1]->is_alive()); + EXPECT_EQ(mgr.process_pool_for_test(version)[1]->get_child_pid(), dead_pid); mgr.shutdown(); } @@ -586,6 +613,40 @@ TEST_F(PythonServerTest, MultipleVersionPools) { mgr.shutdown(); } +TEST_F(PythonServerTest, EnsurePoolInitializedForDifferentVersionsDoesNotShareVersionLock) { + setup_doris_home(); + + config::max_python_process_num = 1; + + std::string python39_path = + create_fake_python_with_delay_and_socket_creation("python3.9", "3.9.16", 1200); + std::string python310_path = + create_fake_python_with_delay_and_socket_creation("python3.10", "3.10.0", 1200); + + PythonServerManager mgr; + PythonVersion version39("3.9.16", test_dir_, python39_path); + PythonVersion version310("3.10.0", test_dir_, python310_path); + + auto start = std::chrono::steady_clock::now(); + auto future39 = std::async(std::launch::async, + [&]() { return mgr.ensure_pool_initialized(version39); }); + auto future310 = std::async(std::launch::async, + [&]() { return mgr.ensure_pool_initialized(version310); }); + + Status status39 = future39.get(); + Status status310 = future310.get(); + auto elapsed = std::chrono::duration_cast( + std::chrono::steady_clock::now() - start); + + EXPECT_TRUE(status39.ok()) << status39.to_string(); + EXPECT_TRUE(status310.ok()) << status310.to_string(); + // If both versions still contended on one manager-wide lock, the elapsed time would + // be close to two serialized 1.2s startups instead of a single startup window. + EXPECT_LT(elapsed.count(), 2200); + + mgr.shutdown(); +} + // ============================================================================ // PythonServerManager::_check_and_recreate_processes() - health-check recreation test // ============================================================================ @@ -609,15 +670,15 @@ TEST_F(PythonServerTest, CheckAndRecreateProcessesRecreatesDeadProcess) { dead_process->shutdown(); ASSERT_FALSE(dead_process->is_alive()); - mgr.process_pools_for_test()[version] = {alive_process, dead_process, nullptr}; + mgr.set_process_pool_for_test(version, {alive_process, dead_process, nullptr}); mgr.check_and_recreate_processes_for_test(); - ASSERT_EQ(mgr.process_pools_for_test()[version].size(), 3); - EXPECT_EQ(mgr.process_pools_for_test()[version][0], alive_process); - EXPECT_EQ(mgr.process_pools_for_test()[version][2], nullptr); + ASSERT_EQ(mgr.process_pool_for_test(version).size(), 3); + EXPECT_EQ(mgr.process_pool_for_test(version)[0], alive_process); + EXPECT_EQ(mgr.process_pool_for_test(version)[2], nullptr); - ProcessPtr recreated = mgr.process_pools_for_test()[version][1]; + ProcessPtr recreated = mgr.process_pool_for_test(version)[1]; ASSERT_NE(recreated, nullptr); EXPECT_TRUE(recreated->is_alive()); EXPECT_NE(recreated->get_child_pid(), dead_pid_before); @@ -645,11 +706,11 @@ TEST_F(PythonServerTest, CheckAndRecreateProcessesErasesDeadProcessWhenRecreateF ASSERT_FALSE(dead_process_2->is_alive()); PythonVersion invalid_version("3.9.16", test_dir_, test_dir_ + "/bin/nonexistent_python"); - mgr.process_pools_for_test()[invalid_version] = {dead_process_1, dead_process_2}; + mgr.set_process_pool_for_test(invalid_version, {dead_process_1, dead_process_2}); mgr.check_and_recreate_processes_for_test(); - EXPECT_TRUE(mgr.process_pools_for_test()[invalid_version].empty()); + EXPECT_TRUE(mgr.process_pool_for_test(invalid_version).empty()); mgr.shutdown(); } From fe66e93750c53d31913b6441d96e5655e5d4750d Mon Sep 17 00:00:00 2001 From: linzhenqi Date: Fri, 24 Apr 2026 10:51:41 +0800 Subject: [PATCH 06/11] add log --- be/src/exprs/function/function_python_udf.cpp | 24 ++++++++++++++- be/src/udf/python/python_udf_meta.cpp | 29 ++++++++++++++++++- .../doris/analysis/FunctionCallExpr.java | 12 ++++++++ .../doris/analysis/ExprToThriftVisitor.java | 10 +++++++ .../catalog/FunctionToThriftConverter.java | 21 ++++++++++++++ .../glue/translator/ExpressionTranslator.java | 14 ++++++++- .../expressions/functions/udf/PythonUdf.java | 15 ++++++++++ .../functions/udf/PythonUdfBuilder.java | 15 +++++++++- .../plans/commands/CreateFunctionCommand.java | 29 +++++++++++++++++++ 9 files changed, 165 insertions(+), 4 deletions(-) diff --git a/be/src/exprs/function/function_python_udf.cpp b/be/src/exprs/function/function_python_udf.cpp index 3d999cdac5e718..072930852cab1c 100644 --- a/be/src/exprs/function/function_python_udf.cpp +++ b/be/src/exprs/function/function_python_udf.cpp @@ -54,18 +54,37 @@ Status PythonFunctionCall::open(FunctionContext* context, func_meta.id = _fn.id; func_meta.name = _fn.name.function_name; func_meta.symbol = _fn.scalar_fn.symbol; + LOG(INFO) << fmt::format( + "[pyudf-test] be open raw tfunction name={}, symbol={}, has_hdfs_location={}, " + "hdfs_location={}, has_function_code={}, function_code_empty={}, " + "has_runtime_version={}, " + "runtime_version={}, checksum={}", + _fn.name.function_name, _fn.scalar_fn.symbol, + _fn.__isset.hdfs_location ? "true" : "false", _fn.hdfs_location, + _fn.__isset.function_code ? "true" : "false", + _fn.function_code.empty() ? "true" : "false", + _fn.__isset.runtime_version ? "true" : "false", _fn.runtime_version, _fn.checksum); if (!_fn.function_code.empty()) { func_meta.type = PythonUDFLoadType::INLINE; func_meta.location = "inline"; func_meta.inline_code = _fn.function_code; + LOG(INFO) << fmt::format("[pyudf-test] be open inline mode code_length={}", + _fn.function_code.size()); } else if (!_fn.hdfs_location.empty()) { func_meta.type = PythonUDFLoadType::MODULE; func_meta.location = _fn.hdfs_location; func_meta.checksum = _fn.checksum; + LOG(INFO) << fmt::format("[pyudf-test] be open module mode url={}, checksum={}", + _fn.hdfs_location, _fn.checksum); } else { func_meta.type = PythonUDFLoadType::UNKNOWN; func_meta.location = "unknown"; + LOG(INFO) << "[pyudf-test] be open unknown mode because both function_code and " + "hdfs_location are empty"; } + LOG(INFO) << fmt::format( + "[pyudf-test] be open classified load_type={}, location={}, checksum={}", + static_cast(func_meta.type), func_meta.location, func_meta.checksum); func_meta.input_types = _argument_types; func_meta.return_type = _return_type; @@ -81,12 +100,15 @@ Status PythonFunctionCall::open(FunctionContext* context, func_meta.runtime_version = version.full_version; RETURN_IF_ERROR(func_meta.check()); func_meta.always_nullable = _return_type->is_nullable(); - LOG(INFO) << fmt::format("runtime_version: {}, func_meta: {}", version.to_string(), + LOG(INFO) << fmt::format("[pyudf-test] runtime_version: {}, func_meta: {}", version.to_string(), func_meta.to_string()); if (func_meta.type == PythonUDFLoadType::MODULE) { RETURN_IF_ERROR(UserFunctionCache::instance()->get_pypath( func_meta.id, func_meta.location, func_meta.checksum, &func_meta.location)); + LOG(INFO) << fmt::format( + "[pyudf-test] be open resolved module path id={}, resolved_location={}", + func_meta.id, func_meta.location); } PythonUDFClientPtr client = nullptr; diff --git a/be/src/udf/python/python_udf_meta.cpp b/be/src/udf/python/python_udf_meta.cpp index f0978dc926bf11..49806f82ca944e 100644 --- a/be/src/udf/python/python_udf_meta.cpp +++ b/be/src/udf/python/python_udf_meta.cpp @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -139,42 +140,68 @@ std::string PythonUDFMeta::to_string() const { } Status PythonUDFMeta::check() const { + LOG(INFO) << fmt::format( + "[pyudf-test] PythonUDFMeta::check name={}, symbol={}, location={}, " + "runtime_version={}, " + "load_type={}, client_type={}, inline_code_empty={}, checksum_empty={}, " + "input_types_size={}, " + "has_return_type={}, always_nullable={}", + name, symbol, location, runtime_version, static_cast(type), + static_cast(client_type), inline_code.empty() ? "true" : "false", + checksum.empty() ? "true" : "false", input_types.size(), return_type ? "true" : "false", + always_nullable ? "true" : "false"); if (trim(name).empty()) { + LOG(WARNING) << "[pyudf-test] PythonUDFMeta::check failed: empty name"; return Status::InvalidArgument("Python UDF name is empty"); } if (trim(symbol).empty()) { + LOG(WARNING) << "[pyudf-test] PythonUDFMeta::check failed: empty symbol"; return Status::InvalidArgument("Python UDF symbol is empty"); } if (trim(runtime_version).empty()) { + LOG(WARNING) << "[pyudf-test] PythonUDFMeta::check failed: empty runtime_version"; return Status::InvalidArgument("Python UDF runtime version is empty"); } if (input_types.empty() && (client_type == PythonClientType::UDAF || type == PythonUDFLoadType::UNKNOWN)) { + LOG(WARNING) << fmt::format( + "[pyudf-test] PythonUDFMeta::check failed: empty input_types, client_type={}, " + "load_type={}", + static_cast(client_type), static_cast(type)); return Status::InvalidArgument("Python UDAF input types is empty"); } if (!return_type) { + LOG(WARNING) << "[pyudf-test] PythonUDFMeta::check failed: empty return_type"; return Status::InvalidArgument("Python UDF return type is empty"); } if (type == PythonUDFLoadType::UNKNOWN) { + LOG(WARNING) << fmt::format( + "[pyudf-test] PythonUDFMeta::check failed: unknown load_type, " + "inline_code_empty={}, " + "location_empty={}", + inline_code.empty() ? "true" : "false", trim(location).empty() ? "true" : "false"); return Status::InvalidArgument( "Python UDF load type is invalid, please check inline code or file path"); } if (type == PythonUDFLoadType::MODULE) { if (trim(location).empty()) { + LOG(WARNING) << "[pyudf-test] PythonUDFMeta::check failed: module location empty"; return Status::InvalidArgument("Non-inline Python UDF location is empty"); } if (trim(checksum).empty()) { + LOG(WARNING) << "[pyudf-test] PythonUDFMeta::check failed: module checksum empty"; return Status::InvalidArgument("Non-inline Python UDF checksum is empty"); } } + LOG(INFO) << "[pyudf-test] PythonUDFMeta::check passed"; return Status::OK(); } -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/fe/fe-catalog/src/main/java/org/apache/doris/analysis/FunctionCallExpr.java b/fe/fe-catalog/src/main/java/org/apache/doris/analysis/FunctionCallExpr.java index 5a934cd6ca4e26..07efe49fa84873 100644 --- a/fe/fe-catalog/src/main/java/org/apache/doris/analysis/FunctionCallExpr.java +++ b/fe/fe-catalog/src/main/java/org/apache/doris/analysis/FunctionCallExpr.java @@ -29,12 +29,15 @@ import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import com.google.gson.annotations.SerializedName; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import java.text.StringCharacterIterator; import java.util.List; // TODO: for aggregations, we need to unify the code paths for builtins and UDAs. public class FunctionCallExpr extends Expr { + private static final Logger LOG = LogManager.getLogger(FunctionCallExpr.class); @SerializedName("fnn") private FunctionName fnName; @@ -127,6 +130,15 @@ public FunctionCallExpr(Function function, FunctionParams functionParams, Functi this.originChildSize = children.size(); this.isMergeAggFn = isMergeAggFn; this.nullable = nullable; + if (function.getBinaryType() == Function.BinaryType.PYTHON_UDF) { + LOG.info("[pyudf-test] FunctionCallExpr ctor signature={}, location={}, runtimeVersion={}, " + + "functionCodeEmpty={}, childCount={}, nullable={}", + function.signatureString(), + function.getLocation() == null ? "null" : function.getLocation().getLocation(), + function.getRuntimeVersion(), + function.getFunctionCode() == null || function.getFunctionCode().isEmpty(), + children.size(), nullable); + } } protected FunctionCallExpr(FunctionCallExpr other) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/ExprToThriftVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/ExprToThriftVisitor.java index d52a0eb8bf0890..6169d231a1d6a5 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/ExprToThriftVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/ExprToThriftVisitor.java @@ -19,6 +19,7 @@ import org.apache.doris.analysis.ArithmeticExpr.Operator; import org.apache.doris.catalog.ArrayType; +import org.apache.doris.catalog.Function.BinaryType; import org.apache.doris.catalog.FunctionToThriftConverter; import org.apache.doris.catalog.ScalarType; import org.apache.doris.catalog.StructType; @@ -114,6 +115,15 @@ public static void treeToThriftHelper(Expr expr, TExpr container, msg.type = expr.getType().toThrift(); msg.num_children = expr.getChildren().size(); if (expr.getFn() != null) { + if (expr.getFn().getBinaryType() == BinaryType.PYTHON_UDF) { + LOG.info("[pyudf-test] ExprToThriftVisitor exprFn signature={}, location={}, runtimeVersion={}, " + + "functionCodeEmpty={}, childCount={}", + expr.getFn().signatureString(), + expr.getFn().getLocation() == null ? "null" : expr.getFn().getLocation().getLocation(), + expr.getFn().getRuntimeVersion(), + expr.getFn().getFunctionCode() == null || expr.getFn().getFunctionCode().isEmpty(), + expr.getChildren().size()); + } msg.setFn(FunctionToThriftConverter.toThrift(expr.getFn(), expr.getType(), expr.collectChildReturnTypes(), expr.collectChildReturnNullables())); if (expr.getFn().hasVarArgs()) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionToThriftConverter.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionToThriftConverter.java index 5c4ca6f8be9994..463f498e89b706 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionToThriftConverter.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionToThriftConverter.java @@ -25,11 +25,14 @@ import com.google.common.base.Strings; import com.google.common.collect.Lists; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; /** * Converts {@link Function} and its subclasses to their Thrift representations. */ public class FunctionToThriftConverter { + private static final Logger LOG = LogManager.getLogger(FunctionToThriftConverter.class); /** * Converts a {@link Function.BinaryType} to its Thrift representation. @@ -97,6 +100,14 @@ public static TFunction toThrift(ScalarFunction fn, Type realReturnType, Type[] tfn.setFunctionCode(fn.getFunctionCode()); } tfn.setRuntimeVersion(fn.getRuntimeVersion()); + LOG.info("[pyudf-test] scalar toThrift python udf signature={}, location={}, hdfsLocationIsSet={}, " + + "runtimeVersion={}, functionCodeEmpty={}, checksum={}", + fn.signatureString(), + fn.getLocation() == null ? "null" : fn.getLocation().getLocation(), + tfn.isSetHdfsLocation(), + fn.getRuntimeVersion(), + Strings.isNullOrEmpty(fn.getFunctionCode()), + fn.getChecksum()); } if (fn.getDictFunction() != null) { tfn.setDictFunction(fn.getDictFunction()); @@ -161,6 +172,16 @@ private static TFunction toThriftBase(Function fn, Type realReturnType, Type[] r if (fn.getLocation() != null) { tfn.setHdfsLocation(fn.getLocation().getLocation()); } + if (fn.getBinaryType() == Function.BinaryType.PYTHON_UDF) { + LOG.info("[pyudf-test] toThriftBase python udf signature={}, location={}, hdfsLocationIsSet={}, " + + "runtimeVersion={}, functionCodeEmpty={}, checksum={}", + fn.signatureString(), + fn.getLocation() == null ? "null" : fn.getLocation().getLocation(), + tfn.isSetHdfsLocation(), + fn.getRuntimeVersion(), + Strings.isNullOrEmpty(fn.getFunctionCode()), + fn.getChecksum()); + } // `realArgTypes.length != argTypes.length` is true iff this is an aggregation // function. // For aggregation functions, `argTypes` here is already its real type with true diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java index acc9e59c6809c7..a02659065e7ab0 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java @@ -945,7 +945,19 @@ public Expr visitPythonUdf(PythonUdf udf, PlanTranslatorContext context) { FunctionParams exprs = new FunctionParams(udf.children().stream() .map(expression -> expression.accept(this, context)) .collect(Collectors.toList())); - return new FunctionCallExpr(udf.getCatalogFunction(), exprs, udf.nullable()); + org.apache.doris.catalog.Function catalogFunction = udf.getCatalogFunction(); + if (catalogFunction instanceof org.apache.doris.catalog.ScalarFunction) { + org.apache.doris.catalog.ScalarFunction scalarFunction = + (org.apache.doris.catalog.ScalarFunction) catalogFunction; + LOG.info("[pyudf-test] ExpressionTranslator.visitPythonUdf name={}, location={}, runtimeVersion={}, " + + "functionCodeEmpty={}, childCount={}, nullable={}", + udf.getName(), + scalarFunction.getLocation() == null ? "null" : scalarFunction.getLocation().getLocation(), + scalarFunction.getRuntimeVersion(), + scalarFunction.getFunctionCode() == null || scalarFunction.getFunctionCode().isEmpty(), + udf.children().size(), udf.nullable()); + } + return new FunctionCallExpr(catalogFunction, exprs, udf.nullable()); } @Override diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdf.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdf.java index 8ace16ccc08c48..7d6e409b7d5ef9 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdf.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdf.java @@ -35,6 +35,8 @@ import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import java.util.Arrays; import java.util.List; @@ -44,6 +46,8 @@ * Python UDF for Nereids */ public class PythonUdf extends ScalarFunction implements ExplicitlyCastableSignature, Udf { + private static final Logger LOG = LogManager.getLogger(PythonUdf.class); + private final String dbName; private final long functionId; private final Function.BinaryType binaryType; @@ -153,6 +157,13 @@ public static void translateToNereidsFunction(String dbName, org.apache.doris.ca scalar.isDeterministic(), arguments); + LOG.info("[pyudf-test] translateToNereidsFunction name={}, dbName={}, location={}, checksum={}, " + + "runtimeVersion={}, functionCodeEmpty={}, deterministic={}", + fnName, dbName, scalar.getLocation() == null ? "null" : scalar.getLocation().getLocation(), + scalar.getChecksum(), scalar.getRuntimeVersion(), + scalar.getFunctionCode() == null || scalar.getFunctionCode().isEmpty(), + scalar.isDeterministic()); + PythonUdfBuilder builder = new PythonUdfBuilder(udf); Env.getCurrentEnv().getFunctionRegistry().addUdf(dbName, fnName, builder); } @@ -184,6 +195,10 @@ public Function getCatalogFunction() { expr.setRuntimeVersion(runtimeVersion); expr.setFunctionCode(functionCode); expr.setDeterministic(deterministic); + LOG.info("[pyudf-test] getCatalogFunction name={}, dbName={}, objectFile={}, checksum={}, " + + "runtimeVersion={}, functionCodeEmpty={}, deterministic={}", + getName(), dbName, objectFile, checkSum, + runtimeVersion, functionCode == null || functionCode.isEmpty(), deterministic); return expr; } catch (Exception e) { throw new AnalysisException(e.getMessage(), e.getCause()); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdfBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdfBuilder.java index 7185594099b87c..2b794a95e7ae58 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdfBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdfBuilder.java @@ -27,6 +27,8 @@ import com.google.common.base.Suppliers; import com.google.common.collect.Lists; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import java.util.List; import java.util.Optional; @@ -36,6 +38,8 @@ * function builder for python udf */ public class PythonUdfBuilder extends UdfBuilder { + private static final Logger LOG = LogManager.getLogger(PythonUdfBuilder.class); + private final PythonUdf udf; private final int arity; private final boolean isVarArgs; @@ -88,7 +92,16 @@ public Pair build(String name, List arguments) { for (int i = 0; i < exprs.size(); ++i) { processedExprs.add(TypeCoercionUtils.castIfNotSameType(exprs.get(i), argTypes.get(i))); } - return Pair.ofSame(udf.withChildren(processedExprs)); + PythonUdf built = udf.withChildren(processedExprs); + org.apache.doris.catalog.Function catalogFn = built.getCatalogFunction(); + LOG.info("[pyudf-test] PythonUdfBuilder.build name={}, argCount={}, location={}, runtimeVersion={}, " + + "functionCodeEmpty={}, nullableMode={}", + name, arguments.size(), + catalogFn.getLocation() == null ? "null" : catalogFn.getLocation().getLocation(), + catalogFn.getRuntimeVersion(), + catalogFn.getFunctionCode() == null || catalogFn.getFunctionCode().isEmpty(), + catalogFn.getNullableMode()); + return Pair.ofSame(built); } @Override diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/CreateFunctionCommand.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/CreateFunctionCommand.java index d35000b4923516..b9209797910357 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/CreateFunctionCommand.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/CreateFunctionCommand.java @@ -367,6 +367,10 @@ private void analyzeCommon(ConnectContext ctx) throws AnalysisException { + "'3.X.X' or '3.XX.XX' (e.g. '3.10.2').", runtimeVersionString)); } runtimeVersion = runtimeVersionString; + LOG.info("[pyudf-test] analyzeCommon python udf functionName={}, userFile={}, " + + "originalUserFile={}, runtimeVersion={}, functionCodeEmpty={}, properties={}", + functionName, userFile, originalUserFile, runtimeVersion, + Strings.isNullOrEmpty(functionCode), properties); } if (binaryType == Function.BinaryType.JAVA_UDF || binaryType == Function.BinaryType.PYTHON_UDF) { Boolean deterministicProperty = parseBooleanFromProperties(IS_DETERMINISTIC); @@ -560,6 +564,10 @@ private void analyzeUdaf() throws AnalysisException { function.setRuntimeVersion(runtimeVersion); function.setFunctionCode(functionCode); function.setDeterministic(deterministic); + LOG.info("[pyudf-test] analyzeUdf created function signature={}, binaryType={}, location={}, " + + "checksum={}, runtimeVersion={}, functionCodeEmpty={}, deterministic={}", + function.signatureString(), binaryType, location == null ? "null" : location.getLocation(), + checksum, runtimeVersion, Strings.isNullOrEmpty(functionCode), deterministic); } private void analyzeUdf() throws AnalysisException { @@ -633,9 +641,13 @@ private void analyzePythonUdaf(String clazz) throws AnalysisException { } if (Strings.isNullOrEmpty(this.functionCode)) { + LOG.info("[pyudf-test] analyzePythonUdaf module mode symbol={}, userFile={}, runtimeVersion={}", + clazz, originalUserFile, runtimeVersion); return; } + LOG.info("[pyudf-test] analyzePythonUdaf inline raw symbol={}, rawCodeLength={}, runtimeVersion={}", + clazz, this.functionCode.length(), runtimeVersion); this.functionCode = this.functionCode.trim(); if (!(this.functionCode.startsWith("$$") && this.functionCode.endsWith("$$"))) { throw new AnalysisException("Inline Python UDAF code must be start with $$ and end with $$"); @@ -645,6 +657,9 @@ private void analyzePythonUdaf(String clazz) throws AnalysisException { if (this.functionCode.isEmpty()) { throw new AnalysisException("Inline Python UDAF is empty"); } + LOG.info("[pyudf-test] analyzePythonUdaf inline normalized symbol={}, normalizedCodeLength={}, " + + "runtimeVersion={}", + clazz, this.functionCode.length(), runtimeVersion); } private void checkUdafClass(String clazz, ClassLoader cl, HashMap allMethods) @@ -804,9 +819,13 @@ private void analyzePythonUdf(String clazz) throws AnalysisException { } if (Strings.isNullOrEmpty(this.functionCode)) { + LOG.info("[pyudf-test] analyzePythonUdf module mode symbol={}, userFile={}, runtimeVersion={}", + clazz, originalUserFile, runtimeVersion); return; } + LOG.info("[pyudf-test] analyzePythonUdf inline raw symbol={}, rawCodeLength={}, runtimeVersion={}", + clazz, this.functionCode.length(), runtimeVersion); this.functionCode = this.functionCode.trim(); if (!(this.functionCode.startsWith("$$") && this.functionCode.endsWith("$$"))) { throw new AnalysisException("Inline Python UDF code must be start with $$ and end with $$"); @@ -816,6 +835,9 @@ private void analyzePythonUdf(String clazz) throws AnalysisException { if (this.functionCode.isEmpty()) { throw new AnalysisException("Inline Python UDF is empty"); } + LOG.info("[pyudf-test] analyzePythonUdf inline normalized symbol={}, normalizedCodeLength={}, " + + "runtimeVersion={}", + clazz, this.functionCode.length(), runtimeVersion); } private void checkUdfClass(String clazz, ClassLoader cl) throws ClassNotFoundException, AnalysisException { @@ -916,9 +938,13 @@ private void analyzePythonUdtf(String clazz) throws AnalysisException { } if (Strings.isNullOrEmpty(this.functionCode)) { + LOG.info("[pyudf-test] analyzePythonUdtf module mode symbol={}, userFile={}, runtimeVersion={}", + clazz, originalUserFile, runtimeVersion); return; } + LOG.info("[pyudf-test] analyzePythonUdtf inline raw symbol={}, rawCodeLength={}, runtimeVersion={}", + clazz, this.functionCode.length(), runtimeVersion); this.functionCode = this.functionCode.trim(); if (!(this.functionCode.startsWith("$$") && this.functionCode.endsWith("$$"))) { throw new AnalysisException("Inline Python UDTF code must be start with $$ and end with $$"); @@ -928,6 +954,9 @@ private void analyzePythonUdtf(String clazz) throws AnalysisException { if (this.functionCode.isEmpty()) { throw new AnalysisException("Inline Python UDTF is empty"); } + LOG.info("[pyudf-test] analyzePythonUdtf inline normalized symbol={}, normalizedCodeLength={}, " + + "runtimeVersion={}", + clazz, this.functionCode.length(), runtimeVersion); } private void checkRPCUdf(String symbol) throws AnalysisException { From 7834b5142f9eee6a5ccb971f608457a1b1a62d4e Mon Sep 17 00:00:00 2001 From: linzhenqi Date: Fri, 24 Apr 2026 18:24:53 +0800 Subject: [PATCH 07/11] fix empty-arg --- be/src/exprs/table_function/python_udtf_function.cpp | 4 ++++ be/src/udf/python/python_udf_meta.h | 10 +++++----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/be/src/exprs/table_function/python_udtf_function.cpp b/be/src/exprs/table_function/python_udtf_function.cpp index eae1b71f638f4e..d12fb278b10c6b 100644 --- a/be/src/exprs/table_function/python_udtf_function.cpp +++ b/be/src/exprs/table_function/python_udtf_function.cpp @@ -150,6 +150,10 @@ Status PythonUDTFFunction::process_init(Block* block, RuntimeState* state) { // Python returns a ListArray where each element contains outputs for one input row std::shared_ptr list_array; RETURN_IF_ERROR(_udtf_client->evaluate(*input_batch, &list_array)); + if (list_array->length() != input_rows) [[unlikely]] { + return Status::InternalError("Python UDTF output rows {} not equal to input rows {}", + list_array->length(), input_rows); + } // Step 4: Convert Python server output (ListArray) to Doris array column RETURN_IF_ERROR(_convert_list_array_to_array_column(list_array)); diff --git a/be/src/udf/python/python_udf_meta.h b/be/src/udf/python/python_udf_meta.h index 7993faf3bb7014..55c49abb30ad07 100644 --- a/be/src/udf/python/python_udf_meta.h +++ b/be/src/udf/python/python_udf_meta.h @@ -33,18 +33,18 @@ enum class PythonUDFLoadType : uint8_t { INLINE = 0, MODULE = 1, UNKNOWN = 2 }; enum class PythonClientType : uint8_t { UDF = 0, UDAF = 1, UDTF = 2, UNKNOWN = 3 }; struct PythonUDFMeta { - int64_t id; + int64_t id = 0; std::string name; std::string symbol; std::string location; std::string checksum; std::string runtime_version; std::string inline_code; - bool always_nullable; + bool always_nullable = false; DataTypes input_types; DataTypePtr return_type; - PythonUDFLoadType type; - PythonClientType client_type; + PythonUDFLoadType type = PythonUDFLoadType::UNKNOWN; + PythonClientType client_type = PythonClientType::UNKNOWN; static Status convert_types_to_schema(const DataTypes& types, const std::string& timezone, std::shared_ptr* schema); @@ -70,4 +70,4 @@ struct hash { return std::hash()(meta.id); } }; -} // namespace std \ No newline at end of file +} // namespace std From e21893fca353b6478503f19bc4a16fc74ac9b7cb Mon Sep 17 00:00:00 2001 From: linzhenqi Date: Tue, 28 Apr 2026 11:23:41 +0800 Subject: [PATCH 08/11] fix err propagation --- be/src/udf/python/python_server.py | 9 +- be/src/udf/python/python_udaf_client.cpp | 47 ++++++-- be/src/udf/python/python_udaf_client.h | 10 ++ .../udf/python/python_udaf_client_test.cpp | 109 ++++++++++++++++++ 4 files changed, 167 insertions(+), 8 deletions(-) create mode 100644 be/test/udf/python/python_udaf_client_test.cpp diff --git a/be/src/udf/python/python_server.py b/be/src/udf/python/python_server.py index 2b9d260b47fddd..665fc46e515799 100644 --- a/be/src/udf/python/python_server.py +++ b/be/src/udf/python/python_server.py @@ -2098,6 +2098,7 @@ def _handle_exchange_udaf( * ACCUMULATE: use success + rows_processed (number of rows processed) * SERIALIZE: use success + serialized_data (serialized_state) * FINALIZE: use success + serialized_data (serialized result) + * Any failed operation: use success=false + serialized_data (UTF-8 error message) """ # Get or create state manager for this specific UDAF function @@ -2270,7 +2271,13 @@ def _handle_exchange_udaf( e, traceback.format_exc(), ) - raise + # Keep the UDAF Flight stream alive so C++ can still send DESTROY. + # On failure, serialized_data carries the user-visible Python error text. + result_batch = self._create_unified_response( + success=False, + rows_processed=0, + data=str(e).encode("utf-8", errors="replace"), + ) # Begin stream with unified schema on first call if not started: diff --git a/be/src/udf/python/python_udaf_client.cpp b/be/src/udf/python/python_udaf_client.cpp index 6a6f6035ea5ca2..07c5b98027e5f4 100644 --- a/be/src/udf/python/python_udaf_client.cpp +++ b/be/src/udf/python/python_udaf_client.cpp @@ -42,6 +42,7 @@ namespace doris { // - ACCUMULATE: use success + rows_processed (number of rows processed) // - SERIALIZE: use success + data (serialized_state) // - FINALIZE: use success + data (serialized result, may be null) +// - Any failed operation: use success=false + data (UTF-8 error message) // // This unified schema allows all operations to return consistent format, // solving Arrow Flight's limitation that all responses must have the same schema. @@ -51,6 +52,38 @@ static const std::shared_ptr kUnifiedUDAFResponseSchema = arrow:: arrow::field("serialized_data", arrow::binary()), }); +Status PythonUDAFClient::make_udaf_failure_status( + const std::shared_ptr& response, const char* operation, + int64_t place_id) { + if (response == nullptr || response->num_rows() != 1 || + response->num_columns() != kUnifiedUDAFResponseSchema->num_fields()) [[unlikely]] { + return Status::InternalError("Invalid {} failure response for place_id={}", operation, + place_id); + } + + auto data_array = std::static_pointer_cast(response->column(2)); + if (data_array->IsNull(0)) { + return Status::InternalError("{} operation failed for place_id={}", operation, place_id); + } + + const uint8_t* data = data_array->value_data()->data() + data_array->value_offset(0); + int32_t length = data_array->value_length(0); + if (length <= 0) { + return Status::InternalError("{} operation failed for place_id={}", operation, place_id); + } + std::string error_message(reinterpret_cast(data), length); + return Status::InternalError("{} operation failed for place_id={}: {}", operation, place_id, + error_message); +} + +#ifdef BE_TEST +Status PythonUDAFClient::make_udaf_failure_status_for_test( + const std::shared_ptr& response, const char* operation, + int64_t place_id) { + return make_udaf_failure_status(response, operation, place_id); +} +#endif + Status PythonUDAFClient::create(const PythonUDFMeta& func_meta, ProcessPtr process, const std::shared_ptr& data_schema, PythonUDAFClientPtr* client) { @@ -89,7 +122,7 @@ Status PythonUDAFClient::create(int64_t place_id) { auto success_array = std::static_pointer_cast(response_batch->column(0)); if (!success_array->Value(0)) { - return Status::InternalError("CREATE operation failed for place_id={}", place_id); + return make_udaf_failure_status(response_batch, "CREATE", place_id); } _created_place_id = place_id; @@ -142,7 +175,7 @@ Status PythonUDAFClient::accumulate(int64_t place_id, bool is_single_place, auto rows_processed_array = std::static_pointer_cast(response->column(1)); if (!success_array->Value(0)) { - return Status::InternalError("ACCUMULATE operation failed for place_id={}", place_id); + return make_udaf_failure_status(response, "ACCUMULATE", place_id); } // Cast to uint8_t* first to avoid UBSAN misaligned pointer errors @@ -185,7 +218,7 @@ Status PythonUDAFClient::serialize(int64_t place_id, auto data_array = std::static_pointer_cast(response->column(2)); if (!success_array->Value(0)) { - return Status::InternalError("SERIALIZE operation failed for place_id={}", place_id); + return make_udaf_failure_status(response, "SERIALIZE", place_id); } // Cast to uint8_t* first to avoid UBSAN misaligned pointer errors @@ -233,7 +266,7 @@ Status PythonUDAFClient::merge(int64_t place_id, auto success_array = std::static_pointer_cast(response->column(0)); if (!success_array->Value(0)) { - return Status::InternalError("MERGE operation failed for place_id={}", place_id); + return make_udaf_failure_status(response, "MERGE", place_id); } return Status::OK(); @@ -260,7 +293,7 @@ Status PythonUDAFClient::finalize(int64_t place_id, std::shared_ptr(response_batch->column(2)); if (!success_array->Value(0)) { - return Status::InternalError("FINALIZE operation failed for place_id={}", place_id); + return make_udaf_failure_status(response_batch, "FINALIZE", place_id); } // Cast to uint8_t* first to avoid UBSAN misaligned pointer errors @@ -324,7 +357,7 @@ Status PythonUDAFClient::reset(int64_t place_id) { auto success_array = std::static_pointer_cast(response->column(0)); if (!success_array->Value(0)) { - return Status::InternalError("RESET operation failed for place_id={}", place_id); + return make_udaf_failure_status(response, "RESET", place_id); } return Status::OK(); @@ -363,7 +396,7 @@ Status PythonUDAFClient::destroy(int64_t place_id) { if (!success_array->Value(0)) { LOG(WARNING) << "DESTROY operation failed for place_id=" << place_id; - return Status::InternalError("DESTROY operation failed for place_id={}", place_id); + return make_udaf_failure_status(response, "DESTROY", place_id); } return Status::OK(); diff --git a/be/src/udf/python/python_udaf_client.h b/be/src/udf/python/python_udaf_client.h index 078c34a39ea967..471716651a4d9a 100644 --- a/be/src/udf/python/python_udaf_client.h +++ b/be/src/udf/python/python_udaf_client.h @@ -17,6 +17,7 @@ #pragma once +#include #include #include "udf/python/python_client.h" @@ -173,9 +174,18 @@ class PythonUDAFClient : public PythonClient { */ Status close(); +#ifdef BE_TEST + static Status make_udaf_failure_status_for_test( + const std::shared_ptr& response, const char* operation, + int64_t place_id); +#endif + private: DISALLOW_COPY_AND_ASSIGN(PythonUDAFClient); + static Status make_udaf_failure_status(const std::shared_ptr& response, + const char* operation, int64_t place_id); + /** * Send RecordBatch request to Python server with app_metadata * @param metadata UDAFMetadata structure (will be sent as app_metadata) diff --git a/be/test/udf/python/python_udaf_client_test.cpp b/be/test/udf/python/python_udaf_client_test.cpp new file mode 100644 index 00000000000000..0867749f947471 --- /dev/null +++ b/be/test/udf/python/python_udaf_client_test.cpp @@ -0,0 +1,109 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "udf/python/python_udaf_client.h" + +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace doris { + +std::shared_ptr make_udaf_response(const std::optional& error) { + arrow::BooleanBuilder success_builder; + std::shared_ptr success_array; + EXPECT_TRUE(success_builder.Append(false).ok()); + EXPECT_TRUE(success_builder.Finish(&success_array).ok()); + + arrow::Int64Builder rows_processed_builder; + std::shared_ptr rows_processed_array; + EXPECT_TRUE(rows_processed_builder.Append(0).ok()); + EXPECT_TRUE(rows_processed_builder.Finish(&rows_processed_array).ok()); + + arrow::BinaryBuilder data_builder; + std::shared_ptr data_array; + if (error.has_value()) { + EXPECT_TRUE(data_builder.Append(error->data(), static_cast(error->size())).ok()); + } else { + EXPECT_TRUE(data_builder.AppendNull().ok()); + } + EXPECT_TRUE(data_builder.Finish(&data_array).ok()); + + auto schema = arrow::schema({ + arrow::field("success", arrow::boolean()), + arrow::field("rows_processed", arrow::int64()), + arrow::field("serialized_data", arrow::binary()), + }); + return arrow::RecordBatch::Make(schema, 1, {success_array, rows_processed_array, data_array}); +} + +TEST(PythonUDAFClientTest, FailureStatusIncludesPythonErrorMessage) { + auto response = make_udaf_response("finish failed"); + Status status = PythonUDAFClient::make_udaf_failure_status_for_test(response, "FINALIZE", 7); + + EXPECT_FALSE(status.ok()); + EXPECT_NE(status.to_string().find("FINALIZE operation failed for place_id=7: finish failed"), + std::string::npos); +} + +TEST(PythonUDAFClientTest, FailureStatusFallsBackWhenErrorMessageIsNullOrEmpty) { + Status null_status = PythonUDAFClient::make_udaf_failure_status_for_test( + make_udaf_response(std::nullopt), "RESET", 8); + EXPECT_FALSE(null_status.ok()); + EXPECT_NE(null_status.to_string().find("RESET operation failed for place_id=8"), + std::string::npos); + + Status empty_status = + PythonUDAFClient::make_udaf_failure_status_for_test(make_udaf_response(""), "MERGE", 9); + EXPECT_FALSE(empty_status.ok()); + EXPECT_NE(empty_status.to_string().find("MERGE operation failed for place_id=9"), + std::string::npos); +} + +TEST(PythonUDAFClientTest, FailureStatusRejectsInvalidResponseShape) { + Status null_status = + PythonUDAFClient::make_udaf_failure_status_for_test(nullptr, "ACCUMULATE", 10); + EXPECT_FALSE(null_status.ok()); + EXPECT_NE(null_status.to_string().find("Invalid ACCUMULATE failure response for place_id=10"), + std::string::npos); + + auto zero_row_response = make_udaf_response("accumulate failed")->Slice(0, 0); + Status zero_row_status = PythonUDAFClient::make_udaf_failure_status_for_test(zero_row_response, + "ACCUMULATE", 11); + EXPECT_FALSE(zero_row_status.ok()); + EXPECT_NE( + zero_row_status.to_string().find("Invalid ACCUMULATE failure response for place_id=11"), + std::string::npos); + + auto response = make_udaf_response("reset failed"); + auto two_column_response = arrow::RecordBatch::Make( + arrow::schema({response->schema()->field(0), response->schema()->field(1)}), 1, + {response->column(0), response->column(1)}); + Status two_column_status = + PythonUDAFClient::make_udaf_failure_status_for_test(two_column_response, "RESET", 12); + EXPECT_FALSE(two_column_status.ok()); + EXPECT_NE(two_column_status.to_string().find("Invalid RESET failure response for place_id=12"), + std::string::npos); +} + +} // namespace doris From bce6c6c7f2da64443362abd8777d064f3b5b7526 Mon Sep 17 00:00:00 2001 From: linzhenqi Date: Wed, 29 Apr 2026 17:08:44 +0800 Subject: [PATCH 09/11] fix misalign --- be/src/udf/python/python_udaf_client.cpp | 39 +++++++++------- .../udf/python/python_udaf_client_test.cpp | 46 +++++++++++++++++++ 2 files changed, 69 insertions(+), 16 deletions(-) diff --git a/be/src/udf/python/python_udaf_client.cpp b/be/src/udf/python/python_udaf_client.cpp index 07c5b98027e5f4..90b983c8671a5b 100644 --- a/be/src/udf/python/python_udaf_client.cpp +++ b/be/src/udf/python/python_udaf_client.cpp @@ -30,6 +30,7 @@ #include "common/compiler_util.h" #include "common/status.h" #include "format/arrow/arrow_utils.h" +#include "util/unaligned.h" #include "udf/python/python_udf_meta.h" #include "udf/python/python_udf_runtime.h" @@ -66,11 +67,20 @@ Status PythonUDAFClient::make_udaf_failure_status( return Status::InternalError("{} operation failed for place_id={}", operation, place_id); } - const uint8_t* data = data_array->value_data()->data() + data_array->value_offset(0); - int32_t length = data_array->value_length(0); + const auto* offsets = data_array->raw_value_offsets(); + if (offsets == nullptr) [[unlikely]] { + return Status::InternalError("Invalid {} failure response for place_id={}: null offsets", + operation, place_id); + } + // Arrow Flight buffers may be unaligned after IPC deserialization + int32_t offset_start = unaligned_load(offsets); + int32_t offset_end = unaligned_load(offsets + 1); + + int32_t length = offset_end - offset_start; if (length <= 0) { return Status::InternalError("{} operation failed for place_id={}", operation, place_id); } + const uint8_t* data = data_array->value_data()->data() + offset_start; std::string error_message(reinterpret_cast(data), length); return Status::InternalError("{} operation failed for place_id={}: {}", operation, place_id, error_message); @@ -178,13 +188,12 @@ Status PythonUDAFClient::accumulate(int64_t place_id, bool is_single_place, return make_udaf_failure_status(response, "ACCUMULATE", place_id); } - // Cast to uint8_t* first to avoid UBSAN misaligned pointer errors - const uint8_t* raw_ptr = reinterpret_cast(rows_processed_array->raw_values()); + // Arrow Flight buffers may be unaligned after IPC deserialization. + const auto* raw_ptr = rows_processed_array->raw_values(); if (raw_ptr == nullptr) { return Status::InternalError("ACCUMULATE response has null rows_processed array"); } - int64_t rows_processed; - memcpy(&rows_processed, raw_ptr, sizeof(int64_t)); + int64_t rows_processed = unaligned_load(raw_ptr); int64_t expected_rows = row_end - row_start; @@ -221,14 +230,13 @@ Status PythonUDAFClient::serialize(int64_t place_id, return make_udaf_failure_status(response, "SERIALIZE", place_id); } - // Cast to uint8_t* first to avoid UBSAN misaligned pointer errors - const uint8_t* offsets = reinterpret_cast(data_array->raw_value_offsets()); + // Arrow Flight buffers may be unaligned after IPC deserialization. + const auto* offsets = data_array->raw_value_offsets(); if (offsets == nullptr) { return Status::InternalError("SERIALIZE response has null offsets"); } - int32_t offset_start, offset_end; - memcpy(&offset_start, offsets, sizeof(int32_t)); - memcpy(&offset_end, offsets + sizeof(int32_t), sizeof(int32_t)); + int32_t offset_start = unaligned_load(offsets); + int32_t offset_end = unaligned_load(offsets + 1); int32_t length = offset_end - offset_start; @@ -296,14 +304,13 @@ Status PythonUDAFClient::finalize(int64_t place_id, std::shared_ptr(data_array->raw_value_offsets()); + // Arrow Flight buffers may be unaligned after IPC deserialization. + const auto* offsets = data_array->raw_value_offsets(); if (offsets == nullptr) { return Status::InternalError("FINALIZE response has null offsets"); } - int32_t offset_start, offset_end; - memcpy(&offset_start, offsets, sizeof(int32_t)); - memcpy(&offset_end, offsets + sizeof(int32_t), sizeof(int32_t)); + int32_t offset_start = unaligned_load(offsets); + int32_t offset_end = unaligned_load(offsets + 1); int32_t length = offset_end - offset_start; diff --git a/be/test/udf/python/python_udaf_client_test.cpp b/be/test/udf/python/python_udaf_client_test.cpp index 0867749f947471..eb1ab5242b8cf0 100644 --- a/be/test/udf/python/python_udaf_client_test.cpp +++ b/be/test/udf/python/python_udaf_client_test.cpp @@ -23,9 +23,12 @@ #include #include +#include +#include #include #include #include +#include namespace doris { @@ -57,6 +60,26 @@ std::shared_ptr make_udaf_response(const std::optional make_udaf_response_with_data_array( + const std::shared_ptr& data_array) { + arrow::BooleanBuilder success_builder; + std::shared_ptr success_array; + EXPECT_TRUE(success_builder.Append(false).ok()); + EXPECT_TRUE(success_builder.Finish(&success_array).ok()); + + arrow::Int64Builder rows_processed_builder; + std::shared_ptr rows_processed_array; + EXPECT_TRUE(rows_processed_builder.Append(0).ok()); + EXPECT_TRUE(rows_processed_builder.Finish(&rows_processed_array).ok()); + + auto schema = arrow::schema({ + arrow::field("success", arrow::boolean()), + arrow::field("rows_processed", arrow::int64()), + arrow::field("serialized_data", arrow::binary()), + }); + return arrow::RecordBatch::Make(schema, 1, {success_array, rows_processed_array, data_array}); +} + TEST(PythonUDAFClientTest, FailureStatusIncludesPythonErrorMessage) { auto response = make_udaf_response("finish failed"); Status status = PythonUDAFClient::make_udaf_failure_status_for_test(response, "FINALIZE", 7); @@ -66,6 +89,29 @@ TEST(PythonUDAFClientTest, FailureStatusIncludesPythonErrorMessage) { std::string::npos); } +TEST(PythonUDAFClientTest, FailureStatusHandlesUnalignedBinaryOffsets) { + std::string error = "finalize failed"; + std::vector offset_storage(1 + 2 * sizeof(int32_t)); + uint8_t* unaligned_offsets = offset_storage.data() + 1; + int32_t offset_start = 0; + int32_t offset_end = static_cast(error.size()); + memcpy(unaligned_offsets, &offset_start, sizeof(int32_t)); + memcpy(unaligned_offsets + sizeof(int32_t), &offset_end, sizeof(int32_t)); + + auto offset_buffer = arrow::Buffer::Wrap(unaligned_offsets, 2 * sizeof(int32_t)); + auto value_buffer = + arrow::Buffer::Wrap(reinterpret_cast(error.data()), error.size()); + auto data_array = std::make_shared(1, offset_buffer, value_buffer); + ASSERT_EQ(reinterpret_cast(data_array->raw_value_offsets()) % alignof(int32_t), 1); + + Status status = PythonUDAFClient::make_udaf_failure_status_for_test( + make_udaf_response_with_data_array(data_array), "FINALIZE", 13); + + EXPECT_FALSE(status.ok()); + EXPECT_NE(status.to_string().find("FINALIZE operation failed for place_id=13: finalize failed"), + std::string::npos); +} + TEST(PythonUDAFClientTest, FailureStatusFallsBackWhenErrorMessageIsNullOrEmpty) { Status null_status = PythonUDAFClient::make_udaf_failure_status_for_test( make_udaf_response(std::nullopt), "RESET", 8); From a59cb215f95a0c725a1265390518e3b65e6e639b Mon Sep 17 00:00:00 2001 From: linzhenqi Date: Wed, 29 Apr 2026 21:50:19 +0800 Subject: [PATCH 10/11] [Fix](pyudf) clear Nereids UDF registry on drop database --- .../doris/catalog/FunctionRegistry.java | 9 ++ .../doris/datasource/InternalCatalog.java | 13 +++ .../pythonudf_p0/test_pythonudf_drop.groovy | 105 ++++++++++++++++++ 3 files changed, 127 insertions(+) diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionRegistry.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionRegistry.java index 619de51e50d8cb..1a8402520ef01b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionRegistry.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionRegistry.java @@ -322,6 +322,15 @@ public void dropUdf(String dbName, String name, List argTypes) { } } + public void dropUdfByDb(String dbName) { + if (dbName == null) { + dbName = GLOBAL_FUNCTION; + } + synchronized (name2UdfBuilders) { + name2UdfBuilders.remove(dbName); + } + } + /** * use for search appropriate signature for UDFs if candidate more than one. */ diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java index 7b48cbb10efce0..17fc587963e8f7 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java @@ -42,6 +42,8 @@ import org.apache.doris.catalog.DynamicPartitionProperty; import org.apache.doris.catalog.Env; import org.apache.doris.catalog.EnvFactory; +import org.apache.doris.catalog.Function; +import org.apache.doris.catalog.FunctionUtil; import org.apache.doris.catalog.HashDistributionInfo; import org.apache.doris.catalog.Index; import org.apache.doris.catalog.InfoSchemaDb; @@ -537,6 +539,7 @@ public void dropDb(String dbName, boolean ifExists, boolean force) throws DdlExc // 3. remove db from catalog idToDb.remove(db.getId()); fullNameToDb.remove(db.getFullName()); + Env.getCurrentEnv().getFunctionRegistry().dropUdfByDb(db.getFullName()); DropDbInfo info = new DropDbInfo(dbName, force, recycleTime); Env.getCurrentEnv().getQueryStats().clear(Env.getCurrentEnv().getCurrentCatalog().getId(), db.getId()); Env.getCurrentEnv().getDictionaryManager().dropDbDictionaries(dbName); @@ -595,6 +598,7 @@ public void replayDropDb(String dbName, boolean isForceDrop, Long recycleTime) t fullNameToDb.remove(dbName); idToDb.remove(db.getId()); + Env.getCurrentEnv().getFunctionRegistry().dropUdfByDb(dbName); } finally { unlock(); } @@ -644,6 +648,7 @@ public void recoverDatabase(String dbName, long dbId, String newDbName) throws D RecoverInfo recoverInfo = new RecoverInfo(db.getId(), -1L, -1L, newDbName, "", "", "", ""); Env.getCurrentEnv().getEditLog().logRecoverDb(recoverInfo); db.unmarkDropped(); + registerDbFunctionsToNereids(db); } finally { MetaLockUtils.writeUnlockTables(tableList); db.writeUnlock(); @@ -726,9 +731,17 @@ public void replayRecoverDatabase(RecoverInfo info) { // add db to catalog replayCreateDb(db, newDbName); db.unmarkDropped(); + registerDbFunctionsToNereids(db); LOG.info("replay recover db[{}]", dbId); } + private void registerDbFunctionsToNereids(Database db) { + // A recovered database reuses catalog Function objects, so rebuild their Nereids builders. + for (Function function : db.getFunctions()) { + FunctionUtil.translateToNereids(db.getFullName(), function); + } + } + public void alterDatabaseQuota(String dbName, QuotaType quotaType, long quotaValue) throws DdlException { Database db = getDbOrDdlException(dbName); db.writeLockOrDdlException(); diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_drop.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_drop.groovy index 2672cadbee886a..b2ab740f6ea0ec 100644 --- a/regression-test/suites/pythonudf_p0/test_pythonudf_drop.groovy +++ b/regression-test/suites/pythonudf_p0/test_pythonudf_drop.groovy @@ -111,10 +111,115 @@ suite("test_pythonudf_drop", "nonConcurrent") { qt_py_udf_drop_5 """SELECT py_drop_reconnect(32);""" try_sql("DROP FUNCTION IF EXISTS py_drop_reconnect(INT);") + + // Case 4: recreating the same signature must use the new inline function body. + sql """DROP FUNCTION IF EXISTS py_drop_recreate(INT)""" + sql """ + CREATE FUNCTION py_drop_recreate(INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +def evaluate(x): + if x is None: + return None + return x + 1 +\$\$ + """ + def recreateOldResult = sql """SELECT py_drop_recreate(10);""" + assert recreateOldResult[0][0] == 11 + + sql """DROP FUNCTION IF EXISTS py_drop_recreate(INT)""" + sql """ + CREATE FUNCTION py_drop_recreate(INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +def evaluate(x): + if x is None: + return None + return x + 999 +\$\$ + """ + def recreateNewResult = sql """SELECT py_drop_recreate(10);""" + assert recreateNewResult[0][0] == 1009 + sql """DROP FUNCTION IF EXISTS py_drop_recreate(INT)""" + + // Case 5: dropping a database must also clear Nereids UDF registry. + // SHOW FUNCTIONS reads catalog metadata, while SELECT resolves from FunctionRegistry. + // Without registry cleanup, SELECT could still bind the stale x + 1 inline UDF + // after the database had been dropped and recreated. + def originalDb = sql("SELECT DATABASE()")[0][0] + def registryDb = "${originalDb}_registry_cleanup" + try { + sql """DROP DATABASE IF EXISTS ${registryDb} FORCE""" + sql """CREATE DATABASE ${registryDb}""" + sql """USE ${registryDb}""" + sql """ + CREATE FUNCTION py_drop_db_registry(INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +def evaluate(x): + if x is None: + return None + return x + 1 +\$\$ + """ + def oldResult = sql """SELECT py_drop_db_registry(10);""" + assert oldResult[0][0] == 11 + + sql """DROP DATABASE ${registryDb} FORCE""" + sql """CREATE DATABASE ${registryDb}""" + sql """USE ${registryDb}""" + def functions = sql """SHOW FUNCTIONS LIKE 'py_drop_db_registry'""" + assert functions.isEmpty() + test { + sql """SELECT py_drop_db_registry(10);""" + exception "Can not found function" + } + + sql """ + CREATE FUNCTION py_drop_db_registry(INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +def evaluate(x): + if x is None: + return None + return x + 999 +\$\$ + """ + def rebuiltResult = sql """SELECT py_drop_db_registry(10);""" + assert rebuiltResult[0][0] == 1009 + } finally { + sql """USE ${originalDb}""" + try_sql("DROP DATABASE IF EXISTS ${registryDb} FORCE") + } } finally { try_sql("DROP FUNCTION IF EXISTS py_drop_once(INT);") try_sql("DROP FUNCTION IF EXISTS py_drop_a(INT);") try_sql("DROP FUNCTION IF EXISTS py_drop_b(INT);") try_sql("DROP FUNCTION IF EXISTS py_drop_reconnect(INT);") + try_sql("DROP FUNCTION IF EXISTS py_drop_recreate(INT);") } } From 4354e5b52cd3fdcf59444ebe8c319d1c9cfc68e1 Mon Sep 17 00:00:00 2001 From: linzhenqi Date: Wed, 29 Apr 2026 23:49:19 +0800 Subject: [PATCH 11/11] [Fix](pyudf) clear stale UDAF state cache on drop --- be/src/agent/task_worker_pool.cpp | 2 + be/src/udf/python/python_server.cpp | 24 +++++-- be/src/udf/python/python_server.h | 5 ++ be/src/udf/python/python_server.py | 70 +++++++++++++++++-- be/src/udf/python/python_udf_meta.cpp | 2 + be/test/udf/python/python_server_test.cpp | 12 ++++ be/test/udf/python/python_udf_meta_test.cpp | 3 + .../pythonudaf_p0/test_pythonudaf_drop.groovy | 63 +++++++++++++++++ 8 files changed, 170 insertions(+), 11 deletions(-) diff --git a/be/src/agent/task_worker_pool.cpp b/be/src/agent/task_worker_pool.cpp index 45720833eab8c9..b98815d61a54b3 100644 --- a/be/src/agent/task_worker_pool.cpp +++ b/be/src/agent/task_worker_pool.cpp @@ -92,6 +92,7 @@ #include "storage/task/engine_storage_migration_task.h" #include "storage/txn/txn_manager.h" #include "storage/utils.h" +#include "udf/python/python_server.h" #include "util/brpc_client_cache.h" #include "util/debug_points.h" #include "util/jni-util.h" @@ -2511,6 +2512,7 @@ void clean_udf_cache_callback(const TAgentTaskRequest& req) { if (clean_req.__isset.function_id && clean_req.function_id > 0) { UserFunctionCache::instance()->drop_function_cache(clean_req.function_id); + PythonServerManager::instance().clear_udaf_state_cache(clean_req.function_id); } LOG(INFO) << "clean udf cache finish: function_signature=" << clean_req.function_signature; diff --git a/be/src/udf/python/python_server.cpp b/be/src/udf/python/python_server.cpp index 2b001a927f05f0..ff9f6fa9133306 100644 --- a/be/src/udf/python/python_server.cpp +++ b/be/src/udf/python/python_server.cpp @@ -31,6 +31,7 @@ #include "arrow/flight/client.h" #include "common/config.h" +#include "common/status.h" #include "udf/python/python_udaf_client.h" #include "udf/python/python_udf_client.h" #include "udf/python/python_udtf_client.h" @@ -436,7 +437,19 @@ Status PythonServerManager::clear_module_cache(const std::string& location) { } std::string body = fmt::format(R"({{"location": "{}"}})", location); + return _broadcast_action_to_processes("clear_module_cache", body, + fmt::format("location={}", location)); +} + +void PythonServerManager::clear_udaf_state_cache(int64_t function_id) { + std::string body = fmt::format(R"({{"function_id": {}}})", function_id); + THROW_IF_ERROR(_broadcast_action_to_processes("clear_udaf_state_cache", body, + fmt::format("function_id={}", function_id))); +} +Status PythonServerManager::_broadcast_action_to_processes(const std::string& action_type, + const std::string& body, + const std::string& log_name) { int success_count = 0; int fail_count = 0; bool has_active_process = false; @@ -464,7 +477,7 @@ Status PythonServerManager::clear_module_cache(const std::string& location) { auto client = std::move(*client_result); arrow::flight::Action action; - action.type = "clear_module_cache"; + action.type = action_type; action.body = arrow::Buffer::FromString(body); auto result_stream = client->DoAction(action); @@ -490,13 +503,12 @@ Status PythonServerManager::clear_module_cache(const std::string& location) { return Status::OK(); } - LOG(INFO) << "clear_module_cache completed for location=" << location - << ", success=" << success_count << ", failed=" << fail_count; + LOG(INFO) << action_type << " completed for " << log_name << ", success=" << success_count + << ", failed=" << fail_count; if (fail_count > 0) { - return Status::InternalError( - "clear_module_cache failed for location={}, success={}, failed={}", location, - success_count, fail_count); + return Status::InternalError("{} failed for {}, success={}, failed={}", action_type, + log_name, success_count, fail_count); } return Status::OK(); diff --git a/be/src/udf/python/python_server.h b/be/src/udf/python/python_server.h index 1e0b978f3bb49e..4362e95cb1cfd3 100644 --- a/be/src/udf/python/python_server.h +++ b/be/src/udf/python/python_server.h @@ -54,6 +54,9 @@ class PythonServerManager { // Clear Python module cache for a specific UDF location across all processes Status clear_module_cache(const std::string& location); + // Clear Python UDAF runtime state after DROP FUNCTION + void clear_udaf_state_cache(int64_t function_id); + Status ensure_pool_initialized(const PythonVersion& version); void shutdown(); @@ -100,6 +103,8 @@ class PythonServerManager { std::shared_ptr _get_process_pool(const PythonVersion& version); std::vector>> _snapshot_process_pools(); + Status _broadcast_action_to_processes(const std::string& action_type, const std::string& body, + const std::string& log_name); std::unordered_map> _process_pools; // Protects the version -> pool handle map only. Per-version process operations are guarded diff --git a/be/src/udf/python/python_server.py b/be/src/udf/python/python_server.py index 665fc46e515799..bf162b871604e8 100644 --- a/be/src/udf/python/python_server.py +++ b/be/src/udf/python/python_server.py @@ -455,6 +455,7 @@ class PythonUDFMeta: def __init__( self, + function_id: int, name: str, symbol: str, location: str, @@ -470,6 +471,7 @@ def __init__( Initialize Python UDF metadata. Args: + function_id: FE catalog function id name: UDF function name symbol: Symbol to load (function name or module.function) location: File path or directory containing the UDF @@ -481,6 +483,7 @@ def __init__( output_type: PyArrow data type for return value client_type: 0 for UDF, 1 for UDAF, 2 for UDTF """ + self.id = function_id self.name = name self.symbol = symbol self.location = location @@ -508,7 +511,7 @@ def __str__(self) -> str: """Returns a string representation of the UDF metadata.""" udf_load_type_str = "INLINE" if self.udf_load_type == 0 else "MODULE" return ( - f"PythonUDFMeta(name={self.name}, symbol={self.symbol}, " + f"PythonUDFMeta(id={self.id}, name={self.name}, symbol={self.symbol}, " f"location={self.location}, udf_load_type={udf_load_type_str}, runtime_version={self.runtime_version}, " f"always_nullable={self.always_nullable}, client_type={self.client_type.name}, " f"input_types={self.input_types}, output_type={self.output_type})" @@ -1573,8 +1576,9 @@ def __init__(self, location: str): location: Unix socket path for the server """ super().__init__(location) - # Use a dictionary to maintain separate state managers for each UDAF function - # Key: function signature (name + input_types), Value: UDAFStateManager instance + # Use a dictionary to maintain separate state managers for each UDAF function. + # Key includes function_id so DROP/CREATE with the same name and signature + # cannot reuse a class loaded from old inline code. self.udaf_state_managers: Dict[str, UDAFStateManager] = {} self.udaf_managers_lock = threading.Lock() @@ -1591,9 +1595,10 @@ def _get_udaf_state_manager( Returns: UDAFStateManager instance for this specific UDAF """ - # Create a unique key based on function name and argument types type_names = [str(field.type) for field in python_udaf_meta.input_types] - func_key = f"{python_udaf_meta.name}({','.join(type_names)})" + func_key = ( + f"{python_udaf_meta.id}:{python_udaf_meta.name}({','.join(type_names)})" + ) with self.udaf_managers_lock: if func_key not in self.udaf_state_managers: @@ -1605,6 +1610,31 @@ def _get_udaf_state_manager( return self.udaf_state_managers[func_key] + def _clear_udaf_state_cache_by_function_id(self, function_id: int) -> int: + """ + Clear UDAF managers for a dropped function id. + + DROP FUNCTION cache cleanup is asynchronous. The runtime key still includes + function_id for correctness, while this action releases old states and class + objects after the drop task reaches this Python process. + """ + prefix = f"{function_id}:" + cleared = 0 + + with self.udaf_managers_lock: + keys_to_remove = [ + key for key in self.udaf_state_managers if key.startswith(prefix) + ] + for key in keys_to_remove: + manager = self.udaf_state_managers.pop(key) + manager.states.clear() + cleared += 1 + + if cleared: + gc.collect() + + return cleared + @staticmethod def parse_python_udf_meta( descriptor: flight.FlightDescriptor, @@ -1621,6 +1651,7 @@ def parse_python_udf_meta( return None cmd_json = json.loads(descriptor.command) + function_id = cmd_json["id"] name = cmd_json["name"] symbol = cmd_json["symbol"] location = cmd_json["location"] @@ -1646,6 +1677,7 @@ def parse_python_udf_meta( output_type = output_schema.field(0).type python_udf_meta = PythonUDFMeta( + function_id=function_id, name=name, symbol=symbol, location=location, @@ -2526,14 +2558,42 @@ def do_action( Supported actions: - "clear_module_cache": Clear Python module cache for a specific location Body: JSON with "location" field (the UDF cache directory path) + - "clear_udaf_state_cache": Clear UDAF runtime state for a dropped function id + Body: JSON with "function_id" field """ action_type = action.type if action_type == "clear_module_cache": yield from self._handle_clear_module_cache(action.body.to_pybytes()) + elif action_type == "clear_udaf_state_cache": + yield from self._handle_clear_udaf_state_cache(action.body.to_pybytes()) else: raise flight.FlightUnavailableError(f"Unknown action: {action_type}") + def _handle_clear_udaf_state_cache(self, body: bytes): + """ + Clear cached UDAF state managers for a dropped function id. + """ + try: + params = json.loads(body.decode("utf-8")) + function_id = int(params["function_id"]) + + cleared_managers = self._clear_udaf_state_cache_by_function_id(function_id) + + result = { + "success": True, + "cleared_managers": cleared_managers, + "function_id": function_id, + } + yield flight.Result(json.dumps(result).encode("utf-8")) + + except Exception as e: + logging.error("clear_udaf_state_cache failed: %s", e) + yield flight.Result(json.dumps({ + "success": False, + "error": str(e) + }).encode("utf-8")) + def _handle_clear_module_cache(self, body: bytes): """ Clear Python module cache for a specific UDF location. diff --git a/be/src/udf/python/python_udf_meta.cpp b/be/src/udf/python/python_udf_meta.cpp index 49806f82ca944e..5cd855432de70d 100644 --- a/be/src/udf/python/python_udf_meta.cpp +++ b/be/src/udf/python/python_udf_meta.cpp @@ -56,6 +56,7 @@ Status PythonUDFMeta::serialize_arrow_schema(const std::shared_ptr(type)), allocator); diff --git a/be/test/udf/python/python_server_test.cpp b/be/test/udf/python/python_server_test.cpp index 675c72988b64c4..7dfda79515b92d 100644 --- a/be/test/udf/python/python_server_test.cpp +++ b/be/test/udf/python/python_server_test.cpp @@ -301,6 +301,18 @@ TEST_F(PythonServerTest, ShutdownAfterFailedInitializationDoesNotCrash) { EXPECT_NO_THROW(mgr.shutdown()); } +TEST_F(PythonServerTest, ClearUdafStateCacheWithoutProcessesIsNoOp) { + PythonServerManager mgr; + + EXPECT_NO_THROW(mgr.clear_udaf_state_cache(12345)); +} + +TEST_F(PythonServerTest, ClearModuleCacheWithoutProcessesIsNoOp) { + PythonServerManager mgr; + + EXPECT_NO_THROW(mgr.clear_module_cache("/tmp/python_udf_cache")); +} + // ============================================================================ // PythonServerManager::get_client() - client retrieval test // ============================================================================ diff --git a/be/test/udf/python/python_udf_meta_test.cpp b/be/test/udf/python/python_udf_meta_test.cpp index fd651ae07d042a..4308543051057e 100644 --- a/be/test/udf/python/python_udf_meta_test.cpp +++ b/be/test/udf/python/python_udf_meta_test.cpp @@ -352,6 +352,9 @@ TEST_F(PythonUDFMetaTest, SerializeToJsonBasic) { doc.Parse(json_str.c_str()); EXPECT_FALSE(doc.HasParseError()); + EXPECT_TRUE(doc.HasMember("id")); + EXPECT_EQ(doc["id"].GetInt64(), 1); + EXPECT_TRUE(doc.HasMember("name")); EXPECT_STREQ(doc["name"].GetString(), "test_udf"); diff --git a/regression-test/suites/pythonudaf_p0/test_pythonudaf_drop.groovy b/regression-test/suites/pythonudaf_p0/test_pythonudaf_drop.groovy index 4b64921676fd0b..e0b0ed8c4668e9 100644 --- a/regression-test/suites/pythonudaf_p0/test_pythonudaf_drop.groovy +++ b/regression-test/suites/pythonudaf_p0/test_pythonudaf_drop.groovy @@ -124,10 +124,73 @@ suite('test_pythonudaf_drop', "nonConcurrent") { qt_py_udaf_drop_5 '''SELECT py_drop_sum_reconnect(v) FROM py_udaf_drop_tbl;''' try_sql('DROP FUNCTION IF EXISTS py_drop_sum_reconnect(INT);') + + // Case 4: inline UDAF drop/recreate must not reuse the old Python class. + // The Python server caches UDAF state managers, so this verifies the cache key + // and drop cleanup both use the FE function id, not just name + argument types. + sql '''DROP FUNCTION IF EXISTS py_drop_inline_recreate(INT)''' + sql """ + CREATE AGGREGATE FUNCTION py_drop_inline_recreate(INT) + RETURNS BIGINT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "InlineDropRecreateUdaf", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +class InlineDropRecreateUdaf: + def __init__(self): + self.total = 0 + @property + def aggregate_state(self): + return self.total + def accumulate(self, val): + if val is not None: + self.total += val + def merge(self, other): + self.total += other + def finish(self): + return self.total * 10 +\$\$ + """ + def inlineOldResult = sql '''SELECT py_drop_inline_recreate(v) FROM py_udaf_drop_tbl;''' + assert inlineOldResult[0][0].toString() == '60' + + sql '''DROP FUNCTION IF EXISTS py_drop_inline_recreate(INT)''' + sql """ + CREATE AGGREGATE FUNCTION py_drop_inline_recreate(INT) + RETURNS BIGINT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "InlineDropRecreateUdaf", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +class InlineDropRecreateUdaf: + def __init__(self): + self.total = 0 + @property + def aggregate_state(self): + return self.total + def accumulate(self, val): + if val is not None: + self.total += val + def merge(self, other): + self.total += other + def finish(self): + return self.total * 100 +\$\$ + """ + def inlineNewResult = sql '''SELECT py_drop_inline_recreate(v) FROM py_udaf_drop_tbl;''' + assert inlineNewResult[0][0].toString() == '600' + sql '''DROP FUNCTION IF EXISTS py_drop_inline_recreate(INT)''' } finally { try_sql('DROP FUNCTION IF EXISTS py_drop_sum_once(INT);') try_sql('DROP FUNCTION IF EXISTS py_drop_sum_a(INT);') try_sql('DROP FUNCTION IF EXISTS py_drop_sum_b(INT);') try_sql('DROP FUNCTION IF EXISTS py_drop_sum_reconnect(INT);') + try_sql('DROP FUNCTION IF EXISTS py_drop_inline_recreate(INT);') } }