diff --git a/be/src/agent/task_worker_pool.cpp b/be/src/agent/task_worker_pool.cpp index 45720833eab8c9..b98815d61a54b3 100644 --- a/be/src/agent/task_worker_pool.cpp +++ b/be/src/agent/task_worker_pool.cpp @@ -92,6 +92,7 @@ #include "storage/task/engine_storage_migration_task.h" #include "storage/txn/txn_manager.h" #include "storage/utils.h" +#include "udf/python/python_server.h" #include "util/brpc_client_cache.h" #include "util/debug_points.h" #include "util/jni-util.h" @@ -2511,6 +2512,7 @@ void clean_udf_cache_callback(const TAgentTaskRequest& req) { if (clean_req.__isset.function_id && clean_req.function_id > 0) { UserFunctionCache::instance()->drop_function_cache(clean_req.function_id); + PythonServerManager::instance().clear_udaf_state_cache(clean_req.function_id); } LOG(INFO) << "clean udf cache finish: function_signature=" << clean_req.function_signature; diff --git a/be/src/exprs/function/function_python_udf.cpp b/be/src/exprs/function/function_python_udf.cpp index b874d3ce14a59d..072930852cab1c 100644 --- a/be/src/exprs/function/function_python_udf.cpp +++ b/be/src/exprs/function/function_python_udf.cpp @@ -54,18 +54,37 @@ Status PythonFunctionCall::open(FunctionContext* context, func_meta.id = _fn.id; func_meta.name = _fn.name.function_name; func_meta.symbol = _fn.scalar_fn.symbol; + LOG(INFO) << fmt::format( + "[pyudf-test] be open raw tfunction name={}, symbol={}, has_hdfs_location={}, " + "hdfs_location={}, has_function_code={}, function_code_empty={}, " + "has_runtime_version={}, " + "runtime_version={}, checksum={}", + _fn.name.function_name, _fn.scalar_fn.symbol, + _fn.__isset.hdfs_location ? "true" : "false", _fn.hdfs_location, + _fn.__isset.function_code ? "true" : "false", + _fn.function_code.empty() ? "true" : "false", + _fn.__isset.runtime_version ? "true" : "false", _fn.runtime_version, _fn.checksum); if (!_fn.function_code.empty()) { func_meta.type = PythonUDFLoadType::INLINE; func_meta.location = "inline"; func_meta.inline_code = _fn.function_code; + LOG(INFO) << fmt::format("[pyudf-test] be open inline mode code_length={}", + _fn.function_code.size()); } else if (!_fn.hdfs_location.empty()) { func_meta.type = PythonUDFLoadType::MODULE; func_meta.location = _fn.hdfs_location; func_meta.checksum = _fn.checksum; + LOG(INFO) << fmt::format("[pyudf-test] be open module mode url={}, checksum={}", + _fn.hdfs_location, _fn.checksum); } else { func_meta.type = PythonUDFLoadType::UNKNOWN; func_meta.location = "unknown"; + LOG(INFO) << "[pyudf-test] be open unknown mode because both function_code and " + "hdfs_location are empty"; } + LOG(INFO) << fmt::format( + "[pyudf-test] be open classified load_type={}, location={}, checksum={}", + static_cast(func_meta.type), func_meta.location, func_meta.checksum); func_meta.input_types = _argument_types; func_meta.return_type = _return_type; @@ -81,12 +100,15 @@ Status PythonFunctionCall::open(FunctionContext* context, func_meta.runtime_version = version.full_version; RETURN_IF_ERROR(func_meta.check()); func_meta.always_nullable = _return_type->is_nullable(); - LOG(INFO) << fmt::format("runtime_version: {}, func_meta: {}", version.to_string(), + LOG(INFO) << fmt::format("[pyudf-test] runtime_version: {}, func_meta: {}", version.to_string(), func_meta.to_string()); if (func_meta.type == PythonUDFLoadType::MODULE) { RETURN_IF_ERROR(UserFunctionCache::instance()->get_pypath( func_meta.id, func_meta.location, func_meta.checksum, &func_meta.location)); + LOG(INFO) << fmt::format( + "[pyudf-test] be open resolved module path id={}, resolved_location={}", + func_meta.id, func_meta.location); } PythonUDFClientPtr client = nullptr; @@ -112,7 +134,7 @@ Status PythonFunctionCall::execute_impl(FunctionContext* context, Block& block, return Status::InternalError("Python UDF client is null"); } - int64_t input_rows = block.rows(); + int64_t input_rows = num_rows; uint32_t input_columns = block.columns(); DCHECK(input_columns > 0 && result < input_columns && _argument_types.size() == arguments.size()); @@ -141,8 +163,13 @@ Status PythonFunctionCall::execute_impl(FunctionContext* context, Block& block, std::shared_ptr input_batch; std::shared_ptr output_batch; cctz::time_zone _timezone_obj; // default UTC - RETURN_IF_ERROR(convert_to_arrow_batch(input_block, schema, arrow::default_memory_pool(), - &input_batch, _timezone_obj)); + if (arguments.empty()) { + input_batch = arrow::RecordBatch::Make(schema, input_rows, + std::vector> {}); + } else { + RETURN_IF_ERROR(convert_to_arrow_batch(input_block, schema, arrow::default_memory_pool(), + &input_batch, _timezone_obj)); + } RETURN_IF_ERROR(client->evaluate(*input_batch, &output_batch)); int64_t output_rows = output_batch->num_rows(); diff --git a/be/src/exprs/table_function/python_udtf_function.cpp b/be/src/exprs/table_function/python_udtf_function.cpp index a116a3d6785297..d12fb278b10c6b 100644 --- a/be/src/exprs/table_function/python_udtf_function.cpp +++ b/be/src/exprs/table_function/python_udtf_function.cpp @@ -132,17 +132,28 @@ Status PythonUDTFFunction::process_init(Block* block, RuntimeState* state) { for (uint32_t i = 0; i < child_column_idxs.size(); ++i) { input_block.insert(block->get_by_position(child_column_idxs[i])); } + int64_t input_rows = block->rows(); std::shared_ptr input_schema; std::shared_ptr input_batch; RETURN_IF_ERROR(get_arrow_schema_from_block(input_block, &input_schema, TimezoneUtils::default_time_zone)); - RETURN_IF_ERROR(convert_to_arrow_batch(input_block, input_schema, arrow::default_memory_pool(), - &input_batch, _timezone_obj)); + if (child_column_idxs.empty()) { + input_batch = arrow::RecordBatch::Make(input_schema, input_rows, + std::vector> {}); + } else { + RETURN_IF_ERROR(convert_to_arrow_batch(input_block, input_schema, + arrow::default_memory_pool(), &input_batch, + _timezone_obj)); + } // Step 3: Call Python UDTF to evaluate all rows at once (similar to Java UDTF's JNI call) // Python returns a ListArray where each element contains outputs for one input row std::shared_ptr list_array; RETURN_IF_ERROR(_udtf_client->evaluate(*input_batch, &list_array)); + if (list_array->length() != input_rows) [[unlikely]] { + return Status::InternalError("Python UDTF output rows {} not equal to input rows {}", + list_array->length(), input_rows); + } // Step 4: Convert Python server output (ListArray) to Doris array column RETURN_IF_ERROR(_convert_list_array_to_array_column(list_array)); diff --git a/be/src/udf/python/python_server.cpp b/be/src/udf/python/python_server.cpp index 646e1e79039b5b..ff9f6fa9133306 100644 --- a/be/src/udf/python/python_server.cpp +++ b/be/src/udf/python/python_server.cpp @@ -27,9 +27,11 @@ #include #include #include +#include #include "arrow/flight/client.h" #include "common/config.h" +#include "common/status.h" #include "udf/python/python_udaf_client.h" #include "udf/python/python_udf_client.h" #include "udf/python/python_udtf_client.h" @@ -37,6 +39,50 @@ namespace doris { +std::shared_ptr +PythonServerManager::_get_or_create_process_pool(const PythonVersion& version) { + std::lock_guard lock(_pools_mutex); + auto& pool = _process_pools[version]; + if (!pool) { + pool = std::make_shared(); + } + return pool; +} + +std::shared_ptr PythonServerManager::_get_process_pool( + const PythonVersion& version) { + std::lock_guard lock(_pools_mutex); + auto it = _process_pools.find(version); + return it == _process_pools.end() ? nullptr : it->second; +} + +std::vector>> +PythonServerManager::_snapshot_process_pools() { + std::lock_guard lock(_pools_mutex); + std::vector>> snapshot; + snapshot.reserve(_process_pools.size()); + for (const auto& [version, pool] : _process_pools) { + snapshot.emplace_back(version, pool); + } + return snapshot; +} + +#ifdef BE_TEST +void PythonServerManager::set_process_pool_for_test(const PythonVersion& version, + std::vector processes, + bool initialized) { + auto versioned_pool = _get_or_create_process_pool(version); + std::lock_guard lock(versioned_pool->mutex); + versioned_pool->processes = std::move(processes); + versioned_pool->initialized = initialized; +} + +std::vector& PythonServerManager::process_pool_for_test(const PythonVersion& version) { + auto versioned_pool = _get_or_create_process_pool(version); + return versioned_pool->processes; +} +#endif + template Status PythonServerManager::get_client(const PythonUDFMeta& func_meta, const PythonVersion& version, std::shared_ptr* client, @@ -57,12 +103,12 @@ Status PythonServerManager::get_client(const PythonUDFMeta& func_meta, const Pyt } Status PythonServerManager::ensure_pool_initialized(const PythonVersion& version) { - std::lock_guard lock(_pools_mutex); + auto versioned_pool = _get_or_create_process_pool(version); + std::lock_guard lock(versioned_pool->mutex); // Check if already initialized - if (_initialized_versions.count(version)) return Status::OK(); + if (versioned_pool->initialized) return Status::OK(); - std::vector& pool = _process_pools[version]; // 0 means use CPU core count as default, otherwise use the specified value int max_pool_size = config::max_python_process_num > 0 ? config::max_python_process_num : CpuInfo::num_cores(); @@ -91,7 +137,7 @@ Status PythonServerManager::ensure_pool_initialized(const PythonVersion& version for (int i = 0; i < max_pool_size; i++) { Status s = futures[i].get(); if (s.ok() && temp_processes[i]) { - pool.push_back(std::move(temp_processes[i])); + versioned_pool->processes.push_back(std::move(temp_processes[i])); success_count++; } else { failure_count++; @@ -100,7 +146,7 @@ Status PythonServerManager::ensure_pool_initialized(const PythonVersion& version } } - if (pool.empty()) { + if (versioned_pool->processes.empty()) { return Status::InternalError( "Failed to initialize Python process pool: all {} process creation attempts failed", max_pool_size); @@ -110,29 +156,73 @@ Status PythonServerManager::ensure_pool_initialized(const PythonVersion& version << ": created " << success_count << " processes" << (failure_count > 0 ? fmt::format(" ({} failed)", failure_count) : ""); - _initialized_versions.insert(version); + versioned_pool->initialized = true; _start_health_check_thread(); return Status::OK(); } Status PythonServerManager::get_process(const PythonVersion& version, ProcessPtr* process) { - std::lock_guard lock(_pools_mutex); - std::vector& pool = _process_pools[version]; + auto versioned_pool = _get_process_pool(version); + if (!versioned_pool) { + return Status::InternalError("Python process pool is empty for version {}", + version.to_string()); + } + std::lock_guard lock(versioned_pool->mutex); + std::vector& pool = versioned_pool->processes; if (UNLIKELY(pool.empty())) { return Status::InternalError("Python process pool is empty for version {}", version.to_string()); } - // Find process with minimum load (use_count - 1 gives active client count) - auto min_iter = std::min_element( - pool.begin(), pool.end(), - [](const ProcessPtr& a, const ProcessPtr& b) { return a.use_count() < b.use_count(); }); + // Prefer an already-alive process and only use load balancing inside that alive subset. + // keep dead entries stay in the pool for the background health checker + // unless there is no alive process left for the current request. + auto min_alive_iter = std::min_element(pool.begin(), pool.end(), + [](const ProcessPtr& a, const ProcessPtr& b) { + const bool a_alive = a && a->is_alive(); + const bool b_alive = b && b->is_alive(); + if (a_alive != b_alive) { + return a_alive > b_alive; + } + if (!a_alive) { + return false; + } + return a.use_count() < b.use_count(); + }); + + if (min_alive_iter != pool.end() && *min_alive_iter && (*min_alive_iter)->is_alive()) { + *process = *min_alive_iter; + return Status::OK(); + } - // Return process with minimum load - *process = *min_iter; - return Status::OK(); + // Only reach here when the pool has no alive process at all. In that fallback path we + // rebuild one process so the caller can still make progress instead of waiting + // for the next health-check round. + for (size_t i = 0; i < pool.size(); ++i) { + auto& candidate = pool[i]; + ProcessPtr replacement; + Status status = fork(version, &replacement); + if (!status.ok()) { + if (candidate) { + LOG(WARNING) << "Failed to recreate unavailable Python process (pid=" + << candidate->get_child_pid() << ", version=" << version.to_string() + << "): " << status.to_string(); + } else { + LOG(WARNING) << "Failed to create Python process for empty slot, version=" + << version.to_string() << ": " << status.to_string(); + } + continue; + } + + pool[i] = replacement; + *process = std::move(replacement); + return Status::OK(); + } + + return Status::InternalError("Python process pool has no available process for version {}", + version.to_string()); } Status PythonServerManager::fork(const PythonVersion& version, ProcessPtr* process) { @@ -191,39 +281,39 @@ Status PythonServerManager::fork(const PythonVersion& version, ProcessPtr* proce } void PythonServerManager::_start_health_check_thread() { - if (_health_check_thread) return; - - LOG(INFO) << "Starting Python process health check thread (interval: 30 seconds)"; - - _health_check_thread = std::make_unique([this]() { - // Health check loop - while (!_shutdown_flag.load(std::memory_order_acquire)) { - // Wait for interval or shutdown signal - { - std::unique_lock lock(_health_check_mutex); - _health_check_cv.wait_for(lock, std::chrono::seconds(30), [this]() { - return _shutdown_flag.load(std::memory_order_acquire); - }); - } + std::call_once(_health_check_once, [this]() { + LOG(INFO) << "Starting Python process health check thread (interval: 30 seconds)"; + + _health_check_thread = std::make_unique([this]() { + // Health check loop + while (!_shutdown_flag.load(std::memory_order_acquire)) { + // Wait for interval or shutdown signal + { + std::unique_lock lock(_health_check_mutex); + _health_check_cv.wait_for(lock, std::chrono::seconds(30), [this]() { + return _shutdown_flag.load(std::memory_order_acquire); + }); + } - if (_shutdown_flag.load(std::memory_order_acquire)) break; + if (_shutdown_flag.load(std::memory_order_acquire)) break; - _check_and_recreate_processes(); - _refresh_memory_stats(); - } + _check_and_recreate_processes(); + _refresh_memory_stats(); + } - LOG(INFO) << "Python process health check thread exiting"; + LOG(INFO) << "Python process health check thread exiting"; + }); }); } void PythonServerManager::_check_and_recreate_processes() { - std::lock_guard lock(_pools_mutex); - int total_checked = 0; int total_dead = 0; int total_recreated = 0; - for (auto& [version, pool] : _process_pools) { + for (auto& [version, versioned_pool] : _snapshot_process_pools()) { + std::lock_guard lock(versioned_pool->mutex); + auto& pool = versioned_pool->processes; for (size_t i = 0; i < pool.size(); ++i) { auto& process = pool[i]; if (!process) continue; @@ -268,15 +358,22 @@ void PythonServerManager::shutdown() { } // Shutdown all processes - std::lock_guard lock(_pools_mutex); - for (auto& [version, pool] : _process_pools) { + for (auto& [version, versioned_pool] : _snapshot_process_pools()) { + std::lock_guard lock(versioned_pool->mutex); + auto& pool = versioned_pool->processes; for (auto& process : pool) { if (process) { process->shutdown(); } } + pool.clear(); + versioned_pool->initialized = false; + } + + { + std::lock_guard lock(_pools_mutex); + _process_pools.clear(); } - _process_pools.clear(); } Status PythonServerManager::_read_process_memory(pid_t pid, size_t* rss_bytes) { @@ -305,11 +402,11 @@ Status PythonServerManager::_read_process_memory(pid_t pid, size_t* rss_bytes) { } void PythonServerManager::_refresh_memory_stats() { - std::lock_guard lock(_pools_mutex); - int64_t total_rss = 0; - for (const auto& [version, pool] : _process_pools) { + for (const auto& [version, versioned_pool] : _snapshot_process_pools()) { + std::lock_guard lock(versioned_pool->mutex); + const auto& pool = versioned_pool->processes; for (const auto& process : pool) { if (!process || !process->is_alive()) continue; @@ -339,15 +436,27 @@ Status PythonServerManager::clear_module_cache(const std::string& location) { return Status::InvalidArgument("Empty location for clear_module_cache"); } - std::lock_guard lock(_pools_mutex); - std::string body = fmt::format(R"({{"location": "{}"}})", location); + return _broadcast_action_to_processes("clear_module_cache", body, + fmt::format("location={}", location)); +} +void PythonServerManager::clear_udaf_state_cache(int64_t function_id) { + std::string body = fmt::format(R"({{"function_id": {}}})", function_id); + THROW_IF_ERROR(_broadcast_action_to_processes("clear_udaf_state_cache", body, + fmt::format("function_id={}", function_id))); +} + +Status PythonServerManager::_broadcast_action_to_processes(const std::string& action_type, + const std::string& body, + const std::string& log_name) { int success_count = 0; int fail_count = 0; bool has_active_process = false; - for (auto& [version, pool] : _process_pools) { + for (auto& [version, versioned_pool] : _snapshot_process_pools()) { + std::lock_guard lock(versioned_pool->mutex); + auto& pool = versioned_pool->processes; for (auto& process : pool) { if (!process || !process->is_alive()) { continue; @@ -368,7 +477,7 @@ Status PythonServerManager::clear_module_cache(const std::string& location) { auto client = std::move(*client_result); arrow::flight::Action action; - action.type = "clear_module_cache"; + action.type = action_type; action.body = arrow::Buffer::FromString(body); auto result_stream = client->DoAction(action); @@ -394,13 +503,12 @@ Status PythonServerManager::clear_module_cache(const std::string& location) { return Status::OK(); } - LOG(INFO) << "clear_module_cache completed for location=" << location - << ", success=" << success_count << ", failed=" << fail_count; + LOG(INFO) << action_type << " completed for " << log_name << ", success=" << success_count + << ", failed=" << fail_count; if (fail_count > 0) { - return Status::InternalError( - "clear_module_cache failed for location={}, success={}, failed={}", location, - success_count, fail_count); + return Status::InternalError("{} failed for {}, success={}, failed={}", action_type, + log_name, success_count, fail_count); } return Status::OK(); @@ -422,4 +530,4 @@ template Status PythonServerManager::get_client( std::shared_ptr* client, const std::shared_ptr& data_schema); -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/src/udf/python/python_server.h b/be/src/udf/python/python_server.h index 6427cb7e63c38e..4362e95cb1cfd3 100644 --- a/be/src/udf/python/python_server.h +++ b/be/src/udf/python/python_server.h @@ -20,7 +20,10 @@ #include #include #include +#include #include +#include +#include #include "common/status.h" #include "runtime/memory/mem_tracker.h" @@ -51,6 +54,9 @@ class PythonServerManager { // Clear Python module cache for a specific UDF location across all processes Status clear_module_cache(const std::string& location); + // Clear Python UDAF runtime state after DROP FUNCTION + void clear_udaf_state_cache(int64_t function_id); + Status ensure_pool_initialized(const PythonVersion& version); void shutdown(); @@ -59,12 +65,19 @@ class PythonServerManager { // For unit testing only. void check_and_recreate_processes_for_test() { _check_and_recreate_processes(); } - std::unordered_map>& process_pools_for_test() { - return _process_pools; - } + void set_process_pool_for_test(const PythonVersion& version, std::vector processes, + bool initialized = true); + + std::vector& process_pool_for_test(const PythonVersion& version); #endif private: + struct VersionedProcessPool { + std::mutex mutex; + std::vector processes; + bool initialized = false; + }; + /** * Start health check background thread (called once by ensure_pool_initialized) * Thread periodically checks process health and refreshes memory stats @@ -86,17 +99,24 @@ class PythonServerManager { */ void _refresh_memory_stats(); - std::unordered_map> _process_pools; - // Protects _process_pools access + std::shared_ptr _get_or_create_process_pool(const PythonVersion& version); + std::shared_ptr _get_process_pool(const PythonVersion& version); + std::vector>> + _snapshot_process_pools(); + Status _broadcast_action_to_processes(const std::string& action_type, const std::string& body, + const std::string& log_name); + + std::unordered_map> _process_pools; + // Protects the version -> pool handle map only. Per-version process operations are guarded + // by VersionedProcessPool::mutex. std::mutex _pools_mutex; - // Track which versions have been initialized - std::unordered_set _initialized_versions; // Health check background thread std::unique_ptr _health_check_thread; + std::once_flag _health_check_once; std::atomic _shutdown_flag {false}; std::condition_variable _health_check_cv; std::mutex _health_check_mutex; MemTracker _mem_tracker {"PythonUDFProcesses"}; }; -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/src/udf/python/python_server.py b/be/src/udf/python/python_server.py index d16fc352178942..bf162b871604e8 100644 --- a/be/src/udf/python/python_server.py +++ b/be/src/udf/python/python_server.py @@ -455,6 +455,7 @@ class PythonUDFMeta: def __init__( self, + function_id: int, name: str, symbol: str, location: str, @@ -470,6 +471,7 @@ def __init__( Initialize Python UDF metadata. Args: + function_id: FE catalog function id name: UDF function name symbol: Symbol to load (function name or module.function) location: File path or directory containing the UDF @@ -481,6 +483,7 @@ def __init__( output_type: PyArrow data type for return value client_type: 0 for UDF, 1 for UDAF, 2 for UDTF """ + self.id = function_id self.name = name self.symbol = symbol self.location = location @@ -508,7 +511,7 @@ def __str__(self) -> str: """Returns a string representation of the UDF metadata.""" udf_load_type_str = "INLINE" if self.udf_load_type == 0 else "MODULE" return ( - f"PythonUDFMeta(name={self.name}, symbol={self.symbol}, " + f"PythonUDFMeta(id={self.id}, name={self.name}, symbol={self.symbol}, " f"location={self.location}, udf_load_type={udf_load_type_str}, runtime_version={self.runtime_version}, " f"always_nullable={self.always_nullable}, client_type={self.client_type.name}, " f"input_types={self.input_types}, output_type={self.output_type})" @@ -628,11 +631,9 @@ def _scalar_call(self, record_batch: pa.RecordBatch) -> pa.Array: converted_args, traceback.format_exc(), ) - # Return None for failed rows if always_nullable is True - if self.python_udf_meta.always_nullable: - result.append(None) - else: - raise + raise RuntimeError( + f"Error in scalar UDF execution at row {i}: {e}" + ) from e return pa.array(result, type=self._get_output_type()) @@ -1575,8 +1576,9 @@ def __init__(self, location: str): location: Unix socket path for the server """ super().__init__(location) - # Use a dictionary to maintain separate state managers for each UDAF function - # Key: function signature (name + input_types), Value: UDAFStateManager instance + # Use a dictionary to maintain separate state managers for each UDAF function. + # Key includes function_id so DROP/CREATE with the same name and signature + # cannot reuse a class loaded from old inline code. self.udaf_state_managers: Dict[str, UDAFStateManager] = {} self.udaf_managers_lock = threading.Lock() @@ -1593,9 +1595,10 @@ def _get_udaf_state_manager( Returns: UDAFStateManager instance for this specific UDAF """ - # Create a unique key based on function name and argument types type_names = [str(field.type) for field in python_udaf_meta.input_types] - func_key = f"{python_udaf_meta.name}({','.join(type_names)})" + func_key = ( + f"{python_udaf_meta.id}:{python_udaf_meta.name}({','.join(type_names)})" + ) with self.udaf_managers_lock: if func_key not in self.udaf_state_managers: @@ -1607,6 +1610,31 @@ def _get_udaf_state_manager( return self.udaf_state_managers[func_key] + def _clear_udaf_state_cache_by_function_id(self, function_id: int) -> int: + """ + Clear UDAF managers for a dropped function id. + + DROP FUNCTION cache cleanup is asynchronous. The runtime key still includes + function_id for correctness, while this action releases old states and class + objects after the drop task reaches this Python process. + """ + prefix = f"{function_id}:" + cleared = 0 + + with self.udaf_managers_lock: + keys_to_remove = [ + key for key in self.udaf_state_managers if key.startswith(prefix) + ] + for key in keys_to_remove: + manager = self.udaf_state_managers.pop(key) + manager.states.clear() + cleared += 1 + + if cleared: + gc.collect() + + return cleared + @staticmethod def parse_python_udf_meta( descriptor: flight.FlightDescriptor, @@ -1623,6 +1651,7 @@ def parse_python_udf_meta( return None cmd_json = json.loads(descriptor.command) + function_id = cmd_json["id"] name = cmd_json["name"] symbol = cmd_json["symbol"] location = cmd_json["location"] @@ -1648,6 +1677,7 @@ def parse_python_udf_meta( output_type = output_schema.field(0).type python_udf_meta = PythonUDFMeta( + function_id=function_id, name=name, symbol=symbol, location=location, @@ -1731,7 +1761,9 @@ def _handle_udaf_create( place_id, e, ) - success = False + raise RuntimeError( + f"CREATE operation failed for place_id={place_id}: {e}" + ) from e return pa.RecordBatch.from_arrays( [pa.array([success], type=pa.bool_())], ["success"] @@ -1879,7 +1911,9 @@ def _handle_udaf_serialize( place_id, e, ) - serialized = b"" + raise RuntimeError( + f"SERIALIZE operation failed for place_id={place_id}: {e}" + ) from e return pa.RecordBatch.from_arrays( [pa.array([serialized], type=pa.binary())], ["serialized_state"] @@ -1908,7 +1942,9 @@ def _handle_udaf_merge( place_id, e, ) - success = False + raise RuntimeError( + f"MERGE operation failed for place_id={place_id}: {e}" + ) from e return pa.RecordBatch.from_arrays( [pa.array([success], type=pa.bool_())], ["success"] @@ -1932,7 +1968,9 @@ def _handle_udaf_finalize( place_id, e, ) - result = None + raise RuntimeError( + f"FINALIZE operation failed for place_id={place_id}: {e}" + ) from e return pa.RecordBatch.from_arrays( [pa.array([result], type=output_type)], ["result"] @@ -1954,7 +1992,9 @@ def _handle_udaf_reset( place_id, e, ) - success = False + raise RuntimeError( + f"RESET operation failed for place_id={place_id}: {e}" + ) from e return pa.RecordBatch.from_arrays( [pa.array([success], type=pa.bool_())], ["success"] @@ -2090,6 +2130,7 @@ def _handle_exchange_udaf( * ACCUMULATE: use success + rows_processed (number of rows processed) * SERIALIZE: use success + serialized_data (serialized_state) * FINALIZE: use success + serialized_data (serialized result) + * Any failed operation: use success=false + serialized_data (UTF-8 error message) """ # Get or create state manager for this specific UDAF function @@ -2262,8 +2303,12 @@ def _handle_exchange_udaf( e, traceback.format_exc(), ) + # Keep the UDAF Flight stream alive so C++ can still send DESTROY. + # On failure, serialized_data carries the user-visible Python error text. result_batch = self._create_unified_response( - success=False, rows_processed=0, data=b"" + success=False, + rows_processed=0, + data=str(e).encode("utf-8", errors="replace"), ) # Begin stream with unified schema on first call @@ -2513,14 +2558,42 @@ def do_action( Supported actions: - "clear_module_cache": Clear Python module cache for a specific location Body: JSON with "location" field (the UDF cache directory path) + - "clear_udaf_state_cache": Clear UDAF runtime state for a dropped function id + Body: JSON with "function_id" field """ action_type = action.type if action_type == "clear_module_cache": yield from self._handle_clear_module_cache(action.body.to_pybytes()) + elif action_type == "clear_udaf_state_cache": + yield from self._handle_clear_udaf_state_cache(action.body.to_pybytes()) else: raise flight.FlightUnavailableError(f"Unknown action: {action_type}") + def _handle_clear_udaf_state_cache(self, body: bytes): + """ + Clear cached UDAF state managers for a dropped function id. + """ + try: + params = json.loads(body.decode("utf-8")) + function_id = int(params["function_id"]) + + cleared_managers = self._clear_udaf_state_cache_by_function_id(function_id) + + result = { + "success": True, + "cleared_managers": cleared_managers, + "function_id": function_id, + } + yield flight.Result(json.dumps(result).encode("utf-8")) + + except Exception as e: + logging.error("clear_udaf_state_cache failed: %s", e) + yield flight.Result(json.dumps({ + "success": False, + "error": str(e) + }).encode("utf-8")) + def _handle_clear_module_cache(self, body: bytes): """ Clear Python module cache for a specific UDF location. diff --git a/be/src/udf/python/python_udaf_client.cpp b/be/src/udf/python/python_udaf_client.cpp index 6a6f6035ea5ca2..90b983c8671a5b 100644 --- a/be/src/udf/python/python_udaf_client.cpp +++ b/be/src/udf/python/python_udaf_client.cpp @@ -30,6 +30,7 @@ #include "common/compiler_util.h" #include "common/status.h" #include "format/arrow/arrow_utils.h" +#include "util/unaligned.h" #include "udf/python/python_udf_meta.h" #include "udf/python/python_udf_runtime.h" @@ -42,6 +43,7 @@ namespace doris { // - ACCUMULATE: use success + rows_processed (number of rows processed) // - SERIALIZE: use success + data (serialized_state) // - FINALIZE: use success + data (serialized result, may be null) +// - Any failed operation: use success=false + data (UTF-8 error message) // // This unified schema allows all operations to return consistent format, // solving Arrow Flight's limitation that all responses must have the same schema. @@ -51,6 +53,47 @@ static const std::shared_ptr kUnifiedUDAFResponseSchema = arrow:: arrow::field("serialized_data", arrow::binary()), }); +Status PythonUDAFClient::make_udaf_failure_status( + const std::shared_ptr& response, const char* operation, + int64_t place_id) { + if (response == nullptr || response->num_rows() != 1 || + response->num_columns() != kUnifiedUDAFResponseSchema->num_fields()) [[unlikely]] { + return Status::InternalError("Invalid {} failure response for place_id={}", operation, + place_id); + } + + auto data_array = std::static_pointer_cast(response->column(2)); + if (data_array->IsNull(0)) { + return Status::InternalError("{} operation failed for place_id={}", operation, place_id); + } + + const auto* offsets = data_array->raw_value_offsets(); + if (offsets == nullptr) [[unlikely]] { + return Status::InternalError("Invalid {} failure response for place_id={}: null offsets", + operation, place_id); + } + // Arrow Flight buffers may be unaligned after IPC deserialization + int32_t offset_start = unaligned_load(offsets); + int32_t offset_end = unaligned_load(offsets + 1); + + int32_t length = offset_end - offset_start; + if (length <= 0) { + return Status::InternalError("{} operation failed for place_id={}", operation, place_id); + } + const uint8_t* data = data_array->value_data()->data() + offset_start; + std::string error_message(reinterpret_cast(data), length); + return Status::InternalError("{} operation failed for place_id={}: {}", operation, place_id, + error_message); +} + +#ifdef BE_TEST +Status PythonUDAFClient::make_udaf_failure_status_for_test( + const std::shared_ptr& response, const char* operation, + int64_t place_id) { + return make_udaf_failure_status(response, operation, place_id); +} +#endif + Status PythonUDAFClient::create(const PythonUDFMeta& func_meta, ProcessPtr process, const std::shared_ptr& data_schema, PythonUDAFClientPtr* client) { @@ -89,7 +132,7 @@ Status PythonUDAFClient::create(int64_t place_id) { auto success_array = std::static_pointer_cast(response_batch->column(0)); if (!success_array->Value(0)) { - return Status::InternalError("CREATE operation failed for place_id={}", place_id); + return make_udaf_failure_status(response_batch, "CREATE", place_id); } _created_place_id = place_id; @@ -142,16 +185,15 @@ Status PythonUDAFClient::accumulate(int64_t place_id, bool is_single_place, auto rows_processed_array = std::static_pointer_cast(response->column(1)); if (!success_array->Value(0)) { - return Status::InternalError("ACCUMULATE operation failed for place_id={}", place_id); + return make_udaf_failure_status(response, "ACCUMULATE", place_id); } - // Cast to uint8_t* first to avoid UBSAN misaligned pointer errors - const uint8_t* raw_ptr = reinterpret_cast(rows_processed_array->raw_values()); + // Arrow Flight buffers may be unaligned after IPC deserialization. + const auto* raw_ptr = rows_processed_array->raw_values(); if (raw_ptr == nullptr) { return Status::InternalError("ACCUMULATE response has null rows_processed array"); } - int64_t rows_processed; - memcpy(&rows_processed, raw_ptr, sizeof(int64_t)); + int64_t rows_processed = unaligned_load(raw_ptr); int64_t expected_rows = row_end - row_start; @@ -185,17 +227,16 @@ Status PythonUDAFClient::serialize(int64_t place_id, auto data_array = std::static_pointer_cast(response->column(2)); if (!success_array->Value(0)) { - return Status::InternalError("SERIALIZE operation failed for place_id={}", place_id); + return make_udaf_failure_status(response, "SERIALIZE", place_id); } - // Cast to uint8_t* first to avoid UBSAN misaligned pointer errors - const uint8_t* offsets = reinterpret_cast(data_array->raw_value_offsets()); + // Arrow Flight buffers may be unaligned after IPC deserialization. + const auto* offsets = data_array->raw_value_offsets(); if (offsets == nullptr) { return Status::InternalError("SERIALIZE response has null offsets"); } - int32_t offset_start, offset_end; - memcpy(&offset_start, offsets, sizeof(int32_t)); - memcpy(&offset_end, offsets + sizeof(int32_t), sizeof(int32_t)); + int32_t offset_start = unaligned_load(offsets); + int32_t offset_end = unaligned_load(offsets + 1); int32_t length = offset_end - offset_start; @@ -233,7 +274,7 @@ Status PythonUDAFClient::merge(int64_t place_id, auto success_array = std::static_pointer_cast(response->column(0)); if (!success_array->Value(0)) { - return Status::InternalError("MERGE operation failed for place_id={}", place_id); + return make_udaf_failure_status(response, "MERGE", place_id); } return Status::OK(); @@ -260,17 +301,16 @@ Status PythonUDAFClient::finalize(int64_t place_id, std::shared_ptr(response_batch->column(2)); if (!success_array->Value(0)) { - return Status::InternalError("FINALIZE operation failed for place_id={}", place_id); + return make_udaf_failure_status(response_batch, "FINALIZE", place_id); } - // Cast to uint8_t* first to avoid UBSAN misaligned pointer errors - const uint8_t* offsets = reinterpret_cast(data_array->raw_value_offsets()); + // Arrow Flight buffers may be unaligned after IPC deserialization. + const auto* offsets = data_array->raw_value_offsets(); if (offsets == nullptr) { return Status::InternalError("FINALIZE response has null offsets"); } - int32_t offset_start, offset_end; - memcpy(&offset_start, offsets, sizeof(int32_t)); - memcpy(&offset_end, offsets + sizeof(int32_t), sizeof(int32_t)); + int32_t offset_start = unaligned_load(offsets); + int32_t offset_end = unaligned_load(offsets + 1); int32_t length = offset_end - offset_start; @@ -324,7 +364,7 @@ Status PythonUDAFClient::reset(int64_t place_id) { auto success_array = std::static_pointer_cast(response->column(0)); if (!success_array->Value(0)) { - return Status::InternalError("RESET operation failed for place_id={}", place_id); + return make_udaf_failure_status(response, "RESET", place_id); } return Status::OK(); @@ -363,7 +403,7 @@ Status PythonUDAFClient::destroy(int64_t place_id) { if (!success_array->Value(0)) { LOG(WARNING) << "DESTROY operation failed for place_id=" << place_id; - return Status::InternalError("DESTROY operation failed for place_id={}", place_id); + return make_udaf_failure_status(response, "DESTROY", place_id); } return Status::OK(); diff --git a/be/src/udf/python/python_udaf_client.h b/be/src/udf/python/python_udaf_client.h index 078c34a39ea967..471716651a4d9a 100644 --- a/be/src/udf/python/python_udaf_client.h +++ b/be/src/udf/python/python_udaf_client.h @@ -17,6 +17,7 @@ #pragma once +#include #include #include "udf/python/python_client.h" @@ -173,9 +174,18 @@ class PythonUDAFClient : public PythonClient { */ Status close(); +#ifdef BE_TEST + static Status make_udaf_failure_status_for_test( + const std::shared_ptr& response, const char* operation, + int64_t place_id); +#endif + private: DISALLOW_COPY_AND_ASSIGN(PythonUDAFClient); + static Status make_udaf_failure_status(const std::shared_ptr& response, + const char* operation, int64_t place_id); + /** * Send RecordBatch request to Python server with app_metadata * @param metadata UDAFMetadata structure (will be sent as app_metadata) diff --git a/be/src/udf/python/python_udf_meta.cpp b/be/src/udf/python/python_udf_meta.cpp index 88af0c9ff64128..5cd855432de70d 100644 --- a/be/src/udf/python/python_udf_meta.cpp +++ b/be/src/udf/python/python_udf_meta.cpp @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -32,7 +33,6 @@ namespace doris { Status PythonUDFMeta::convert_types_to_schema(const DataTypes& types, const std::string& timezone, std::shared_ptr* schema) { - assert(!types.empty()); arrow::SchemaBuilder builder; for (size_t i = 0; i < types.size(); ++i) { std::shared_ptr arrow_type; @@ -56,6 +56,7 @@ Status PythonUDFMeta::serialize_arrow_schema(const std::shared_ptr(type)), allocator); @@ -140,41 +142,68 @@ std::string PythonUDFMeta::to_string() const { } Status PythonUDFMeta::check() const { + LOG(INFO) << fmt::format( + "[pyudf-test] PythonUDFMeta::check name={}, symbol={}, location={}, " + "runtime_version={}, " + "load_type={}, client_type={}, inline_code_empty={}, checksum_empty={}, " + "input_types_size={}, " + "has_return_type={}, always_nullable={}", + name, symbol, location, runtime_version, static_cast(type), + static_cast(client_type), inline_code.empty() ? "true" : "false", + checksum.empty() ? "true" : "false", input_types.size(), return_type ? "true" : "false", + always_nullable ? "true" : "false"); if (trim(name).empty()) { + LOG(WARNING) << "[pyudf-test] PythonUDFMeta::check failed: empty name"; return Status::InvalidArgument("Python UDF name is empty"); } if (trim(symbol).empty()) { + LOG(WARNING) << "[pyudf-test] PythonUDFMeta::check failed: empty symbol"; return Status::InvalidArgument("Python UDF symbol is empty"); } if (trim(runtime_version).empty()) { + LOG(WARNING) << "[pyudf-test] PythonUDFMeta::check failed: empty runtime_version"; return Status::InvalidArgument("Python UDF runtime version is empty"); } - if (input_types.empty()) { - return Status::InvalidArgument("Python UDF input types is empty"); + if (input_types.empty() && + (client_type == PythonClientType::UDAF || type == PythonUDFLoadType::UNKNOWN)) { + LOG(WARNING) << fmt::format( + "[pyudf-test] PythonUDFMeta::check failed: empty input_types, client_type={}, " + "load_type={}", + static_cast(client_type), static_cast(type)); + return Status::InvalidArgument("Python UDAF input types is empty"); } if (!return_type) { + LOG(WARNING) << "[pyudf-test] PythonUDFMeta::check failed: empty return_type"; return Status::InvalidArgument("Python UDF return type is empty"); } if (type == PythonUDFLoadType::UNKNOWN) { + LOG(WARNING) << fmt::format( + "[pyudf-test] PythonUDFMeta::check failed: unknown load_type, " + "inline_code_empty={}, " + "location_empty={}", + inline_code.empty() ? "true" : "false", trim(location).empty() ? "true" : "false"); return Status::InvalidArgument( "Python UDF load type is invalid, please check inline code or file path"); } if (type == PythonUDFLoadType::MODULE) { if (trim(location).empty()) { + LOG(WARNING) << "[pyudf-test] PythonUDFMeta::check failed: module location empty"; return Status::InvalidArgument("Non-inline Python UDF location is empty"); } if (trim(checksum).empty()) { + LOG(WARNING) << "[pyudf-test] PythonUDFMeta::check failed: module checksum empty"; return Status::InvalidArgument("Non-inline Python UDF checksum is empty"); } } + LOG(INFO) << "[pyudf-test] PythonUDFMeta::check passed"; return Status::OK(); } -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/src/udf/python/python_udf_meta.h b/be/src/udf/python/python_udf_meta.h index 7993faf3bb7014..55c49abb30ad07 100644 --- a/be/src/udf/python/python_udf_meta.h +++ b/be/src/udf/python/python_udf_meta.h @@ -33,18 +33,18 @@ enum class PythonUDFLoadType : uint8_t { INLINE = 0, MODULE = 1, UNKNOWN = 2 }; enum class PythonClientType : uint8_t { UDF = 0, UDAF = 1, UDTF = 2, UNKNOWN = 3 }; struct PythonUDFMeta { - int64_t id; + int64_t id = 0; std::string name; std::string symbol; std::string location; std::string checksum; std::string runtime_version; std::string inline_code; - bool always_nullable; + bool always_nullable = false; DataTypes input_types; DataTypePtr return_type; - PythonUDFLoadType type; - PythonClientType client_type; + PythonUDFLoadType type = PythonUDFLoadType::UNKNOWN; + PythonClientType client_type = PythonClientType::UNKNOWN; static Status convert_types_to_schema(const DataTypes& types, const std::string& timezone, std::shared_ptr* schema); @@ -70,4 +70,4 @@ struct hash { return std::hash()(meta.id); } }; -} // namespace std \ No newline at end of file +} // namespace std diff --git a/be/test/udf/python/python_server_test.cpp b/be/test/udf/python/python_server_test.cpp index 40e4ab3a11a24d..7dfda79515b92d 100644 --- a/be/test/udf/python/python_server_test.cpp +++ b/be/test/udf/python/python_server_test.cpp @@ -23,6 +23,7 @@ #include #include +#include #include #include "common/config.h" @@ -99,6 +100,32 @@ class PythonServerTest : public ::testing::Test { return python_path; } + std::string create_fake_python_with_delay_and_socket_creation(const std::string& binary_name, + const std::string& version, + int delay_ms) { + std::string bin_dir = test_dir_ + "/bin"; + std::string python_path = bin_dir + "/" + binary_name; + fs::create_directories(bin_dir); + + std::ofstream ofs(python_path); + ofs << "#!/bin/bash\n"; + ofs << "if [ \"$1\" = \"--version\" ]; then\n"; + ofs << " echo 'Python " << version << "'\n"; + ofs << " exit 0\n"; + ofs << "fi\n"; + ofs << "sleep " << (delay_ms / 1000.0) << "\n"; + ofs << "SOCKET_PREFIX=\"$3\"\n"; + ofs << "SOCKET_BASE=\"${SOCKET_PREFIX#grpc+unix://}\"\n"; + ofs << "SOCKET_FILE=\"${SOCKET_BASE}_$$.sock\"\n"; + ofs << "touch \"$SOCKET_FILE\"\n"; + ofs << "trap 'rm -f \"$SOCKET_FILE\"; exit 0' TERM INT\n"; + ofs << "while true; do sleep 1; done\n"; + ofs.close(); + fs::permissions(python_path, fs::perms::owner_all); + + return python_path; + } + // Set DORIS_HOME and create flight server script directory void setup_doris_home() { setenv("DORIS_HOME", test_dir_.c_str(), 1); @@ -274,6 +301,18 @@ TEST_F(PythonServerTest, ShutdownAfterFailedInitializationDoesNotCrash) { EXPECT_NO_THROW(mgr.shutdown()); } +TEST_F(PythonServerTest, ClearUdafStateCacheWithoutProcessesIsNoOp) { + PythonServerManager mgr; + + EXPECT_NO_THROW(mgr.clear_udaf_state_cache(12345)); +} + +TEST_F(PythonServerTest, ClearModuleCacheWithoutProcessesIsNoOp) { + PythonServerManager mgr; + + EXPECT_NO_THROW(mgr.clear_module_cache("/tmp/python_udf_cache")); +} + // ============================================================================ // PythonServerManager::get_client() - client retrieval test // ============================================================================ @@ -417,6 +456,69 @@ TEST_F(PythonServerTest, GetProcessFromInitializedPool) { mgr.shutdown(); } +TEST_F(PythonServerTest, GetProcessRecreatesDeadProcessWhenNoAliveProcess) { + setup_doris_home(); + std::string python_path = create_fake_python_with_socket_creation("3.9.16"); + + config::max_python_process_num = 1; + + PythonServerManager mgr; + PythonVersion version("3.9.16", test_dir_, python_path); + + ASSERT_TRUE(mgr.ensure_pool_initialized(version).ok()); + + ProcessPtr first_process; + ASSERT_TRUE(mgr.get_process(version, &first_process).ok()); + ASSERT_NE(first_process, nullptr); + ASSERT_TRUE(first_process->is_alive()); + pid_t first_pid = first_process->get_child_pid(); + + first_process->shutdown(); + ASSERT_FALSE(first_process->is_alive()); + + ProcessPtr replacement; + Status status = mgr.get_process(version, &replacement); + + EXPECT_TRUE(status.ok()) << status.to_string(); + ASSERT_NE(replacement, nullptr); + EXPECT_TRUE(replacement->is_alive()); + EXPECT_NE(replacement->get_child_pid(), first_pid); + + mgr.shutdown(); +} + +TEST_F(PythonServerTest, GetProcessSkipsDeadProcessWhenAliveProcessExists) { + setup_doris_home(); + std::string python_path = create_fake_python_with_socket_creation("3.9.16"); + + PythonServerManager mgr; + PythonVersion version("3.9.16", test_dir_, python_path); + + ProcessPtr alive_process; + ASSERT_TRUE(mgr.fork(version, &alive_process).ok()); + ASSERT_NE(alive_process, nullptr); + ASSERT_TRUE(alive_process->is_alive()); + + ProcessPtr dead_process; + ASSERT_TRUE(mgr.fork(version, &dead_process).ok()); + ASSERT_NE(dead_process, nullptr); + pid_t dead_pid = dead_process->get_child_pid(); + dead_process->shutdown(); + ASSERT_FALSE(dead_process->is_alive()); + + mgr.set_process_pool_for_test(version, {alive_process, dead_process}); + + ProcessPtr selected; + Status status = mgr.get_process(version, &selected); + + EXPECT_TRUE(status.ok()) << status.to_string(); + EXPECT_EQ(selected, alive_process); + EXPECT_FALSE(mgr.process_pool_for_test(version)[1]->is_alive()); + EXPECT_EQ(mgr.process_pool_for_test(version)[1]->get_child_pid(), dead_pid); + + mgr.shutdown(); +} + TEST_F(PythonServerTest, GetProcessLoadBalancing) { setup_doris_home(); std::string python_path = create_fake_python_with_socket_creation("3.9.16"); @@ -523,6 +625,40 @@ TEST_F(PythonServerTest, MultipleVersionPools) { mgr.shutdown(); } +TEST_F(PythonServerTest, EnsurePoolInitializedForDifferentVersionsDoesNotShareVersionLock) { + setup_doris_home(); + + config::max_python_process_num = 1; + + std::string python39_path = + create_fake_python_with_delay_and_socket_creation("python3.9", "3.9.16", 1200); + std::string python310_path = + create_fake_python_with_delay_and_socket_creation("python3.10", "3.10.0", 1200); + + PythonServerManager mgr; + PythonVersion version39("3.9.16", test_dir_, python39_path); + PythonVersion version310("3.10.0", test_dir_, python310_path); + + auto start = std::chrono::steady_clock::now(); + auto future39 = std::async(std::launch::async, + [&]() { return mgr.ensure_pool_initialized(version39); }); + auto future310 = std::async(std::launch::async, + [&]() { return mgr.ensure_pool_initialized(version310); }); + + Status status39 = future39.get(); + Status status310 = future310.get(); + auto elapsed = std::chrono::duration_cast( + std::chrono::steady_clock::now() - start); + + EXPECT_TRUE(status39.ok()) << status39.to_string(); + EXPECT_TRUE(status310.ok()) << status310.to_string(); + // If both versions still contended on one manager-wide lock, the elapsed time would + // be close to two serialized 1.2s startups instead of a single startup window. + EXPECT_LT(elapsed.count(), 2200); + + mgr.shutdown(); +} + // ============================================================================ // PythonServerManager::_check_and_recreate_processes() - health-check recreation test // ============================================================================ @@ -546,15 +682,15 @@ TEST_F(PythonServerTest, CheckAndRecreateProcessesRecreatesDeadProcess) { dead_process->shutdown(); ASSERT_FALSE(dead_process->is_alive()); - mgr.process_pools_for_test()[version] = {alive_process, dead_process, nullptr}; + mgr.set_process_pool_for_test(version, {alive_process, dead_process, nullptr}); mgr.check_and_recreate_processes_for_test(); - ASSERT_EQ(mgr.process_pools_for_test()[version].size(), 3); - EXPECT_EQ(mgr.process_pools_for_test()[version][0], alive_process); - EXPECT_EQ(mgr.process_pools_for_test()[version][2], nullptr); + ASSERT_EQ(mgr.process_pool_for_test(version).size(), 3); + EXPECT_EQ(mgr.process_pool_for_test(version)[0], alive_process); + EXPECT_EQ(mgr.process_pool_for_test(version)[2], nullptr); - ProcessPtr recreated = mgr.process_pools_for_test()[version][1]; + ProcessPtr recreated = mgr.process_pool_for_test(version)[1]; ASSERT_NE(recreated, nullptr); EXPECT_TRUE(recreated->is_alive()); EXPECT_NE(recreated->get_child_pid(), dead_pid_before); @@ -582,11 +718,11 @@ TEST_F(PythonServerTest, CheckAndRecreateProcessesErasesDeadProcessWhenRecreateF ASSERT_FALSE(dead_process_2->is_alive()); PythonVersion invalid_version("3.9.16", test_dir_, test_dir_ + "/bin/nonexistent_python"); - mgr.process_pools_for_test()[invalid_version] = {dead_process_1, dead_process_2}; + mgr.set_process_pool_for_test(invalid_version, {dead_process_1, dead_process_2}); mgr.check_and_recreate_processes_for_test(); - EXPECT_TRUE(mgr.process_pools_for_test()[invalid_version].empty()); + EXPECT_TRUE(mgr.process_pool_for_test(invalid_version).empty()); mgr.shutdown(); } diff --git a/be/test/udf/python/python_udaf_client_test.cpp b/be/test/udf/python/python_udaf_client_test.cpp new file mode 100644 index 00000000000000..eb1ab5242b8cf0 --- /dev/null +++ b/be/test/udf/python/python_udaf_client_test.cpp @@ -0,0 +1,155 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "udf/python/python_udaf_client.h" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace doris { + +std::shared_ptr make_udaf_response(const std::optional& error) { + arrow::BooleanBuilder success_builder; + std::shared_ptr success_array; + EXPECT_TRUE(success_builder.Append(false).ok()); + EXPECT_TRUE(success_builder.Finish(&success_array).ok()); + + arrow::Int64Builder rows_processed_builder; + std::shared_ptr rows_processed_array; + EXPECT_TRUE(rows_processed_builder.Append(0).ok()); + EXPECT_TRUE(rows_processed_builder.Finish(&rows_processed_array).ok()); + + arrow::BinaryBuilder data_builder; + std::shared_ptr data_array; + if (error.has_value()) { + EXPECT_TRUE(data_builder.Append(error->data(), static_cast(error->size())).ok()); + } else { + EXPECT_TRUE(data_builder.AppendNull().ok()); + } + EXPECT_TRUE(data_builder.Finish(&data_array).ok()); + + auto schema = arrow::schema({ + arrow::field("success", arrow::boolean()), + arrow::field("rows_processed", arrow::int64()), + arrow::field("serialized_data", arrow::binary()), + }); + return arrow::RecordBatch::Make(schema, 1, {success_array, rows_processed_array, data_array}); +} + +std::shared_ptr make_udaf_response_with_data_array( + const std::shared_ptr& data_array) { + arrow::BooleanBuilder success_builder; + std::shared_ptr success_array; + EXPECT_TRUE(success_builder.Append(false).ok()); + EXPECT_TRUE(success_builder.Finish(&success_array).ok()); + + arrow::Int64Builder rows_processed_builder; + std::shared_ptr rows_processed_array; + EXPECT_TRUE(rows_processed_builder.Append(0).ok()); + EXPECT_TRUE(rows_processed_builder.Finish(&rows_processed_array).ok()); + + auto schema = arrow::schema({ + arrow::field("success", arrow::boolean()), + arrow::field("rows_processed", arrow::int64()), + arrow::field("serialized_data", arrow::binary()), + }); + return arrow::RecordBatch::Make(schema, 1, {success_array, rows_processed_array, data_array}); +} + +TEST(PythonUDAFClientTest, FailureStatusIncludesPythonErrorMessage) { + auto response = make_udaf_response("finish failed"); + Status status = PythonUDAFClient::make_udaf_failure_status_for_test(response, "FINALIZE", 7); + + EXPECT_FALSE(status.ok()); + EXPECT_NE(status.to_string().find("FINALIZE operation failed for place_id=7: finish failed"), + std::string::npos); +} + +TEST(PythonUDAFClientTest, FailureStatusHandlesUnalignedBinaryOffsets) { + std::string error = "finalize failed"; + std::vector offset_storage(1 + 2 * sizeof(int32_t)); + uint8_t* unaligned_offsets = offset_storage.data() + 1; + int32_t offset_start = 0; + int32_t offset_end = static_cast(error.size()); + memcpy(unaligned_offsets, &offset_start, sizeof(int32_t)); + memcpy(unaligned_offsets + sizeof(int32_t), &offset_end, sizeof(int32_t)); + + auto offset_buffer = arrow::Buffer::Wrap(unaligned_offsets, 2 * sizeof(int32_t)); + auto value_buffer = + arrow::Buffer::Wrap(reinterpret_cast(error.data()), error.size()); + auto data_array = std::make_shared(1, offset_buffer, value_buffer); + ASSERT_EQ(reinterpret_cast(data_array->raw_value_offsets()) % alignof(int32_t), 1); + + Status status = PythonUDAFClient::make_udaf_failure_status_for_test( + make_udaf_response_with_data_array(data_array), "FINALIZE", 13); + + EXPECT_FALSE(status.ok()); + EXPECT_NE(status.to_string().find("FINALIZE operation failed for place_id=13: finalize failed"), + std::string::npos); +} + +TEST(PythonUDAFClientTest, FailureStatusFallsBackWhenErrorMessageIsNullOrEmpty) { + Status null_status = PythonUDAFClient::make_udaf_failure_status_for_test( + make_udaf_response(std::nullopt), "RESET", 8); + EXPECT_FALSE(null_status.ok()); + EXPECT_NE(null_status.to_string().find("RESET operation failed for place_id=8"), + std::string::npos); + + Status empty_status = + PythonUDAFClient::make_udaf_failure_status_for_test(make_udaf_response(""), "MERGE", 9); + EXPECT_FALSE(empty_status.ok()); + EXPECT_NE(empty_status.to_string().find("MERGE operation failed for place_id=9"), + std::string::npos); +} + +TEST(PythonUDAFClientTest, FailureStatusRejectsInvalidResponseShape) { + Status null_status = + PythonUDAFClient::make_udaf_failure_status_for_test(nullptr, "ACCUMULATE", 10); + EXPECT_FALSE(null_status.ok()); + EXPECT_NE(null_status.to_string().find("Invalid ACCUMULATE failure response for place_id=10"), + std::string::npos); + + auto zero_row_response = make_udaf_response("accumulate failed")->Slice(0, 0); + Status zero_row_status = PythonUDAFClient::make_udaf_failure_status_for_test(zero_row_response, + "ACCUMULATE", 11); + EXPECT_FALSE(zero_row_status.ok()); + EXPECT_NE( + zero_row_status.to_string().find("Invalid ACCUMULATE failure response for place_id=11"), + std::string::npos); + + auto response = make_udaf_response("reset failed"); + auto two_column_response = arrow::RecordBatch::Make( + arrow::schema({response->schema()->field(0), response->schema()->field(1)}), 1, + {response->column(0), response->column(1)}); + Status two_column_status = + PythonUDAFClient::make_udaf_failure_status_for_test(two_column_response, "RESET", 12); + EXPECT_FALSE(two_column_status.ok()); + EXPECT_NE(two_column_status.to_string().find("Invalid RESET failure response for place_id=12"), + std::string::npos); +} + +} // namespace doris diff --git a/be/test/udf/python/python_udf_meta_test.cpp b/be/test/udf/python/python_udf_meta_test.cpp index b913f49d19b5f1..4308543051057e 100644 --- a/be/test/udf/python/python_udf_meta_test.cpp +++ b/be/test/udf/python/python_udf_meta_test.cpp @@ -109,7 +109,7 @@ TEST_F(PythonUDFMetaTest, CheckEmptyRuntimeVersion) { EXPECT_TRUE(status.to_string().find("runtime version is empty") != std::string::npos); } -TEST_F(PythonUDFMetaTest, CheckEmptyInputTypes) { +TEST_F(PythonUDFMetaTest, CheckEmptyInputTypesAllowedForUdf) { PythonUDFMeta meta; meta.name = "test_udf"; meta.symbol = "test_func"; @@ -117,6 +117,35 @@ TEST_F(PythonUDFMetaTest, CheckEmptyInputTypes) { meta.input_types = {}; meta.return_type = nullable_int32_; meta.type = PythonUDFLoadType::INLINE; + meta.client_type = PythonClientType::UDF; + + Status status = meta.check(); + EXPECT_TRUE(status.ok()) << status.to_string(); +} + +TEST_F(PythonUDFMetaTest, CheckEmptyInputTypesAllowedForUdtf) { + PythonUDFMeta meta; + meta.name = "test_udtf"; + meta.symbol = "test_func"; + meta.runtime_version = "3.9.16"; + meta.input_types = {}; + meta.return_type = nullable_string_; + meta.type = PythonUDFLoadType::INLINE; + meta.client_type = PythonClientType::UDTF; + + Status status = meta.check(); + EXPECT_TRUE(status.ok()) << status.to_string(); +} + +TEST_F(PythonUDFMetaTest, CheckEmptyInputTypesRejectedForUdaf) { + PythonUDFMeta meta; + meta.name = "test_udaf"; + meta.symbol = "test_func"; + meta.runtime_version = "3.9.16"; + meta.input_types = {}; + meta.return_type = nullable_int32_; + meta.type = PythonUDFLoadType::INLINE; + meta.client_type = PythonClientType::UDAF; Status status = meta.check(); EXPECT_FALSE(status.ok()); @@ -323,6 +352,9 @@ TEST_F(PythonUDFMetaTest, SerializeToJsonBasic) { doc.Parse(json_str.c_str()); EXPECT_FALSE(doc.HasParseError()); + EXPECT_TRUE(doc.HasMember("id")); + EXPECT_EQ(doc["id"].GetInt64(), 1); + EXPECT_TRUE(doc.HasMember("name")); EXPECT_STREQ(doc["name"].GetString(), "test_udf"); @@ -401,6 +433,27 @@ TEST_F(PythonUDFMetaTest, SerializeToJsonMultipleInputTypes) { EXPECT_TRUE(doc.HasMember("input_types")); } +TEST_F(PythonUDFMetaTest, SerializeToJsonEmptyInputTypesForUdf) { + PythonUDFMeta meta; + meta.name = "zero_arg_udf"; + meta.symbol = "func"; + meta.runtime_version = "3.9.16"; + meta.input_types = {}; + meta.return_type = nullable_int32_; + meta.type = PythonUDFLoadType::INLINE; + meta.client_type = PythonClientType::UDF; + + std::string json_str; + Status status = meta.serialize_to_json(&json_str); + EXPECT_TRUE(status.ok()) << status.to_string(); + + rapidjson::Document doc; + doc.Parse(json_str.c_str()); + EXPECT_FALSE(doc.HasParseError()); + EXPECT_TRUE(doc.HasMember("input_types")); + EXPECT_FALSE(std::string(doc["input_types"].GetString()).empty()); +} + // ============================================================================ // PythonUDFMeta convert_types_to_schema() tests // ============================================================================ @@ -429,6 +482,17 @@ TEST_F(PythonUDFMetaTest, ConvertTypesToSchemaSingleType) { EXPECT_EQ(schema->num_fields(), 1); } +TEST_F(PythonUDFMetaTest, ConvertTypesToSchemaEmpty) { + DataTypes types = {}; + std::shared_ptr schema; + + Status status = PythonUDFMeta::convert_types_to_schema(types, TimezoneUtils::default_time_zone, + &schema); + EXPECT_TRUE(status.ok()) << status.to_string(); + EXPECT_NE(schema, nullptr); + EXPECT_EQ(schema->num_fields(), 0); +} + // ============================================================================ // PythonUDFMeta serialize_arrow_schema() tests // ============================================================================ diff --git a/fe/fe-catalog/src/main/java/org/apache/doris/analysis/FunctionCallExpr.java b/fe/fe-catalog/src/main/java/org/apache/doris/analysis/FunctionCallExpr.java index 5a934cd6ca4e26..07efe49fa84873 100644 --- a/fe/fe-catalog/src/main/java/org/apache/doris/analysis/FunctionCallExpr.java +++ b/fe/fe-catalog/src/main/java/org/apache/doris/analysis/FunctionCallExpr.java @@ -29,12 +29,15 @@ import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import com.google.gson.annotations.SerializedName; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import java.text.StringCharacterIterator; import java.util.List; // TODO: for aggregations, we need to unify the code paths for builtins and UDAs. public class FunctionCallExpr extends Expr { + private static final Logger LOG = LogManager.getLogger(FunctionCallExpr.class); @SerializedName("fnn") private FunctionName fnName; @@ -127,6 +130,15 @@ public FunctionCallExpr(Function function, FunctionParams functionParams, Functi this.originChildSize = children.size(); this.isMergeAggFn = isMergeAggFn; this.nullable = nullable; + if (function.getBinaryType() == Function.BinaryType.PYTHON_UDF) { + LOG.info("[pyudf-test] FunctionCallExpr ctor signature={}, location={}, runtimeVersion={}, " + + "functionCodeEmpty={}, childCount={}, nullable={}", + function.signatureString(), + function.getLocation() == null ? "null" : function.getLocation().getLocation(), + function.getRuntimeVersion(), + function.getFunctionCode() == null || function.getFunctionCode().isEmpty(), + children.size(), nullable); + } } protected FunctionCallExpr(FunctionCallExpr other) { diff --git a/fe/fe-catalog/src/main/java/org/apache/doris/catalog/Function.java b/fe/fe-catalog/src/main/java/org/apache/doris/catalog/Function.java index 5c7c80f4535ef1..4c672c3c6a4a09 100644 --- a/fe/fe-catalog/src/main/java/org/apache/doris/catalog/Function.java +++ b/fe/fe-catalog/src/main/java/org/apache/doris/catalog/Function.java @@ -114,6 +114,8 @@ public enum BinaryType { protected String runtimeVersion; @SerializedName("fc") protected String functionCode; + @SerializedName("det") + protected boolean deterministic = false; // Only used for serialization protected Function() { @@ -174,6 +176,7 @@ public Function(Function other) { this.expirationTime = other.expirationTime; this.runtimeVersion = other.runtimeVersion; this.functionCode = other.functionCode; + this.deterministic = other.deterministic; } public Function clone() { @@ -301,6 +304,14 @@ public void setFunctionCode(String functionCode) { this.functionCode = functionCode; } + public boolean isDeterministic() { + return deterministic; + } + + public void setDeterministic(boolean deterministic) { + this.deterministic = deterministic; + } + // TODO(cmy): Currently we judge whether it is UDF by wheter the 'location' is set. // Maybe we should use a separate variable to identify, // but additional variables need to modify the persistence information. @@ -401,7 +412,8 @@ public boolean equals(Object o) { } Function function = (Function) o; return id == function.id && hasVarArgs == function.hasVarArgs && userVisible == function.userVisible - && vectorized == function.vectorized && Objects.equals(name, function.name) + && vectorized == function.vectorized && deterministic == function.deterministic + && Objects.equals(name, function.name) && Objects.equals(retType, function.retType) && Arrays.equals(argTypes, function.argTypes) && Objects.equals(location, function.location) && binaryType == function.binaryType && nullableMode == function.nullableMode && Objects.equals( @@ -411,7 +423,7 @@ public boolean equals(Object o) { @Override public int hashCode() { int result = Objects.hash(id, name, retType, hasVarArgs, userVisible, location, binaryType, nullableMode, - vectorized, checksum); + vectorized, checksum, deterministic); result = 31 * result + Arrays.hashCode(argTypes); return result; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/ExprToThriftVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/ExprToThriftVisitor.java index d52a0eb8bf0890..6169d231a1d6a5 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/ExprToThriftVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/ExprToThriftVisitor.java @@ -19,6 +19,7 @@ import org.apache.doris.analysis.ArithmeticExpr.Operator; import org.apache.doris.catalog.ArrayType; +import org.apache.doris.catalog.Function.BinaryType; import org.apache.doris.catalog.FunctionToThriftConverter; import org.apache.doris.catalog.ScalarType; import org.apache.doris.catalog.StructType; @@ -114,6 +115,15 @@ public static void treeToThriftHelper(Expr expr, TExpr container, msg.type = expr.getType().toThrift(); msg.num_children = expr.getChildren().size(); if (expr.getFn() != null) { + if (expr.getFn().getBinaryType() == BinaryType.PYTHON_UDF) { + LOG.info("[pyudf-test] ExprToThriftVisitor exprFn signature={}, location={}, runtimeVersion={}, " + + "functionCodeEmpty={}, childCount={}", + expr.getFn().signatureString(), + expr.getFn().getLocation() == null ? "null" : expr.getFn().getLocation().getLocation(), + expr.getFn().getRuntimeVersion(), + expr.getFn().getFunctionCode() == null || expr.getFn().getFunctionCode().isEmpty(), + expr.getChildren().size()); + } msg.setFn(FunctionToThriftConverter.toThrift(expr.getFn(), expr.getType(), expr.collectChildReturnTypes(), expr.collectChildReturnNullables())); if (expr.getFn().hasVarArgs()) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionRegistry.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionRegistry.java index 619de51e50d8cb..1a8402520ef01b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionRegistry.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionRegistry.java @@ -322,6 +322,15 @@ public void dropUdf(String dbName, String name, List argTypes) { } } + public void dropUdfByDb(String dbName) { + if (dbName == null) { + dbName = GLOBAL_FUNCTION; + } + synchronized (name2UdfBuilders) { + name2UdfBuilders.remove(dbName); + } + } + /** * use for search appropriate signature for UDFs if candidate more than one. */ diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionToSqlConverter.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionToSqlConverter.java index 8709eb5b6de8ed..e79e39fa9d14df 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionToSqlConverter.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionToSqlConverter.java @@ -75,6 +75,13 @@ public static String toSql(ScalarFunction fn, boolean ifNotExists) { .append("\"" + (fn.getLocation() == null ? "" : fn.getLocation().toString()) + "\""); boolean isReturnNull = fn.getNullableMode() == NullableMode.ALWAYS_NULLABLE; sb.append(",\n \"ALWAYS_NULLABLE\"=").append("\"" + isReturnNull + "\""); + sb.append(",\n \"DETERMINISTIC\"=").append("\"" + fn.isDeterministic() + "\""); + } else if (fn.getBinaryType() == Function.BinaryType.PYTHON_UDF) { + sb.append(",\n \"FILE\"=") + .append("\"" + (fn.getLocation() == null ? "" : fn.getLocation().toString()) + "\""); + boolean isReturnNull = fn.getNullableMode() == NullableMode.ALWAYS_NULLABLE; + sb.append(",\n \"ALWAYS_NULLABLE\"=").append("\"" + isReturnNull + "\""); + sb.append(",\n \"DETERMINISTIC\"=").append("\"" + fn.isDeterministic() + "\""); } else { sb.append(",\n \"OBJECT_FILE\"=") .append("\"" + (fn.getLocation() == null ? "" : fn.getLocation().toString()) + "\""); @@ -125,6 +132,13 @@ public static String toSql(AggregateFunction fn, boolean ifNotExists) { .append("\"" + (fn.getLocation() == null ? "" : fn.getLocation().toString()) + "\","); boolean isReturnNull = fn.getNullableMode() == NullableMode.ALWAYS_NULLABLE; sb.append("\n \"ALWAYS_NULLABLE\"=").append("\"" + isReturnNull + "\","); + sb.append("\n \"DETERMINISTIC\"=").append("\"" + fn.isDeterministic() + "\","); + } else if (fn.getBinaryType() == Function.BinaryType.PYTHON_UDF) { + sb.append("\n \"FILE\"=") + .append("\"" + (fn.getLocation() == null ? "" : fn.getLocation().toString()) + "\","); + boolean isReturnNull = fn.getNullableMode() == NullableMode.ALWAYS_NULLABLE; + sb.append("\n \"ALWAYS_NULLABLE\"=").append("\"" + isReturnNull + "\","); + sb.append("\n \"DETERMINISTIC\"=").append("\"" + fn.isDeterministic() + "\","); } else { sb.append("\n \"OBJECT_FILE\"=") .append("\"" + (fn.getLocation() == null ? "" : fn.getLocation().toString()) + "\","); diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionToThriftConverter.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionToThriftConverter.java index 5c4ca6f8be9994..463f498e89b706 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionToThriftConverter.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionToThriftConverter.java @@ -25,11 +25,14 @@ import com.google.common.base.Strings; import com.google.common.collect.Lists; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; /** * Converts {@link Function} and its subclasses to their Thrift representations. */ public class FunctionToThriftConverter { + private static final Logger LOG = LogManager.getLogger(FunctionToThriftConverter.class); /** * Converts a {@link Function.BinaryType} to its Thrift representation. @@ -97,6 +100,14 @@ public static TFunction toThrift(ScalarFunction fn, Type realReturnType, Type[] tfn.setFunctionCode(fn.getFunctionCode()); } tfn.setRuntimeVersion(fn.getRuntimeVersion()); + LOG.info("[pyudf-test] scalar toThrift python udf signature={}, location={}, hdfsLocationIsSet={}, " + + "runtimeVersion={}, functionCodeEmpty={}, checksum={}", + fn.signatureString(), + fn.getLocation() == null ? "null" : fn.getLocation().getLocation(), + tfn.isSetHdfsLocation(), + fn.getRuntimeVersion(), + Strings.isNullOrEmpty(fn.getFunctionCode()), + fn.getChecksum()); } if (fn.getDictFunction() != null) { tfn.setDictFunction(fn.getDictFunction()); @@ -161,6 +172,16 @@ private static TFunction toThriftBase(Function fn, Type realReturnType, Type[] r if (fn.getLocation() != null) { tfn.setHdfsLocation(fn.getLocation().getLocation()); } + if (fn.getBinaryType() == Function.BinaryType.PYTHON_UDF) { + LOG.info("[pyudf-test] toThriftBase python udf signature={}, location={}, hdfsLocationIsSet={}, " + + "runtimeVersion={}, functionCodeEmpty={}, checksum={}", + fn.signatureString(), + fn.getLocation() == null ? "null" : fn.getLocation().getLocation(), + tfn.isSetHdfsLocation(), + fn.getRuntimeVersion(), + Strings.isNullOrEmpty(fn.getFunctionCode()), + fn.getChecksum()); + } // `realArgTypes.length != argTypes.length` is true iff this is an aggregation // function. // For aggregation functions, `argTypes` here is already its real type with true diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java index 7b48cbb10efce0..17fc587963e8f7 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java @@ -42,6 +42,8 @@ import org.apache.doris.catalog.DynamicPartitionProperty; import org.apache.doris.catalog.Env; import org.apache.doris.catalog.EnvFactory; +import org.apache.doris.catalog.Function; +import org.apache.doris.catalog.FunctionUtil; import org.apache.doris.catalog.HashDistributionInfo; import org.apache.doris.catalog.Index; import org.apache.doris.catalog.InfoSchemaDb; @@ -537,6 +539,7 @@ public void dropDb(String dbName, boolean ifExists, boolean force) throws DdlExc // 3. remove db from catalog idToDb.remove(db.getId()); fullNameToDb.remove(db.getFullName()); + Env.getCurrentEnv().getFunctionRegistry().dropUdfByDb(db.getFullName()); DropDbInfo info = new DropDbInfo(dbName, force, recycleTime); Env.getCurrentEnv().getQueryStats().clear(Env.getCurrentEnv().getCurrentCatalog().getId(), db.getId()); Env.getCurrentEnv().getDictionaryManager().dropDbDictionaries(dbName); @@ -595,6 +598,7 @@ public void replayDropDb(String dbName, boolean isForceDrop, Long recycleTime) t fullNameToDb.remove(dbName); idToDb.remove(db.getId()); + Env.getCurrentEnv().getFunctionRegistry().dropUdfByDb(dbName); } finally { unlock(); } @@ -644,6 +648,7 @@ public void recoverDatabase(String dbName, long dbId, String newDbName) throws D RecoverInfo recoverInfo = new RecoverInfo(db.getId(), -1L, -1L, newDbName, "", "", "", ""); Env.getCurrentEnv().getEditLog().logRecoverDb(recoverInfo); db.unmarkDropped(); + registerDbFunctionsToNereids(db); } finally { MetaLockUtils.writeUnlockTables(tableList); db.writeUnlock(); @@ -726,9 +731,17 @@ public void replayRecoverDatabase(RecoverInfo info) { // add db to catalog replayCreateDb(db, newDbName); db.unmarkDropped(); + registerDbFunctionsToNereids(db); LOG.info("replay recover db[{}]", dbId); } + private void registerDbFunctionsToNereids(Database db) { + // A recovered database reuses catalog Function objects, so rebuild their Nereids builders. + for (Function function : db.getFunctions()) { + FunctionUtil.translateToNereids(db.getFullName(), function); + } + } + public void alterDatabaseQuota(String dbName, QuotaType quotaType, long quotaValue) throws DdlException { Database db = getDbOrDdlException(dbName); db.writeLockOrDdlException(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java index acc9e59c6809c7..a02659065e7ab0 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java @@ -945,7 +945,19 @@ public Expr visitPythonUdf(PythonUdf udf, PlanTranslatorContext context) { FunctionParams exprs = new FunctionParams(udf.children().stream() .map(expression -> expression.accept(this, context)) .collect(Collectors.toList())); - return new FunctionCallExpr(udf.getCatalogFunction(), exprs, udf.nullable()); + org.apache.doris.catalog.Function catalogFunction = udf.getCatalogFunction(); + if (catalogFunction instanceof org.apache.doris.catalog.ScalarFunction) { + org.apache.doris.catalog.ScalarFunction scalarFunction = + (org.apache.doris.catalog.ScalarFunction) catalogFunction; + LOG.info("[pyudf-test] ExpressionTranslator.visitPythonUdf name={}, location={}, runtimeVersion={}, " + + "functionCodeEmpty={}, childCount={}, nullable={}", + udf.getName(), + scalarFunction.getLocation() == null ? "null" : scalarFunction.getLocation().getLocation(), + scalarFunction.getRuntimeVersion(), + scalarFunction.getFunctionCode() == null || scalarFunction.getFunctionCode().isEmpty(), + udf.children().size(), udf.nullable()); + } + return new FunctionCallExpr(catalogFunction, exprs, udf.nullable()); } @Override diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/JavaUdaf.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/JavaUdaf.java index c3eebfc283fd0c..2a428cd5fd0746 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/JavaUdaf.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/JavaUdaf.java @@ -62,6 +62,7 @@ public class JavaUdaf extends AggregateFunction implements ExplicitlyCastableSig private final String checkSum; private final boolean isStaticLoad; private final long expirationTime; + private final boolean deterministic; /** * Constructor of UDAF @@ -72,7 +73,8 @@ public JavaUdaf(String name, long functionId, String dbName, Function.BinaryType String objectFile, String symbol, String initFn, String updateFn, String mergeFn, String serializeFn, String finalizeFn, String getValueFn, String removeFn, - boolean isDistinct, String checkSum, boolean isStaticLoad, long expirationTime, Expression... args) { + boolean isDistinct, String checkSum, boolean isStaticLoad, long expirationTime, + boolean deterministic, Expression... args) { super(name, isDistinct, args); this.dbName = dbName; this.functionId = functionId; @@ -92,6 +94,7 @@ public JavaUdaf(String name, long functionId, String dbName, Function.BinaryType this.checkSum = checkSum; this.isStaticLoad = isStaticLoad; this.expirationTime = expirationTime; + this.deterministic = deterministic; } @Override @@ -114,6 +117,11 @@ public NullableMode getNullableMode() { return nullableMode; } + @Override + public boolean isDeterministic() { + return deterministic; + } + /** * withChildren. */ @@ -122,7 +130,8 @@ public JavaUdaf withDistinctAndChildren(boolean isDistinct, List chi Preconditions.checkArgument(children.size() == this.children.size()); return new JavaUdaf(getName(), functionId, dbName, binaryType, signature, intermediateType, nullableMode, objectFile, symbol, initFn, updateFn, mergeFn, serializeFn, finalizeFn, getValueFn, removeFn, - isDistinct, checkSum, isStaticLoad, expirationTime, children.toArray(new Expression[0])); + isDistinct, checkSum, isStaticLoad, expirationTime, deterministic, + children.toArray(new Expression[0])); } /** @@ -165,6 +174,7 @@ public static void translateToNereidsFunction(String dbName, org.apache.doris.ca aggregate.getChecksum(), aggregate.isStaticLoad(), aggregate.getExpirationTime(), + aggregate.isDeterministic(), arguments); JavaUdafBuilder builder = new JavaUdafBuilder(udaf); @@ -201,6 +211,7 @@ public Function getCatalogFunction() { expr.setId(functionId); expr.setStaticLoad(isStaticLoad); expr.setExpirationTime(expirationTime); + expr.setDeterministic(deterministic); return expr; } catch (Exception e) { throw new AnalysisException(e.getMessage(), e.getCause()); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/JavaUdf.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/JavaUdf.java index 07cd4556324f21..974c13b2160f58 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/JavaUdf.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/JavaUdf.java @@ -56,6 +56,7 @@ public class JavaUdf extends ScalarFunction implements ExplicitlyCastableSignatu private final String checkSum; private final boolean isStaticLoad; private final long expirationTime; + private final boolean deterministic; /** * Constructor of UDF @@ -63,7 +64,7 @@ public class JavaUdf extends ScalarFunction implements ExplicitlyCastableSignatu public JavaUdf(String name, long functionId, String dbName, Function.BinaryType binaryType, FunctionSignature signature, NullableMode nullableMode, String objectFile, String symbol, String prepareFn, String closeFn, - String checkSum, boolean isStaticLoad, long expirationTime, Expression... args) { + String checkSum, boolean isStaticLoad, long expirationTime, boolean deterministic, Expression... args) { super(name, args); this.dbName = dbName; this.functionId = functionId; @@ -77,6 +78,7 @@ public JavaUdf(String name, long functionId, String dbName, Function.BinaryType this.checkSum = checkSum; this.isStaticLoad = isStaticLoad; this.expirationTime = expirationTime; + this.deterministic = deterministic; } @Override @@ -99,6 +101,11 @@ public NullableMode getNullableMode() { return nullableMode; } + @Override + public boolean isDeterministic() { + return deterministic; + } + /** * withChildren. */ @@ -106,7 +113,7 @@ public NullableMode getNullableMode() { public JavaUdf withChildren(List children) { Preconditions.checkArgument(children.size() == this.children.size()); return new JavaUdf(getName(), functionId, dbName, binaryType, signature, nullableMode, - objectFile, symbol, prepareFn, closeFn, checkSum, isStaticLoad, expirationTime, + objectFile, symbol, prepareFn, closeFn, checkSum, isStaticLoad, expirationTime, deterministic, children.toArray(new Expression[0])); } @@ -135,7 +142,7 @@ public static void translateToNereidsFunction(String dbName, org.apache.doris.ca scalar.getSymbolName(), scalar.getPrepareFnSymbol(), scalar.getCloseFnSymbol(), - scalar.getChecksum(), scalar.isStaticLoad(), scalar.getExpirationTime(), + scalar.getChecksum(), scalar.isStaticLoad(), scalar.getExpirationTime(), scalar.isDeterministic(), arguments); JavaUdfBuilder builder = new JavaUdfBuilder(udf); @@ -166,6 +173,7 @@ public Function getCatalogFunction() { expr.setId(functionId); expr.setStaticLoad(isStaticLoad); expr.setExpirationTime(expirationTime); + expr.setDeterministic(deterministic); return expr; } catch (Exception e) { throw new AnalysisException(e.getMessage(), e.getCause()); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/JavaUdtf.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/JavaUdtf.java index 2e04dec1d68163..7935c67f6af7ac 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/JavaUdtf.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/JavaUdtf.java @@ -56,6 +56,7 @@ public class JavaUdtf extends TableGeneratingFunction implements ExplicitlyCasta private final String checkSum; private final boolean isStaticLoad; private final long expirationTime; + private final boolean deterministic; /** * Constructor of UDTF @@ -63,7 +64,7 @@ public class JavaUdtf extends TableGeneratingFunction implements ExplicitlyCasta public JavaUdtf(String name, long functionId, String dbName, Function.BinaryType binaryType, FunctionSignature signature, NullableMode nullableMode, String objectFile, String symbol, String prepareFn, String closeFn, - String checkSum, boolean isStaticLoad, long expirationTime, Expression... args) { + String checkSum, boolean isStaticLoad, long expirationTime, boolean deterministic, Expression... args) { super(name, args); this.dbName = dbName; this.functionId = functionId; @@ -77,6 +78,7 @@ public JavaUdtf(String name, long functionId, String dbName, Function.BinaryType this.checkSum = checkSum; this.isStaticLoad = isStaticLoad; this.expirationTime = expirationTime; + this.deterministic = deterministic; } /** @@ -86,7 +88,7 @@ public JavaUdtf(String name, long functionId, String dbName, Function.BinaryType public JavaUdtf withChildren(List children) { Preconditions.checkArgument(children.size() == this.children.size()); return new JavaUdtf(getName(), functionId, dbName, binaryType, signature, nullableMode, - objectFile, symbol, prepareFn, closeFn, checkSum, isStaticLoad, expirationTime, + objectFile, symbol, prepareFn, closeFn, checkSum, isStaticLoad, expirationTime, deterministic, children.toArray(new Expression[0])); } @@ -95,6 +97,11 @@ public List getSignatures() { return ImmutableList.of(signature); } + @Override + public boolean isDeterministic() { + return deterministic; + } + @Override public boolean hasVarArguments() { return signature.hasVarArgs; @@ -125,6 +132,7 @@ public Function getCatalogFunction() { expr.setStaticLoad(isStaticLoad); expr.setExpirationTime(expirationTime); expr.setUDTFunction(true); + expr.setDeterministic(deterministic); return expr; } catch (Exception e) { throw new AnalysisException(e.getMessage(), e.getCause()); @@ -159,6 +167,7 @@ public static void translateToNereidsFunction(String dbName, org.apache.doris.ca scalar.getChecksum(), scalar.isStaticLoad(), scalar.getExpirationTime(), + scalar.isDeterministic(), arguments); JavaUdtfBuilder builder = new JavaUdtfBuilder(udf); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdaf.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdaf.java index 456e0f1a6eac42..ee03571b731aab 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdaf.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdaf.java @@ -64,6 +64,7 @@ public class PythonUdaf extends AggregateFunction implements ExplicitlyCastableS private final long expirationTime; private final String runtimeVersion; private final String functionCode; + private final boolean deterministic; /** * Constructor of UDAF @@ -75,7 +76,7 @@ public PythonUdaf(String name, long functionId, String dbName, Function.BinaryTy String initFn, String updateFn, String mergeFn, String serializeFn, String finalizeFn, String getValueFn, String removeFn, boolean isDistinct, String checkSum, boolean isStaticLoad, long expirationTime, - String runtimeVersion, String functionCode, Expression... args) { + String runtimeVersion, String functionCode, boolean deterministic, Expression... args) { super(name, isDistinct, args); this.dbName = dbName; this.functionId = functionId; @@ -97,6 +98,7 @@ public PythonUdaf(String name, long functionId, String dbName, Function.BinaryTy this.expirationTime = expirationTime; this.runtimeVersion = runtimeVersion; this.functionCode = functionCode; + this.deterministic = deterministic; } @Override @@ -119,6 +121,11 @@ public NullableMode getNullableMode() { return nullableMode; } + @Override + public boolean isDeterministic() { + return deterministic; + } + /** * withChildren. */ @@ -127,7 +134,7 @@ public PythonUdaf withDistinctAndChildren(boolean isDistinct, List c Preconditions.checkArgument(children.size() == this.children.size()); return new PythonUdaf(getName(), functionId, dbName, binaryType, signature, intermediateType, nullableMode, objectFile, symbol, initFn, updateFn, mergeFn, serializeFn, finalizeFn, getValueFn, removeFn, - isDistinct, checkSum, isStaticLoad, expirationTime, runtimeVersion, functionCode, + isDistinct, checkSum, isStaticLoad, expirationTime, runtimeVersion, functionCode, deterministic, children.toArray(new Expression[0])); } @@ -173,6 +180,7 @@ public static void translateToNereidsFunction(String dbName, org.apache.doris.ca aggregate.getExpirationTime(), aggregate.getRuntimeVersion(), aggregate.getFunctionCode(), + aggregate.isDeterministic(), arguments); PythonUdafBuilder builder = new PythonUdafBuilder(udaf); @@ -211,6 +219,7 @@ public Function getCatalogFunction() { expr.setExpirationTime(expirationTime); expr.setRuntimeVersion(runtimeVersion); expr.setFunctionCode(functionCode); + expr.setDeterministic(deterministic); return expr; } catch (Exception e) { throw new AnalysisException(e.getMessage(), e.getCause()); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdf.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdf.java index 98a9e161308417..7d6e409b7d5ef9 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdf.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdf.java @@ -35,6 +35,8 @@ import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import java.util.Arrays; import java.util.List; @@ -44,6 +46,8 @@ * Python UDF for Nereids */ public class PythonUdf extends ScalarFunction implements ExplicitlyCastableSignature, Udf { + private static final Logger LOG = LogManager.getLogger(PythonUdf.class); + private final String dbName; private final long functionId; private final Function.BinaryType binaryType; @@ -58,6 +62,7 @@ public class PythonUdf extends ScalarFunction implements ExplicitlyCastableSigna private final long expirationTime; private final String runtimeVersion; private final String functionCode; + private final boolean deterministic; /** * Constructor of UDF @@ -66,7 +71,7 @@ public PythonUdf(String name, long functionId, String dbName, Function.BinaryTyp FunctionSignature signature, NullableMode nullableMode, String objectFile, String symbol, String prepareFn, String closeFn, String checkSum, boolean isStaticLoad, long expirationTime, - String runtimeVersion, String functionCode, Expression... args) { + String runtimeVersion, String functionCode, boolean deterministic, Expression... args) { super(name, args); this.dbName = dbName; this.functionId = functionId; @@ -82,6 +87,7 @@ public PythonUdf(String name, long functionId, String dbName, Function.BinaryTyp this.expirationTime = expirationTime; this.runtimeVersion = runtimeVersion; this.functionCode = functionCode; + this.deterministic = deterministic; } @Override @@ -104,6 +110,11 @@ public NullableMode getNullableMode() { return nullableMode; } + @Override + public boolean isDeterministic() { + return deterministic; + } + /** * withChildren. */ @@ -112,7 +123,7 @@ public PythonUdf withChildren(List children) { Preconditions.checkArgument(children.size() == this.children.size()); return new PythonUdf(getName(), functionId, dbName, binaryType, signature, nullableMode, objectFile, symbol, prepareFn, closeFn, checkSum, isStaticLoad, expirationTime, - runtimeVersion, functionCode, children.toArray(new Expression[0])); + runtimeVersion, functionCode, deterministic, children.toArray(new Expression[0])); } /** @@ -143,8 +154,16 @@ public static void translateToNereidsFunction(String dbName, org.apache.doris.ca scalar.getChecksum(), scalar.isStaticLoad(), scalar.getExpirationTime(), scalar.getRuntimeVersion(), scalar.getFunctionCode(), + scalar.isDeterministic(), arguments); + LOG.info("[pyudf-test] translateToNereidsFunction name={}, dbName={}, location={}, checksum={}, " + + "runtimeVersion={}, functionCodeEmpty={}, deterministic={}", + fnName, dbName, scalar.getLocation() == null ? "null" : scalar.getLocation().getLocation(), + scalar.getChecksum(), scalar.getRuntimeVersion(), + scalar.getFunctionCode() == null || scalar.getFunctionCode().isEmpty(), + scalar.isDeterministic()); + PythonUdfBuilder builder = new PythonUdfBuilder(udf); Env.getCurrentEnv().getFunctionRegistry().addUdf(dbName, fnName, builder); } @@ -175,6 +194,11 @@ public Function getCatalogFunction() { expr.setExpirationTime(expirationTime); expr.setRuntimeVersion(runtimeVersion); expr.setFunctionCode(functionCode); + expr.setDeterministic(deterministic); + LOG.info("[pyudf-test] getCatalogFunction name={}, dbName={}, objectFile={}, checksum={}, " + + "runtimeVersion={}, functionCodeEmpty={}, deterministic={}", + getName(), dbName, objectFile, checkSum, + runtimeVersion, functionCode == null || functionCode.isEmpty(), deterministic); return expr; } catch (Exception e) { throw new AnalysisException(e.getMessage(), e.getCause()); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdfBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdfBuilder.java index 7185594099b87c..2b794a95e7ae58 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdfBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdfBuilder.java @@ -27,6 +27,8 @@ import com.google.common.base.Suppliers; import com.google.common.collect.Lists; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import java.util.List; import java.util.Optional; @@ -36,6 +38,8 @@ * function builder for python udf */ public class PythonUdfBuilder extends UdfBuilder { + private static final Logger LOG = LogManager.getLogger(PythonUdfBuilder.class); + private final PythonUdf udf; private final int arity; private final boolean isVarArgs; @@ -88,7 +92,16 @@ public Pair build(String name, List arguments) { for (int i = 0; i < exprs.size(); ++i) { processedExprs.add(TypeCoercionUtils.castIfNotSameType(exprs.get(i), argTypes.get(i))); } - return Pair.ofSame(udf.withChildren(processedExprs)); + PythonUdf built = udf.withChildren(processedExprs); + org.apache.doris.catalog.Function catalogFn = built.getCatalogFunction(); + LOG.info("[pyudf-test] PythonUdfBuilder.build name={}, argCount={}, location={}, runtimeVersion={}, " + + "functionCodeEmpty={}, nullableMode={}", + name, arguments.size(), + catalogFn.getLocation() == null ? "null" : catalogFn.getLocation().getLocation(), + catalogFn.getRuntimeVersion(), + catalogFn.getFunctionCode() == null || catalogFn.getFunctionCode().isEmpty(), + catalogFn.getNullableMode()); + return Pair.ofSame(built); } @Override diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdtf.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdtf.java index 74e662aee7297e..9ee167304ec11f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdtf.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdtf.java @@ -58,6 +58,7 @@ public class PythonUdtf extends TableGeneratingFunction implements ExplicitlyCas private final long expirationTime; private final String runtimeVersion; private final String functionCode; + private final boolean deterministic; /** * Constructor of Python UDTF @@ -66,7 +67,7 @@ public PythonUdtf(String name, long functionId, String dbName, Function.BinaryTy FunctionSignature signature, NullableMode nullableMode, String objectFile, String symbol, String prepareFn, String closeFn, String checkSum, boolean isStaticLoad, long expirationTime, - String runtimeVersion, String functionCode, Expression... args) { + String runtimeVersion, String functionCode, boolean deterministic, Expression... args) { super(name, args); this.dbName = dbName; this.functionId = functionId; @@ -82,6 +83,7 @@ public PythonUdtf(String name, long functionId, String dbName, Function.BinaryTy this.expirationTime = expirationTime; this.runtimeVersion = runtimeVersion; this.functionCode = functionCode; + this.deterministic = deterministic; } /** @@ -92,7 +94,7 @@ public PythonUdtf withChildren(List children) { Preconditions.checkArgument(children.size() == this.children.size()); return new PythonUdtf(getName(), functionId, dbName, binaryType, signature, nullableMode, objectFile, symbol, prepareFn, closeFn, checkSum, isStaticLoad, expirationTime, - runtimeVersion, functionCode, children.toArray(new Expression[0])); + runtimeVersion, functionCode, deterministic, children.toArray(new Expression[0])); } @Override @@ -100,6 +102,11 @@ public List getSignatures() { return ImmutableList.of(signature); } + @Override + public boolean isDeterministic() { + return deterministic; + } + @Override public boolean hasVarArguments() { return signature.hasVarArgs; @@ -132,6 +139,7 @@ public Function getCatalogFunction() { expr.setUDTFunction(true); expr.setRuntimeVersion(runtimeVersion); expr.setFunctionCode(functionCode); + expr.setDeterministic(deterministic); return expr; } catch (Exception e) { throw new AnalysisException(e.getMessage(), e.getCause()); @@ -168,6 +176,7 @@ public static void translateToNereidsFunction(String dbName, org.apache.doris.ca scalar.getExpirationTime(), scalar.getRuntimeVersion(), scalar.getFunctionCode(), + scalar.isDeterministic(), arguments); PythonUdtfBuilder builder = new PythonUdtfBuilder(udtf); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/CreateFunctionCommand.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/CreateFunctionCommand.java index eaacae8aaa62e3..b9209797910357 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/CreateFunctionCommand.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/CreateFunctionCommand.java @@ -147,6 +147,7 @@ public class CreateFunctionCommand extends Command implements ForwardWithSync { public static final String IS_STATIC_LOAD = "static_load"; public static final String EXPIRATION_TIME = "expiration_time"; public static final String RUNTIME_VERSION = "runtime_version"; + public static final String IS_DETERMINISTIC = "deterministic"; private static final Pattern PYTHON_VERSION_PATTERN = Pattern.compile("^3\\.\\d{1,2}(?:\\.\\d{1,2})?$"); private static final Logger LOG = LogManager.getLogger(CreateFunctionCommand.class); @@ -179,6 +180,7 @@ public class CreateFunctionCommand extends Command implements ForwardWithSync { private NullableMode returnNullMode = NullableMode.ALWAYS_NULLABLE; private String runtimeVersion; private String functionCode; + private boolean deterministic = false; /** * CreateFunctionCommand @@ -365,6 +367,16 @@ private void analyzeCommon(ConnectContext ctx) throws AnalysisException { + "'3.X.X' or '3.XX.XX' (e.g. '3.10.2').", runtimeVersionString)); } runtimeVersion = runtimeVersionString; + LOG.info("[pyudf-test] analyzeCommon python udf functionName={}, userFile={}, " + + "originalUserFile={}, runtimeVersion={}, functionCodeEmpty={}, properties={}", + functionName, userFile, originalUserFile, runtimeVersion, + Strings.isNullOrEmpty(functionCode), properties); + } + if (binaryType == Function.BinaryType.JAVA_UDF || binaryType == Function.BinaryType.PYTHON_UDF) { + Boolean deterministicProperty = parseBooleanFromProperties(IS_DETERMINISTIC); + if (deterministicProperty != null) { + deterministic = deterministicProperty; + } } } @@ -476,6 +488,7 @@ private void analyzeUdtf() throws AnalysisException { function.setUDTFunction(true); function.setRuntimeVersion(runtimeVersion); function.setFunctionCode(functionCode); + function.setDeterministic(deterministic); // Todo: maybe in create tables function, need register two function, one is // normal and one is outer as those have different result when result is NULL. } @@ -550,6 +563,11 @@ private void analyzeUdaf() throws AnalysisException { function.setExpirationTime(expirationTime); function.setRuntimeVersion(runtimeVersion); function.setFunctionCode(functionCode); + function.setDeterministic(deterministic); + LOG.info("[pyudf-test] analyzeUdf created function signature={}, binaryType={}, location={}, " + + "checksum={}, runtimeVersion={}, functionCodeEmpty={}, deterministic={}", + function.signatureString(), binaryType, location == null ? "null" : location.getLocation(), + checksum, runtimeVersion, Strings.isNullOrEmpty(functionCode), deterministic); } private void analyzeUdf() throws AnalysisException { @@ -587,6 +605,7 @@ private void analyzeUdf() throws AnalysisException { function.setExpirationTime(expirationTime); function.setRuntimeVersion(runtimeVersion); function.setFunctionCode(functionCode); + function.setDeterministic(deterministic); } private void analyzeJavaUdaf(String clazz) throws AnalysisException { @@ -622,9 +641,13 @@ private void analyzePythonUdaf(String clazz) throws AnalysisException { } if (Strings.isNullOrEmpty(this.functionCode)) { + LOG.info("[pyudf-test] analyzePythonUdaf module mode symbol={}, userFile={}, runtimeVersion={}", + clazz, originalUserFile, runtimeVersion); return; } + LOG.info("[pyudf-test] analyzePythonUdaf inline raw symbol={}, rawCodeLength={}, runtimeVersion={}", + clazz, this.functionCode.length(), runtimeVersion); this.functionCode = this.functionCode.trim(); if (!(this.functionCode.startsWith("$$") && this.functionCode.endsWith("$$"))) { throw new AnalysisException("Inline Python UDAF code must be start with $$ and end with $$"); @@ -634,6 +657,9 @@ private void analyzePythonUdaf(String clazz) throws AnalysisException { if (this.functionCode.isEmpty()) { throw new AnalysisException("Inline Python UDAF is empty"); } + LOG.info("[pyudf-test] analyzePythonUdaf inline normalized symbol={}, normalizedCodeLength={}, " + + "runtimeVersion={}", + clazz, this.functionCode.length(), runtimeVersion); } private void checkUdafClass(String clazz, ClassLoader cl, HashMap allMethods) @@ -793,9 +819,13 @@ private void analyzePythonUdf(String clazz) throws AnalysisException { } if (Strings.isNullOrEmpty(this.functionCode)) { + LOG.info("[pyudf-test] analyzePythonUdf module mode symbol={}, userFile={}, runtimeVersion={}", + clazz, originalUserFile, runtimeVersion); return; } + LOG.info("[pyudf-test] analyzePythonUdf inline raw symbol={}, rawCodeLength={}, runtimeVersion={}", + clazz, this.functionCode.length(), runtimeVersion); this.functionCode = this.functionCode.trim(); if (!(this.functionCode.startsWith("$$") && this.functionCode.endsWith("$$"))) { throw new AnalysisException("Inline Python UDF code must be start with $$ and end with $$"); @@ -805,6 +835,9 @@ private void analyzePythonUdf(String clazz) throws AnalysisException { if (this.functionCode.isEmpty()) { throw new AnalysisException("Inline Python UDF is empty"); } + LOG.info("[pyudf-test] analyzePythonUdf inline normalized symbol={}, normalizedCodeLength={}, " + + "runtimeVersion={}", + clazz, this.functionCode.length(), runtimeVersion); } private void checkUdfClass(String clazz, ClassLoader cl) throws ClassNotFoundException, AnalysisException { @@ -905,9 +938,13 @@ private void analyzePythonUdtf(String clazz) throws AnalysisException { } if (Strings.isNullOrEmpty(this.functionCode)) { + LOG.info("[pyudf-test] analyzePythonUdtf module mode symbol={}, userFile={}, runtimeVersion={}", + clazz, originalUserFile, runtimeVersion); return; } + LOG.info("[pyudf-test] analyzePythonUdtf inline raw symbol={}, rawCodeLength={}, runtimeVersion={}", + clazz, this.functionCode.length(), runtimeVersion); this.functionCode = this.functionCode.trim(); if (!(this.functionCode.startsWith("$$") && this.functionCode.endsWith("$$"))) { throw new AnalysisException("Inline Python UDTF code must be start with $$ and end with $$"); @@ -917,6 +954,9 @@ private void analyzePythonUdtf(String clazz) throws AnalysisException { if (this.functionCode.isEmpty()) { throw new AnalysisException("Inline Python UDTF is empty"); } + LOG.info("[pyudf-test] analyzePythonUdtf inline normalized symbol={}, normalizedCodeLength={}, " + + "runtimeVersion={}", + clazz, this.functionCode.length(), runtimeVersion); } private void checkRPCUdf(String symbol) throws AnalysisException { diff --git a/fe/fe-core/src/test/java/org/apache/doris/catalog/CreateFunctionTest.java b/fe/fe-core/src/test/java/org/apache/doris/catalog/CreateFunctionTest.java index fc309de75b0c90..8ebc8ea6aada66 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/catalog/CreateFunctionTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/catalog/CreateFunctionTest.java @@ -24,6 +24,8 @@ import org.apache.doris.common.jmockit.Deencapsulation; import org.apache.doris.nereids.StatementContext; import org.apache.doris.nereids.parser.NereidsParser; +import org.apache.doris.nereids.trees.expressions.functions.FunctionBuilder; +import org.apache.doris.nereids.trees.expressions.functions.udf.JavaUdf; import org.apache.doris.nereids.trees.plans.commands.CreateDatabaseCommand; import org.apache.doris.nereids.trees.plans.commands.CreateFunctionCommand; import org.apache.doris.nereids.trees.plans.commands.CreateTableCommand; @@ -43,6 +45,7 @@ import org.junit.Test; import java.io.File; +import java.util.Collections; import java.util.List; import java.util.UUID; @@ -53,6 +56,12 @@ public class CreateFunctionTest { + public static class TestConstantUdf { + public Integer evaluate() { + return 1; + } + } + private static String runningDir = "fe/mocked/CreateFunctionTest/" + UUID.randomUUID().toString() + "/"; private static ConnectContext connectContext; private static DorisAssert dorisAssert; @@ -150,6 +159,47 @@ public void testCreateGlobalFunction() throws Exception { + " right(CAST(CAST(k1 AS BIGINT) AS VARCHAR(65533)), 4))")); } + @Test + public void testCreateJavaUdfDeterministicProperty() throws Exception { + ConnectContext ctx = UtFrameUtils.createDefaultCtx(); + createDatabase(ctx, "create database db_det;"); + + createFunction("CREATE FUNCTION db_det.default_det() RETURNS int PROPERTIES (\n" + + " \"symbol\"=\"" + TestConstantUdf.class.getName() + "\",\n" + + " \"type\"=\"JAVA_UDF\"\n" + + ");", ctx); + + createFunction("CREATE FUNCTION db_det.explicit_det() RETURNS int PROPERTIES (\n" + + " \"symbol\"=\"" + TestConstantUdf.class.getName() + "\",\n" + + " \"type\"=\"JAVA_UDF\",\n" + + " \"deterministic\"=\"true\"\n" + + ");", ctx); + + Database db = Env.getCurrentInternalCatalog().getDbNullable("db_det"); + Assert.assertNotNull(db); + + Function defaultFn = db.getFunction( + new FunctionSearchDesc(new FunctionName("db_det", "default_det"), new Type[] {}, false)); + Function explicitFn = db.getFunction( + new FunctionSearchDesc(new FunctionName("db_det", "explicit_det"), new Type[] {}, false)); + + Assert.assertNotNull(defaultFn); + Assert.assertNotNull(explicitFn); + Assert.assertFalse(defaultFn.isDeterministic()); + Assert.assertTrue(explicitFn.isDeterministic()); + + FunctionRegistry functionRegistry = Env.getCurrentEnv().getFunctionRegistry(); + FunctionBuilder defaultBuilder = functionRegistry.findFunctionBuilder( + "db_det", "default_det", Collections.emptyList()); + FunctionBuilder explicitBuilder = functionRegistry.findFunctionBuilder( + "db_det", "explicit_det", Collections.emptyList()); + + JavaUdf defaultNereidsFn = (JavaUdf) defaultBuilder.build("default_det", Collections.emptyList()).first; + JavaUdf explicitNereidsFn = (JavaUdf) explicitBuilder.build("explicit_det", Collections.emptyList()).first; + Assert.assertFalse(defaultNereidsFn.isDeterministic()); + Assert.assertTrue(explicitNereidsFn.isDeterministic()); + } + private void testFunctionQuery(ConnectContext ctx, String queryStr, Boolean isStringLiteral) throws Exception { ctx.getState().reset(); StmtExecutor stmtExecutor = new StmtExecutor(ctx, queryStr); diff --git a/fe/fe-core/src/test/java/org/apache/doris/catalog/FunctionToSqlConverterTest.java b/fe/fe-core/src/test/java/org/apache/doris/catalog/FunctionToSqlConverterTest.java index 26b22fa1baed1b..979874df53d95d 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/catalog/FunctionToSqlConverterTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/catalog/FunctionToSqlConverterTest.java @@ -45,6 +45,7 @@ void testScalarFunction_javaUdf_basicSql() { Assertions.assertTrue(sql.contains("\"FILE\"=\"\"")); Assertions.assertTrue(sql.contains("\"TYPE\"=\"JAVA_UDF\"")); Assertions.assertTrue(sql.contains("\"ALWAYS_NULLABLE\"=")); + Assertions.assertTrue(sql.contains("\"DETERMINISTIC\"=\"false\"")); Assertions.assertFalse(sql.contains("OBJECT_FILE")); Assertions.assertFalse(sql.contains("IF NOT EXISTS")); Assertions.assertFalse(sql.contains("GLOBAL")); @@ -186,6 +187,7 @@ void testAggregateFunction_javaUdf_basicSql() { Assertions.assertTrue(sql.contains("\"FILE\"=\"\"")); Assertions.assertTrue(sql.contains("\"TYPE\"=\"JAVA_UDF\"")); Assertions.assertTrue(sql.contains("\"ALWAYS_NULLABLE\"=")); + Assertions.assertTrue(sql.contains("\"DETERMINISTIC\"=\"false\"")); Assertions.assertFalse(sql.contains("INIT_FN")); Assertions.assertFalse(sql.contains("UPDATE_FN")); Assertions.assertFalse(sql.contains("MERGE_FN")); diff --git a/regression-test/data/pythonudaf_p0/test_pythonudaf_deterministic.out b/regression-test/data/pythonudaf_p0/test_pythonudaf_deterministic.out new file mode 100644 index 00000000000000..144af0dc4d1a9c --- /dev/null +++ b/regression-test/data/pythonudaf_p0/test_pythonudaf_deterministic.out @@ -0,0 +1,20 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !materialized -- +1 1 +2 1 +3 1 + +-- !inlined -- +1 1 +2 1 +3 1 + +-- !materialized_det -- +1 1 +2 1 +3 1 + +-- !inlined_det -- +1 2 +2 2 +3 2 diff --git a/regression-test/data/pythonudaf_p0/test_pythonudaf_drop.out b/regression-test/data/pythonudaf_p0/test_pythonudaf_drop.out index 79e35e30ee5fcd..8c1eb081162e9b 100644 --- a/regression-test/data/pythonudaf_p0/test_pythonudaf_drop.out +++ b/regression-test/data/pythonudaf_p0/test_pythonudaf_drop.out @@ -8,3 +8,9 @@ -- !py_udaf_drop_3 -- 6 +-- !py_udaf_drop_4 -- +6 + +-- !py_udaf_drop_5 -- +6 + diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_deterministic.out b/regression-test/data/pythonudf_p0/test_pythonudf_deterministic.out new file mode 100644 index 00000000000000..144af0dc4d1a9c --- /dev/null +++ b/regression-test/data/pythonudf_p0/test_pythonudf_deterministic.out @@ -0,0 +1,20 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !materialized -- +1 1 +2 1 +3 1 + +-- !inlined -- +1 1 +2 1 +3 1 + +-- !materialized_det -- +1 1 +2 1 +3 1 + +-- !inlined_det -- +1 2 +2 2 +3 2 diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_drop.out b/regression-test/data/pythonudf_p0/test_pythonudf_drop.out index 254ebe44809dfc..903817f7e7364a 100644 --- a/regression-test/data/pythonudf_p0/test_pythonudf_drop.out +++ b/regression-test/data/pythonudf_p0/test_pythonudf_drop.out @@ -8,3 +8,9 @@ -- !py_udf_drop_3 -- 8 +-- !py_udf_drop_4-- +32 + +-- !py_udf_drop_5 -- +33 + diff --git a/regression-test/data/pythonudtf_p0/test_pythonudtf_deterministic.out b/regression-test/data/pythonudtf_p0/test_pythonudtf_deterministic.out new file mode 100644 index 00000000000000..144af0dc4d1a9c --- /dev/null +++ b/regression-test/data/pythonudtf_p0/test_pythonudtf_deterministic.out @@ -0,0 +1,20 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !materialized -- +1 1 +2 1 +3 1 + +-- !inlined -- +1 1 +2 1 +3 1 + +-- !materialized_det -- +1 1 +2 1 +3 1 + +-- !inlined_det -- +1 2 +2 2 +3 2 diff --git a/regression-test/data/pythonudtf_p0/test_pythonudtf_drop.out b/regression-test/data/pythonudtf_p0/test_pythonudtf_drop.out index 6f1159a95d1d4b..faa6e95841cdcf 100644 --- a/regression-test/data/pythonudtf_p0/test_pythonudtf_drop.out +++ b/regression-test/data/pythonudtf_p0/test_pythonudtf_drop.out @@ -7,7 +7,11 @@ 1 2 2 3 --- !py_udtf_drop_3 -- +-- !py_udtf_drop_4 -- +1 +2 + +-- !py_udtf_drop_5 -- 1 2 diff --git a/regression-test/suites/javaudf_p0/test_javaudf_float.groovy b/regression-test/suites/javaudf_p0/test_javaudf_float.groovy index 5372bda71c4f3e..e57e130bda9ded 100644 --- a/regression-test/suites/javaudf_p0/test_javaudf_float.groovy +++ b/regression-test/suites/javaudf_p0/test_javaudf_float.groovy @@ -64,12 +64,24 @@ suite("test_javaudf_float") { qt_select """ SELECT java_udf_float_test(2.83645,null) as result ; """ qt_select """ SELECT java_udf_float_test(cast(2.83645 as float),null) as result ; """ qt_select """ SELECT user_id,java_udf_float_test(float_1, float_2) as sum FROM ${tableName} order by user_id; """ - createMV("create materialized view udf_mv as SELECT user_id as a1,java_udf_float_test(float_1, float_2) as sum FROM test_javaudf_float order by user_id;") + sql """ DROP MATERIALIZED VIEW IF EXISTS udf_mv; """ + sql """ + CREATE MATERIALIZED VIEW udf_mv + BUILD DEFERRED REFRESH AUTO ON MANUAL + DISTRIBUTED BY RANDOM BUCKETS 2 + PROPERTIES ( + 'replication_num' = '1', + 'version_info'='3', + 'enable_nondeterministic_function' = 'true' + ) + AS + SELECT user_id as a1, java_udf_float_test(float_1, float_2) as sum FROM ${tableName}; + """ qt_select """ SELECT user_id,java_udf_float_test(float_1, float_2) as sum FROM ${tableName} order by user_id; """ explain { sql("SELECT user_id,java_udf_float_test(float_1, float_2) as sum FROM ${tableName} order by user_id; ") - contains "(udf_mv)" + notContains "(udf_mv)" } @@ -90,6 +102,7 @@ suite("test_javaudf_float") { } finally { + try_sql("DROP MATERIALIZED VIEW IF EXISTS udf_mv;") try_sql("DROP FUNCTION IF EXISTS java_udf_double_test(DOUBLE,DOUBLE);") try_sql("DROP FUNCTION IF EXISTS java_udf_float_test(FLOAT,FLOAT);") try_sql("DROP TABLE IF EXISTS ${tableName}") diff --git a/regression-test/suites/mtmv_p0/test_expand_star_mtmv.groovy b/regression-test/suites/mtmv_p0/test_expand_star_mtmv.groovy index f550dc78c3db33..d32041b3531b8b 100644 --- a/regression-test/suites/mtmv_p0/test_expand_star_mtmv.groovy +++ b/regression-test/suites/mtmv_p0/test_expand_star_mtmv.groovy @@ -71,7 +71,8 @@ suite("test_expand_star_mtmv","mtmv") { DISTRIBUTED BY RANDOM BUCKETS 2 PROPERTIES ( 'replication_num' = '1', - 'version_info'='3' + 'version_info'='3', + 'enable_nondeterministic_function' = 'true' ) AS SELECT ${functionName} ('2011-01-01','2011-01-03') as k1 from ${tableName}; diff --git a/regression-test/suites/pythonudaf_p0/test_python_raise_error_propagation.groovy b/regression-test/suites/pythonudaf_p0/test_python_raise_error_propagation.groovy new file mode 100644 index 00000000000000..669bca070f9e47 --- /dev/null +++ b/regression-test/suites/pythonudaf_p0/test_python_raise_error_propagation.groovy @@ -0,0 +1,196 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_python_raise_error_propagation") { + // Keep using the existing type-wide archives under regression-test/suites. + // This avoids introducing extra zip names and preserves the same loader/cache path shape as p0. + def suitePath = context.file.parent + "/.." + def udfPath = """${suitePath}/pythonudf_p0/udf_scripts/pyudf.zip""" + def udafPath = """${suitePath}/pythonudaf_p0/udaf_scripts/pyudaf.zip""" + def udtfPath = """${suitePath}/pythonudtf_p0/udtf_scripts/pyudtf.zip""" + scp_udf_file_to_all_be(udfPath) + scp_udf_file_to_all_be(udafPath) + scp_udf_file_to_all_be(udtfPath) + def runtime_version = getPythonUdfRuntimeVersion() + log.info("Python UDF zip path: ${udfPath}".toString()) + log.info("Python UDAF zip path: ${udafPath}".toString()) + log.info("Python UDTF zip path: ${udtfPath}".toString()) + + try { + sql """ DROP TABLE IF EXISTS python_raise_error_test; """ + sql """ + CREATE TABLE python_raise_error_test ( + id INT, + val INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO python_raise_error_test VALUES + (1, 1), + (2, 2); + """ + + sql """ DROP FUNCTION IF EXISTS py_inline_raise_udf(INT); """ + sql """ + CREATE FUNCTION py_inline_raise_udf(INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) AS \$\$ +def evaluate(x): + raise TypeError("inline_udf_error_42") +\$\$; + """ + + test { + sql """ SELECT py_inline_raise_udf(1); """ + exception "inline_udf_error_42" + } + + sql """ DROP FUNCTION IF EXISTS py_module_raise_udf(INT); """ + sql """ + CREATE FUNCTION py_module_raise_udf(INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${udfPath}", + "symbol" = "udf_errors.raise_in_module", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + test { + sql """ SELECT py_module_raise_udf(1); """ + exception "module_udf_error_42" + } + + sql """ DROP FUNCTION IF EXISTS py_inline_raise_udaf(INT); """ + sql """ + CREATE AGGREGATE FUNCTION py_inline_raise_udaf(INT) + RETURNS BIGINT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "InlineFinishErrorUDAF", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) AS \$\$ +class InlineFinishErrorUDAF: + def __init__(self): + self.count = 0 + + @property + def aggregate_state(self): + return self.count + + def accumulate(self, value): + if value is not None: + self.count += 1 + + def merge(self, other_state): + self.count += other_state + + def finish(self): + raise TypeError("inline_udaf_error_42") +\$\$; + """ + + test { + sql """ SELECT py_inline_raise_udaf(val) FROM python_raise_error_test; """ + exception "inline_udaf_error_42" + } + + sql """ DROP FUNCTION IF EXISTS py_module_raise_udaf(INT); """ + sql """ + CREATE AGGREGATE FUNCTION py_module_raise_udaf(INT) + RETURNS BIGINT + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${udafPath}", + "symbol" = "udaf_errors.ModuleFinishErrorUDAF", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + test { + sql """ SELECT py_module_raise_udaf(val) FROM python_raise_error_test; """ + exception "module_udaf_error_42" + } + + sql """ DROP FUNCTION IF EXISTS py_inline_raise_udtf(INT); """ + sql """ + CREATE TABLES FUNCTION py_inline_raise_udtf(INT) + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "inline_raise_udtf", + "runtime_version" = "${runtime_version}" + ) AS \$\$ +def inline_raise_udtf(x): + if False: + yield x + raise TypeError("inline_udtf_error_42") +\$\$; + """ + + test { + sql """ + SELECT tmp.col + FROM python_raise_error_test + LATERAL VIEW py_inline_raise_udtf(val) tmp AS col; + """ + exception "inline_udtf_error_42" + } + + sql """ DROP FUNCTION IF EXISTS py_module_raise_udtf(INT); """ + sql """ + CREATE TABLES FUNCTION py_module_raise_udtf(INT) + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${udtfPath}", + "symbol" = "pyudtf_module.exceptions_udtf.raise_in_module_udtf", + "runtime_version" = "${runtime_version}" + ); + """ + + test { + sql """ + SELECT tmp.col + FROM python_raise_error_test + LATERAL VIEW py_module_raise_udtf(val) tmp AS col; + """ + exception "module_udtf_error_42" + } + } finally { + try_sql("DROP FUNCTION IF EXISTS py_inline_raise_udf(INT);") + try_sql("DROP FUNCTION IF EXISTS py_module_raise_udf(INT);") + try_sql("DROP FUNCTION IF EXISTS py_inline_raise_udaf(INT);") + try_sql("DROP FUNCTION IF EXISTS py_module_raise_udaf(INT);") + try_sql("DROP FUNCTION IF EXISTS py_inline_raise_udtf(INT);") + try_sql("DROP FUNCTION IF EXISTS py_module_raise_udtf(INT);") + try_sql("DROP TABLE IF EXISTS python_raise_error_test;") + } +} diff --git a/regression-test/suites/pythonudaf_p0/test_pythonudaf_deterministic.groovy b/regression-test/suites/pythonudaf_p0/test_pythonudaf_deterministic.groovy new file mode 100644 index 00000000000000..ecfcb3ba3aa5a9 --- /dev/null +++ b/regression-test/suites/pythonudaf_p0/test_pythonudaf_deterministic.groovy @@ -0,0 +1,255 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.junit.Assert; + +suite("test_pythonudaf_deterministic") { + def runtime_version = getPythonUdfRuntimeVersion() + + try { + sql """ DROP TABLE IF EXISTS cte_uuid_seed; """ + sql """ DROP TABLE IF EXISTS mtmv_uuid_seed; """ + sql """ DROP MATERIALIZED VIEW IF EXISTS py_uuid_agg_mtmv; """ + sql """ DROP FUNCTION IF EXISTS py_uuid_agg_false(INT); """ + sql """ DROP FUNCTION IF EXISTS py_uuid_agg_det(INT); """ + sql """ + CREATE TABLE cte_uuid_seed (id INT) ENGINE=OLAP DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("replication_num" = "1"); + """ + sql """ INSERT INTO cte_uuid_seed VALUES (1), (2), (3); """ + sql """ sync; """ + + sql """ SET enable_nereids_planner = true; """ + sql """ SET enable_fallback_to_original_planner = false; """ + + sql """ DROP FUNCTION IF EXISTS py_uuid_agg(INT); """ + sql """ + CREATE AGGREGATE FUNCTION py_uuid_agg(INT) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "PyUuidAgg", + "always_nullable" = "false", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +import uuid + +class PyUuidAgg: + def __init__(self): + self.last = None + + def accumulate(self, value): + if value is not None: + self.last = value + + def merge(self, other_state): + if other_state is not None: + self.last = other_state + + def finish(self): + return f"{self.last}-{uuid.uuid4()}" + + @property + def aggregate_state(self): + return self.last +\$\$; + """ + def showDefault = sql """ SHOW CREATE FUNCTION py_uuid_agg(INT); """ + assertTrue(showDefault.size() == 1) + assertTrue(showDefault[0][1].contains("\"DETERMINISTIC\"=\"false\"")) + + sql """ + CREATE AGGREGATE FUNCTION py_uuid_agg_false(INT) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "PyUuidAggFalse", + "always_nullable" = "false", + "runtime_version" = "${runtime_version}", + "deterministic" = "false" + ) + AS \$\$ +import uuid + +class PyUuidAggFalse: + def __init__(self): + self.last = None + + def accumulate(self, value): + if value is not None: + self.last = value + + def merge(self, other_state): + if other_state is not None: + self.last = other_state + + def finish(self): + return f"{self.last}-{uuid.uuid4()}" + + @property + def aggregate_state(self): + return self.last +\$\$; + """ + def showExplicitFalse = sql """ SHOW CREATE FUNCTION py_uuid_agg_false(INT); """ + assertTrue(showExplicitFalse.size() == 1) + assertTrue(showExplicitFalse[0][1].contains("\"DETERMINISTIC\"=\"false\"")) + + sql """ + CREATE AGGREGATE FUNCTION py_uuid_agg_det(INT) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "PyUuidAggDet", + "always_nullable" = "false", + "runtime_version" = "${runtime_version}", + "deterministic" = "true" + ) + AS \$\$ +import uuid + +class PyUuidAggDet: + def __init__(self): + self.last = None + + def accumulate(self, value): + if value is not None: + self.last = value + + def merge(self, other_state): + if other_state is not None: + self.last = other_state + + def finish(self): + return f"{self.last}-{uuid.uuid4()}" + + @property + def aggregate_state(self): + return self.last +\$\$; + """ + def showDet = sql """ SHOW CREATE FUNCTION py_uuid_agg_det(INT); """ + assertTrue(showDet.size() == 1) + assertTrue(showDet[0][1].contains("\"DETERMINISTIC\"=\"true\"")) + + sql """ SET enable_cte_materialize = true; """ + sql """ SET inline_cte_referenced_threshold = 1; """ + qt_materialized """ + WITH cte AS ( + SELECT id, py_uuid_agg(id) AS token + FROM cte_uuid_seed + GROUP BY id + ) + SELECT id, COUNT(DISTINCT token) AS distinct_tokens + FROM ( + SELECT id, token FROM cte + UNION ALL + SELECT id, token FROM cte + ) u + GROUP BY id + ORDER BY id; + """ + + sql """ SET enable_cte_materialize = true; """ + sql """ SET inline_cte_referenced_threshold = 10; """ + qt_inlined """ + WITH cte AS ( + SELECT id, py_uuid_agg(id) AS token + FROM cte_uuid_seed + GROUP BY id + ) + SELECT id, COUNT(DISTINCT token) AS distinct_tokens + FROM ( + SELECT id, token FROM cte + UNION ALL + SELECT id, token FROM cte + ) u + GROUP BY id + ORDER BY id; + """ + + sql """ SET enable_cte_materialize = true; """ + sql """ SET inline_cte_referenced_threshold = 1; """ + qt_materialized_det """ + WITH cte AS ( + SELECT id, py_uuid_agg_det(id) AS token + FROM cte_uuid_seed + GROUP BY id + ) + SELECT id, COUNT(DISTINCT token) AS distinct_tokens + FROM ( + SELECT id, token FROM cte + UNION ALL + SELECT id, token FROM cte + ) u + GROUP BY id + ORDER BY id; + """ + + sql """ SET enable_cte_materialize = true; """ + sql """ SET inline_cte_referenced_threshold = 10; """ + qt_inlined_det """ + WITH cte AS ( + SELECT id, py_uuid_agg_det(id) AS token + FROM cte_uuid_seed + GROUP BY id + ) + SELECT id, COUNT(DISTINCT token) AS distinct_tokens + FROM ( + SELECT id, token FROM cte + UNION ALL + SELECT id, token FROM cte + ) u + GROUP BY id + ORDER BY id; + """ + + sql """ + CREATE TABLE mtmv_uuid_seed (id INT, v INT) ENGINE=OLAP DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("replication_num" = "1"); + """ + sql """ INSERT INTO mtmv_uuid_seed VALUES (1, 10), (1, 11), (2, 20), (2, 21), (3, 30), (3, 31); """ + sql """ sync; """ + + try { + sql """ + CREATE MATERIALIZED VIEW py_uuid_agg_mtmv + BUILD DEFERRED REFRESH COMPLETE ON MANUAL + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ("replication_num" = "1") + AS + SELECT id, py_uuid_agg(v) AS token + FROM mtmv_uuid_seed + GROUP BY id; + """ + Assert.fail() + } catch (Exception e) { + log.info(e.getMessage()) + assertTrue(e.getMessage().contains("can not contain nonDeterministic expression")) + } + } finally { + try_sql(""" DROP MATERIALIZED VIEW IF EXISTS py_uuid_agg_mtmv; """) + sql """ DROP FUNCTION IF EXISTS py_uuid_agg(INT); """ + sql """ DROP FUNCTION IF EXISTS py_uuid_agg_false(INT); """ + sql """ DROP FUNCTION IF EXISTS py_uuid_agg_det(INT); """ + sql """ DROP TABLE IF EXISTS cte_uuid_seed; """ + sql """ DROP TABLE IF EXISTS mtmv_uuid_seed; """ + } +} diff --git a/regression-test/suites/pythonudaf_p0/test_pythonudaf_drop.groovy b/regression-test/suites/pythonudaf_p0/test_pythonudaf_drop.groovy index 964413828ab7d3..e0b0ed8c4668e9 100644 --- a/regression-test/suites/pythonudaf_p0/test_pythonudaf_drop.groovy +++ b/regression-test/suites/pythonudaf_p0/test_pythonudaf_drop.groovy @@ -15,10 +15,23 @@ // specific language governing permissions and limitations // under the License. -suite('test_pythonudaf_drop') { +suite('test_pythonudaf_drop', "nonConcurrent") { def runtime_version = getPythonUdfRuntimeVersion() def zipA = """${context.file.parent}/udaf_scripts/python_udaf_drop_a/python_udaf_drop_test.zip""" def zipB = """${context.file.parent}/udaf_scripts/python_udaf_drop_b/python_udaf_drop_test.zip""" + def localDorisHome = System.getenv("DORIS_HOME") + def localUdfRoot = localDorisHome != null ? "${localDorisHome}/lib/udf" : "/tmp" + def backendId_to_backendIP = [:] + def backendId_to_backendHttpPort = [:] + getBackendIpHttpPort(backendId_to_backendIP, backendId_to_backendHttpPort) + + def execOnBackend = { be_ip, localCmd, remoteCmd -> + if (be_ip == "127.0.0.1" || be_ip == "localhost") { + cmd(localCmd) + } else { + sshExec("root", be_ip, remoteCmd, false) + } + } scp_udf_file_to_all_be(zipA) scp_udf_file_to_all_be(zipB) @@ -88,9 +101,96 @@ suite('test_pythonudaf_drop') { sql '''SELECT py_drop_sum_a(v) FROM py_udaf_drop_tbl;''' exception 'Can not found function' } + + // Case 3: kill Python servers between two aggregate queries, next CREATE handshake should recover + sql '''DROP FUNCTION IF EXISTS py_drop_sum_reconnect(INT)''' + sql """ + CREATE AGGREGATE FUNCTION py_drop_sum_reconnect(INT) RETURNS BIGINT PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${zipA}", + "symbol" = "drop_udaf.SumAgg", + "runtime_version" = "${runtime_version}" + ) + """ + + qt_py_udaf_drop_4 '''SELECT py_drop_sum_reconnect(v) FROM py_udaf_drop_tbl;''' + + backendId_to_backendIP.values().each { be_ip -> + execOnBackend( + be_ip, + "pkill -f 'python_server.py grpc+unix:///tmp/doris_python_udf' || true", + "pkill -f 'python_server.py grpc+unix:///tmp/doris_python_udf' || true") + } + + qt_py_udaf_drop_5 '''SELECT py_drop_sum_reconnect(v) FROM py_udaf_drop_tbl;''' + try_sql('DROP FUNCTION IF EXISTS py_drop_sum_reconnect(INT);') + + // Case 4: inline UDAF drop/recreate must not reuse the old Python class. + // The Python server caches UDAF state managers, so this verifies the cache key + // and drop cleanup both use the FE function id, not just name + argument types. + sql '''DROP FUNCTION IF EXISTS py_drop_inline_recreate(INT)''' + sql """ + CREATE AGGREGATE FUNCTION py_drop_inline_recreate(INT) + RETURNS BIGINT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "InlineDropRecreateUdaf", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +class InlineDropRecreateUdaf: + def __init__(self): + self.total = 0 + @property + def aggregate_state(self): + return self.total + def accumulate(self, val): + if val is not None: + self.total += val + def merge(self, other): + self.total += other + def finish(self): + return self.total * 10 +\$\$ + """ + def inlineOldResult = sql '''SELECT py_drop_inline_recreate(v) FROM py_udaf_drop_tbl;''' + assert inlineOldResult[0][0].toString() == '60' + + sql '''DROP FUNCTION IF EXISTS py_drop_inline_recreate(INT)''' + sql """ + CREATE AGGREGATE FUNCTION py_drop_inline_recreate(INT) + RETURNS BIGINT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "InlineDropRecreateUdaf", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +class InlineDropRecreateUdaf: + def __init__(self): + self.total = 0 + @property + def aggregate_state(self): + return self.total + def accumulate(self, val): + if val is not None: + self.total += val + def merge(self, other): + self.total += other + def finish(self): + return self.total * 100 +\$\$ + """ + def inlineNewResult = sql '''SELECT py_drop_inline_recreate(v) FROM py_udaf_drop_tbl;''' + assert inlineNewResult[0][0].toString() == '600' + sql '''DROP FUNCTION IF EXISTS py_drop_inline_recreate(INT)''' } finally { try_sql('DROP FUNCTION IF EXISTS py_drop_sum_once(INT);') try_sql('DROP FUNCTION IF EXISTS py_drop_sum_a(INT);') try_sql('DROP FUNCTION IF EXISTS py_drop_sum_b(INT);') + try_sql('DROP FUNCTION IF EXISTS py_drop_sum_reconnect(INT);') + try_sql('DROP FUNCTION IF EXISTS py_drop_inline_recreate(INT);') } } diff --git a/regression-test/suites/pythonudaf_p0/udaf_scripts/pyudaf.zip b/regression-test/suites/pythonudaf_p0/udaf_scripts/pyudaf.zip index 1dc76099d43326..835aa5af0ad6dc 100644 Binary files a/regression-test/suites/pythonudaf_p0/udaf_scripts/pyudaf.zip and b/regression-test/suites/pythonudaf_p0/udaf_scripts/pyudaf.zip differ diff --git a/regression-test/suites/pythonudaf_p0/udaf_scripts/udaf_errors.py b/regression-test/suites/pythonudaf_p0/udaf_scripts/udaf_errors.py new file mode 100644 index 00000000000000..e7ea9ed3c3db60 --- /dev/null +++ b/regression-test/suites/pythonudaf_p0/udaf_scripts/udaf_errors.py @@ -0,0 +1,42 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Module-based UDAF error cases for regression tests.""" + + +class ModuleFinishErrorUDAF: + """Raise a stable error from finish() to verify propagation.""" + + def __init__(self): + self.count = 0 + + @property + def aggregate_state(self): + return self.count + + def accumulate(self, value): + if value is not None: + self.count += 1 + + def merge(self, other_state): + self.count += other_state + + def reset(self): + self.count = 0 + + def finish(self): + raise TypeError("module_udaf_error_42") diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_deterministic.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_deterministic.groovy new file mode 100644 index 00000000000000..69e15a924d1740 --- /dev/null +++ b/regression-test/suites/pythonudf_p0/test_pythonudf_deterministic.groovy @@ -0,0 +1,203 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.junit.Assert; + +suite("test_pythonudf_deterministic") { + def runtime_version = getPythonUdfRuntimeVersion() + + try { + sql """ DROP TABLE IF EXISTS cte_uuid_seed; """ + sql """ DROP TABLE IF EXISTS mtmv_uuid_seed; """ + sql """ DROP MATERIALIZED VIEW IF EXISTS py_uuid_token_mtmv; """ + sql """ DROP FUNCTION IF EXISTS py_uuid_token_false(INT); """ + sql """ DROP FUNCTION IF EXISTS py_uuid_token_det(INT); """ + sql """ + CREATE TABLE cte_uuid_seed (id INT) ENGINE=OLAP DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("replication_num" = "1"); + """ + sql """ INSERT INTO cte_uuid_seed VALUES (1), (2), (3); """ + sql """ sync; """ + + sql """ SET enable_nereids_planner = true; """ + sql """ SET enable_fallback_to_original_planner = false; """ + + sql """ DROP FUNCTION IF EXISTS py_uuid_token(INT); """ + sql """ + CREATE FUNCTION py_uuid_token(INT) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "py_uuid_token_impl", + "always_nullable" = "false", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +import uuid + +def py_uuid_token_impl(x): + return f"{x}-{uuid.uuid4()}" +\$\$; + """ + def showDefault = sql """ SHOW CREATE FUNCTION py_uuid_token(INT); """ + assertTrue(showDefault.size() == 1) + assertTrue(showDefault[0][1].contains("DETERMINISTIC")) + assertTrue(showDefault[0][1].contains("\"DETERMINISTIC\"=\"false\"")) + + sql """ + CREATE FUNCTION py_uuid_token_false(INT) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "py_uuid_token_impl", + "always_nullable" = "false", + "runtime_version" = "${runtime_version}", + "deterministic" = "false" + ) + AS \$\$ +import uuid + +def py_uuid_token_impl(x): + return f"{x}-{uuid.uuid4()}" +\$\$; + """ + def showExplicitFalse = sql """ SHOW CREATE FUNCTION py_uuid_token_false(INT); """ + assertTrue(showExplicitFalse.size() == 1) + assertTrue(showExplicitFalse[0][1].contains("\"DETERMINISTIC\"=\"false\"")) + + sql """ + CREATE FUNCTION py_uuid_token_det(INT) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "py_uuid_token_det_impl", + "always_nullable" = "false", + "runtime_version" = "${runtime_version}", + "deterministic" = "true" + ) + AS \$\$ +import uuid + +def py_uuid_token_det_impl(x): + return f"{x}-{uuid.uuid4()}" +\$\$; + """ + def showDet = sql """ SHOW CREATE FUNCTION py_uuid_token_det(INT); """ + assertTrue(showDet.size() == 1) + assertTrue(showDet[0][1].contains("\"DETERMINISTIC\"=\"true\"")) + + sql """ SET enable_cte_materialize = true; """ + sql """ SET inline_cte_referenced_threshold = 1; """ + qt_materialized """ + WITH cte AS ( + SELECT id, py_uuid_token(id) AS token + FROM cte_uuid_seed + ) + SELECT id, COUNT(DISTINCT token) AS distinct_tokens + FROM ( + SELECT id, token FROM cte + UNION ALL + SELECT id, token FROM cte + ) u + GROUP BY id + ORDER BY id; + """ + + sql """ SET enable_cte_materialize = true; """ + sql """ SET inline_cte_referenced_threshold = 10; """ + qt_inlined """ + WITH cte AS ( + SELECT id, py_uuid_token(id) AS token + FROM cte_uuid_seed + ) + SELECT id, COUNT(DISTINCT token) AS distinct_tokens + FROM ( + SELECT id, token FROM cte + UNION ALL + SELECT id, token FROM cte + ) u + GROUP BY id + ORDER BY id; + """ + + sql """ SET enable_cte_materialize = true; """ + sql """ SET inline_cte_referenced_threshold = 1; """ + qt_materialized_det """ + WITH cte AS ( + SELECT id, py_uuid_token_det(id) AS token + FROM cte_uuid_seed + ) + SELECT id, COUNT(DISTINCT token) AS distinct_tokens + FROM ( + SELECT id, token FROM cte + UNION ALL + SELECT id, token FROM cte + ) u + GROUP BY id + ORDER BY id; + """ + + sql """ SET enable_cte_materialize = true; """ + sql """ SET inline_cte_referenced_threshold = 10; """ + qt_inlined_det """ + WITH cte AS ( + SELECT id, py_uuid_token_det(id) AS token + FROM cte_uuid_seed + ) + SELECT id, COUNT(DISTINCT token) AS distinct_tokens + FROM ( + SELECT id, token FROM cte + UNION ALL + SELECT id, token FROM cte + ) u + GROUP BY id + ORDER BY id; + """ + + sql """ + CREATE TABLE mtmv_uuid_seed (id INT) ENGINE=OLAP DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("replication_num" = "1"); + """ + sql """ INSERT INTO mtmv_uuid_seed VALUES (1), (2), (3); """ + sql """ sync; """ + + try { + sql """ + CREATE MATERIALIZED VIEW py_uuid_token_mtmv + BUILD DEFERRED REFRESH COMPLETE ON MANUAL + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ("replication_num" = "1") + AS + SELECT id, py_uuid_token(id) AS token + FROM mtmv_uuid_seed; + """ + Assert.fail() + } catch (Exception e) { + log.info(e.getMessage()) + assertTrue(e.getMessage().contains("can not contain nonDeterministic expression")) + } + } finally { + try_sql(""" DROP MATERIALIZED VIEW IF EXISTS py_uuid_token_mtmv; """) + sql """ DROP FUNCTION IF EXISTS py_uuid_token(INT); """ + sql """ DROP FUNCTION IF EXISTS py_uuid_token_false(INT); """ + sql """ DROP FUNCTION IF EXISTS py_uuid_token_det(INT); """ + sql """ DROP TABLE IF EXISTS cte_uuid_seed; """ + sql """ DROP TABLE IF EXISTS mtmv_uuid_seed; """ + } +} diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_drop.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_drop.groovy index ab103c21f25111..b2ab740f6ea0ec 100644 --- a/regression-test/suites/pythonudf_p0/test_pythonudf_drop.groovy +++ b/regression-test/suites/pythonudf_p0/test_pythonudf_drop.groovy @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -suite("test_pythonudf_drop") { +suite("test_pythonudf_drop", "nonConcurrent") { def runtime_version = getPythonUdfRuntimeVersion() def zipA = """${context.file.parent}/udf_scripts/python_udf_drop_a/python_udf_drop_test.zip""" def zipB = """${context.file.parent}/udf_scripts/python_udf_drop_b/python_udf_drop_test.zip""" @@ -88,9 +88,138 @@ suite("test_pythonudf_drop") { sql """SELECT py_drop_a(1);""" exception "Can not found function" } + + // Case 3: kill Python servers between two queries, next client handshake should recover + sql """DROP FUNCTION IF EXISTS py_drop_reconnect(INT)""" + sql """ + CREATE FUNCTION py_drop_reconnect(INT) RETURNS INT PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${zipA}", + "symbol" = "drop_udf.evaluate", + "runtime_version" = "${runtime_version}" + ) + """ + + qt_py_udf_drop_4 """SELECT py_drop_reconnect(31);""" + + backendId_to_backendIP.values().each { be_ip -> + execOnBackend( + be_ip, + "pkill -f 'python_server.py grpc+unix:///tmp/doris_python_udf' || true", + "pkill -f 'python_server.py grpc+unix:///tmp/doris_python_udf' || true") + } + + qt_py_udf_drop_5 """SELECT py_drop_reconnect(32);""" + try_sql("DROP FUNCTION IF EXISTS py_drop_reconnect(INT);") + + // Case 4: recreating the same signature must use the new inline function body. + sql """DROP FUNCTION IF EXISTS py_drop_recreate(INT)""" + sql """ + CREATE FUNCTION py_drop_recreate(INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +def evaluate(x): + if x is None: + return None + return x + 1 +\$\$ + """ + def recreateOldResult = sql """SELECT py_drop_recreate(10);""" + assert recreateOldResult[0][0] == 11 + + sql """DROP FUNCTION IF EXISTS py_drop_recreate(INT)""" + sql """ + CREATE FUNCTION py_drop_recreate(INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +def evaluate(x): + if x is None: + return None + return x + 999 +\$\$ + """ + def recreateNewResult = sql """SELECT py_drop_recreate(10);""" + assert recreateNewResult[0][0] == 1009 + sql """DROP FUNCTION IF EXISTS py_drop_recreate(INT)""" + + // Case 5: dropping a database must also clear Nereids UDF registry. + // SHOW FUNCTIONS reads catalog metadata, while SELECT resolves from FunctionRegistry. + // Without registry cleanup, SELECT could still bind the stale x + 1 inline UDF + // after the database had been dropped and recreated. + def originalDb = sql("SELECT DATABASE()")[0][0] + def registryDb = "${originalDb}_registry_cleanup" + try { + sql """DROP DATABASE IF EXISTS ${registryDb} FORCE""" + sql """CREATE DATABASE ${registryDb}""" + sql """USE ${registryDb}""" + sql """ + CREATE FUNCTION py_drop_db_registry(INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +def evaluate(x): + if x is None: + return None + return x + 1 +\$\$ + """ + def oldResult = sql """SELECT py_drop_db_registry(10);""" + assert oldResult[0][0] == 11 + + sql """DROP DATABASE ${registryDb} FORCE""" + sql """CREATE DATABASE ${registryDb}""" + sql """USE ${registryDb}""" + def functions = sql """SHOW FUNCTIONS LIKE 'py_drop_db_registry'""" + assert functions.isEmpty() + test { + sql """SELECT py_drop_db_registry(10);""" + exception "Can not found function" + } + + sql """ + CREATE FUNCTION py_drop_db_registry(INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +def evaluate(x): + if x is None: + return None + return x + 999 +\$\$ + """ + def rebuiltResult = sql """SELECT py_drop_db_registry(10);""" + assert rebuiltResult[0][0] == 1009 + } finally { + sql """USE ${originalDb}""" + try_sql("DROP DATABASE IF EXISTS ${registryDb} FORCE") + } } finally { try_sql("DROP FUNCTION IF EXISTS py_drop_once(INT);") try_sql("DROP FUNCTION IF EXISTS py_drop_a(INT);") try_sql("DROP FUNCTION IF EXISTS py_drop_b(INT);") + try_sql("DROP FUNCTION IF EXISTS py_drop_reconnect(INT);") + try_sql("DROP FUNCTION IF EXISTS py_drop_recreate(INT);") } } diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_file_protocol.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_file_protocol.groovy index 151de115abafa1..be11b64d339a41 100644 --- a/regression-test/suites/pythonudf_p0/test_pythonudf_file_protocol.groovy +++ b/regression-test/suites/pythonudf_p0/test_pythonudf_file_protocol.groovy @@ -55,19 +55,20 @@ suite("test_pythonudf_file_protocol") { qt_select_file_string """ SELECT py_file_string_mask('1234567890', 3, 3) AS result; """ // Test 3: Load float_test.py from zip package using file:// protocol - sql """ DROP FUNCTION IF EXISTS py_file_float_process(FLOAT); """ + sql """ DROP FUNCTION IF EXISTS py_file_float_process(FLOAT, FLOAT); """ sql """ - CREATE FUNCTION py_file_float_process(FLOAT) + CREATE FUNCTION py_file_float_process(FLOAT, FLOAT) RETURNS FLOAT PROPERTIES ( "type" = "PYTHON_UDF", "file" = "file://${zipPath}", "symbol" = "float_test.evaluate", - "runtime_version" = "${runtime_version}" + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" ); """ - qt_select_file_float """ SELECT py_file_float_process(3.14) AS result; """ + qt_select_file_float """ SELECT py_file_float_process(3.14, null) AS result; """ // Test 4: Load boolean_test.py from zip package using file:// protocol sql """ DROP FUNCTION IF EXISTS py_file_bool_not(BOOLEAN); """ @@ -120,7 +121,7 @@ suite("test_pythonudf_file_protocol") { } finally { try_sql("DROP FUNCTION IF EXISTS py_file_int_add(INT);") try_sql("DROP FUNCTION IF EXISTS py_file_string_mask(STRING, INT, INT);") - try_sql("DROP FUNCTION IF EXISTS py_file_float_process(FLOAT);") + try_sql("DROP FUNCTION IF EXISTS py_file_float_process(FLOAT, FLOAT);") try_sql("DROP FUNCTION IF EXISTS py_file_bool_not(BOOLEAN);") try_sql("DROP TABLE IF EXISTS file_protocol_test_table;") } diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_float.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_float.groovy index 7a26136ed2d41c..cab7d580c61c22 100644 --- a/regression-test/suites/pythonudf_p0/test_pythonudf_float.groovy +++ b/regression-test/suites/pythonudf_p0/test_pythonudf_float.groovy @@ -59,12 +59,24 @@ suite("test_pythonudf_float") { qt_select """ SELECT python_udf_float_test(cast(2.83645 as float),cast(111.1111111 as float)) as result; """ qt_select """ SELECT python_udf_float_test(2.83645,111.1111111) as result ; """ qt_select """ SELECT user_id,python_udf_float_test(float_1, float_2) as sum FROM test_pythonudf_float order by user_id; """ - createMV("create materialized view udf_mv as SELECT user_id as a1,python_udf_float_test(float_1, float_2) as sum FROM test_pythonudf_float order by user_id;") + sql """ DROP MATERIALIZED VIEW IF EXISTS udf_mv; """ + sql """ + CREATE MATERIALIZED VIEW udf_mv + BUILD DEFERRED REFRESH AUTO ON MANUAL + DISTRIBUTED BY RANDOM BUCKETS 2 + PROPERTIES ( + 'replication_num' = '1', + 'version_info'='3', + 'enable_nondeterministic_function' = 'true' + ) + AS + SELECT user_id as a1, python_udf_float_test(float_1, float_2) as sum FROM test_pythonudf_float; + """ qt_select """ SELECT user_id,python_udf_float_test(float_1, float_2) as sum FROM test_pythonudf_float order by user_id; """ explain { sql("SELECT user_id,python_udf_float_test(float_1, float_2) as sum FROM test_pythonudf_float order by user_id; ") - contains "(udf_mv)" + notContains "(udf_mv)" } sql """ CREATE FUNCTION python_udf_double_test(DOUBLE,DOUBLE) RETURNS DOUBLE PROPERTIES ( @@ -80,6 +92,7 @@ suite("test_pythonudf_float") { qt_select """ SELECT user_id,python_udf_double_test(double_1, double_1) as sum FROM test_pythonudf_float order by user_id; """ } finally { + try_sql("DROP MATERIALIZED VIEW IF EXISTS udf_mv;") try_sql("DROP FUNCTION IF EXISTS python_udf_double_test(DOUBLE,DOUBLE);") try_sql("DROP FUNCTION IF EXISTS python_udf_float_test(FLOAT,FLOAT);") try_sql("DROP TABLE IF EXISTS test_pythonudf_float") diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_no_input.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_no_input.groovy new file mode 100644 index 00000000000000..c81ed89e0628bd --- /dev/null +++ b/regression-test/suites/pythonudf_p0/test_pythonudf_no_input.groovy @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudf_no_input") { + def runtime_version = getPythonUdfRuntimeVersion() + def table_name = "test_pythonudf_no_input_tbl" + + try { + sql """ DROP FUNCTION IF EXISTS py_const_no_input(); """ + sql """ DROP TABLE IF EXISTS ${table_name}; """ + + sql """ + CREATE FUNCTION py_const_no_input() + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def evaluate(): + return 7 +\$\$; + """ + + assert sql(""" SELECT py_const_no_input(); """)[0][0] == 7 + + sql """ + CREATE TABLE ${table_name} ( + id INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ INSERT INTO ${table_name} VALUES (1), (2), (3); """ + + def rows = sql(""" + SELECT id, py_const_no_input() AS v + FROM ${table_name} + ORDER BY id + """) + + assert rows.size() == 3 : "Expected 3 rows, got ${rows.size()}" + assert rows.collect { it[0] as int } == [1, 2, 3] + assert rows.every { (it[1] as int) == 7 } + } finally { + try_sql(""" DROP FUNCTION IF EXISTS py_const_no_input(); """) + try_sql(""" DROP TABLE IF EXISTS ${table_name}; """) + } +} diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/array_int_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/array_int_test.py index ef3020985d4a4d..2e751c80ffa512 100644 --- a/regression-test/suites/pythonudf_p0/udf_scripts/array_int_test.py +++ b/regression-test/suites/pythonudf_p0/udf_scripts/array_int_test.py @@ -17,6 +17,8 @@ def evaluate(res): + if res is None: + return None value = 0 for data in res: if data is not None: diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/array_return_array_int_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/array_return_array_int_test.py index 7781d788f0794c..45292ab5499c0d 100644 --- a/regression-test/suites/pythonudf_p0/udf_scripts/array_return_array_int_test.py +++ b/regression-test/suites/pythonudf_p0/udf_scripts/array_return_array_int_test.py @@ -17,6 +17,8 @@ def evaluate(res): + if res is None: + return None value = 0 for data in res: if data is not None: diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/array_return_array_string_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/array_return_array_string_test.py index 92864bc800cb1f..4991fb395957ff 100644 --- a/regression-test/suites/pythonudf_p0/udf_scripts/array_return_array_string_test.py +++ b/regression-test/suites/pythonudf_p0/udf_scripts/array_return_array_string_test.py @@ -17,6 +17,8 @@ def evaluate(res): + if res is None: + return None value = "" for data in res: if data is not None: diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/array_string_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/array_string_test.py index ede02c1201e713..4539a27cb6aba1 100644 --- a/regression-test/suites/pythonudf_p0/udf_scripts/array_string_test.py +++ b/regression-test/suites/pythonudf_p0/udf_scripts/array_string_test.py @@ -17,6 +17,8 @@ def evaluate(res): + if res is None: + return None value = "" for data in res: if data is not None: diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/float_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/float_test.py index 3b2d726ff406f7..1ce8ca82010bb7 100644 --- a/regression-test/suites/pythonudf_p0/udf_scripts/float_test.py +++ b/regression-test/suites/pythonudf_p0/udf_scripts/float_test.py @@ -17,4 +17,6 @@ def evaluate(arg1, arg2): + if arg1 is None or arg2 is None: + return None return arg1 - arg2 \ No newline at end of file diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/int_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/int_test.py index b96f6b0d4029ef..23df4b06ce616a 100644 --- a/regression-test/suites/pythonudf_p0/udf_scripts/int_test.py +++ b/regression-test/suites/pythonudf_p0/udf_scripts/int_test.py @@ -17,4 +17,6 @@ def evaluate(arg): + if arg is None: + return None return int(arg + 1) \ No newline at end of file diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/pyudf.zip b/regression-test/suites/pythonudf_p0/udf_scripts/pyudf.zip index b4ed70a402bc02..7a33bc3e20d22a 100644 Binary files a/regression-test/suites/pythonudf_p0/udf_scripts/pyudf.zip and b/regression-test/suites/pythonudf_p0/udf_scripts/pyudf.zip differ diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/udf_errors.py b/regression-test/suites/pythonudf_p0/udf_scripts/udf_errors.py new file mode 100644 index 00000000000000..336167e37b1373 --- /dev/null +++ b/regression-test/suites/pythonudf_p0/udf_scripts/udf_errors.py @@ -0,0 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Module-based UDF error cases for regression tests.""" + + +def raise_in_module(value): + raise TypeError("module_udf_error_42") diff --git a/regression-test/suites/pythonudtf_p0/test_pythonudtf_deterministic.groovy b/regression-test/suites/pythonudtf_p0/test_pythonudtf_deterministic.groovy new file mode 100644 index 00000000000000..d1dbdb63f7e437 --- /dev/null +++ b/regression-test/suites/pythonudtf_p0/test_pythonudtf_deterministic.groovy @@ -0,0 +1,176 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudtf_deterministic") { + def runtime_version = getPythonUdfRuntimeVersion() + + try { + sql """ DROP TABLE IF EXISTS cte_uuid_seed; """ + sql """ DROP FUNCTION IF EXISTS py_uuid_expand_false(INT); """ + sql """ DROP FUNCTION IF EXISTS py_uuid_expand_det(INT); """ + sql """ + CREATE TABLE cte_uuid_seed (id INT) ENGINE=OLAP DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("replication_num" = "1"); + """ + sql """ INSERT INTO cte_uuid_seed VALUES (1), (2), (3); """ + sql """ sync; """ + + sql """ SET enable_nereids_planner = true; """ + sql """ SET enable_fallback_to_original_planner = false; """ + + sql """ DROP FUNCTION IF EXISTS py_uuid_expand(INT); """ + sql """ + CREATE TABLES FUNCTION py_uuid_expand(INT) + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "py_uuid_expand_impl", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +import uuid + +def py_uuid_expand_impl(x): + if x is not None: + yield (f"{x}-{uuid.uuid4()}",) +\$\$; + """ + def showDefault = sql """ SHOW CREATE FUNCTION py_uuid_expand(INT); """ + assertTrue(showDefault.size() == 1) + assertTrue(showDefault[0][1].contains("\"DETERMINISTIC\"=\"false\"")) + + sql """ + CREATE TABLES FUNCTION py_uuid_expand_false(INT) + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "py_uuid_expand_false_impl", + "runtime_version" = "${runtime_version}", + "deterministic" = "false" + ) + AS \$\$ +import uuid + +def py_uuid_expand_false_impl(x): + if x is not None: + yield (f"{x}-{uuid.uuid4()}",) +\$\$; + """ + def showExplicitFalse = sql """ SHOW CREATE FUNCTION py_uuid_expand_false(INT); """ + assertTrue(showExplicitFalse.size() == 1) + assertTrue(showExplicitFalse[0][1].contains("\"DETERMINISTIC\"=\"false\"")) + + sql """ + CREATE TABLES FUNCTION py_uuid_expand_det(INT) + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "py_uuid_expand_det_impl", + "runtime_version" = "${runtime_version}", + "deterministic" = "true" + ) + AS \$\$ +import uuid + +def py_uuid_expand_det_impl(x): + if x is not None: + yield (f"{x}-{uuid.uuid4()}",) +\$\$; + """ + def showDet = sql """ SHOW CREATE FUNCTION py_uuid_expand_det(INT); """ + assertTrue(showDet.size() == 1) + assertTrue(showDet[0][1].contains("\"DETERMINISTIC\"=\"true\"")) + + sql """ SET enable_cte_materialize = true; """ + sql """ SET inline_cte_referenced_threshold = 1; """ + qt_materialized """ + WITH cte AS ( + SELECT id, token + FROM cte_uuid_seed + LATERAL VIEW py_uuid_expand(id) tmp AS token + ) + SELECT id, COUNT(DISTINCT token) AS distinct_tokens + FROM ( + SELECT id, token FROM cte + UNION ALL + SELECT id, token FROM cte + ) u + GROUP BY id + ORDER BY id; + """ + + sql """ SET enable_cte_materialize = true; """ + sql """ SET inline_cte_referenced_threshold = 10; """ + qt_inlined """ + WITH cte AS ( + SELECT id, token + FROM cte_uuid_seed + LATERAL VIEW py_uuid_expand(id) tmp AS token + ) + SELECT id, COUNT(DISTINCT token) AS distinct_tokens + FROM ( + SELECT id, token FROM cte + UNION ALL + SELECT id, token FROM cte + ) u + GROUP BY id + ORDER BY id; + """ + + sql """ SET enable_cte_materialize = true; """ + sql """ SET inline_cte_referenced_threshold = 1; """ + qt_materialized_det """ + WITH cte AS ( + SELECT id, token + FROM cte_uuid_seed + LATERAL VIEW py_uuid_expand_det(id) tmp AS token + ) + SELECT id, COUNT(DISTINCT token) AS distinct_tokens + FROM ( + SELECT id, token FROM cte + UNION ALL + SELECT id, token FROM cte + ) u + GROUP BY id + ORDER BY id; + """ + + sql """ SET enable_cte_materialize = true; """ + sql """ SET inline_cte_referenced_threshold = 10; """ + qt_inlined_det """ + WITH cte AS ( + SELECT id, token + FROM cte_uuid_seed + LATERAL VIEW py_uuid_expand_det(id) tmp AS token + ) + SELECT id, COUNT(DISTINCT token) AS distinct_tokens + FROM ( + SELECT id, token FROM cte + UNION ALL + SELECT id, token FROM cte + ) u + GROUP BY id + ORDER BY id; + """ + } finally { + sql """ DROP FUNCTION IF EXISTS py_uuid_expand(INT); """ + sql """ DROP FUNCTION IF EXISTS py_uuid_expand_false(INT); """ + sql """ DROP FUNCTION IF EXISTS py_uuid_expand_det(INT); """ + sql """ DROP TABLE IF EXISTS cte_uuid_seed; """ + } +} diff --git a/regression-test/suites/pythonudtf_p0/test_pythonudtf_drop.groovy b/regression-test/suites/pythonudtf_p0/test_pythonudtf_drop.groovy index 1f454243fb051c..04abde6c146b50 100644 --- a/regression-test/suites/pythonudtf_p0/test_pythonudtf_drop.groovy +++ b/regression-test/suites/pythonudtf_p0/test_pythonudtf_drop.groovy @@ -15,10 +15,23 @@ // specific language governing permissions and limitations // under the License. -suite("test_pythonudtf_drop") { +suite("test_pythonudtf_drop", "nonConcurrent") { def runtime_version = getPythonUdfRuntimeVersion() def zipA = """${context.file.parent}/udtf_scripts/python_udtf_drop_a/python_udtf_drop_test.zip""" def zipB = """${context.file.parent}/udtf_scripts/python_udtf_drop_b/python_udtf_drop_test.zip""" + def localDorisHome = System.getenv("DORIS_HOME") + def localUdfRoot = localDorisHome != null ? "${localDorisHome}/lib/udf" : "/tmp" + def backendId_to_backendIP = [:] + def backendId_to_backendHttpPort = [:] + getBackendIpHttpPort(backendId_to_backendIP, backendId_to_backendHttpPort) + + def execOnBackend = { be_ip, localCmd, remoteCmd -> + if (be_ip == "127.0.0.1" || be_ip == "localhost") { + cmd(localCmd) + } else { + sshExec("root", be_ip, remoteCmd, false) + } + } scp_udf_file_to_all_be(zipA) scp_udf_file_to_all_be(zipB) @@ -122,9 +135,45 @@ suite("test_pythonudtf_drop") { """ exception "Can not found function" } + + // Case 4: kill Python servers between two table-function queries, next handshake should recover + sql """DROP FUNCTION IF EXISTS py_drop_t_reconnect(INT)""" + sql """ + CREATE TABLES FUNCTION py_drop_t_reconnect(INT) + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${zipA}", + "symbol" = "drop_udtf.process", + "runtime_version" = "${runtime_version}" + ) + """ + + qt_py_udtf_drop_4 """ + SELECT c + FROM py_udtf_drop_tbl + LATERAL VIEW py_drop_t_reconnect(v) tmp AS c + ORDER BY c; + """ + + backendId_to_backendIP.values().each { be_ip -> + execOnBackend( + be_ip, + "pkill -f 'python_server.py grpc+unix:///tmp/doris_python_udf' || true", + "pkill -f 'python_server.py grpc+unix:///tmp/doris_python_udf' || true") + } + + qt_py_udtf_drop_5 """ + SELECT c + FROM py_udtf_drop_tbl + LATERAL VIEW py_drop_t_reconnect(v) tmp AS c + ORDER BY c; + """ + try_sql("DROP FUNCTION IF EXISTS py_drop_t_reconnect(INT);") } finally { try_sql("DROP FUNCTION IF EXISTS py_drop_t_once(INT);") try_sql("DROP FUNCTION IF EXISTS py_drop_t_a(INT);") try_sql("DROP FUNCTION IF EXISTS py_drop_t_b(INT);") + try_sql("DROP FUNCTION IF EXISTS py_drop_t_reconnect(INT);") } } diff --git a/regression-test/suites/pythonudtf_p0/test_pythonudtf_no_input.groovy b/regression-test/suites/pythonudtf_p0/test_pythonudtf_no_input.groovy new file mode 100644 index 00000000000000..0136aabca5e9ba --- /dev/null +++ b/regression-test/suites/pythonudtf_p0/test_pythonudtf_no_input.groovy @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudtf_no_input") { + def runtime_version = getPythonUdfRuntimeVersion() + def table_name = "test_pythonudtf_no_input_tbl" + + try { + sql """ DROP FUNCTION IF EXISTS py_emit_no_input(); """ + sql """ DROP TABLE IF EXISTS ${table_name}; """ + + sql """ + CREATE TABLES FUNCTION py_emit_no_input() + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "emit_values", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def emit_values(): + yield ('left',) + yield ('right',) +\$\$; + """ + + sql """ + CREATE TABLE ${table_name} ( + id INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ INSERT INTO ${table_name} VALUES (1), (2); """ + + def rows = sql(""" + SELECT id, value + FROM ${table_name} + LATERAL VIEW py_emit_no_input() tmp AS value + ORDER BY id, value + """) + + assert rows.size() == 4 : "Expected 4 rows, got ${rows.size()}" + assert rows.collect { [(it[0] as int), it[1].toString()] } == [ + [1, "left"], + [1, "right"], + [2, "left"], + [2, "right"] + ] + } finally { + try_sql(""" DROP FUNCTION IF EXISTS py_emit_no_input(); """) + try_sql(""" DROP TABLE IF EXISTS ${table_name}; """) + } +} diff --git a/regression-test/suites/pythonudtf_p0/udtf_scripts/pyudtf.zip b/regression-test/suites/pythonudtf_p0/udtf_scripts/pyudtf.zip index f04942c97849c9..7e4bbe0820546a 100644 Binary files a/regression-test/suites/pythonudtf_p0/udtf_scripts/pyudtf.zip and b/regression-test/suites/pythonudtf_p0/udtf_scripts/pyudtf.zip differ diff --git a/regression-test/suites/pythonudtf_p0/udtf_scripts/pyudtf_module/exceptions_udtf.py b/regression-test/suites/pythonudtf_p0/udtf_scripts/pyudtf_module/exceptions_udtf.py index b663c7aa878dc7..837426742406c7 100644 --- a/regression-test/suites/pythonudtf_p0/udtf_scripts/pyudtf_module/exceptions_udtf.py +++ b/regression-test/suites/pythonudtf_p0/udtf_scripts/pyudtf_module/exceptions_udtf.py @@ -211,3 +211,10 @@ def validate_date(dt): status = 'normal' yield (dt, year, is_leap, status) + + +def raise_in_module_udtf(value): + """Raise a stable error to verify UDTF exception propagation.""" + if False: + yield value + raise TypeError("module_udtf_error_42") diff --git a/regression-test/suites/query_p0/javaudf/test_javaudf_deterministic.groovy b/regression-test/suites/query_p0/javaudf/test_javaudf_deterministic.groovy new file mode 100644 index 00000000000000..54364f88ed72cb --- /dev/null +++ b/regression-test/suites/query_p0/javaudf/test_javaudf_deterministic.groovy @@ -0,0 +1,176 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.junit.Assert + +suite("test_javaudf_deterministic") { + def tableName = "test_javaudf_deterministic_seed" + def udfMvName = "java_udf_deterministic_false_mtmv" + def udafMvName = "java_udaf_deterministic_false_mtmv" + def udfDetMvName = "java_udf_deterministic_true_mtmv" + def udafDetMvName = "java_udaf_deterministic_true_mtmv" + def jarPath = """${context.file.parent}/../../javaudf_p0/jars/java-udf-case-jar-with-dependencies.jar""" + scp_udf_file_to_all_be(jarPath) + + try { + sql """ SET enable_nereids_planner = true; """ + sql """ SET enable_fallback_to_original_planner = false; """ + + sql """ DROP TABLE IF EXISTS ${tableName}; """ + sql """ DROP MATERIALIZED VIEW IF EXISTS ${udfMvName}; """ + sql """ DROP MATERIALIZED VIEW IF EXISTS ${udafMvName}; """ + sql """ DROP MATERIALIZED VIEW IF EXISTS ${udfDetMvName}; """ + sql """ DROP MATERIALIZED VIEW IF EXISTS ${udafDetMvName}; """ + sql """ DROP FUNCTION IF EXISTS java_udf_deterministic_test(INT); """ + sql """ DROP FUNCTION IF EXISTS java_udaf_deterministic_test(INT); """ + sql """ DROP FUNCTION IF EXISTS java_udf_deterministic_true_test(INT); """ + sql """ DROP FUNCTION IF EXISTS java_udaf_deterministic_true_test(INT); """ + + sql """ + CREATE TABLE ${tableName} ( + id INT, + v INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("replication_num" = "1"); + """ + sql """ INSERT INTO ${tableName} VALUES (1, 10), (2, 20), (3, 30); """ + sql """ sync; """ + + File jarFile = new File(jarPath) + if (!jarFile.exists()) { + throw new IllegalStateException("""${jarPath} doesn't exist! """) + } + + sql """ + CREATE FUNCTION java_udf_deterministic_test(INT) RETURNS INT PROPERTIES ( + "file"="file://${jarPath}", + "symbol"="org.apache.doris.udf.IntTest", + "type"="JAVA_UDF" + ); + """ + def udfShowDefault = sql """ SHOW CREATE FUNCTION java_udf_deterministic_test(INT); """ + assertTrue(udfShowDefault.size() == 1) + assertTrue(udfShowDefault[0][1].contains("\"DETERMINISTIC\"=\"false\"")) + + sql """ + CREATE AGGREGATE FUNCTION java_udaf_deterministic_test(INT) RETURNS BIGINT PROPERTIES ( + "file"="file://${jarPath}", + "symbol"="org.apache.doris.udf.MySumInt", + "always_nullable"="false", + "type"="JAVA_UDF" + ); + """ + def udafShowDefault = sql """ SHOW CREATE FUNCTION java_udaf_deterministic_test(INT); """ + assertTrue(udafShowDefault.size() == 1) + assertTrue(udafShowDefault[0][1].contains("\"DETERMINISTIC\"=\"false\"")) + + sql """ + CREATE FUNCTION java_udf_deterministic_true_test(INT) RETURNS INT PROPERTIES ( + "file"="file://${jarPath}", + "symbol"="org.apache.doris.udf.IntTest", + "type"="JAVA_UDF", + "deterministic"="true" + ); + """ + def udfShowDet = sql """ SHOW CREATE FUNCTION java_udf_deterministic_true_test(INT); """ + assertTrue(udfShowDet.size() == 1) + assertTrue(udfShowDet[0][1].contains("\"DETERMINISTIC\"=\"true\"")) + + sql """ + CREATE AGGREGATE FUNCTION java_udaf_deterministic_true_test(INT) RETURNS BIGINT PROPERTIES ( + "file"="file://${jarPath}", + "symbol"="org.apache.doris.udf.MySumInt", + "always_nullable"="false", + "type"="JAVA_UDF", + "deterministic"="true" + ); + """ + def udafShowDet = sql """ SHOW CREATE FUNCTION java_udaf_deterministic_true_test(INT); """ + assertTrue(udafShowDet.size() == 1) + assertTrue(udafShowDet[0][1].contains("\"DETERMINISTIC\"=\"true\"")) + + try { + sql """ + CREATE MATERIALIZED VIEW ${udfMvName} + BUILD DEFERRED REFRESH COMPLETE ON MANUAL + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ("replication_num" = "1") + AS + SELECT id, java_udf_deterministic_test(v) AS result + FROM ${tableName}; + """ + Assert.fail() + } catch (Exception e) { + log.info(e.getMessage()) + assertTrue(e.getMessage().contains("can not contain nonDeterministic expression")) + } + + try { + sql """ + CREATE MATERIALIZED VIEW ${udafMvName} + BUILD DEFERRED REFRESH COMPLETE ON MANUAL + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ("replication_num" = "1") + AS + SELECT id, java_udaf_deterministic_test(v) AS result + FROM ${tableName} + GROUP BY id; + """ + Assert.fail() + } catch (Exception e) { + log.info(e.getMessage()) + assertTrue(e.getMessage().contains("can not contain nonDeterministic expression")) + } + + sql """ + CREATE MATERIALIZED VIEW ${udfDetMvName} + BUILD DEFERRED REFRESH COMPLETE ON MANUAL + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ("replication_num" = "1") + AS + SELECT id, java_udf_deterministic_true_test(v) AS result + FROM ${tableName}; + """ + def udfDetShow = sql """ SHOW CREATE MATERIALIZED VIEW ${udfDetMvName}; """ + assertTrue(udfDetShow.size() == 1) + + sql """ + CREATE MATERIALIZED VIEW ${udafDetMvName} + BUILD DEFERRED REFRESH COMPLETE ON MANUAL + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ("replication_num" = "1") + AS + SELECT id, java_udaf_deterministic_true_test(v) AS result + FROM ${tableName} + GROUP BY id; + """ + def udafDetShow = sql """ SHOW CREATE MATERIALIZED VIEW ${udafDetMvName}; """ + assertTrue(udafDetShow.size() == 1) + } finally { + try_sql(""" DROP MATERIALIZED VIEW IF EXISTS ${udfMvName}; """) + try_sql(""" DROP MATERIALIZED VIEW IF EXISTS ${udafMvName}; """) + try_sql(""" DROP MATERIALIZED VIEW IF EXISTS ${udfDetMvName}; """) + try_sql(""" DROP MATERIALIZED VIEW IF EXISTS ${udafDetMvName}; """) + try_sql(""" DROP FUNCTION IF EXISTS java_udf_deterministic_test(INT); """) + try_sql(""" DROP FUNCTION IF EXISTS java_udaf_deterministic_test(INT); """) + try_sql(""" DROP FUNCTION IF EXISTS java_udf_deterministic_true_test(INT); """) + try_sql(""" DROP FUNCTION IF EXISTS java_udaf_deterministic_true_test(INT); """) + try_sql(""" DROP TABLE IF EXISTS ${tableName}; """) + } +}