From 2fe590b2e660cb68e16210557baff059f0dc0ba3 Mon Sep 17 00:00:00 2001 From: Mikhail Filimonov Date: Sat, 2 May 2026 00:06:34 +0200 Subject: [PATCH 1/6] Named Scalars MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add server-side named, refreshable scalar values backed by either a local on-disk cache or a shared Keeper-backed cache, accessed via getNamedScalar / getNamedScalarOrDefault and surfaced in system.named_scalars. Surface: - DDL: CREATE [OR REPLACE] [LOCAL|SHARED] NAMED SCALAR [IF NOT EXISTS] [ON CLUSTER ...] [DEFINER = ...] [SQL SECURITY DEFINER] [REFRESH EVERY ] AS ; +``` + +The `AS` clause must be a `SELECT`. The query must produce exactly one row, +one column at evaluation time — empty or multi-row results are errors. + +`` is one of `SECOND`, `MINUTE`, `HOUR`, `DAY` (with optional plural). +A failed refresh retries on the next scheduled tick — there is no separate +backoff. The previously-good value continues to be served while the refresh +is broken (`current_value_is_valid` flips to 0; `consecutive_failures` bumps). + +Named scalars always use `SQL SECURITY DEFINER`: the source query is evaluated +with the privileges of the definer user, both during creation and during +refresh. If `DEFINER` is omitted, `CURRENT_USER` is used. `SQL SECURITY INVOKER` +and `SQL SECURITY NONE` are not supported for named scalars. + +## Cache Kinds + +| Modifier | Value cache | Definition store | +|---|---|---| +| (none) | Uses `` (`local` by default) | selected at server start | +| `LOCAL` | per-server disk cache under `` (default `/named_scalars_cache/`) | selected at server start | +| `SHARED` | cluster-wide Keeper cache, leader-elected refresh | Keeper-backed definitions only | + +`LOCAL` / `SHARED` is a value-cache property, not a separate namespace. The +server has one active definition store: disk via `` +or Keeper via ``. `SHARED` cache +requires the Keeper definition store. +`SHARED` cannot be combined with `ON CLUSTER` — Keeper already distributes the +scalar across the cluster. + +## OR REPLACE + +`CREATE OR REPLACE` atomically swaps the definition and value: in-flight +queries see either the previous or the new scalar, never an intermediate +state. The new SELECT result is what gets published; nothing carries over +from the previous definition (including its declared type). + +## Examples + +A per-node computed value, refreshed every hour: + +```sql +CREATE NAMED SCALAR fx_rate + REFRESH EVERY 1 HOUR + AS SELECT rate FROM rates WHERE pair = 'EUR/USD'; + +SELECT amount * getNamedScalar('fx_rate') FROM orders; +``` + +A cluster-wide watermark shared via Keeper: + +```sql +CREATE SHARED NAMED SCALAR max_event_time + REFRESH EVERY 1 MINUTE + AS SELECT max(event_time) FROM events; + +SELECT count() FROM events WHERE event_time > getNamedScalar('max_event_time'); +``` + +Fan-out to every node in a cluster (local scope): + +```sql +CREATE NAMED SCALAR node_name ON CLUSTER my_cluster AS SELECT hostName(); +-- each node evaluates hostName() on itself; values differ across nodes. +``` + +Default fallback when the scalar isn't defined: + +```sql +SELECT getNamedScalarOrDefault('flap', 0); +``` + +## When to use + +A named scalar is the right tool when the value (a) needs background +evaluation with no query to attach to, or (b) needs to swap atomically +across a cluster, or (c) is computed under privileges the reader +doesn't have. The patterns below all lean on at least one of those. + +**ETL high-water mark across replicas (Shared).** A SHARED scalar +holds the last successfully-loaded id; every replica reads it, +exactly one replica refreshes it after a batch lands. No app-side +cursor that drifts between nodes. + +```sql +CREATE SHARED NAMED SCALAR last_loaded_id + REFRESH EVERY 30 SECOND + AS SELECT max(event_id) FROM staging.events_loaded; + +INSERT INTO main.events + SELECT * FROM staging.events + WHERE event_id > getNamedScalar('last_loaded_id'); +``` + +**Cached top-N (or any small set) as an inline filter.** Refresh +hourly; reads are constant-time. Without scalars this is either a +correlated subquery scanning the fact table on every dashboard hit +or a one-row table with `ARRAY JOIN`. + +```sql +CREATE NAMED SCALAR top_1k_users + REFRESH EVERY 1 HOUR + AS SELECT groupArray(user_id) FROM ( + SELECT user_id FROM events + GROUP BY user_id ORDER BY count() DESC LIMIT 1000); + +SELECT * FROM events WHERE user_id IN getNamedScalar('top_1k_users'); +``` + +**Slow-changing reference values.** FX rates, holiday calendars, the +current fiscal period, p99 baselines. The atomic single-value swap +sidesteps the "what does an in-flight query see while the table is +being rewritten" question that comes with a tiny reference table + +`OPTIMIZE FINAL`. + +**Anomaly / alert thresholds derived from rolling history.** The +threshold is a query over `system.query_log` (or wherever); the alert +predicate uses it as a literal. The whole feedback loop lives inside +the database. + +```sql +CREATE NAMED SCALAR p99_24h + REFRESH EVERY 1 HOUR + AS SELECT quantile(0.99)(query_duration_ms) + FROM system.query_log WHERE event_time > now() - INTERVAL 1 DAY; + +SELECT * FROM live_queries WHERE query_duration_ms > getNamedScalar('p99_24h'); +``` + +**Salt / key rotation for PII hashing.** Materialized views and +visible columns reference `getNamedScalar('salt_v')`; rotating the +salt is one `CREATE OR REPLACE`. No DDL churn across dependent +objects, no coordinated cutover. + +**Cheap broadcast of expensive estimates.** A `uniqHLL12` / +approximate-quantile / Bloom summary computed minutes-long and read +microseconds. Stored once as a scalar, used to short-circuit +planning logic in views. Without it: re-computed per query, or +maintained as a one-row `ReplacingMergeTree` with all the staleness +and atomicity caveats that brings. + +**Last-known-good model coefficients for inline scoring.** A nightly +training job writes the coefficients via `CREATE OR REPLACE`. The +atomic swap means no half-updated coefficient set is ever readable; +queries see either the old set or the new set. + +```sql +CREATE OR REPLACE NAMED SCALAR scoring_intercept AS SELECT 0.42; +CREATE OR REPLACE NAMED SCALAR scoring_alpha AS SELECT 1.7; + +SELECT + getNamedScalar('scoring_alpha') * x + getNamedScalar('scoring_intercept') + AS score +FROM features; +``` + +**Feature flags / experiment cohorts.** `cohort_pct` as a SHARED +scalar; `multiIf(user_id % 100 < getNamedScalar('cohort_pct'), +'A', 'B')`. Toggle without redeploying DDL; atomic across the +cluster. + +**Privilege-bridge for sensitive aggregates.** Admin defines +`total_revenue AS SELECT sum(amount) FROM orders`; analysts have +`getNamedScalar` but not `SELECT` on `orders`. The DEFINER-style +refresh removes the read-only summary table + grant ladder +bookkeeping that would otherwise be needed. + +```sql +CREATE NAMED SCALAR total_revenue + DEFINER = analytics_admin SQL SECURITY DEFINER + REFRESH EVERY 5 MINUTE + AS SELECT sum(amount) FROM orders; +``` + +A small reference table is usually a better fit for *many-row* +data. A named scalar is for the single-value case where (a)/(b)/(c) +above outweigh the natural relational shape. + +## Persistence + +`CREATE` validates the source query by running it once, then persists the +definition. The first cached value is populated asynchronously by the +background refresh machinery; reads can throw `NAMED_SCALAR_HAS_NO_VALUE` +until that first populate succeeds. + +Definitions are stored in exactly one active store per server. If +`` is configured, definitions are +cluster-wide in Keeper. Otherwise definitions are local files under +`` (default `/named_scalars/`). + +Local cached values are stored on this server under +`` (default `/named_scalars_cache/`). +Shared cached values are stored cluster-wide under the same Keeper root as +the definitions. Refresh is leader-elected: +exactly one replica per initial populate or scheduled tick runs the +SELECT and publishes the result, and every other replica picks up the +published value without re-evaluating. `CREATE`, `DROP`, and `OR REPLACE` of a +Keeper-backed scalar are atomic from every replica's perspective. + +## Server configuration + +```xml + + + /var/lib/clickhouse/named_scalars/ + + /clickhouse/named_scalars + + + /var/lib/clickhouse/named_scalars_cache/ + + + local + + + 1048576 + + + 16 + +``` + +## Access control + +- `CREATE_NAMED_SCALAR` — required for `CREATE [OR REPLACE] NAMED SCALAR`. +- `DROP_NAMED_SCALAR` — required for `DROP NAMED SCALAR`. +- `SET DEFINER` on the target user — required to create a named scalar with + `DEFINER = ` when `` is not the current user. +- `getNamedScalar` (function-execute grant) — required for `getNamedScalar` and + `getNamedScalarOrDefault`. +- `SHOW_NAMED_SCALARS` — required for unrestricted `system.named_scalars` + reads. +- `SYSTEM REFRESH NAMED SCALAR` — required for `SYSTEM REFRESH NAMED SCALAR`. +- `SYSTEM NAMED SCALAR REFRESHES` — required for + `SYSTEM START NAMED SCALAR REFRESHES` and `SYSTEM STOP NAMED SCALAR REFRESHES`. + +:::note +The `getNamedScalar` grant is **all-or-nothing**: any role that holds it +can read the value of every named scalar on the server. There is no +per-scalar ACL. Do not store secrets, credentials, or other +narrowly-scoped sensitive data in named scalars unless every grantee +should be able to read every scalar. +::: + +## Operating shared (Keeper-backed) scalars + +`SHARED` scalars store both their definitions and their cached values +in Keeper, under ``. The named +scalar manager trusts this Keeper subtree the same way ClickHouse +trusts the Keeper paths of `Database engine = Replicated` and +`ReplicatedMergeTree` — anyone with write access to the path can mutate +metadata, including the `DEFINER` clause that determines under which +identity the refresh `SELECT` runs. + +**Operationally that means: the named-scalar Keeper subtree must be +ACL-restricted to the ClickHouse server identity.** Use Keeper ACLs to +deny write access to anyone other than the server's own credentials. +This is the same requirement that already applies to replicated table +metadata; it is not a new responsibility introduced by named scalars. + +## See also + +- [`DROP NAMED SCALAR`](/sql-reference/statements/drop#drop-named-scalar) — removes a scalar. +- [`SYSTEM REFRESH NAMED SCALAR`](/sql-reference/statements/system#refresh-named-scalar) — re-runs the + source query on demand. +- [`SYSTEM STOP / START NAMED SCALAR REFRESHES`](/sql-reference/statements/system#stop-named-scalar-refreshes) — pause/resume periodic refresh on the current server. +- [`getNamedScalar` / `getNamedScalarOrDefault`](/sql-reference/functions/named-scalar-functions) — functions to read scalar values. +- [`system.named_scalars`](/operations/system-tables/named_scalars) — runtime + introspection. +- [`CREATE MATERIALIZED VIEW`](/sql-reference/statements/create/view) — for + cached *relational* (many-row) results. diff --git a/docs/en/sql-reference/statements/drop.md b/docs/en/sql-reference/statements/drop.md index 79c73d4e12a5..26f5c47b8f73 100644 --- a/docs/en/sql-reference/statements/drop.md +++ b/docs/en/sql-reference/statements/drop.md @@ -137,6 +137,36 @@ CREATE FUNCTION linear_equation AS (x, k, b) -> k*x + b; DROP FUNCTION linear_equation; ``` +## DROP NAMED SCALAR {#drop-named-scalar} + +Removes a named cached scalar created by [`CREATE NAMED SCALAR`](./create/named-scalar.md). +Requires the `DROP_NAMED_SCALAR` privilege. + +**Syntax** + +```sql +DROP NAMED SCALAR [IF EXISTS] name [ON CLUSTER cluster] +``` + +`IF EXISTS` suppresses the error when the scalar does not exist. + +`ON CLUSTER` fans the command out to every node in the specified cluster. +`SHARED` scalars cannot be combined with `ON CLUSTER` — Keeper already +propagates the removal cluster-wide; use `ON CLUSTER` only for `LOCAL` scalars +where each server holds its own definition copy. + +**Example** + +```sql +CREATE NAMED SCALAR fx_rate REFRESH EVERY 1 HOUR AS SELECT rate FROM rates WHERE pair = 'EUR/USD'; +DROP NAMED SCALAR fx_rate; +``` + +**See also** + +- [`CREATE NAMED SCALAR`](./create/named-scalar.md) +- [`system.named_scalars`](/operations/system-tables/named_scalars) + ## DROP NAMED COLLECTION {#drop-named-collection} Deletes a named collection. diff --git a/docs/en/sql-reference/statements/system.md b/docs/en/sql-reference/statements/system.md index 933bd4d32aca..cbe17e618f63 100644 --- a/docs/en/sql-reference/statements/system.md +++ b/docs/en/sql-reference/statements/system.md @@ -758,3 +758,55 @@ If the view is in a Replicated or Shared database, and refresh is running on ano ```sql SYSTEM WAIT VIEW [db.]name ``` + +## Managing Named Scalars {#named-scalars} + +Commands to control background tasks performed by +[Named Scalars](../../sql-reference/statements/create/named-scalar.md). + +Keep an eye on [`system.named_scalars`](../../operations/system-tables/named_scalars.md) +while using them. `SYSTEM REFRESH NAMED SCALAR` requires the +`SYSTEM REFRESH NAMED SCALAR` privilege; `SYSTEM START NAMED SCALAR REFRESHES` +and `SYSTEM STOP NAMED SCALAR REFRESHES` require the +`SYSTEM NAMED SCALAR REFRESHES` privilege. + +### SYSTEM REFRESH NAMED SCALAR {#refresh-named-scalar} + +Trigger an immediate out-of-schedule refresh of a given named scalar. +For scalars without a `REFRESH` clause (static scalars), this throws +`NAMED_SCALAR_NOT_REFRESHABLE`. + +The command is asynchronous: it schedules the refresh and returns immediately. +Use `system.named_scalars` to observe `refresh_in_flight` and `last_update`. + +```sql +SYSTEM REFRESH NAMED SCALAR name +``` + +### SYSTEM STOP NAMED SCALAR REFRESHES {#stop-named-scalar-refreshes} + +Pause periodic refreshing of named scalars on the **current server only**. +With a name argument, only that scalar is paused; without a name, every +named scalar on the server is paused. Any in-progress refresh is allowed +to complete. + +:::note +The paused state is not propagated to peer servers and is not persisted across +server restarts. After a restart the scalar will resume its configured refresh +schedule. +::: + +```sql +SYSTEM STOP NAMED SCALAR REFRESHES [name] +``` + +### SYSTEM START NAMED SCALAR REFRESHES {#start-named-scalar-refreshes} + +Resume periodic refreshing for named scalars that were previously stopped +with `SYSTEM STOP NAMED SCALAR REFRESHES`. With a name argument, only that +scalar is resumed; without a name, every named scalar on the server is +resumed. No immediate refresh is triggered. + +```sql +SYSTEM START NAMED SCALAR REFRESHES [name] +``` diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index 5e7e188c77df..e84da15a5532 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -1149,6 +1149,10 @@ void LocalServer::processConfig() DatabaseCatalog::instance().startupBackgroundTasks(); } + /// Always wire the manager, even without --path, so getNamedScalar + /// / system.named_scalars work. + global_context->initializeNamedScalars(); + std::string default_database = getClientConfiguration().getString("database", server_default_database); if (default_database.empty()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "default_database cannot be empty"); diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 7b45a62f4834..7b3d2957faa9 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -2861,6 +2861,7 @@ try database_catalog.assertDatabaseExists(default_database); /// Load user-defined SQL functions. global_context->getUserDefinedSQLObjectsStorage().loadObjects(); + global_context->initializeNamedScalars(); global_context->getRefreshSet().setRefreshesStopped(false); } diff --git a/src/Access/Common/AccessType.h b/src/Access/Common/AccessType.h index 4ffbfebfbd47..166c157ed296 100644 --- a/src/Access/Common/AccessType.h +++ b/src/Access/Common/AccessType.h @@ -153,7 +153,15 @@ enum class AccessType : uint8_t /// node_type either specifies access type's level (GLOBAL/NAMED_COLLECTION/USER_NAME/SOURCE/DATABASE/TABLE/DICTIONARY/VIEW/COLUMNS), /// or specifies that the access type is a GROUP of other access types; /// parent_group_name is the name of the group containing this access type (or NONE if there is no such group). -/// NOTE A parent group must be declared AFTER all its children. +/// NOTE A parent group must be declared AFTER all its (direct *and* indirect) descendants. +/// `Helper::addChild` in AccessFlags.cpp ORs the child's flags into +/// the immediate parent only — it does NOT walk back up to ancestors. +/// So if you add a new entry whose parent is `SHOW`, and the SHOW group +/// itself has already been declared (and therefore already attached to +/// ALL), the new bit lands on SHOW.flags but does NOT reach ALL.flags; +/// `GRANT ALL ON *.*` will silently miss the new permission. Place new +/// children among their group's existing children, BEFORE the group's +/// own M(...) line. #define APPLY_FOR_ACCESS_TYPES(M) \ M(SHOW_DATABASES, "", DATABASE, SHOW) /* allows to execute SHOW DATABASES, SHOW CREATE DATABASE, USE ; implicitly enabled by any grant on the database */\ @@ -247,6 +255,7 @@ enum class AccessType : uint8_t M(CREATE_WORKLOAD, "", GLOBAL, CREATE) /* allows to execute CREATE WORKLOAD */ \ M(CREATE_RESOURCE, "", GLOBAL, CREATE) /* allows to execute CREATE RESOURCE */ \ M(CREATE_NAMED_COLLECTION, "", NAMED_COLLECTION, NAMED_COLLECTION_ADMIN) /* allows to execute CREATE NAMED COLLECTION */ \ + M(CREATE_NAMED_SCALAR, "", GLOBAL, CREATE) /* allows to execute CREATE NAMED SCALAR */ \ M(CREATE, "", GROUP, ALL) /* allows to execute {CREATE|ATTACH} */ \ \ M(DROP_DATABASE, "", DATABASE, DROP) /* allows to execute {DROP|DETACH|TRUNCATE} DATABASE */\ @@ -258,6 +267,7 @@ enum class AccessType : uint8_t M(DROP_WORKLOAD, "", GLOBAL, DROP) /* allows to execute DROP WORKLOAD */\ M(DROP_RESOURCE, "", GLOBAL, DROP) /* allows to execute DROP RESOURCE */\ M(DROP_NAMED_COLLECTION, "", NAMED_COLLECTION, NAMED_COLLECTION_ADMIN) /* allows to execute DROP NAMED COLLECTION */\ + M(DROP_NAMED_SCALAR, "", GLOBAL, DROP) /* allows to execute DROP NAMED SCALAR */\ M(DROP, "", GROUP, ALL) /* allows to execute {DROP|DETACH} */\ \ M(UNDROP_TABLE, "", TABLE, ALL) /* allows to execute {UNDROP} TABLE */\ @@ -387,10 +397,14 @@ enum class AccessType : uint8_t M(SYSTEM_UNLOAD_PRIMARY_KEY, "SYSTEM UNLOAD PRIMARY KEY", TABLE, SYSTEM) \ M(SYSTEM_INSTRUMENT_ADD, "SYSTEM INSTRUMENT ADD", GLOBAL, SYSTEM) \ M(SYSTEM_INSTRUMENT_REMOVE, "SYSTEM INSTRUMENT REMOVE", GLOBAL, SYSTEM) \ + M(SYSTEM_REFRESH_NAMED_SCALAR, "SYSTEM REFRESH NAMED SCALAR", GLOBAL, SYSTEM) \ + M(SYSTEM_NAMED_SCALAR_REFRESHES, "SYSTEM START NAMED SCALAR REFRESHES, SYSTEM STOP NAMED SCALAR REFRESHES", GLOBAL, SYSTEM) \ M(SYSTEM, "", GROUP, ALL) /* allows to execute SYSTEM {SHUTDOWN|RELOAD CONFIG|...} */ \ \ M(dictGet, "dictHas, dictGetHierarchy, dictIsIn", DICTIONARY, ALL) /* allows to execute functions dictGet(), dictHas(), dictGetHierarchy(), dictIsIn() */\ M(displaySecretsInShowAndSelect, "", GLOBAL, ALL) /* allows to show plaintext secrets in SELECT and SHOW queries. display_secrets_in_show_and_select format and server settings must be turned on */\ + M(getNamedScalar, "getNamedScalarOrDefault", GLOBAL, ALL) /* allows to execute functions getNamedScalar(), getNamedScalarOrDefault() */\ + M(SHOW_NAMED_SCALARS, "", GLOBAL, ALL) /* allows to read system.named_scalars (operator-tier columns); named scalars are flat (no per-entity grant) so the scope is GLOBAL */\ \ M(addressToLine, "", GLOBAL, INTROSPECTION) /* allows to execute function addressToLine() */\ M(addressToLineWithInlines, "", GLOBAL, INTROSPECTION) /* allows to execute function addressToLineWithInlines() */\ diff --git a/src/Access/ContextAccess.cpp b/src/Access/ContextAccess.cpp index 8aaa28a8e829..e57f6b0f5c95 100644 --- a/src/Access/ContextAccess.cpp +++ b/src/Access/ContextAccess.cpp @@ -269,6 +269,11 @@ AccessRights ContextAccess::addImplicitAccessRights(const AccessRights & access, if (max_flags.contains(AccessType::SHOW_QUOTAS)) res.grant(AccessType::SELECT, DatabaseCatalog::SYSTEM_DATABASE, "quotas"); + + /// Per-column tier split is enforced inside fillData; granting + /// SELECT here is safe. + if (max_flags.contains(AccessType::SHOW_NAMED_SCALARS) || max_flags.contains(AccessType::getNamedScalar)) + res.grant(AccessType::SELECT, DatabaseCatalog::SYSTEM_DATABASE, "named_scalars"); } else { @@ -776,15 +781,16 @@ bool ContextAccess::checkAccessImplHelper(const ContextPtr & context, AccessFlag const AccessFlags function_ddl = AccessType::CREATE_FUNCTION | AccessType::DROP_FUNCTION; const AccessFlags workload_ddl = AccessType::CREATE_WORKLOAD | AccessType::DROP_WORKLOAD; const AccessFlags resource_ddl = AccessType::CREATE_RESOURCE | AccessType::DROP_RESOURCE; + const AccessFlags named_scalar_ddl = AccessType::CREATE_NAMED_SCALAR | AccessType::DROP_NAMED_SCALAR; const AccessFlags table_and_dictionary_ddl = table_ddl | dictionary_ddl; const AccessFlags table_and_dictionary_and_function_ddl = table_ddl | dictionary_ddl | function_ddl; const AccessFlags write_table_access = AccessType::INSERT | AccessType::OPTIMIZE; const AccessFlags write_dcl_access = AccessType::ACCESS_MANAGEMENT - AccessType::SHOW_ACCESS; - const AccessFlags not_readonly_flags = write_table_access | table_and_dictionary_and_function_ddl | workload_ddl | resource_ddl | write_dcl_access | AccessType::SYSTEM | AccessType::KILL_QUERY; + const AccessFlags not_readonly_flags = write_table_access | table_and_dictionary_and_function_ddl | workload_ddl | resource_ddl | named_scalar_ddl | write_dcl_access | AccessType::SYSTEM | AccessType::KILL_QUERY; const AccessFlags not_readonly_1_flags = AccessType::CREATE_TEMPORARY_TABLE; - const AccessFlags ddl_flags = table_ddl | dictionary_ddl | function_ddl | workload_ddl | resource_ddl; + const AccessFlags ddl_flags = table_ddl | dictionary_ddl | function_ddl | workload_ddl | resource_ddl | named_scalar_ddl; const AccessFlags introspection_flags = AccessType::INTROSPECTION; }; static const PrecalculatedFlags precalc; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 88dcaa1fa6ab..305f9e2e8a40 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -292,6 +292,7 @@ add_object_library(clickhouse_interpreters_mysql Interpreters/MySQL) add_object_library(clickhouse_interpreters_clusterproxy Interpreters/ClusterProxy) add_object_library(clickhouse_interpreters_jit Interpreters/JIT) add_object_library(clickhouse_interpreters_wasm Interpreters/WebAssembly) +add_object_library(clickhouse_interpreters_named_scalars Interpreters/NamedScalars) add_object_library(clickhouse_columns Columns) add_object_library(clickhouse_storages Storages) add_object_library(clickhouse_storages_mysql Storages/MySQL) diff --git a/src/Common/CurrentMetrics.cpp b/src/Common/CurrentMetrics.cpp index f398128cfebf..fe6e065548a7 100644 --- a/src/Common/CurrentMetrics.cpp +++ b/src/Common/CurrentMetrics.cpp @@ -33,6 +33,8 @@ M(BackgroundDistributedSchedulePoolSize, "Limit on number of tasks in BackgroundDistributedSchedulePool") \ M(BackgroundMessageBrokerSchedulePoolTask, "Number of active tasks in BackgroundMessageBrokerSchedulePool for message streaming") \ M(BackgroundMessageBrokerSchedulePoolSize, "Limit on number of tasks in BackgroundMessageBrokerSchedulePool for message streaming") \ + M(BackgroundNamedScalarRefreshPoolTask, "Number of active tasks in BackgroundNamedScalarRefreshPool, used for periodic named-scalar refresh.") \ + M(BackgroundNamedScalarRefreshPoolSize, "Limit on number of tasks in BackgroundNamedScalarRefreshPool") \ M(CacheDictionaryUpdateQueueBatches, "Number of 'batches' (a set of keys) in update queue in CacheDictionaries.") \ M(CacheDictionaryUpdateQueueKeys, "Exact number of keys in update queue in CacheDictionaries.") \ M(DiskSpaceReservedForMerge, "Disk space reserved for currently running background merges. It is slightly more than the total size of currently merging parts.") \ diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index 6bfe68a6a24e..7827cf18440a 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -650,6 +650,12 @@ M(768, CANNOT_EXECUTE_PROMQL_QUERY) \ M(769, NAMED_COLLECTION_IS_USED) \ M(770, WASM_ERROR) \ + M(771, NAMED_SCALAR_NOT_FOUND) \ + M(772, NAMED_SCALAR_ALREADY_EXISTS) \ + M(773, SHARED_NAMED_SCALARS_NOT_CONFIGURED) \ + M(774, NAMED_SCALAR_NOT_REFRESHABLE) \ + M(775, NAMED_SCALAR_VALUE_TOO_LARGE) \ + M(776, NAMED_SCALAR_HAS_NO_VALUE) \ \ M(900, DISTRIBUTED_CACHE_ERROR) \ M(901, CANNOT_USE_DISTRIBUTED_CACHE) \ diff --git a/src/Common/FailPoint.cpp b/src/Common/FailPoint.cpp index 56a4a55d07a5..667546363072 100644 --- a/src/Common/FailPoint.cpp +++ b/src/Common/FailPoint.cpp @@ -130,6 +130,8 @@ static struct InitFiu ONCE(backup_add_empty_memory_table) \ PAUSEABLE_ONCE(backup_pause_on_start) \ PAUSEABLE_ONCE(restore_pause_on_start) \ + ONCE(shared_named_scalars_store_value_fail_once) \ + PAUSEABLE(named_scalar_create_after_publish_pause) \ PAUSEABLE(sc_state_application_pause) \ PAUSEABLE(sc_state_application_pause_after_fetch) \ REGULAR(sc_intentions_commit_fail) \ diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index 87368fd76272..15bdf8484ef2 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -1348,6 +1348,12 @@ The server successfully detected this situation and will download merged part fr M(RefreshableViewSyncReplicaRetry, "How many times a SELECT from refreshable materialized view failed and retried an implicit SYNC REPLICA", ValueType::Number) \ M(RefreshableViewLockTableRetry, "How many times a SELECT from refreshable materialized view had to switch to a new table because the old table was dropped", ValueType::Number) \ \ + M(NamedScalarRefreshAttempts, "How many named-scalar refresh ticks proceeded past the per-tick lock (i.e. ran the SELECT)", ValueType::Number) \ + M(NamedScalarRefreshSuccesses, "How many named-scalar refresh ticks evaluated successfully and published a new value", ValueType::Number) \ + M(NamedScalarRefreshFailures, "How many named-scalar refresh ticks failed (eval threw or persist failed); the previously-good value is retained", ValueType::Number) \ + M(NamedScalarRefreshSkippedByPeer, "Shared scalars only: refresh ticks where the per-scalar Keeper lock was held by a peer replica", ValueType::Number) \ + M(NamedScalarRefreshDurationMicroseconds, "Cumulative wall-clock duration of named-scalar refresh bodies (divide by Attempts for mean)", ValueType::Microseconds) \ + \ M(AsyncLoggingConsoleTotalMessages, "How many messages (accepted or dropped) have been sent to the async queue for the console log", ValueType::Number) \ M(AsyncLoggingFileLogTotalMessages, "How many messages (accepted or dropped) have been sent to the async queue for the file log", ValueType::Number) \ M(AsyncLoggingErrorFileLogTotalMessages, "How many messages (accepted or dropped) have been sent to the async queue for the error file log", ValueType::Number) \ diff --git a/src/Common/setThreadName.h b/src/Common/setThreadName.h index 9d94523958c5..74a4820e99d4 100644 --- a/src/Common/setThreadName.h +++ b/src/Common/setThreadName.h @@ -24,6 +24,7 @@ namespace DB M(AZURE_COPY_POOL, "AzureObjCopy") \ M(AZURE_LIST_POOL, "AzureObjList") \ M(BACKGROUND_BUFFER_FLUSH_SCHEDULE_POOL, "BgBufSchPool") \ + M(BACKGROUND_NAMED_SCALAR_REFRESH_POOL, "BgNSRefSchPool") \ M(BACKGROUND_SCHEDULE_POOL, "BgSchPool") \ M(BACKUP_ASYNC, "BackupAsync") \ M(BACKUP_ASYNC_INTERNAL, "BackupAsyncInt") \ @@ -142,6 +143,7 @@ namespace DB M(S3_LIST_POOL, "ListObjectS3") \ M(SESSION_CLEANUP, "SessionCleanup") \ M(SEND_TO_SHELL_CMD, "SendToShellCmd") \ + M(SHARED_NAMED_SCALARS, "NSCoord") \ M(SUGGEST, "Suggest") \ M(SYSTEM_LOG_FLUSH, "SystemLogFlush") \ M(SYSTEM_REPLICAS, "SysReplicas") \ diff --git a/src/Core/ServerSettings.cpp b/src/Core/ServerSettings.cpp index 09520a9768d5..f2cfc28b3608 100644 --- a/src/Core/ServerSettings.cpp +++ b/src/Core/ServerSettings.cpp @@ -31,6 +31,7 @@ extern const Metric BackgroundSchedulePoolSize; extern const Metric BackgroundBufferFlushSchedulePoolSize; extern const Metric BackgroundDistributedSchedulePoolSize; extern const Metric BackgroundMessageBrokerSchedulePoolSize; +extern const Metric BackgroundNamedScalarRefreshPoolSize; } namespace DB @@ -906,6 +907,7 @@ The policy on how to perform a scheduling of CPU slots specified by `concurrent_ DECLARE(Float, background_schedule_pool_max_parallel_tasks_per_type_ratio, 0.8f, R"(The maximum ratio of threads in the pool that can execute tasks of the same type simultaneously.)", 0) \ DECLARE(UInt64, background_message_broker_schedule_pool_size, 16, R"(The maximum number of threads that will be used for executing background operations for message streaming.)", 0) \ DECLARE(UInt64, background_distributed_schedule_pool_size, 16, R"(The maximum number of threads that will be used for executing distributed sends.)", 0) \ + DECLARE(UInt64, background_named_scalar_refresh_pool_size, 16, R"(The maximum number of threads that will be used for refreshing named scalars in the background. The pool is instantiated lazily on the first CREATE NAMED SCALAR; servers without any named scalars pay no thread overhead.)", 0) \ DECLARE(UInt64, tables_loader_foreground_pool_size, 0, R"( Sets the number of threads performing load jobs in foreground pool. The foreground pool is used for loading table synchronously before server start listening on a port and for loading tables that are waited for. Foreground pool has higher priority than background pool. It means that no job starts in background pool while there are jobs running in foreground pool. @@ -1634,6 +1636,7 @@ void ServerSettingsImpl::loadSettingsFromConfig(const Poco::Util::AbstractConfig "background_schedule_pool_size", "background_message_broker_schedule_pool_size", "background_distributed_schedule_pool_size", + "background_named_scalar_refresh_pool_size", "max_remote_read_network_bandwidth_for_server", "max_remote_write_network_bandwidth_for_server", @@ -1743,6 +1746,8 @@ void ServerSettings::dumpToSystemServerSettingsColumns(ServerSettingColumnsParam {std::to_string(CurrentMetrics::get(CurrentMetrics::BackgroundMessageBrokerSchedulePoolSize)), ChangeableWithoutRestart::IncreaseOnly}}, {"background_distributed_schedule_pool_size", {std::to_string(CurrentMetrics::get(CurrentMetrics::BackgroundDistributedSchedulePoolSize)), ChangeableWithoutRestart::IncreaseOnly}}, + {"background_named_scalar_refresh_pool_size", + {std::to_string(CurrentMetrics::get(CurrentMetrics::BackgroundNamedScalarRefreshPoolSize)), ChangeableWithoutRestart::IncreaseOnly}}, {"mark_cache_size", {std::to_string(context->getMarkCache()->maxSizeInBytes()), ChangeableWithoutRestart::Yes}}, {"uncompressed_cache_size", {std::to_string(context->getUncompressedCache()->maxSizeInBytes()), ChangeableWithoutRestart::Yes}}, diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp index 257d1a6b3c53..337fdaaa52da 100644 --- a/src/Core/Settings.cpp +++ b/src/Core/Settings.cpp @@ -7591,6 +7591,10 @@ The heartbeat interval in seconds to indicate watch query is alive. )", EXPERIMENTAL) \ DECLARE(Seconds, wait_for_window_view_fire_signal_timeout, 10, R"( Timeout for waiting for window view fire signal in event time processing +)", EXPERIMENTAL) \ + \ + DECLARE(Bool, allow_experimental_named_scalars, false, R"( +Enables experimental named scalars. )", EXPERIMENTAL) \ \ DECLARE(Bool, stop_refreshable_materialized_views_on_startup, false, R"( diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp index 93c39a64bc99..91378901cc27 100644 --- a/src/Core/SettingsChangesHistory.cpp +++ b/src/Core/SettingsChangesHistory.cpp @@ -108,6 +108,7 @@ const VersionToSettingsChangesMap & getSettingsChangesHistory() }); addSettingsChanges(settings_changes_history, "26.1.3.20001.altinityantalya", { + {"allow_experimental_named_scalars", false, false, "Experimental gate for CREATE NAMED SCALAR."}, // {"iceberg_partition_timezone", "", "", "New setting."}, // {"s3_propagate_credentials_to_other_storages", false, false, "New setting"}, // {"export_merge_tree_part_filename_pattern", "", "{part_name}_{checksum}", "New setting"}, diff --git a/src/Functions/getNamedScalar.cpp b/src/Functions/getNamedScalar.cpp new file mode 100644 index 000000000000..9745fc1138ad --- /dev/null +++ b/src/Functions/getNamedScalar.cpp @@ -0,0 +1,254 @@ +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; + extern const int NAMED_SCALAR_HAS_NO_VALUE; + extern const int NAMED_SCALAR_NOT_FOUND; +} + +namespace +{ + +enum class ErrorHandlingMode : uint8_t +{ + Exception, + Default, +}; + +template +constexpr const char * functionNameFor() +{ + return Mode == ErrorHandlingMode::Exception ? "getNamedScalar" : "getNamedScalarOrDefault"; +} + +String extractNamedScalarName(const ColumnsWithTypeAndName & arguments, const String & function_name) +{ + const auto * column = arguments[0].column.get(); + const auto * const_col = column ? checkAndGetColumnConstStringOrFixedString(column) : nullptr; + if (!isString(arguments[0].type) || !const_col) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "The argument of function {} should be a constant string with the name of a scalar", + function_name); + + String name(column->getDataAt(0)); + if (name.empty()) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "The argument of function {} must be a non-empty string", + function_name); + return name; +} + +Field extractDefaultValue(const ColumnsWithTypeAndName & arguments, const String & function_name) +{ + if (!arguments[1].column || !isColumnConst(*arguments[1].column)) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "The second argument of function {} should be a constant default value", + function_name); + return (*arguments[1].column)[0]; +} + +class ExecutableFunctionGetNamedScalar final : public IExecutableFunction +{ +public: + ExecutableFunctionGetNamedScalar(String name_, Field value_) + : name(std::move(name_)) + , value(std::move(value_)) + { + } + + String getName() const override { return name; } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName &, const DataTypePtr & result_type, size_t input_rows_count) const override + { + return result_type->createColumnConst(input_rows_count, value); + } + +private: + String name; + Field value; +}; + +class FunctionBaseGetNamedScalar final : public IFunctionBase +{ +public: + FunctionBaseGetNamedScalar(String name_, DataTypes argument_types_, DataTypePtr result_type_, Field value_) + : name(std::move(name_)) + , argument_types(std::move(argument_types_)) + , result_type(std::move(result_type_)) + , value(std::move(value_)) + { + } + + String getName() const override { return name; } + const DataTypes & getArgumentTypes() const override { return argument_types; } + const DataTypePtr & getResultType() const override { return result_type; } + + ExecutableFunctionPtr prepare(const ColumnsWithTypeAndName &) const override + { + return std::make_unique(name, value); + } + + bool isDeterministic() const override { return false; } + bool isDeterministicInScopeOfQuery() const override { return true; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo &) const override { return false; } + +private: + String name; + DataTypes argument_types; + DataTypePtr result_type; + Field value; +}; + +template +class GetNamedScalarOverloadResolver final : public IFunctionOverloadResolver +{ +public: + static constexpr auto name = functionNameFor(); + + static FunctionOverloadResolverPtr create(ContextPtr context_) + { + return std::make_unique(context_); + } + + explicit GetNamedScalarOverloadResolver(ContextPtr context_) + : context(std::move(context_)) + { + } + + String getName() const override { return name; } + bool isDeterministic() const override { return false; } + bool isDeterministicInScopeOfQuery() const override { return true; } + bool isVariadic() const override { return false; } + size_t getNumberOfArguments() const override { return (Mode == ErrorHandlingMode::Default) ? 2 : 1; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override + { + if constexpr (Mode == ErrorHandlingMode::Default) + return {0, 1}; + else + return {0}; + } + + FunctionBasePtr build(const ColumnsWithTypeAndName & arguments) const override + { + checkNumberOfArguments(arguments.size()); + context->checkAccess(AccessType::getNamedScalar); + + const String bare_name = extractNamedScalarName(arguments, getName()); + DataTypes argument_types; + argument_types.reserve(arguments.size()); + for (const auto & argument : arguments) + argument_types.push_back(argument.type); + + /// Single lookup: tryGetScopedScalar returns the scalar plus its + /// cache_kind so the error path's `kind_str` can't race a concurrent + /// drop and report the wrong kind. + auto scoped = context->getNamedScalarsManager().tryGetScopedScalar(bare_name); + const auto & scalar = scoped ? scoped->scalar : nullptr; + if (scalar) + { + if (auto snapshot = scalar->tryGetValue()) + { + return std::make_unique( + getName(), + std::move(argument_types), + snapshot->type, + std::move(snapshot->value)); + } + } + + if constexpr (Mode == ErrorHandlingMode::Default) + { + return std::make_unique( + getName(), + std::move(argument_types), + arguments[1].type, + extractDefaultValue(arguments, getName())); + } + else + { + if (!scalar) + throw Exception( + ErrorCodes::NAMED_SCALAR_NOT_FOUND, + "No named scalar '{}'", + bare_name); + + const std::string_view kind_str = toString(scoped->cache_kind); + + /// last_error is the definer's exception text; it can disclose + /// schema / permission detail under the definer's privileges. + /// Only surface it to readers who already have operator-tier + /// visibility (SHOW_NAMED_SCALARS); other callers get the short + /// form and are pointed at system.named_scalars. + const auto status = scalar->getInfo(); + const bool show_full = context->getAccess()->isGranted(AccessType::SHOW_NAMED_SCALARS); + if (show_full && !status.last_error.empty()) + throw Exception( + ErrorCodes::NAMED_SCALAR_HAS_NO_VALUE, + "{} scalar '{}' has no value (last error [{}]: {})", + kind_str, + bare_name, + status.last_error_type, + status.last_error); + + throw Exception( + ErrorCodes::NAMED_SCALAR_HAS_NO_VALUE, + "{} scalar '{}' has no value; query system.named_scalars " + "with SHOW_NAMED_SCALARS for the last error", + kind_str, + bare_name); + } + } + +private: + ContextPtr context; +}; + +} + +REGISTER_FUNCTION(GetNamedScalar) +{ + using M = ErrorHandlingMode; + using FunctionFactory_Case = FunctionFactory::Case; + + factory.registerFunction>( + FunctionDocumentation{ + .description = "Returns the current value of a named scalar. Throws if the named scalar does not exist or has no value.", + .syntax = "getNamedScalar(named_scalar_name)", + .arguments = {{"named_scalar_name", "The bare scalar name.", {"const String"}}}, + .returned_value = {"Current value of the named scalar.", {"Any"}}, + .introduced_in = {26, 3}, + .category = FunctionDocumentation::Category::Other, + }, + FunctionFactory_Case::Sensitive); + + factory.registerFunction>( + FunctionDocumentation{ + .description = "Returns the current value of a named scalar, or the provided default if it is not defined or has no value.", + .syntax = "getNamedScalarOrDefault(named_scalar_name, default_value)", + .arguments = { + {"named_scalar_name", "The bare scalar name.", {"const String"}}, + {"default_value", "Value to return if the named scalar is not defined.", {"Any"}}, + }, + .returned_value = {"Current value, or default_value.", {"Any"}}, + .introduced_in = {26, 3}, + .category = FunctionDocumentation::Category::Other, + }, + FunctionFactory_Case::Sensitive); +} + +} diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 2304133f54f8..58917cb11222 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -99,6 +100,7 @@ #include #include #include +#include #include #include #include @@ -217,6 +219,8 @@ namespace CurrentMetrics extern const Metric BackgroundDistributedSchedulePoolSize; extern const Metric BackgroundMessageBrokerSchedulePoolTask; extern const Metric BackgroundMessageBrokerSchedulePoolSize; + extern const Metric BackgroundNamedScalarRefreshPoolTask; + extern const Metric BackgroundNamedScalarRefreshPoolSize; extern const Metric BackgroundMergesAndMutationsPoolTask; extern const Metric BackgroundMergesAndMutationsPoolSize; extern const Metric BackgroundFetchesPoolTask; @@ -353,6 +357,7 @@ namespace ServerSetting extern const ServerSettingsUInt64 background_buffer_flush_schedule_pool_size; extern const ServerSettingsUInt64 background_common_pool_size; extern const ServerSettingsUInt64 background_distributed_schedule_pool_size; + extern const ServerSettingsUInt64 background_named_scalar_refresh_pool_size; extern const ServerSettingsUInt64 background_fetches_pool_size; extern const ServerSettingsFloat background_merges_mutations_concurrency_ratio; extern const ServerSettingsString background_merges_mutations_scheduling_policy; @@ -408,6 +413,7 @@ namespace ErrorCodes extern const int NO_ELEMENTS_IN_CONFIG; extern const int TABLE_SIZE_EXCEEDS_MAX_DROP_SIZE_LIMIT; extern const int LOGICAL_ERROR; + extern const int INVALID_CONFIG_PARAMETER; extern const int INVALID_SETTING_VALUE; extern const int NOT_IMPLEMENTED; extern const int UNKNOWN_FUNCTION; @@ -502,6 +508,9 @@ struct ContextSharedPart : boost::noncopyable mutable OnceFlag user_defined_sql_objects_storage_initialized; mutable std::unique_ptr user_defined_sql_objects_storage; + /// Constructed eagerly by Context::initializeNamedScalars() at boot. + std::unique_ptr named_scalars_manager; + mutable OnceFlag workload_entity_storage_initialized; mutable std::unique_ptr workload_entity_storage; @@ -585,6 +594,8 @@ struct ContextSharedPart : boost::noncopyable mutable BackgroundSchedulePoolPtr message_broker_schedule_pool; /// A thread pool that can run different jobs in background (used for message brokers, like RabbitMQ and Kafka) OnceFlag iceberg_schedule_pool_initialized; mutable BackgroundSchedulePoolPtr iceberg_schedule_pool; /// A thread pool that runs background metadata refresh for all active Iceberg tables + OnceFlag named_scalar_refresh_pool_initialized; + mutable BackgroundSchedulePoolPtr named_scalar_refresh_pool; mutable OnceFlag readers_initialized; mutable std::unique_ptr asynchronous_remote_fs_reader; @@ -941,6 +952,11 @@ struct ContextSharedPart : boost::noncopyable // Workload entity storage must be destructed when no queries or merges are running because PipelineExecutor may access it. SHUTDOWN(log, "workload entity storage", workload_entity_storage, stopWatching()); + /// Stop the shared named-scalar watcher and drain scalar refresh + /// tasks before AccessControl, Keeper, and schedule pools start + /// shutting down. Destructors are only an idempotent fallback. + SHUTDOWN(log, "named scalars manager", named_scalars_manager, shutdown()); + std::unique_ptr delete_system_logs; std::unique_ptr delete_embedded_dictionaries; std::unique_ptr delete_external_dictionaries_loader; @@ -948,12 +964,14 @@ struct ContextSharedPart : boost::noncopyable std::unique_ptr delete_user_defined_sql_objects_storage; std::unique_ptr delete_workload_entity_storage; std::unique_ptr delete_ddl_worker; + std::unique_ptr delete_named_scalars_manager; BackgroundSchedulePoolPtr delete_buffer_flush_schedule_pool; BackgroundSchedulePoolPtr delete_schedule_pool; BackgroundSchedulePoolPtr delete_distributed_schedule_pool; BackgroundSchedulePoolPtr delete_message_broker_schedule_pool; BackgroundSchedulePoolPtr delete_iceberg_schedule_pool; + BackgroundSchedulePoolPtr delete_named_scalar_refresh_pool; std::unique_ptr delete_access_control; @@ -1027,12 +1045,17 @@ struct ContextSharedPart : boost::noncopyable delete_user_defined_sql_objects_storage = std::move(user_defined_sql_objects_storage); delete_workload_entity_storage = std::move(workload_entity_storage); delete_ddl_worker = std::move(ddl_worker); + /// Explicit shutdown above already stopped the watcher and + /// drained refresh tasks. Keep the object alive until after + /// dependent services are moved out; destruction is fallback. + delete_named_scalars_manager = std::move(named_scalars_manager); delete_buffer_flush_schedule_pool = std::move(buffer_flush_schedule_pool); delete_schedule_pool = std::move(schedule_pool); delete_distributed_schedule_pool = std::move(distributed_schedule_pool); delete_message_broker_schedule_pool = std::move(message_broker_schedule_pool); delete_iceberg_schedule_pool = std::move(iceberg_schedule_pool); + delete_named_scalar_refresh_pool = std::move(named_scalar_refresh_pool); delete_access_control = std::move(access_control); @@ -1090,6 +1113,7 @@ struct ContextSharedPart : boost::noncopyable join_background_pool(std::move(delete_distributed_schedule_pool)); join_background_pool(std::move(delete_message_broker_schedule_pool)); join_background_pool(std::move(delete_iceberg_schedule_pool)); + join_background_pool(std::move(delete_named_scalar_refresh_pool)); delete_access_control.reset(); @@ -3674,6 +3698,74 @@ IUserDefinedSQLObjectsStorage & Context::getUserDefinedSQLObjectsStorage() return *shared->user_defined_sql_objects_storage; } +NamedScalarsManager & Context::getNamedScalarsManager() const +{ + chassert(shared->named_scalars_manager); + return *shared->named_scalars_manager; +} + +void Context::initializeNamedScalars() +{ + auto global = getGlobalContext(); + const auto & config = global->getConfigRef(); + + static constexpr std::string_view definitions_disk_path_key = "named_scalar_definitions_path"; + static constexpr std::string_view definitions_zookeeper_path_key = "named_scalar_definitions_zookeeper_path"; + static constexpr std::string_view local_cache_path_key = "named_scalar_local_cache_path"; + static constexpr std::string_view default_cache_key = "default_named_scalar_cache"; + + const bool has_disk_definitions = config.has(String(definitions_disk_path_key)); + const bool has_keeper_definitions = config.has(String(definitions_zookeeper_path_key)); + if (has_disk_definitions && has_keeper_definitions) + throw Exception( + ErrorCodes::INVALID_CONFIG_PARAMETER, + "Only one named scalar definition store can be configured: use either <{}> or <{}>, not both", + definitions_disk_path_key, + definitions_zookeeper_path_key); + + String definitions_disk_path = config.getString( + String(definitions_disk_path_key), + std::filesystem::path{global->getPath()} / "named_scalars" / ""); + + /// Empty Keeper path -> disk definition store. In that mode SHARED + /// value cache is rejected at CREATE time because there is no + /// cluster-wide definition owner. + String definitions_zookeeper_path; + if (has_keeper_definitions) + definitions_zookeeper_path = config.getString(String(definitions_zookeeper_path_key)); + + String local_cache_path = config.getString( + String(local_cache_path_key), + std::filesystem::path{global->getPath()} / "named_scalars_cache" / ""); + + auto parse_cache_kind = [](const String & value) + { + const auto lower = Poco::toLower(value); + if (lower == "local") + return NamedScalarCacheKind::Local; + if (lower == "shared") + return NamedScalarCacheKind::Shared; + throw Exception( + ErrorCodes::INVALID_CONFIG_PARAMETER, + "Invalid <{}> value '{}': expected 'local' or 'shared'", + default_cache_key, + value); + }; + + const auto default_cache_kind = parse_cache_kind(config.getString(String(default_cache_key), "local")); + if (default_cache_kind == NamedScalarCacheKind::Shared && !has_keeper_definitions) + throw Exception( + ErrorCodes::INVALID_CONFIG_PARAMETER, + "<{}>='shared' requires Keeper-backed named scalar definitions (configure <{}>)", + default_cache_key, + definitions_zookeeper_path_key); + + shared->named_scalars_manager = std::make_unique( + definitions_disk_path, definitions_zookeeper_path, local_cache_path, default_cache_kind, global); + + shared->named_scalars_manager->initialize(global); +} + IWorkloadEntityStorage & Context::getWorkloadEntityStorage() const { callOnce(shared->workload_entity_storage_initialized, [&] { @@ -4630,6 +4722,20 @@ BackgroundSchedulePool & Context::getDistributedSchedulePool() const return *shared->distributed_schedule_pool; } +BackgroundSchedulePool & Context::getNamedScalarRefreshPool() const +{ + callOnce(shared->named_scalar_refresh_pool_initialized, [&] { + shared->named_scalar_refresh_pool = BackgroundSchedulePool::create( + shared->server_settings[ServerSetting::background_named_scalar_refresh_pool_size], + /*max_parallel_tasks_per_type*/ 0, + CurrentMetrics::BackgroundNamedScalarRefreshPoolTask, + CurrentMetrics::BackgroundNamedScalarRefreshPoolSize, + DB::ThreadName::BACKGROUND_NAMED_SCALAR_REFRESH_POOL); + }); + + return *shared->named_scalar_refresh_pool; +} + BackgroundSchedulePool & Context::getMessageBrokerSchedulePool() const { callOnce(shared->message_broker_schedule_pool_initialized, [&] { diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index 74730f22107e..3379a8a01aad 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -88,6 +88,7 @@ class EmbeddedDictionaries; class ExternalDictionariesLoader; class ExternalUserDefinedExecutableFunctionsLoader; class IUserDefinedSQLObjectsStorage; +class NamedScalarsManager; class IWorkloadEntityStorage; class InterserverCredentials; using InterserverCredentialsPtr = std::shared_ptr; @@ -1125,6 +1126,10 @@ class Context: public ContextData, public std::enable_shared_from_this IUserDefinedSQLObjectsStorage & getUserDefinedSQLObjectsStorage(); void loadOrReloadUserDefinedExecutableFunctions(const Poco::Util::AbstractConfiguration & config); + NamedScalarsManager & getNamedScalarsManager() const; + /// Called once at server start. + void initializeNamedScalars(); + IWorkloadEntityStorage & getWorkloadEntityStorage() const; bool hasWasmModuleManager() const; @@ -1463,6 +1468,8 @@ class Context: public ContextData, public std::enable_shared_from_this BackgroundSchedulePool & getMessageBrokerSchedulePool() const; BackgroundSchedulePool & getDistributedSchedulePool() const; BackgroundSchedulePool & getIcebergSchedulePool() const; + /// Dedicated pool isolated from getSchedulePool(); lazily created. + BackgroundSchedulePool & getNamedScalarRefreshPool() const; /// Has distributed_ddl configuration or not. bool hasDistributedDDL() const; diff --git a/src/Interpreters/InterpreterFactory.cpp b/src/Interpreters/InterpreterFactory.cpp index 31183bb47b0c..d7df319f1f73 100644 --- a/src/Interpreters/InterpreterFactory.cpp +++ b/src/Interpreters/InterpreterFactory.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -347,6 +348,10 @@ InterpreterFactory::InterpreterPtr InterpreterFactory::get(ASTPtr & query, Conte { interpreter_name = "InterpreterCreateFunctionQuery"; } + else if (query->as()) + { + interpreter_name = "InterpreterNamedScalarDDLQuery"; + } else if (query->as()) { interpreter_name = "InterpreterDropFunctionQuery"; diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp index c5058e1879f8..5a37005ee871 100644 --- a/src/Interpreters/InterpreterSystemQuery.cpp +++ b/src/Interpreters/InterpreterSystemQuery.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -196,7 +197,6 @@ void executeCommandsAndThrowIfError(std::vector> commands) throw Exception::createDeprecated(result.message, result.code); } - AccessType getRequiredAccessType(StorageActionBlockType action_type) { if (action_type == ActionLocks::PartsMerge) @@ -855,6 +855,27 @@ BlockIO InterpreterSystemQuery::execute() for (const auto & task : getRefreshTasks()) task->run(); break; + case Type::REFRESH_NAMED_SCALAR: + { + getContext()->checkAccess(AccessType::SYSTEM_REFRESH_NAMED_SCALAR); + if (query.named_scalar_name.empty()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Named scalar name must be specified"); + getContext()->getNamedScalarsManager().refreshNow(query.named_scalar_name); + break; + } + case Type::START_NAMED_SCALAR_REFRESHES: + case Type::STOP_NAMED_SCALAR_REFRESHES: + { + getContext()->checkAccess(AccessType::SYSTEM_NAMED_SCALAR_REFRESHES); + const bool stop = query.type == Type::STOP_NAMED_SCALAR_REFRESHES; + auto & manager = getContext()->getNamedScalarsManager(); + /// Empty name -> all named scalar refreshes. + if (query.named_scalar_name.empty()) + manager.setAllRefreshesPaused(stop); + else + manager.setRefreshPaused(query.named_scalar_name, stop); + break; + } case Type::WAIT_VIEW: for (const auto & task : getRefreshTasks()) task->wait(); @@ -2353,6 +2374,17 @@ AccessRightsElements InterpreterSystemQuery::getRequiredAccessForDDLOnCluster() required_access.emplace_back(AccessType::SYSTEM_REDUCE_BLOCKING_PARTS, query.getDatabase(), query.getTable()); break; } + case Type::REFRESH_NAMED_SCALAR: + { + required_access.emplace_back(AccessType::SYSTEM_REFRESH_NAMED_SCALAR); + break; + } + case Type::START_NAMED_SCALAR_REFRESHES: + case Type::STOP_NAMED_SCALAR_REFRESHES: + { + required_access.emplace_back(AccessType::SYSTEM_NAMED_SCALAR_REFRESHES); + break; + } case Type::REFRESH_VIEW: case Type::WAIT_VIEW: case Type::START_VIEW: diff --git a/src/Interpreters/NamedScalars/INamedScalarDefinitionStore.h b/src/Interpreters/NamedScalars/INamedScalarDefinitionStore.h new file mode 100644 index 000000000000..91f1ebb88b7a --- /dev/null +++ b/src/Interpreters/NamedScalars/INamedScalarDefinitionStore.h @@ -0,0 +1,55 @@ +#pragma once + +#include +#include + +#include +#include +#include + +namespace DB +{ + +struct Settings; + +struct LoadedNamedScalarDefinition +{ + String name; + String definition_blob; +}; + +class INamedScalarDefinitionStore +{ +public: + virtual ~INamedScalarDefinitionStore() = default; + + virtual bool isKeeperBacked() const = 0; + virtual void initialize() = 0; + virtual std::vector loadAll() = 0; + + virtual bool definitionExists(const String & name) = 0; + virtual bool publishDefinition( + const String & name, + const String & definition_blob, + bool if_not_exists, + bool or_replace, + const Settings & settings) = 0; + + virtual bool removeDefinition(const String & name, bool throw_if_not_exists) = 0; + virtual bool readDefinition(const String & name, String & out) = 0; +}; + +class IWatchableNamedScalarDefinitionStore : public INamedScalarDefinitionStore +{ +public: + virtual Strings listDefinitionsWithChildrenWatch(std::function on_change) = 0; + virtual bool readDefinitionWithDataWatch( + const String & name, + String & out, + std::function on_change) = 0; +}; + +using NamedScalarDefinitionStorePtr = std::shared_ptr; +using WatchableNamedScalarDefinitionStorePtr = std::shared_ptr; + +} diff --git a/src/Interpreters/NamedScalars/INamedScalarValueBackend.h b/src/Interpreters/NamedScalars/INamedScalarValueBackend.h new file mode 100644 index 000000000000..92c3ceb902f6 --- /dev/null +++ b/src/Interpreters/NamedScalars/INamedScalarValueBackend.h @@ -0,0 +1,78 @@ +#pragma once + +#include +#include + +#include +#include +#include +#include + +namespace DB +{ + +/// Outcome of a refresh body's publish attempt. Storage failures +/// propagate as exceptions instead; those are recorded as refresh errors. +enum class RefreshPublishResult : UInt8 +{ + Published, + /// The lease was lost or the definition no longer points at this UUID + /// (peer OR REPLACE / DROP). Caller treats this as a no-op outcome; + /// the watcher will reconcile the local catalog. + Diverged, +}; + +enum class NamedScalarCacheKind : UInt8 +{ + Local, + Shared, +}; + +inline std::string_view toString(NamedScalarCacheKind cache_kind) +{ + switch (cache_kind) + { + case NamedScalarCacheKind::Local: + return "local"; + case NamedScalarCacheKind::Shared: + return "shared"; + } + UNREACHABLE(); +} + +class NamedScalarRefreshLease +{ +public: + NamedScalarRefreshLease() = default; + explicit NamedScalarRefreshLease(std::function publish_callback_) + : publish_callback(std::move(publish_callback_)) {} + + explicit operator bool() const { return static_cast(publish_callback); } + + RefreshPublishResult publish(const String & value_blob) const + { + return publish_callback(value_blob); + } + +private: + std::function publish_callback; +}; + +/// Serialized value backend used by NamedScalar. Catalogs provide concrete +/// local/shared implementations; the scalar owns decoding, lifetime, and +/// refresh semantics. +class INamedScalarValueBackend +{ +public: + virtual ~INamedScalarValueBackend() = default; + + virtual bool supportsValueWatches() const = 0; + virtual std::optional readValueBlob(const String & value_key) = 0; + virtual std::optional readValueBlobAndWatch(const String & value_key, std::function on_change) = 0; + virtual void removeValue(const String & value_key) = 0; + virtual std::optional tryAcquireRefreshLease( + const String & name, + const String & value_key) = 0; +}; + +} diff --git a/src/Interpreters/NamedScalars/InterpreterNamedScalarDDLQuery.cpp b/src/Interpreters/NamedScalars/InterpreterNamedScalarDDLQuery.cpp new file mode 100644 index 000000000000..b21be8863f06 --- /dev/null +++ b/src/Interpreters/NamedScalars/InterpreterNamedScalarDDLQuery.cpp @@ -0,0 +1,271 @@ +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace Setting +{ + extern const SettingsBool allow_experimental_named_scalars; +} + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; + extern const int NAMED_SCALAR_ALREADY_EXISTS; + extern const int SUPPORT_IS_DISABLED; + extern const int SYNTAX_ERROR; +} + +namespace +{ +String getNamedScalarName(const ASTPtr & ast) +{ + const auto * identifier = ast ? ast->as() : nullptr; + if (!identifier) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Named scalar name is not an identifier"); + + String name; + if (!tryGetIdentifierNameInto(identifier, name)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Named scalar name must be an identifier"); + + NamedScalarsManager::checkName(name); + return name; +} + +NamedScalarCacheKind resolveCacheKind( + ASTNamedScalarDDLQuery::CacheKind ast_cache_kind, + const NamedScalarsManager & manager) +{ + switch (ast_cache_kind) + { + case ASTNamedScalarDDLQuery::CacheKind::Default: + return manager.getDefaultCacheKind(); + case ASTNamedScalarDDLQuery::CacheKind::Local: + return NamedScalarCacheKind::Local; + case ASTNamedScalarDDLQuery::CacheKind::Shared: + return NamedScalarCacheKind::Shared; + } + UNREACHABLE(); +} + +ASTNamedScalarDDLQuery::CacheKind toASTCacheKind(NamedScalarCacheKind cache_kind) +{ + return cache_kind == NamedScalarCacheKind::Shared + ? ASTNamedScalarDDLQuery::CacheKind::Shared + : ASTNamedScalarDDLQuery::CacheKind::Local; +} + +String resolveDefiner(ASTNamedScalarDDLQuery & create_query, const ContextMutablePtr & current_context) +{ + if (!create_query.sql_security) + { + auto sql_security = make_intrusive(); + sql_security->type = SQLSecurityType::DEFINER; + sql_security->is_definer_current_user = true; + create_query.sql_security = sql_security; + create_query.children.push_back(create_query.sql_security); + } + + auto & sql_security = create_query.sql_security->as(); + InterpreterCreateQuery::processSQLSecurityOption( + current_context, + sql_security, + /* is_materialized_view */ true); + + if (sql_security.type != SQLSecurityType::DEFINER) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Only SQL SECURITY DEFINER is supported for NAMED SCALAR"); + + if (!sql_security.definer) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "CREATE NAMED SCALAR requires a user identity (SQL SECURITY DEFINER)"); + + sql_security.type = SQLSecurityType::DEFINER; + sql_security.is_definer_current_user = false; + + return sql_security.definer->toString(); +} + +BlockIO executeCreate(const ASTPtr & query_ptr, ContextMutablePtr current_context) +{ + /// Experimental gate. Operators must opt in explicitly to create + /// named scalars on this server. Reads via getNamedScalar are NOT + /// gated — once a scalar exists, dependent queries should not need + /// to re-assert the experimental flag on every access. + if (!current_context->getSettingsRef()[Setting::allow_experimental_named_scalars]) + throw Exception( + ErrorCodes::SUPPORT_IS_DISABLED, + "Named scalars are an experimental feature. " + "Enable them by setting `allow_experimental_named_scalars = 1` " + "before issuing CREATE NAMED SCALAR."); + + auto & create_query = query_ptr->as(); + auto scalar_name = getNamedScalarName(create_query.named_scalar_name); + auto & manager = current_context->getNamedScalarsManager(); + const auto cache_kind = resolveCacheKind(create_query.cache_kind, manager); + create_query.cache_kind = toASTCacheKind(cache_kind); + + AccessRightsElements access_rights_elements; + access_rights_elements.emplace_back(AccessType::CREATE_NAMED_SCALAR); + if (create_query.or_replace) + access_rights_elements.emplace_back(AccessType::DROP_NAMED_SCALAR); + + current_context->checkAccess(access_rights_elements); + + if (!create_query.cluster.empty() && cache_kind == NamedScalarCacheKind::Shared) + throw Exception(ErrorCodes::SYNTAX_ERROR, + "ON CLUSTER is not supported for SHARED NAMED SCALAR"); + + const String definer_name = resolveDefiner(create_query, current_context); + + AddDefaultDatabaseVisitor add_default_database(current_context, current_context->getCurrentDatabase()); + add_default_database.visit(create_query.expression); + + if (create_query.uuid == UUIDHelpers::Nil) + create_query.uuid = UUIDHelpers::generateV4(); + + if (!create_query.cluster.empty()) + { + DDLQueryOnClusterParams params; + params.access_to_check = access_rights_elements; + return executeDDLQueryOnCluster(query_ptr, current_context, params); + } + + /// Reject CREATE OR REPLACE that changes the cache kind — mixing LOCAL + /// and SHARED backends for the same scalar corrupts state. Consult + /// the persisted definition (Keeper RTT for SHARED) so the guard + /// works even when the watcher hasn't reconciled the local map yet + /// or the scalar exists only on peers. + if (create_query.or_replace) + { + auto existing_kind = manager.getCacheKind( + scalar_name, current_context, getLogger("InterpreterNamedScalarDDLQuery")); + if (existing_kind && *existing_kind != cache_kind) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Cannot change cache kind of named scalar '{}' via CREATE OR REPLACE " + "(was {}, requested {}). DROP and CREATE it instead.", + scalar_name, + toString(*existing_kind), + toString(cache_kind)); + } + + /// Fail fast for SHARED on an unconfigured node, before evaluating + /// the source SELECT. + manager.ensureCreatable(cache_kind); + + /// Avoid evaluating on duplicate / IF NOT EXISTS hits. Catalogs do + /// their own atomic check for crash-safety. + if (!create_query.or_replace && manager.definitionExists(scalar_name)) + { + if (create_query.if_not_exists) + return {}; + throw Exception( + ErrorCodes::NAMED_SCALAR_ALREADY_EXISTS, + "Named scalar '{}' already exists", + scalar_name); + } + + WriteBufferFromOwnString ddl_buf; + IAST::FormatSettings format_settings(/*one_line=*/false); + query_ptr->format(ddl_buf, format_settings); + + manager.create(NamedScalarCreateRequest{ + .cache_kind = cache_kind, + .name = std::move(scalar_name), + .formatted_create_query = ddl_buf.str(), + .if_not_exists = create_query.if_not_exists, + .or_replace = create_query.or_replace, + }, current_context); + return {}; +} + +BlockIO executeDrop(const ASTPtr & query_ptr, ContextMutablePtr current_context) +{ + const auto & drop_query = query_ptr->as(); + auto scalar_name = getNamedScalarName(drop_query.named_scalar_name); + + AccessRightsElements access_rights_elements; + access_rights_elements.emplace_back(AccessType::DROP_NAMED_SCALAR); + + /// Check access before any catalog probe so users without + /// DROP_NAMED_SCALAR can't side-channel the existence/kind of a + /// scalar via getCacheKind's Keeper RTT. Mirrors executeCreate. + current_context->checkAccess(access_rights_elements); + + if (!drop_query.cluster.empty()) + { + /// Determine cache kind from the persisted definition (queries Keeper + /// for SHARED). The previous local-map-only check would silently fall + /// through to a single-node DROP when the scalar wasn't yet in the + /// coordinator's map (just-restarted node, lagging watcher, or a + /// LOCAL scalar that exists only on peers) — leaving cluster state + /// inconsistent. + auto & manager = current_context->getNamedScalarsManager(); + auto kind = manager.getCacheKind( + scalar_name, current_context, getLogger("InterpreterNamedScalarDDLQuery")); + + if (kind == NamedScalarCacheKind::Shared) + throw Exception(ErrorCodes::SYNTAX_ERROR, + "ON CLUSTER is not supported for SHARED NAMED SCALAR"); + + /// LOCAL or non-existent: broadcast. Per-node DROP IF EXISTS no-ops + /// if the scalar isn't present on that node. + DDLQueryOnClusterParams params; + params.access_to_check = std::move(access_rights_elements); + return executeDDLQueryOnCluster(query_ptr, current_context, params); + } + + current_context->getNamedScalarsManager().drop( + scalar_name, !drop_query.if_exists); + return {}; +} +} + +BlockIO InterpreterNamedScalarDDLQuery::execute() +{ + /// Strip the ON CLUSTER clause if it targets our own local cluster, + /// so fan-out targets dispatch on cluster.empty() correctly. Same + /// pattern as InterpreterCreateFunctionQuery / + /// InterpreterCreateNamedCollectionQuery. + const auto updated_query_ptr = removeOnClusterClauseIfNeeded(query_ptr, getContext()); + const auto & query = updated_query_ptr->as(); + switch (query.action) + { + case ASTNamedScalarDDLQuery::Action::Create: + return executeCreate(updated_query_ptr, getContext()); + case ASTNamedScalarDDLQuery::Action::Drop: + return executeDrop(updated_query_ptr, getContext()); + } + UNREACHABLE(); +} + +void registerInterpreterNamedScalarDDLQuery(InterpreterFactory & factory) +{ + auto create_fn = [] (const InterpreterFactory::Arguments & args) + { + return std::make_unique(args.query, args.context); + }; + factory.registerInterpreter("InterpreterNamedScalarDDLQuery", create_fn); +} + +} diff --git a/src/Interpreters/NamedScalars/InterpreterNamedScalarDDLQuery.h b/src/Interpreters/NamedScalars/InterpreterNamedScalarDDLQuery.h new file mode 100644 index 000000000000..c870ceb0a13a --- /dev/null +++ b/src/Interpreters/NamedScalars/InterpreterNamedScalarDDLQuery.h @@ -0,0 +1,22 @@ +#pragma once + +#include + +namespace DB +{ + +class InterpreterNamedScalarDDLQuery : public IInterpreter, WithMutableContext +{ +public: + InterpreterNamedScalarDDLQuery(const ASTPtr & query_ptr_, ContextMutablePtr context_) + : WithMutableContext(context_), query_ptr(query_ptr_) + { + } + + BlockIO execute() override; + +private: + ASTPtr query_ptr; +}; + +} diff --git a/src/Interpreters/NamedScalars/NamedScalar.cpp b/src/Interpreters/NamedScalars/NamedScalar.cpp new file mode 100644 index 000000000000..f1f6308e0e27 --- /dev/null +++ b/src/Interpreters/NamedScalars/NamedScalar.cpp @@ -0,0 +1,819 @@ +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include + +#include +#include +#include +#include + +#include + +#include + +#include + +#include +#include + +namespace ProfileEvents +{ + extern const Event NamedScalarRefreshAttempts; + extern const Event NamedScalarRefreshSuccesses; + extern const Event NamedScalarRefreshFailures; + extern const Event NamedScalarRefreshSkippedByPeer; + extern const Event NamedScalarRefreshDurationMicroseconds; +} + +namespace DB +{ + +constexpr size_t REFRESH_LEASE_BUSY_RETRY_MS = 5000; + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; + extern const int INCORRECT_RESULT_OF_SCALAR_SUBQUERY; + extern const int NAMED_SCALAR_NOT_REFRESHABLE; +} + +namespace +{ + +void assertNoNamedScalarAccess(const ASTPtr & expression) +{ + static constexpr std::string_view forbidden_names[] = { + "getNamedScalar", + "getNamedScalarOrDefault", + }; + if (!expression) + return; + + if (const auto * function = expression->as()) + { + for (const auto & forbidden : forbidden_names) + { + if (function->name == forbidden) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Named scalar definition cannot reference function {}()", + function->name); + } + } + + for (const auto & child : expression->children) + assertNoNamedScalarAccess(child); +} + +ContextMutablePtr createEvaluationContext(const ContextPtr & context) +{ + auto eval_context = Context::createCopy(context); + eval_context->makeQueryContext(); + eval_context->setCurrentQueryId({}); + eval_context->setProcessListElement({}); + eval_context->setInternalQuery(true); + return eval_context; +} + +ContextMutablePtr buildDefinerEvalContext(const ContextPtr & global_context, const String & name, const String & definer) +{ + if (definer.empty()) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "named scalar '{}': missing definer in persisted definition", + name); + + auto & access_control = global_context->getAccessControl(); + const UUID definer_id = access_control.getID(definer); + + auto ctx = Context::createCopy(global_context); + ctx->makeQueryContext(); + ctx->setInternalQuery(true); + ctx->setUser(definer_id); + return ctx; +} + +std::pair executeScalarSelect( + const ASTPtr & select_query, + const ContextMutablePtr & context, + const NamedScalar * owner) +{ + /// Route through executeQuery so the refresh body shows up in + /// system.processes (KILL QUERY support) and system.query_log + /// (audit / perf), and so the running PipelineExecutor honors + /// QueryStatus::cancelQuery() driven by NamedScalar::cancelInFlight(). + /// QueryScope attaches a per-thread ThreadGroup that ProcessList::insert + /// requires. Synchronous CREATE-time evaluation runs on a client thread + /// that already has a group; only the background-pool refresh path + /// needs us to attach. + std::optional query_scope; + if (!CurrentThread::getGroup()) + query_scope.emplace(QueryScope::create(context)); + auto io = executeQuery(select_query->formatWithSecretsOneLine(), context, QueryFlags{.internal = true}).second; + + /// Publish the in-flight QueryStatus before pulling — cancellation + /// triggers between executeQuery and the first pull must reach this + /// query. Owner is null only on the synchronous CREATE-time eval, + /// where there is no running NamedScalar to attach to. + if (owner) + { + if (auto status = context->getProcessListElement()) + owner->setInFlight(status); + } + SCOPE_EXIT({ if (owner) owner->clearInFlight(); }); + + /// Pull the result and finalize the BlockIO so the QueryFinish/Exception + /// row is written to system.query_log (the finish_callbacks are wired + /// inside executeQuery; they fire from io.onFinish / onException). + Block block; + Block next_block; + try + { + PullingPipelineExecutor executor(io.pipeline); + while (block.rows() == 0 && executor.pull(block)) {} + + if (block.rows() == 0) + throw Exception(ErrorCodes::INCORRECT_RESULT_OF_SCALAR_SUBQUERY, "Scalar query returned empty result"); + if (block.rows() != 1) + throw Exception(ErrorCodes::INCORRECT_RESULT_OF_SCALAR_SUBQUERY, "Scalar query returned more than one row"); + + while (next_block.rows() == 0 && executor.pull(next_block)) {} + if (next_block.rows() != 0) + throw Exception(ErrorCodes::INCORRECT_RESULT_OF_SCALAR_SUBQUERY, "Scalar query returned more than one row"); + + block = materializeBlock(block); + if (block.columns() != 1) + throw Exception(ErrorCodes::INCORRECT_RESULT_OF_SCALAR_SUBQUERY, "Scalar query returned more than one column"); + } + catch (...) + { + io.onException(); + throw; + } + io.onFinish(); + + const auto & column_with_type = block.getByPosition(0); + Field field; + column_with_type.column->get(0, field); + return {std::move(field), column_with_type.type}; +} + +struct EvaluatedExpression { Field value; DataTypePtr type; }; + +EvaluatedExpression evaluateExpression( + const ASTPtr & expression, + const ContextPtr & context, + const NamedScalar * owner) +{ + assertNoNamedScalarAccess(expression); + auto eval_context = createEvaluationContext(context); + auto [value, type] = executeScalarSelect(expression, eval_context, owner); + return {std::move(value), std::move(type)}; +} + +std::chrono::system_clock::time_point nextRefreshTime( + UInt64 period_seconds, + std::chrono::system_clock::time_point last_completed) +{ + return last_completed + std::chrono::seconds(period_seconds); +} + +} + +NamedScalar::NamedScalar(ParsedDefinition definition_, INamedScalarValueBackend & value_backend_) + : definition(std::move(definition_)) + , value_backend(value_backend_) +{ + if (definition.refresh_period_seconds) + refresh.emplace(*definition.refresh_period_seconds); +} + +NamedScalar::~NamedScalar() +{ + /// Move the task holder out from under refresh_mutex; its destructor + /// (deactivate + join) must not run while holding a catalog mutex. + BackgroundSchedulePoolTaskHolder local_task; + { + std::lock_guard lock(refresh_mutex); + local_task = std::move(task); + } +} + +NamedScalar::StoredValuePtr NamedScalar::loadCurrentValue() const +{ + std::shared_lock lock(current_value_mutex); + return current_value; +} + +void NamedScalar::storeCurrentValue(NamedScalar::StoredValuePtr ptr) +{ + std::unique_lock lock(current_value_mutex); + current_value = std::move(ptr); +} + +NamedScalar::EvaluatedSnapshot NamedScalar::evaluateAndEncode(const ContextPtr & context) const +{ + auto eval_context = buildDefinerEvalContext(context, definition.name, definition.definer); + + auto evaluated = evaluateExpression(definition.expression, eval_context, this); + + auto value = StoredValue::fromEvaluationSuccess( + evaluated.type, + std::move(evaluated.value), + getFQDNOrHostName(), + std::chrono::system_clock::now()); + String payload = encodeNamedScalarValueAndCheckSize(*value, context); + + return {std::move(value), std::move(payload)}; +} + +void NamedScalar::evaluateAndStoreValue(const ContextPtr & context) +{ + auto lease = value_backend.tryAcquireRefreshLease(definition.name, definition.uuid); + if (!lease) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Could not acquire refresh lease for named scalar '{}' " + "(another node may be creating the same scalar concurrently)", + definition.name); + + auto snapshot = evaluateAndEncode(context); + + std::lock_guard publish_lock(publish_mutex); + if (lease->publish(snapshot.payload) != RefreshPublishResult::Published) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Could not publish initial value for named scalar '{}' (lease publish lost the race)", + definition.name); + + storeCurrentValue(std::move(snapshot.value)); +} + +void NamedScalar::start(const ContextPtr & context) +{ + /// Soft-bail if already shut down. Reachable when a watcher reconcile + /// installs a replacement (and shuts down this scalar) between our + /// caller's `swapScalar` and `start()` — expected race in the watcher + /// path, not a bug. The replacement's own `start()` will run and + /// own the slot. + if (!live.load(std::memory_order_acquire)) + return; + { + std::lock_guard refresh_lock(refresh_mutex); + if (refresh) + { + /// Seed the schedule anchor from the most recent successful + /// publish so refresh-on-restart lands at the next aligned + /// tick instead of immediately. If we have no value yet, + /// anchor to "now" so the first tick fires after one period. + auto value = loadCurrentValue(); + auto seed = (value && value->is_valid()) + ? value->last_successful_update_time + : std::chrono::system_clock::now(); + refresh->last_completed_timeslot = std::chrono::floor(seed); + refresh->next_refresh_time = nextRefreshTime(refresh->period_seconds, refresh->last_completed_timeslot); + } + wireTaskLocked(context); + task->schedule(); + } + + /// Arm the backend value watch (idempotent re-arm if Manager already + /// called loadValueFromBackend). Without this, a local-CREATE'd + /// scalar would not pick up subsequent peer publishes against its + /// own UUID until its first scheduled refresh tick. + if (value_backend.supportsValueWatches()) + loadValueFromBackend(); +} + +void NamedScalar::onValueChanged() +{ + /// Runs on the Keeper IO thread. Must remain non-blocking: flag the + /// reload, kick the refresh task to drain it. + reload_requested.store(true, std::memory_order_release); + std::lock_guard refresh_lock(refresh_mutex); + if (task && live.load(std::memory_order_acquire)) + task->schedule(); +} + +void NamedScalar::shutdown() +{ + live.store(false); + /// Interrupt the running SELECT (if any) before joining the task, + /// so the holder destructor doesn't block on the definer's + /// max_execution_time. + cancelInFlight(); + deactivate(); +} + +void NamedScalar::setInFlight(std::weak_ptr handle) const +{ + std::lock_guard lock(in_flight_mutex); + /// Invariant: BackgroundSchedulePoolTaskInfo::execute holds exec_mutex + /// across the whole function() call, so a scalar's task never runs + /// concurrently with itself. The synchronous CREATE-time eval doesn't + /// reach setInFlight (owner is null on that path). + chassert(in_flight_query.expired() && "setInFlight called while previous refresh is still in flight"); + in_flight_query = std::move(handle); +} + +void NamedScalar::clearInFlight() const +{ + std::lock_guard lock(in_flight_mutex); + in_flight_query.reset(); +} + +void NamedScalar::cancelInFlight() +{ + std::shared_ptr status; + { + std::lock_guard lock(in_flight_mutex); + status = in_flight_query.lock(); + } + if (status) + status->cancelQuery(CancelReason::CANCELLED_BY_USER); +} + +void NamedScalar::deactivate() +{ + BackgroundSchedulePoolTaskHolder old; + { + std::lock_guard refresh_lock(refresh_mutex); + old = std::move(task); + } +} + +void NamedScalar::requestRefreshNow() +{ + std::lock_guard refresh_lock(refresh_mutex); + if (!refresh) + throw Exception( + ErrorCodes::NAMED_SCALAR_NOT_REFRESHABLE, + "named scalar '{}' is not refreshable", + definition.name); + refresh->out_of_schedule_refresh_requested = true; + if (task) + task->schedule(); +} + +void NamedScalar::setRefreshPaused(bool paused) +{ + std::lock_guard refresh_lock(refresh_mutex); + if (!refresh) + return; + refresh->paused = paused; + if (!paused && task) + task->schedule(); +} + +bool NamedScalar::isRefreshable() const +{ + std::lock_guard refresh_lock(refresh_mutex); + return refresh.has_value(); +} + +std::optional NamedScalar::tryGetValue() const +{ + auto value = loadCurrentValue(); + if (!value || !value->has_value()) + return std::nullopt; + return CurrentValue{ + .value = value->value, + .type = value->type, + .is_valid = value->is_valid(), + }; +} + +NamedScalar::Info NamedScalar::getInfo() const +{ + Info info; + info.refresh = snapshotRefresh(); + info.loading_start_time = definition.load_time; + info.expression = definition.expression; + info.definer = definition.definer; + + auto value = loadCurrentValue(); + if (value) + { + if (value->has_value()) + { + info.value = CurrentValue{ + .value = value->value, + .type = value->type, + .is_valid = value->is_valid(), + }; + } + info.last_refresh_time = value->last_update_time; + if (value->last_successful_update_time.time_since_epoch().count() > 0) + info.last_success_time = value->last_successful_update_time; + info.last_refresh_hostname = value->last_update_hostname; + info.last_error = value->last_error; + info.last_error_type = value->last_error_type; + } + return info; +} + +NamedScalar::RefreshSnapshot NamedScalar::snapshotRefresh() const +{ + std::lock_guard refresh_lock(refresh_mutex); + RefreshSnapshot snap; + if (refresh) + { + snap.refreshable = true; + snap.period_seconds = refresh->period_seconds; + snap.next_refresh_time = refresh->next_refresh_time; + snap.refresh_started_at = refresh->refresh_started_at; + snap.consecutive_failures = refresh->consecutive_failures; + } + return snap; +} + +void NamedScalar::wireTaskLocked(const ContextPtr & context) +{ + chassert(!task); // Wired exactly once by start(); never re-wired. + + /// The single task handles refresh ticks AND drains `reload_requested`. + /// Even non-refreshable scalars get one because shared-cache scalars + /// still need to react to peer-driven value-watch fires. + const auto storage_id = StorageID("", fmt::format("named_scalar.{}", definition.name)); + + std::weak_ptr weak_self = weak_from_this(); + task = context->getNamedScalarRefreshPool().createTask(storage_id, "NamedScalarRefresh", + [context, weak_self] + { + if (auto self = weak_self.lock()) + self->runTask(context); + }); +} + +void NamedScalar::planNextRefreshLocked( + std::chrono::system_clock::time_point now, + bool out_of_schedule, + std::chrono::system_clock::time_point & when, + std::chrono::sys_seconds & timeslot) +{ + /// Equal-cadence schedule: next tick is `last_completed + period`. + const auto last_completed = std::chrono::sys_seconds(refresh->last_completed_timeslot); + when = nextRefreshTime(refresh->period_seconds, last_completed); + timeslot = std::chrono::floor(when); + if (out_of_schedule) + when = now; + refresh->next_refresh_time = when; +} + +void NamedScalar::recordSkipByPeerLocked(std::chrono::sys_seconds timeslot) +{ + /// Mark this tick completed locally so we advance to the NEXT tick; + /// without this, losing the per-tick lock would loop us straight + /// back into the same timeslot, defeating one-leader-per-tick. + refresh->last_completed_timeslot = timeslot; +} + +void NamedScalar::recordRefreshOutcomeLocked( + bool succeeded, + bool out_of_schedule, + std::chrono::sys_seconds timeslot) +{ + if (succeeded) + refresh->consecutive_failures = 0; + else + refresh->consecutive_failures += 1; + + /// Advance schedule anchor on success OR failure - otherwise a + /// failure leaves `now >= next` and the next tick fires immediately, + /// producing a tight retry loop. Manual SYSTEM REFRESH (out_of_schedule) + /// is additive: it does NOT advance the anchor, so the next + /// scheduled tick still happens. + if (!out_of_schedule) + refresh->last_completed_timeslot = timeslot; +} + +void NamedScalar::loadValueFromBackend() +{ + try + { + if (!live.load(std::memory_order_acquire)) + return; + + std::optional value_blob; + if (value_watch_active.load(std::memory_order_acquire)) + { + /// A data-watch is already registered for our value znode; just + /// refresh our snapshot without re-arming. Saves a Keeper round + /// trip on the redundant start()-after-loadValueFromBackend + /// path used by the watch-driven peer-CREATE / restart flows. + value_blob = value_backend.readValueBlob(definition.uuid); + } + else + { + /// Claim the right to arm BEFORE registering. If the callback + /// races with us and clears the flag while we're inside + /// readValueBlobAndWatch, the next loadValueFromBackend will + /// re-arm. Setting the flag AFTER would leave us with the flag + /// true and no watch (callback already fired and cleared). + /// Rollback on throw. + value_watch_active.store(true, std::memory_order_release); + try + { + value_blob = value_backend.readValueBlobAndWatch( + definition.uuid, + [weak_self = weak_from_this()] + { + if (auto self = weak_self.lock()) + { + self->value_watch_active.store(false, std::memory_order_release); + self->onValueChanged(); + } + }); + } + catch (...) + { + value_watch_active.store(false, std::memory_order_release); + throw; + } + } + + if (!value_blob) + return; + + auto snapshot = tryDecodeNamedScalarValueBlob(*value_blob, definition.name, getLogger("NamedScalar")); + if (!snapshot) + return; + auto value = std::make_shared(*snapshot); + + std::lock_guard publish_lock(publish_mutex); + if (live.load(std::memory_order_acquire)) + storeCurrentValue(std::move(value)); + } + catch (...) + { + tryLogCurrentException(getLogger("NamedScalar"), + fmt::format("reloading persisted value for named scalar '{}'", definition.name)); + reload_requested.store(true, std::memory_order_release); + std::lock_guard refresh_lock(refresh_mutex); + if (task && live.load(std::memory_order_acquire)) + task->scheduleAfter(1000); + } +} + +std::optional NamedScalar::acquireLeaseOrWait(std::chrono::sys_seconds timeslot) +{ + /// Treat transient store errors as "unavailable": retry without + /// advancing the schedule anchor. A missing lease means a peer won + /// the refresh race for this tick. + try + { + auto lease = value_backend.tryAcquireRefreshLease(definition.name, definition.uuid); + if (lease) + return lease; + } + catch (...) + { + tryLogCurrentException(getLogger("NamedScalar"), + fmt::format("named scalar '{}' refresh: acquiring refresh lease threw, retrying in 1s", definition.name)); + std::lock_guard refresh_lock(refresh_mutex); + if (task && live.load(std::memory_order_acquire)) + task->scheduleAfter(1000); + return std::nullopt; + } + + ProfileEvents::increment(ProfileEvents::NamedScalarRefreshSkippedByPeer); + { + std::lock_guard refresh_lock(refresh_mutex); + if (refresh) + recordSkipByPeerLocked(timeslot); + } + + /// Bounded fallback so a peer dying with the ephemeral lock does not + /// leave this replica stale until much later operator action. + size_t retry_ms = REFRESH_LEASE_BUSY_RETRY_MS; + if (definition.refresh_period_seconds) + retry_ms = std::min( + retry_ms, + static_cast(std::chrono::milliseconds(std::chrono::seconds(*definition.refresh_period_seconds)).count())); + + std::lock_guard refresh_lock(refresh_mutex); + if (task && live.load(std::memory_order_acquire)) + task->scheduleAfter(retry_ms); + return std::nullopt; +} + +NamedScalar::RefreshRunResult NamedScalar::evaluateAndPublishUnderLease( + const ContextPtr & context, + StoredValuePtr previous, + NamedScalarRefreshLease & refresh_lease) +{ + RefreshRunResult result; + + try + { + auto snapshot = evaluateAndEncode(context); + + std::lock_guard publish_lock(publish_mutex); + if (!live.load(std::memory_order_acquire)) + return result; + + switch (refresh_lease.publish(snapshot.payload)) + { + case RefreshPublishResult::Published: + storeCurrentValue(std::move(snapshot.value)); + result.published = true; + break; + case RefreshPublishResult::Diverged: + /// Peer OR REPLACE / DROP. Watcher will install a fresh + /// scalar; this run is a no-op (empty error_message marks + /// it as such, so we don't bump consecutive_failures). + break; + } + } + catch (...) + { + result.error_message = getCurrentExceptionMessage(false); + result.error_type = ErrorCodes::getName(getCurrentExceptionCode()); + + LOG_ERROR(getLogger("NamedScalar"), + "named scalar '{}' refresh failed [{}]: {}", + definition.name, result.error_type, result.error_message); + + auto failure = StoredValue::fromEvaluationFailure( + previous.get(), result.error_message, result.error_type, getFQDNOrHostName(), + std::chrono::system_clock::now()); + + std::lock_guard publish_lock(publish_mutex); + if (!live.load(std::memory_order_acquire)) + return result; + + try + { + const String payload = encodeNamedScalarValueAndCheckSize(*failure, context); + switch (refresh_lease.publish(payload)) + { + case RefreshPublishResult::Published: + storeCurrentValue(failure); + break; + case RefreshPublishResult::Diverged: + /// Peer divergence: pretend the failure didn't happen. + result.error_message.clear(); + result.error_type.clear(); + break; + } + } + catch (...) + { + storeCurrentValue(failure); + tryLogCurrentException(getLogger("NamedScalar"), + fmt::format("persisting failure marker for '{}'", definition.name)); + } + } + + return result; +} + +NamedScalar::RefreshAction NamedScalar::decideRefreshAction() +{ + auto value_at_start = loadCurrentValue(); + const bool needs_population = !(value_at_start && value_at_start->has_value()); + + std::lock_guard refresh_lock(refresh_mutex); + const bool populate_only = !refresh; + + /// `populate_only && !needs_population` is reachable for non-refreshable + /// shared-cache scalars whose initial value is already published by a + /// peer (we read it via loadValueFromBackend at start). Nothing to do. + if (populate_only && !needs_population) + return {RefreshActionKind::Skip, {}, {}, false}; + + /// Manual SYSTEM REFRESH bypasses `paused`. + if (!populate_only && !needs_population && refresh->paused + && !refresh->out_of_schedule_refresh_requested) + { + const auto delay = std::chrono::milliseconds(std::chrono::seconds(refresh->period_seconds)); + return {RefreshActionKind::WaitUntil, std::chrono::system_clock::now() + delay, {}, false}; + } + + const auto now = std::chrono::system_clock::now(); + bool out_of_schedule = true; + std::chrono::system_clock::time_point when; + std::chrono::sys_seconds timeslot = std::chrono::floor(now); + + if (!populate_only) + { + if (needs_population) + { + out_of_schedule = true; + refresh->out_of_schedule_refresh_requested = false; + } + else + { + out_of_schedule = refresh->out_of_schedule_refresh_requested; + refresh->out_of_schedule_refresh_requested = false; + planNextRefreshLocked(now, out_of_schedule, when, timeslot); + } + } + + if (!populate_only && !needs_population && !out_of_schedule && now < when) + return {RefreshActionKind::WaitUntil, when, timeslot, false}; + + return {RefreshActionKind::EvaluateNow, when, timeslot, out_of_schedule}; +} + +void NamedScalar::executeRefreshAction(const ContextPtr & context, RefreshAction action) +{ + if (action.kind == RefreshActionKind::Skip) + return; + + if (action.kind == RefreshActionKind::WaitUntil) + { + auto delay = std::chrono::duration_cast( + action.when - std::chrono::system_clock::now()); + size_t delay_ms = delay.count() > 0 ? static_cast(delay.count()) : 0; + std::lock_guard refresh_lock(refresh_mutex); + if (task && live.load(std::memory_order_acquire)) + task->scheduleAfter(delay_ms); + return; + } + + chassert(action.kind == RefreshActionKind::EvaluateNow); + + std::optional refresh_lease = acquireLeaseOrWait(action.timeslot); + if (!refresh_lease) + return; + + ProfileEvents::increment(ProfileEvents::NamedScalarRefreshAttempts); + const auto refresh_body_started = std::chrono::steady_clock::now(); + + { + std::lock_guard refresh_lock(refresh_mutex); + if (refresh) + refresh->refresh_started_at = std::chrono::system_clock::now(); + } + + auto value_at_start = loadCurrentValue(); + RefreshRunResult result = evaluateAndPublishUnderLease(context, value_at_start, *refresh_lease); + + ProfileEvents::increment(ProfileEvents::NamedScalarRefreshDurationMicroseconds, + std::chrono::duration_cast( + std::chrono::steady_clock::now() - refresh_body_started).count()); + + std::lock_guard refresh_lock(refresh_mutex); + if (!refresh) + { + if (!result.published && task && live.load(std::memory_order_acquire)) + task->scheduleAfter(1000); + return; + } + refresh->refresh_started_at.reset(); + + /// Peer divergence (no-op outcome): don't bump consecutive_failures. + /// Back off so we don't tight-loop while a peer DROP+watcher reconcile + /// is still propagating (acquire lease → Diverged → schedule()). + if (!result.published && result.error_message.empty()) + { + if (task && live.load(std::memory_order_acquire)) + task->scheduleAfter(REFRESH_LEASE_BUSY_RETRY_MS); + return; + } + + ProfileEvents::increment(result.published + ? ProfileEvents::NamedScalarRefreshSuccesses + : ProfileEvents::NamedScalarRefreshFailures); + recordRefreshOutcomeLocked(result.published, action.out_of_schedule, action.timeslot); + + if (task && live.load(std::memory_order_acquire)) + task->schedule(); +} + +void NamedScalar::runTask(const ContextPtr & context) +{ + if (!live.load(std::memory_order_acquire)) + return; + + /// Drain any pending backend value-watch fire BEFORE deciding whether + /// to refresh: a peer's fresh value should be visible to the refresh + /// logic so it can skip an evaluation that's no longer needed. + if (reload_requested.exchange(false, std::memory_order_acq_rel)) + loadValueFromBackend(); + + executeRefreshAction(context, decideRefreshAction()); +} + +} diff --git a/src/Interpreters/NamedScalars/NamedScalar.h b/src/Interpreters/NamedScalars/NamedScalar.h new file mode 100644 index 000000000000..9d092f274646 --- /dev/null +++ b/src/Interpreters/NamedScalars/NamedScalar.h @@ -0,0 +1,278 @@ +#pragma once + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +class NamedScalar; +class QueryStatus; +using NamedScalarPtr = std::shared_ptr; + +/// Per-scalar lifecycle owner. +/// +/// What it owns: +/// * a parsed, immutable definition (UUID + expression + definer + +/// optional refresh period); +/// * the current cached value; +/// * a single refresh task that evaluates SELECT under the DEFINER +/// and publishes via the value backend; +/// * an atomic reload flag flipped by the backend's value-watch +/// callback (peer published a fresh value) and drained at task entry. +/// +/// What it does NOT own: +/// * SQL parsing - manager builds ParsedDefinition via +/// NamedScalarDefinitionParse and hands it in; +/// * definition persistence - manager's definition_store does that; +/// * UUID minting - manager extracts/generates the UUID before construction. +/// +/// Definition lifecycle: definitions are immutable. CREATE OR REPLACE +/// produces a NEW NamedScalar with a NEW UUID; the manager swaps it +/// into the catalog map and shuts down the old one. There is no +/// in-place reconcile path here. +/// +/// Concurrency: +/// * `current_value_mutex` guards the cached value (RW lock; readers +/// copy out a shared_ptr and drop the lock immediately). +/// * `publish_mutex` serialises every publish: refresh task body, +/// watch-driven reload, initial evaluate. Holds across encode/store +/// and the post-publish current_value swap. +/// * `refresh_mutex` guards the refresh runtime + the task holder. +/// Task holders are always moved out from under it before +/// destruction (joining-while-locked deadlocks against the body's +/// own exit path). +/// * `live` + task destruction in shutdown() fences DROP / OR REPLACE +/// against in-flight refresh: the holder's destructor joins, the +/// task body checks `live` and returns early on subsequent runs. +class NamedScalar : public std::enable_shared_from_this +{ +public: + struct RefreshSnapshot + { + bool refreshable = false; + UInt64 period_seconds = 0; + std::optional next_refresh_time; + std::optional refresh_started_at; + UInt64 consecutive_failures = 0; + }; + + struct CurrentValue + { + Field value; + DataTypePtr type; + bool is_valid = false; + }; + + struct Info + { + std::optional value; + RefreshSnapshot refresh; + std::chrono::system_clock::time_point loading_start_time; + std::optional last_refresh_time; + std::optional last_success_time; + ASTPtr expression; + String definer; + String last_refresh_hostname; + String last_error; + String last_error_type; + }; + + NamedScalar(ParsedDefinition definition, INamedScalarValueBackend & value_backend); + ~NamedScalar(); + + NamedScalar(const NamedScalar &) = delete; + NamedScalar & operator=(const NamedScalar &) = delete; + + /// Initial-value paths. Manager calls exactly one of these before + /// `start()`: + /// * `evaluateAndStoreValue` - local CREATE: run SELECT under + /// DEFINER, encode, publish via backend lease. Throws on any + /// failure (missing definer, lease unavailable, SELECT error, + /// publish race lost) - caller rolls back the CREATE. + /// * `loadValueFromBackend` - restart-from-store / peer-CREATE + /// seen via watch: read the existing blob and arm the data-watch. + /// Returns silently if the blob is missing (refresh task will + /// populate). Logs and reschedules the refresh task on transient + /// backend errors; never throws past the catch. + /// Also used internally on watch-fire to drain `reload_requested`. + void evaluateAndStoreValue(const ContextPtr & context); + void loadValueFromBackend(); + + /// Wire the refresh task and schedule it. After `start()` the + /// scalar runs autonomously: refresh on schedule, reload on + /// backend value-watch fire. + void start(const ContextPtr & context); + + /// Fence DROP/OR-REPLACE against in-flight tasks. The caller's + /// subsequent durable removal happens only after any running + /// publish has returned. Must run on a non-schedule-pool thread. + void shutdown(); + + /// Throws NAMED_SCALAR_NOT_REFRESHABLE if no REFRESH clause. + void requestRefreshNow(); + void setRefreshPaused(bool paused); + bool isRefreshable() const; + + /// Refresh-body in-flight cancellation hooks (set by executeScalarSelect + /// around the running PipelineExecutor; cancelled from shutdown()). + void setInFlight(std::weak_ptr handle) const; + void clearInFlight() const; + void cancelInFlight(); + + /// Non-throwing peek for callers that have their own fallback + /// (`getNamedScalarOrDefault`, `system.named_scalars`). + std::optional tryGetValue() const; + + /// Snapshot for system.named_scalars. + Info getInfo() const; + + const String & getName() const { return definition.name; } + const String & getUUID() const { return definition.uuid; } + +private: + using StoredValuePtr = std::shared_ptr; + + struct RefreshRuntime + { + explicit RefreshRuntime(UInt64 period_seconds_) : period_seconds(period_seconds_) {} + + UInt64 period_seconds = 0; + /// Schedule anchor; next tick fires at last_completed + period. + std::chrono::sys_seconds last_completed_timeslot; + /// Bumped on failure, reset on success. Surfaced via + /// system.named_scalars; not used for refresh scheduling. + UInt64 consecutive_failures = 0; + std::chrono::system_clock::time_point next_refresh_time; + std::optional refresh_started_at; + bool paused = false; + bool out_of_schedule_refresh_requested = false; + }; + + struct RefreshRunResult + { + bool published = false; + String error_message; + String error_type; + }; + + struct EvaluatedSnapshot + { + StoredValuePtr value; + String payload; + }; + + enum class RefreshActionKind : UInt8 + { + /// Nothing to do this tick (populate-only with already-populated + /// value, or refresh paused without override). + Skip, + /// Run a refresh evaluation now (out-of-schedule or due). + EvaluateNow, + /// Wait until `when` then re-enter the task. + WaitUntil, + }; + + struct RefreshAction + { + RefreshActionKind kind = RefreshActionKind::Skip; + std::chrono::system_clock::time_point when; + std::chrono::sys_seconds timeslot; + bool out_of_schedule = false; + }; + + StoredValuePtr loadCurrentValue() const; + void storeCurrentValue(StoredValuePtr ptr); + + /// Build definer context, run SELECT, wrap into StoredValue, encode + /// to wire-format and check size. Throws on definer-missing, + /// SELECT failure, or oversize blob. + EvaluatedSnapshot evaluateAndEncode(const ContextPtr & context) const; + + RefreshSnapshot snapshotRefresh() const; + void deactivate(); + + /// Backend value-watch fire. Sets `reload_requested` and schedules + /// the refresh task. Cheap enough to run on the Keeper IO thread. + void onValueChanged(); + + /// Refresh task body. Drains reload_requested, then decides + executes. + void runTask(const ContextPtr & context); + + /// Decide what to do under refresh_mutex; returns a value object the + /// caller acts on without holding any lock. + RefreshAction decideRefreshAction(); + + /// Acquire lease, evaluate, publish, record outcome. Re-enters + /// refresh_mutex internally; never holds it across I/O. + void executeRefreshAction(const ContextPtr & context, RefreshAction action); + + std::optional acquireLeaseOrWait(std::chrono::sys_seconds timeslot); + RefreshRunResult evaluateAndPublishUnderLease( + const ContextPtr & context, + StoredValuePtr previous, + NamedScalarRefreshLease & refresh_lease); + + /// Create the refresh task. Caller holds `refresh_mutex`. Wired + /// exactly once by `start()`; never re-wired in this scalar's life. + void wireTaskLocked(const ContextPtr & context); + + void planNextRefreshLocked(std::chrono::system_clock::time_point now, + bool out_of_schedule, + std::chrono::system_clock::time_point & when, + std::chrono::sys_seconds & timeslot); + void recordSkipByPeerLocked(std::chrono::sys_seconds timeslot); + void recordRefreshOutcomeLocked(bool succeeded, + bool out_of_schedule, + std::chrono::sys_seconds timeslot); + + const ParsedDefinition definition; + INamedScalarValueBackend & value_backend; + + mutable std::shared_mutex current_value_mutex; + StoredValuePtr current_value; + + std::atomic live{true}; + + /// Set by the backend value-watch callback (peer wrote a fresh value). + /// Drained by `runTask` at entry: if set, reload from the backend + /// before doing the regular refresh-due check. + std::atomic reload_requested{false}; + + /// True iff a Keeper data-watch on the value znode is currently + /// registered. Set when we register, cleared by the watch callback + /// when it fires (Keeper watches are one-shot). Spurious false ⇒ + /// next loadValueFromBackend re-arms; spurious true is impossible + /// because we never set it after the registration completes. + std::atomic value_watch_active{false}; + + mutable std::mutex publish_mutex; + + mutable std::mutex refresh_mutex; + std::optional refresh; + BackgroundSchedulePoolTaskHolder task; + + /// Handle to the in-flight refresh query, if any. Set while the + /// refresh body executes; cleared on completion. Used by + /// cancelInFlight() to interrupt a running SELECT on shutdown / + /// DROP / OR REPLACE. + mutable std::mutex in_flight_mutex; + mutable std::weak_ptr in_flight_query; +}; + +} diff --git a/src/Interpreters/NamedScalars/NamedScalarDefinitionParse.cpp b/src/Interpreters/NamedScalars/NamedScalarDefinitionParse.cpp new file mode 100644 index 000000000000..ca1984a86b82 --- /dev/null +++ b/src/Interpreters/NamedScalars/NamedScalarDefinitionParse.cpp @@ -0,0 +1,159 @@ +#include + +#include +#include +#include + +#include +#include + +#include + +#include +#include + +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; +} + +namespace Setting +{ + extern const SettingsUInt64 max_parser_depth; + extern const SettingsUInt64 max_parser_backtracks; +} + +namespace +{ + +ASTPtr parseDDLBlob(const String & blob, const ContextPtr & context) +{ + ParserNamedScalarDDLQuery parser; + auto ast = parseQuery( + parser, + blob.data(), + blob.data() + blob.size(), + "", + 0, + context->getSettingsRef()[Setting::max_parser_depth], + context->getSettingsRef()[Setting::max_parser_backtracks]); + if (ast && !ast->as()) + return nullptr; + return ast; +} + +} + +ParsedDefinitionBlob parseDefinitionBlob(const String & blob, const ContextPtr & context) +{ + ParsedDefinitionBlob result; + + result.ast = parseDDLBlob(blob, context); + const auto * create_query = result.ast ? result.ast->as() : nullptr; + if (!create_query) + return result; + + if (create_query->uuid == UUIDHelpers::Nil) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Named scalar definition SQL must contain an explicit UUID"); + result.uuid = toString(create_query->uuid); + + const auto * sql_security = create_query->sql_security ? create_query->sql_security->as() : nullptr; + if (!sql_security || sql_security->type != SQLSecurityType::DEFINER || !sql_security->definer) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Named scalar definition SQL must contain an explicit SQL SECURITY DEFINER identity"); + result.definer = sql_security->definer->toString(); + + return result; +} + +std::optional parseAndValidateDefinition( + const String & name, + const String & definition_blob, + std::chrono::system_clock::time_point load_time, + const ContextPtr & context, + LoggerPtr log) +{ + ParsedDefinitionBlob blob; + try + { + blob = parseDefinitionBlob(definition_blob, context); + } + catch (...) + { + tryLogCurrentException(log, fmt::format("parsing named scalar definition '{}'", name)); + return std::nullopt; + } + + const auto * create_query = blob.ast ? blob.ast->as() : nullptr; + if (!create_query) + { + LOG_WARNING(log, + "Persisted definition for named scalar '{}' is not a CREATE NAMED SCALAR query", + name); + return std::nullopt; + } + + return ParsedDefinition{ + .name = name, + .uuid = blob.uuid, + .expression = create_query->expression, + .definer = blob.definer, + .refresh_period_seconds = create_query->refresh_period_seconds, + .load_time = load_time, + }; +} + +std::optional getNamedScalarUUIDFromSerializedDefinition( + const String & definition_blob, + const ContextPtr & context, + LoggerPtr log) +{ + try + { + return parseDefinitionBlob(definition_blob, context).uuid; + } + catch (...) + { + tryLogCurrentException(log, "parsing named scalar UUID"); + return std::nullopt; + } +} + +std::optional getNamedScalarCacheKindFromSerializedDefinition( + const String & definition_blob, + const ContextPtr & context, + LoggerPtr log) +{ + try + { + auto parsed = parseDefinitionBlob(definition_blob, context); + const auto * create_query = parsed.ast ? parsed.ast->as() : nullptr; + if (!create_query) + return std::nullopt; + switch (create_query->cache_kind) + { + case ASTNamedScalarDDLQuery::CacheKind::Shared: + return NamedScalarCacheKind::Shared; + case ASTNamedScalarDDLQuery::CacheKind::Default: + case ASTNamedScalarDDLQuery::CacheKind::Local: + return NamedScalarCacheKind::Local; + } + UNREACHABLE(); + } + catch (...) + { + tryLogCurrentException(log, "parsing named scalar cache kind"); + return std::nullopt; + } +} + +} diff --git a/src/Interpreters/NamedScalars/NamedScalarDefinitionParse.h b/src/Interpreters/NamedScalars/NamedScalarDefinitionParse.h new file mode 100644 index 000000000000..72b1c2d1d65f --- /dev/null +++ b/src/Interpreters/NamedScalars/NamedScalarDefinitionParse.h @@ -0,0 +1,87 @@ +#pragma once + +#include +#include +#include +#include + +#include +#include + +namespace DB +{ + +enum class NamedScalarCacheKind : UInt8; + +/// Fully resolved named-scalar definition. Manager builds one from a +/// definition blob (CREATE-NAMED-SCALAR text envelope) and hands it to +/// NamedScalar's constructor. Once built, the definition is immutable - +/// any subsequent CREATE OR REPLACE produces a brand-new ParsedDefinition +/// (and a brand-new NamedScalar) instead of mutating an existing one. +struct ParsedDefinition +{ + String name; + + /// UUID materialised at CREATE time. Used as the value-backend key, + /// so OR REPLACE picks up a fresh value slot. + String uuid; + + /// SELECT expression body. Parsed once; refresh task evaluates it + /// under DEFINER context on every tick. + ASTPtr expression; + + /// SQL SECURITY DEFINER user (mandatory). + String definer; + + /// Optional REFRESH EVERY N period. Absent ⇒ scalar is one-shot. + std::optional refresh_period_seconds; + + /// When the scalar was first parsed by this server. Stable across + /// restarts only insofar as the CREATE query is unchanged. + std::chrono::system_clock::time_point load_time; +}; + +/// Result of parsing a named-scalar definition blob (the textual envelope +/// stored in the definition store: full CREATE query + UUID + DEFINER). +struct ParsedDefinitionBlob +{ + /// Parsed CREATE query AST. Null if the blob did not parse cleanly + /// (logged at the call site that uses the result). + ASTPtr ast; + + /// SQL SECURITY DEFINER identity (mandatory). + String definer; + + /// Materialized UUID from the CREATE query (mandatory). Used by value + /// backends as the value-key, so OR REPLACE picks up a fresh slot. + String uuid; +}; + +/// Parse a definition blob. Throws BAD_ARGUMENTS if the blob parses but +/// is missing the mandatory UUID or DEFINER. Returns a result with null +/// `ast` if the blob does not parse as a CREATE NAMED SCALAR query. +ParsedDefinitionBlob parseDefinitionBlob(const String & blob, const ContextPtr & context); + +/// Build a fully-validated ParsedDefinition from a definition blob. +/// Returns nullopt and logs if the blob does not parse cleanly; throws +/// BAD_ARGUMENTS if the blob parses but is missing required fields. +std::optional parseAndValidateDefinition( + const String & name, + const String & definition_blob, + std::chrono::system_clock::time_point load_time, + const ContextPtr & context, + LoggerPtr log); + +/// Extract just the UUID. Returns nullopt and logs if parsing fails. +std::optional getNamedScalarUUIDFromSerializedDefinition( + const String & definition_blob, + const ContextPtr & context, + LoggerPtr log); + +/// Extract just the cache kind. Returns nullopt and logs if parsing fails. +std::optional getNamedScalarCacheKindFromSerializedDefinition( + const String & definition_blob, + const ContextPtr & context, + LoggerPtr log); + +} diff --git a/src/Interpreters/NamedScalars/NamedScalarDefinitionStoreLocal.cpp b/src/Interpreters/NamedScalars/NamedScalarDefinitionStoreLocal.cpp new file mode 100644 index 000000000000..a6eefd4313a1 --- /dev/null +++ b/src/Interpreters/NamedScalars/NamedScalarDefinitionStoreLocal.cpp @@ -0,0 +1,245 @@ +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include + +#include +#include + +namespace fs = std::filesystem; + +namespace DB +{ + +namespace Setting +{ + extern const SettingsBool fsync_metadata; +} + +namespace ErrorCodes +{ + extern const int DIRECTORY_DOESNT_EXIST; + extern const int NAMED_SCALAR_ALREADY_EXISTS; + extern const int NAMED_SCALAR_NOT_FOUND; +} + +namespace +{ + +constexpr std::string_view file_prefix = "named_scalar_"; +constexpr std::string_view definition_suffix = ".sql"; + +String makeDirectoryPathCanonical(const String & directory_path) +{ + auto canonical_directory_path = std::filesystem::weakly_canonical(directory_path); + if (canonical_directory_path.has_filename()) + canonical_directory_path += std::filesystem::path::preferred_separator; + return canonical_directory_path; +} + +void createDirectory(const String & dir_path) +{ + std::error_code create_dir_error_code; + fs::create_directories(dir_path, create_dir_error_code); + if (!fs::exists(dir_path) || !fs::is_directory(dir_path) || create_dir_error_code) + throw Exception( + ErrorCodes::DIRECTORY_DOESNT_EXIST, + "Couldn't create directory {} reason: '{}'", + dir_path, + create_dir_error_code.message()); +} + +String getDefinitionFilePath(const String & dir_path, const String & name) +{ + return dir_path + String(file_prefix) + escapeForFileName(name) + String(definition_suffix); +} + +std::optional parseDefinitionFileName(const String & file_name) +{ + if (!file_name.starts_with(file_prefix) || !file_name.ends_with(definition_suffix)) + return std::nullopt; + + size_t prefix_length = file_prefix.size(); + size_t suffix_length = definition_suffix.size(); + String escaped_name = file_name.substr(prefix_length, file_name.length() - prefix_length - suffix_length); + + String name = unescapeForFileName(escaped_name); + if (name.empty()) + return std::nullopt; + + return name; +} + +} + +NamedScalarDefinitionStoreLocal::NamedScalarDefinitionStoreLocal(String dir_path_, LoggerPtr log_) + : dir_path(makeDirectoryPathCanonical(dir_path_)) + , log(std::move(log_)) +{ +} + +void NamedScalarDefinitionStoreLocal::initialize() +{ + createDirectory(dir_path); +} + +std::vector NamedScalarDefinitionStoreLocal::loadAll() +{ + LOG_INFO(log, "Loading named scalar definitions from {}", dir_path); + + if (!std::filesystem::exists(dir_path)) + { + LOG_DEBUG(log, "The directory for named scalars ({}) does not exist: nothing to load", dir_path); + return {}; + } + + { + Poco::DirectoryIterator sweep_end; + for (Poco::DirectoryIterator it(dir_path); it != sweep_end; ++it) + { + const String & file_name = it.name(); + if (file_name.find(".tmp.") != String::npos) + { + try { std::filesystem::remove(dir_path + file_name); } + catch (...) { tryLogCurrentException(log, fmt::format("while sweeping orphan temp {}", file_name)); } + } + } + } + + std::vector objects; + + Poco::DirectoryIterator dir_end; + for (Poco::DirectoryIterator it(dir_path); it != dir_end; ++it) + { + if (it->isDirectory()) + continue; + + const String & file_name = it.name(); + auto object_name = parseDefinitionFileName(file_name); + if (!object_name) + continue; + + try { NamedScalarsManager::checkName(*object_name); } + catch (...) + { + tryLogCurrentException(log, fmt::format("rejecting on-disk scalar '{}': name violates cap", *object_name)); + continue; + } + + const String path = dir_path + file_name; + + try + { + ReadBufferFromFile in(path); + String contents; + readStringUntilEOF(contents, in); + + objects.push_back({ + *object_name, + std::move(contents), + }); + } + catch (...) + { + tryLogCurrentException(log, fmt::format("while loading named scalar definition from {}", path)); + } + } + + return objects; +} + +bool NamedScalarDefinitionStoreLocal::definitionExists(const String & name) +{ + return fs::exists(getDefinitionFilePath(dir_path, name)); +} + +bool NamedScalarDefinitionStoreLocal::publishDefinition( + const String & name, + const String & definition_blob, + bool if_not_exists, + bool or_replace, + const Settings & settings) +{ + const String file_path = getDefinitionFilePath(dir_path, name); + LOG_DEBUG(log, "Storing named scalar definition {} to file {}", name, file_path); + + const bool exists = fs::exists(file_path); + if (exists && !or_replace) + { + if (if_not_exists) + return false; + throw Exception(ErrorCodes::NAMED_SCALAR_ALREADY_EXISTS, "Named scalar '{}' already exists", name); + } + + /// Random-suffix temp file: two concurrent writers for the same + /// name would otherwise corrupt each other's temp / catch path. + const String temp_file_path = file_path + ".tmp." + getRandomASCIIString(8); + + try + { + WriteBufferFromFile out(temp_file_path, definition_blob.size()); + writeString(definition_blob, out); + out.next(); + if (settings[Setting::fsync_metadata]) + out.sync(); + out.close(); + + if (or_replace) + fs::rename(temp_file_path, file_path); + else + renameNoReplace(temp_file_path, file_path); + } + catch (...) + { + fs::remove(temp_file_path); + throw; + } + + LOG_TRACE(log, "Named scalar definition {} stored", name); + return true; +} + +bool NamedScalarDefinitionStoreLocal::removeDefinition(const String & name, bool throw_if_not_exists) +{ + const String file_path = getDefinitionFilePath(dir_path, name); + LOG_DEBUG(log, "Removing named scalar definition {} from file {}", name, file_path); + + const bool existed = fs::remove(file_path); + if (!existed) + { + if (throw_if_not_exists) + throw Exception(ErrorCodes::NAMED_SCALAR_NOT_FOUND, "Named scalar '{}' doesn't exist", name); + return false; + } + + LOG_TRACE(log, "Named scalar definition {} removed", name); + return true; +} + +bool NamedScalarDefinitionStoreLocal::readDefinition(const String & name, String & out) +{ + const String path = getDefinitionFilePath(dir_path, name); + if (!fs::exists(path)) + return false; + + ReadBufferFromFile in(path); + readStringUntilEOF(out, in); + return true; +} + +} diff --git a/src/Interpreters/NamedScalars/NamedScalarDefinitionStoreLocal.h b/src/Interpreters/NamedScalars/NamedScalarDefinitionStoreLocal.h new file mode 100644 index 000000000000..92cdb93a0622 --- /dev/null +++ b/src/Interpreters/NamedScalars/NamedScalarDefinitionStoreLocal.h @@ -0,0 +1,37 @@ +#pragma once + +#include + +#include + +namespace DB +{ + +class NamedScalarDefinitionStoreLocal : public INamedScalarDefinitionStore +{ +public: + NamedScalarDefinitionStoreLocal(String dir_path_, LoggerPtr log_); + + const String & directoryPath() const { return dir_path; } + + bool isKeeperBacked() const override { return false; } + void initialize() override; + std::vector loadAll() override; + bool definitionExists(const String & name) override; + + bool publishDefinition( + const String & name, + const String & definition_blob, + bool if_not_exists, + bool or_replace, + const Settings & settings) override; + + bool removeDefinition(const String & name, bool throw_if_not_exists) override; + bool readDefinition(const String & name, String & out) override; + +private: + String dir_path; + LoggerPtr log; +}; + +} diff --git a/src/Interpreters/NamedScalars/NamedScalarDefinitionStoreShared.cpp b/src/Interpreters/NamedScalars/NamedScalarDefinitionStoreShared.cpp new file mode 100644 index 000000000000..e448b2033401 --- /dev/null +++ b/src/Interpreters/NamedScalars/NamedScalarDefinitionStoreShared.cpp @@ -0,0 +1,237 @@ +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NAMED_SCALAR_ALREADY_EXISTS; + extern const int NAMED_SCALAR_NOT_FOUND; + extern const int NO_ZOOKEEPER; + extern const int KEEPER_EXCEPTION; +} + +namespace FailPoints +{ + extern const char shared_named_scalars_store_value_fail_once[]; +} + +namespace +{ + +zkutil::GetZooKeeper makeGetZooKeeper(const ContextPtr & context) +{ + std::weak_ptr weak_ctx = context; + return [weak_ctx]() -> zkutil::ZooKeeperPtr + { + auto locked = weak_ctx.lock(); + if (!locked) + throw Exception(ErrorCodes::NO_ZOOKEEPER, "Global context for shared named_scalars definition storage is gone"); + return locked->getZooKeeper(); + }; +} + +} + +NamedScalarDefinitionStoreShared::NamedScalarDefinitionStoreShared( + const ContextPtr & global_context_, + const String & zookeeper_root_) + : global_context(global_context_) + , zookeeper_root(zookeeper_root_) + , zookeeper_getter(makeGetZooKeeper(global_context_)) + , log(getLogger("NamedScalarDefinitionStoreShared")) +{ + while (!zookeeper_root.empty() && zookeeper_root.back() == '/') + zookeeper_root.pop_back(); + if (zookeeper_root.empty()) + zookeeper_root = "/"; +} + +zkutil::ZooKeeperPtr NamedScalarDefinitionStoreShared::getZooKeeper() +{ + auto [zookeeper, session_status] = zookeeper_getter.getZooKeeper(); + if (session_status == zkutil::ZooKeeperCachingGetter::SessionStatus::New) + { + /// We may have reconnected to a different Keeper member; sync to + /// guarantee read-your-writes after failover. + zookeeper->sync(zookeeper_root); + createRootNodesIfNeeded(); + } + return zookeeper; +} + +String NamedScalarDefinitionStoreShared::definitionsRootPath() const +{ + return childGroup("defs"); +} + +String NamedScalarDefinitionStoreShared::definitionPath(const String & name) const +{ + return definitionsRootPath() + "/" + escapeForFileName(name); +} + +void NamedScalarDefinitionStoreShared::createRootNodesIfNeeded() +{ + auto [zookeeper, _] = zookeeper_getter.getZooKeeper(); + zookeeper->createAncestors(zookeeper_root); + zookeeper->createIfNotExists(zookeeper_root, ""); + zookeeper->createIfNotExists(definitionsRootPath(), ""); +} + +bool NamedScalarDefinitionStoreShared::definitionExists(const String & name) +{ + auto zookeeper = getZooKeeper(); + return zookeeper->exists(definitionPath(name)); +} + +size_t NamedScalarDefinitionStoreShared::definitionCount() +{ + auto zookeeper = getZooKeeper(); + return zookeeper->getChildren(definitionsRootPath()).size(); +} + +bool NamedScalarDefinitionStoreShared::removeDefinition(const String & name, bool throw_if_not_exists) +{ + auto zookeeper = getZooKeeper(); + const auto def_path = definitionPath(name); + Coordination::Error code = zookeeper->tryRemoveRecursive(def_path); + if (code == Coordination::Error::ZOK) + return true; + if (code == Coordination::Error::ZNONODE) + { + if (throw_if_not_exists) + throw Exception(ErrorCodes::NAMED_SCALAR_NOT_FOUND, "Shared scalar '{}' doesn't exist", name); + return false; + } + throw zkutil::KeeperException::fromPath(code, def_path); +} + +bool NamedScalarDefinitionStoreShared::publishDefinition( + const String & name, + const String & definition_blob, + bool if_not_exists, + bool or_replace, + const Settings &) +{ + fiu_do_on(FailPoints::shared_named_scalars_store_value_fail_once, + throw Exception(ErrorCodes::KEEPER_EXCEPTION, "Injected failure while storing shared scalar '{}'", name);); + + auto zookeeper = getZooKeeper(); + const auto def_path = definitionPath(name); + + Coordination::Stat def_stat; + String existing_def; + const bool def_exists = zookeeper->tryGet(def_path, existing_def, &def_stat); + + if (def_exists && !or_replace) + { + if (if_not_exists) + return false; + throw Exception(ErrorCodes::NAMED_SCALAR_ALREADY_EXISTS, "Shared scalar '{}' already exists", name); + } + + if (def_exists) + { + const auto code = zookeeper->trySet(def_path, definition_blob, def_stat.version); + if (code == Coordination::Error::ZBADVERSION) + throw Exception( + ErrorCodes::NAMED_SCALAR_ALREADY_EXISTS, + "Shared scalar '{}' was modified concurrently by another node; retry the OR REPLACE", + name); + if (code != Coordination::Error::ZOK) + throw zkutil::KeeperException::fromPath(code, def_path); + return true; + } + + const auto code = zookeeper->tryCreate(def_path, definition_blob, zkutil::CreateMode::Persistent); + if (code == Coordination::Error::ZNODEEXISTS && if_not_exists) + return false; + if (code == Coordination::Error::ZNODEEXISTS) + throw Exception(ErrorCodes::NAMED_SCALAR_ALREADY_EXISTS, "Shared scalar '{}' already exists", name); + if (code != Coordination::Error::ZOK) + throw zkutil::KeeperException::fromPath(code, def_path); + + if (!zookeeper->exists(def_path)) + throw Exception(ErrorCodes::KEEPER_EXCEPTION, "Definition for shared scalar '{}' disappeared after successful publish", name); + return true; +} + +std::vector NamedScalarDefinitionStoreShared::loadAll() +{ + auto zookeeper = getZooKeeper(); + Strings children = zookeeper->getChildren(definitionsRootPath()); + + std::vector definitions; + definitions.reserve(children.size()); + + for (const auto & escaped : children) + { + auto name = unescapeForFileName(escaped); + if (name.empty()) + continue; + + String blob; + if (zookeeper->tryGet(definitionPath(name), blob)) + definitions.push_back({std::move(name), std::move(blob)}); + } + + return definitions; +} + +Strings NamedScalarDefinitionStoreShared::listDefinitionsWithChildrenWatch(std::function on_change) +{ + auto zookeeper = getZooKeeper(); + auto watcher = zookeeper->createWatchFromRawCallback( + fmt::format("SharedNamedScalarsWatcher(named_scalars/)"), + [cb = std::move(on_change)]() -> Coordination::WatchCallback + { + return [cb](const Coordination::WatchResponse &) { cb(); }; + }); + + Coordination::Stat stat; + Strings children = zookeeper->getChildrenWatch(definitionsRootPath(), &stat, watcher); + + Strings names; + names.reserve(children.size()); + for (const auto & escaped : children) + { + auto n = unescapeForFileName(escaped); + if (!n.empty()) + names.push_back(std::move(n)); + } + return names; +} + +bool NamedScalarDefinitionStoreShared::readDefinition(const String & name, String & out) +{ + auto zookeeper = getZooKeeper(); + return zookeeper->tryGet(definitionPath(name), out); +} + +bool NamedScalarDefinitionStoreShared::readDefinitionWithDataWatch( + const String & name, + String & out, + std::function on_change) +{ + auto zookeeper = getZooKeeper(); + auto watcher = zookeeper->createWatchFromRawCallback( + fmt::format("NamedScalar(definition/{})", name), + [cb = std::move(on_change)]() -> Coordination::WatchCallback + { + return [cb](const Coordination::WatchResponse &) { cb(); }; + }); + Coordination::Stat stat; + return zookeeper->tryGetWatch(definitionPath(name), out, &stat, watcher); +} + +} diff --git a/src/Interpreters/NamedScalars/NamedScalarDefinitionStoreShared.h b/src/Interpreters/NamedScalars/NamedScalarDefinitionStoreShared.h new file mode 100644 index 000000000000..2157f29314ef --- /dev/null +++ b/src/Interpreters/NamedScalars/NamedScalarDefinitionStoreShared.h @@ -0,0 +1,71 @@ +#pragma once + +#include +#include + +#include +#include + +#include + +namespace zkutil +{ +class ZooKeeper; +using ZooKeeperPtr = std::shared_ptr; +} + +namespace DB +{ + +/// Keeper I/O for shared named-scalar definitions. +/// +/// Layout under ``: +/// `defs/` -- text envelope with UUID, definer, and CREATE query. +class NamedScalarDefinitionStoreShared : public IWatchableNamedScalarDefinitionStore +{ +public: + NamedScalarDefinitionStoreShared(const ContextPtr & global_context_, const String & zookeeper_root_); + + void createRootNodesIfNeeded(); + + bool isKeeperBacked() const override { return true; } + void initialize() override { createRootNodesIfNeeded(); } + std::vector loadAll() override; + bool removeDefinition(const String & name, bool throw_if_not_exists) override; + bool definitionExists(const String & name) override; + size_t definitionCount(); + + bool publishDefinition( + const String & name, + const String & definition_blob, + bool if_not_exists, + bool or_replace, + const Settings & settings) override; + + Strings listDefinitionsWithChildrenWatch(std::function on_change) override; + bool readDefinition(const String & name, String & out) override; + bool readDefinitionWithDataWatch( + const String & name, + String & out, + std::function on_change) override; + + zkutil::ZooKeeperPtr getZooKeeper(); + +private: + String definitionsRootPath() const; + String definitionPath(const String & name) const; + + String childGroup(const String & child) const + { + return zookeeper_root == "/" ? "/" + child : zookeeper_root + "/" + child; + } + + ContextPtr global_context; + String zookeeper_root; + zkutil::ZooKeeperCachingGetter zookeeper_getter; + LoggerPtr log; +}; + +using NamedScalarDefinitionStoreSharedPtr = std::shared_ptr; + +} diff --git a/src/Interpreters/NamedScalars/NamedScalarValueBackendLocal.cpp b/src/Interpreters/NamedScalars/NamedScalarValueBackendLocal.cpp new file mode 100644 index 000000000000..5a75eab728dd --- /dev/null +++ b/src/Interpreters/NamedScalars/NamedScalarValueBackendLocal.cpp @@ -0,0 +1,189 @@ +#include + +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include +#include +#include + +namespace fs = std::filesystem; + +namespace DB +{ + +namespace Setting +{ + extern const SettingsBool fsync_metadata; +} + +namespace ErrorCodes +{ + extern const int DIRECTORY_DOESNT_EXIST; +} + +namespace +{ + +constexpr std::string_view value_suffix = ".bin"; + +String makeDirectoryPathCanonical(const String & directory_path) +{ + auto canonical_directory_path = std::filesystem::weakly_canonical(directory_path); + if (canonical_directory_path.has_filename()) + canonical_directory_path += std::filesystem::path::preferred_separator; + return canonical_directory_path; +} + +void createDirectory(const String & dir_path) +{ + std::error_code create_dir_error_code; + fs::create_directories(dir_path, create_dir_error_code); + if (!fs::exists(dir_path) || !fs::is_directory(dir_path) || create_dir_error_code) + throw Exception( + ErrorCodes::DIRECTORY_DOESNT_EXIST, + "Couldn't create directory {} reason: '{}'", + dir_path, + create_dir_error_code.message()); +} + +String getValueFilePath(const String & dir_path, const String & value_key) +{ + return dir_path + escapeForFileName(value_key) + String(value_suffix); +} + +/// Rename to .bad.; fresh-eval will rewrite the file later. +void quarantineValueFile(const String & file_path, LoggerPtr log, const String & reason) +{ + const auto ts = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count(); + const String quarantine_path = fmt::format("{}.bad.{}", file_path, ts); + LOG_ERROR(log, "Quarantining named scalar value file {} (reason: {}) -> {}; will re-evaluate", file_path, reason, quarantine_path); + try { fs::rename(file_path, quarantine_path); } + catch (...) { tryLogCurrentException(log, fmt::format("renaming {} to {}", file_path, quarantine_path)); } +} + +std::optional readValueBlobFromDisk( + const String & dir_path, const String & value_key, LoggerPtr log) +{ + const String file_path = getValueFilePath(dir_path, value_key); + LOG_DEBUG(log, "Loading raw named scalar value for {} from {}", value_key, file_path); + + if (!fs::exists(file_path)) + return std::nullopt; + + try + { + ReadBufferFromFile in(file_path); + String contents; + readStringUntilEOF(contents, in); + return contents; + } + catch (...) + { + const String reason = fmt::format("raw read failed: {}", getCurrentExceptionMessage(false)); + quarantineValueFile(file_path, log, reason); + return std::nullopt; + } +} + +/// Refresh-publish hot path: payload is already encoded by NamedScalar. +void writeValueFile( + const String & dir_path, + const String & value_key, + const String & payload, + const Settings & settings, + LoggerPtr log) +{ + const String file_path = getValueFilePath(dir_path, value_key); + const String temp_file_path = file_path + ".tmp." + getRandomASCIIString(8); + LOG_DEBUG(log, "Storing named scalar value for {} to {}", value_key, file_path); + + try + { + WriteBufferFromFile out(temp_file_path, payload.size()); + writeString(payload, out); + out.next(); + if (settings[Setting::fsync_metadata]) + out.sync(); + out.close(); + + fs::rename(temp_file_path, file_path); + } + catch (...) + { + fs::remove(temp_file_path); + throw; + } +} + +void removeValueFile(const String & dir_path, const String & value_key, LoggerPtr log) +{ + const String file_path = getValueFilePath(dir_path, value_key); + LOG_DEBUG(log, "Removing named scalar value for {} from {}", value_key, file_path); + fs::remove(file_path); +} + +} + +NamedScalarValueBackendLocal::NamedScalarValueBackendLocal(String dir_path_, LoggerPtr log_) + : dir_path(makeDirectoryPathCanonical(dir_path_)) + , log(std::move(log_)) +{ +} + +void NamedScalarValueBackendLocal::initialize() +{ + createDirectory(dir_path); +} + +void NamedScalarValueBackendLocal::setGlobalContext(ContextPtr global_context_) +{ + global_context = std::move(global_context_); +} + +std::optional NamedScalarValueBackendLocal::readValueBlob(const String & value_key) +{ + return readValueBlobFromDisk(dir_path, value_key, log); +} + +std::optional NamedScalarValueBackendLocal::readValueBlobAndWatch(const String & value_key, std::function) +{ + return readValueBlob(value_key); +} + +void NamedScalarValueBackendLocal::removeValue(const String & value_key) +{ + removeValueFile(dir_path, value_key, log); +} + +std::optional NamedScalarValueBackendLocal::tryAcquireRefreshLease( + const String &, + const String & value_key) +{ + return NamedScalarRefreshLease( + [this, value_key](const String & value_blob) + { + writeValue(value_key, value_blob); + return RefreshPublishResult::Published; + }); +} + +void NamedScalarValueBackendLocal::writeValue(const String & value_key, const String & value_blob) +{ + chassert(global_context); + writeValueFile(dir_path, value_key, value_blob, global_context->getSettingsRef(), log); +} + +} diff --git a/src/Interpreters/NamedScalars/NamedScalarValueBackendLocal.h b/src/Interpreters/NamedScalars/NamedScalarValueBackendLocal.h new file mode 100644 index 000000000000..b4125d5e3d26 --- /dev/null +++ b/src/Interpreters/NamedScalars/NamedScalarValueBackendLocal.h @@ -0,0 +1,37 @@ +#pragma once + +#include +#include + +#include + +namespace DB +{ + +class NamedScalarValueBackendLocal final : public INamedScalarValueBackend +{ +public: + NamedScalarValueBackendLocal(String dir_path_, LoggerPtr log_); + + const String & directoryPath() const { return dir_path; } + + void initialize(); + void setGlobalContext(ContextPtr global_context_); + + std::optional readValueBlob(const String & value_key) override; + bool supportsValueWatches() const override { return false; } + std::optional readValueBlobAndWatch(const String & value_key, std::function on_change) override; + void removeValue(const String & value_key) override; + std::optional tryAcquireRefreshLease( + const String & name, + const String & value_key) override; + +private: + void writeValue(const String & value_key, const String & value_blob); + + String dir_path; + LoggerPtr log; + ContextPtr global_context; +}; + +} diff --git a/src/Interpreters/NamedScalars/NamedScalarValueBackendShared.cpp b/src/Interpreters/NamedScalars/NamedScalarValueBackendShared.cpp new file mode 100644 index 000000000000..3aab59e27831 --- /dev/null +++ b/src/Interpreters/NamedScalars/NamedScalarValueBackendShared.cpp @@ -0,0 +1,255 @@ +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int KEEPER_EXCEPTION; + extern const int NO_ZOOKEEPER; +} + +namespace FailPoints +{ + extern const char shared_named_scalars_store_value_fail_once[]; +} + +namespace +{ + +zkutil::GetZooKeeper makeGetZooKeeper(const ContextPtr & context) +{ + std::weak_ptr weak_ctx = context; + return [weak_ctx]() -> zkutil::ZooKeeperPtr + { + auto locked = weak_ctx.lock(); + if (!locked) + throw Exception(ErrorCodes::NO_ZOOKEEPER, "Global context for shared named_scalars value backend is gone"); + return locked->getZooKeeper(); + }; +} + +} + +NamedScalarValueBackendShared::NamedScalarValueBackendShared( + const ContextPtr & global_context_, + const String & zookeeper_root_, + std::function poke_resync_all_) + : global_context(global_context_) + , zookeeper_root(zookeeper_root_) + , zookeeper_getter(makeGetZooKeeper(global_context_)) + , log(getLogger("NamedScalarValueBackendShared")) + , poke_resync_all(std::move(poke_resync_all_)) +{ + while (!zookeeper_root.empty() && zookeeper_root.back() == '/') + zookeeper_root.pop_back(); + if (zookeeper_root.empty()) + zookeeper_root = "/"; +} + +zkutil::ZooKeeperPtr NamedScalarValueBackendShared::getZooKeeper() +{ + auto [zookeeper, session_status] = zookeeper_getter.getZooKeeper(); + if (session_status == zkutil::ZooKeeperCachingGetter::SessionStatus::New) + { + /// We may have reconnected to a different Keeper member; sync to + /// guarantee read-your-writes after failover. + zookeeper->sync(zookeeper_root); + createRootNodesIfNeeded(); + } + return zookeeper; +} + +String NamedScalarValueBackendShared::valuesRootPath() const +{ + return childGroup("values"); +} + +String NamedScalarValueBackendShared::valueGroupPath(const String & value_key) const +{ + return valuesRootPath() + "/" + escapeForFileName(value_key); +} + +String NamedScalarValueBackendShared::valuePath(const String & value_key) const +{ + return valueGroupPath(value_key) + "/value"; +} + +String NamedScalarValueBackendShared::lockPath(const String & value_key) const +{ + return valueGroupPath(value_key) + "/lock"; +} + +void NamedScalarValueBackendShared::createRootNodesIfNeeded() +{ + auto [zookeeper, _] = zookeeper_getter.getZooKeeper(); + zookeeper->createAncestors(zookeeper_root); + zookeeper->createIfNotExists(zookeeper_root, ""); + zookeeper->createIfNotExists(valuesRootPath(), ""); +} + +std::optional NamedScalarValueBackendShared::readValueBlob(const String & value_key) +{ + auto zookeeper = getZooKeeper(); + const auto path = valuePath(value_key); + String data; + if (!zookeeper->tryGet(path, data)) + return std::nullopt; + return data; +} + +bool NamedScalarValueBackendShared::readValueWithDataWatch( + const String & value_key, + String & out, + std::function on_change) +{ + auto zookeeper = getZooKeeper(); + const String path = valuePath(value_key); + + auto watcher = zookeeper->createWatchFromRawCallback( + fmt::format("NamedScalar(value/{})", value_key), + [cb = std::move(on_change)]() -> Coordination::WatchCallback + { + return [cb](const Coordination::WatchResponse &) { cb(); }; + }); + + Coordination::Stat stat; + if (zookeeper->tryGetWatch(path, out, &stat, watcher)) + return true; + /// Race between tryGetWatch and existsWatch: a peer can create the + /// znode in between. Detect and re-read. + if (zookeeper->existsWatch(path, &stat, watcher)) + return zookeeper->tryGetWatch(path, out, &stat, watcher); + return false; +} + +std::optional NamedScalarValueBackendShared::readValueBlobAndWatch( + const String & value_key, + std::function on_change) +{ + String value_blob; + /// `on_change` is `NamedScalar::onValueChanged` and is non-blocking + /// (atomic flip + task->schedule()), so we can hand it to the Keeper + /// raw-callback layer directly. + if (!readValueWithDataWatch(value_key, value_blob, std::move(on_change))) + return std::nullopt; + return value_blob; +} + +void NamedScalarValueBackendShared::removeValue(const String & value_key) +{ + auto zookeeper = getZooKeeper(); + const auto path = valueGroupPath(value_key); + auto code = zookeeper->tryRemoveRecursive(path); + if (code != Coordination::Error::ZOK && code != Coordination::Error::ZNONODE) + throw zkutil::KeeperException::fromPath(code, path); +} + +std::unique_ptr NamedScalarValueBackendShared::tryReserveRefresh( + const String & value_key, + const String & lock_holder_message) +{ + auto zookeeper = getZooKeeper(); + const String path = lockPath(value_key); + zookeeper->createAncestors(path); + auto holder = zkutil::EphemeralNodeHolder::tryCreate( + path, + *zookeeper, + lock_holder_message.empty() ? getFQDNOrHostName() : lock_holder_message); + if (!holder) + return nullptr; + + auto reservation = std::make_unique(); + reservation->zookeeper = zookeeper; + reservation->holder = std::move(holder); + reservation->path = path; + + Coordination::Stat value_stat; + if (zookeeper->exists(valuePath(value_key), &value_stat)) + reservation->value_version = value_stat.version; + return reservation; +} + +RefreshPublishResult NamedScalarValueBackendShared::publishRefreshValue( + const RefreshReservation & reservation, + const String & name, + const String & value_key, + const String & payload) +{ + fiu_do_on(FailPoints::shared_named_scalars_store_value_fail_once, + throw Exception(ErrorCodes::KEEPER_EXCEPTION, "Injected failure while storing shared scalar '{}'", name);); + + auto zookeeper = reservation.zookeeper; + const auto path = valuePath(value_key); + + Coordination::Requests ops; + if (reservation.value_version) + ops.emplace_back(zkutil::makeSetRequest(path, payload, *reservation.value_version)); + else + ops.emplace_back(zkutil::makeCreateRequest(path, payload, zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeRemoveRequest(reservation.path, -1)); + + Coordination::Responses responses; + const auto code = zookeeper->tryMulti(ops, responses, /* check_session_valid */ true); + if (code == Coordination::Error::ZOK) + { + if (reservation.holder) + reservation.holder->setAlreadyRemoved(); + return RefreshPublishResult::Published; + } + + const auto release_code = zookeeper->tryRemove(reservation.path); + if ((release_code == Coordination::Error::ZOK || release_code == Coordination::Error::ZNONODE) && reservation.holder) + reservation.holder->setAlreadyRemoved(); + if (release_code != Coordination::Error::ZOK && release_code != Coordination::Error::ZNONODE) + LOG_WARNING( + log, + "Could not release refresh lock for shared scalar '{}' after failed value publish: {}", + name, + Coordination::errorMessage(release_code)); + + /// All of these mean "this scalar's local idea of who owns the slot + /// no longer matches Keeper" — peer OR REPLACE, peer DROP, or our + /// ephemeral lease was lost. Callers treat them identically. + if (code == Coordination::Error::ZNONODE + || code == Coordination::Error::ZBADVERSION + || code == Coordination::Error::ZNODEEXISTS) + return RefreshPublishResult::Diverged; + throw zkutil::KeeperException::fromPath(code, path); +} + +std::optional NamedScalarValueBackendShared::tryAcquireRefreshLease( + const String & name, + const String & value_key) +{ + auto reservation = tryReserveRefresh(value_key, getFQDNOrHostName()); + if (!reservation) + return std::nullopt; + + auto acquired_reservation = std::shared_ptr(std::move(reservation)); + return NamedScalarRefreshLease( + [this, name, value_key, retained_reservation = std::move(acquired_reservation)](const String & value_blob) + { + const auto result = publishRefreshValue( + *retained_reservation, + name, + value_key, + value_blob); + if (result != RefreshPublishResult::Published && poke_resync_all) + poke_resync_all(); + return result; + }); +} + +} diff --git a/src/Interpreters/NamedScalars/NamedScalarValueBackendShared.h b/src/Interpreters/NamedScalars/NamedScalarValueBackendShared.h new file mode 100644 index 000000000000..ef55a5a53c71 --- /dev/null +++ b/src/Interpreters/NamedScalars/NamedScalarValueBackendShared.h @@ -0,0 +1,87 @@ +#pragma once + +#include +#include + +#include +#include +#include + +#include +#include +#include + +namespace zkutil +{ +class EphemeralNodeHolder; +class ZooKeeper; +using EphemeralNodeHolderPtr = std::shared_ptr; +using ZooKeeperPtr = std::shared_ptr; +} + +namespace DB +{ + +class NamedScalarValueBackendShared final : public INamedScalarValueBackend +{ +public: + NamedScalarValueBackendShared( + const ContextPtr & global_context_, + const String & zookeeper_root_, + std::function poke_resync_all_); + + std::optional readValueBlob(const String & value_key) override; + bool supportsValueWatches() const override { return true; } + std::optional readValueBlobAndWatch(const String & value_key, std::function on_change) override; + void removeValue(const String & value_key) override; + std::optional tryAcquireRefreshLease( + const String & name, + const String & value_key) override; + +private: + struct RefreshReservation + { + zkutil::ZooKeeperPtr zookeeper; + zkutil::EphemeralNodeHolderPtr holder; + String path; + std::optional value_version; + }; + + /// Lazily creates `/values`. + void createRootNodesIfNeeded(); + + RefreshPublishResult publishRefreshValue( + const RefreshReservation & reservation, + const String & name, + const String & value_key, + const String & payload); + + std::unique_ptr tryReserveRefresh( + const String & value_key, + const String & lock_holder_message); + + bool readValueWithDataWatch(const String & value_key, String & out, std::function on_change); + + zkutil::ZooKeeperPtr getZooKeeper(); + + /// `zookeeper_root` is either a non-empty path with no trailing + /// slash, or "/". childGroup() collapses the latter so we never + /// emit empty path components. + String valuesRootPath() const; + String valueGroupPath(const String & value_key) const; + String valuePath(const String & value_key) const; + String lockPath(const String & value_key) const; + + String childGroup(const String & child) const + { + return zookeeper_root == "/" ? "/" + child : zookeeper_root + "/" + child; + } + + ContextPtr global_context; + String zookeeper_root; + zkutil::ZooKeeperCachingGetter zookeeper_getter; + LoggerPtr log; + std::function poke_resync_all; +}; + +} diff --git a/src/Interpreters/NamedScalars/NamedScalarValueCodec.cpp b/src/Interpreters/NamedScalars/NamedScalarValueCodec.cpp new file mode 100644 index 000000000000..877961734869 --- /dev/null +++ b/src/Interpreters/NamedScalars/NamedScalarValueCodec.cpp @@ -0,0 +1,218 @@ +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; + extern const int NAMED_SCALAR_VALUE_TOO_LARGE; +} + +namespace +{ + +constexpr UInt64 stored_value_format_version = 1; + +String storedValueStatusToString(StoredValueStatus state) +{ + switch (state) + { + case StoredValueStatus::Empty: + return "empty"; + case StoredValueStatus::Valid: + return "valid"; + case StoredValueStatus::StaleAfterFailure: + return "stale_after_failure"; + } + UNREACHABLE(); +} + +StoredValueStatus parseStoredValueStatus(const String & state) +{ + if (state == "empty") + return StoredValueStatus::Empty; + if (state == "valid") + return StoredValueStatus::Valid; + if (state == "stale_after_failure") + return StoredValueStatus::StaleAfterFailure; + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown named scalar stored value state: {}", state); +} + +Int64 timePointToUnixSeconds(std::chrono::system_clock::time_point timepoint) +{ + return std::chrono::duration_cast(timepoint.time_since_epoch()).count(); +} + +std::chrono::system_clock::time_point timePointFromUnixSeconds(Int64 seconds) +{ + return std::chrono::system_clock::time_point{std::chrono::seconds(seconds)}; +} + +} + +void StoredValue::serialize(WriteBuffer & out) const +{ + String value_blob; + if (has_value()) + { + WriteBufferFromOwnString value_out; + encodeField(value, value_out); + value_blob = value_out.str(); + } + + out << "format version: " << stored_value_format_version << "\n"; + out << "state: " << storedValueStatusToString(state) << "\n"; + out << "type: " << escape << (type ? type->getName() : String{}) << "\n"; + out << "last_update_time: " << timePointToUnixSeconds(last_update_time) << "\n"; + out << "last_successful_update_time: " << timePointToUnixSeconds(last_successful_update_time) << "\n"; + out << "last_update_hostname: " << escape << last_update_hostname << "\n"; + out << "last_error_type: " << escape << last_error_type << "\n"; + out << "last_error: " << escape << last_error << "\n"; + out << "value: " << escape << value_blob << "\n"; +} + +std::optional StoredValue::deserialize(ReadBuffer & in) +{ + UInt64 version = 0; + in >> "format version: " >> version >> "\n"; + if (version != stored_value_format_version) + return std::nullopt; + + StoredValue snapshot; + String state; + String type_name; + String value_blob; + Int64 last_update_time_seconds = 0; + Int64 last_successful_update_time_seconds = 0; + + in >> "state: " >> state >> "\n"; + snapshot.state = parseStoredValueStatus(state); + + in >> "type: " >> escape >> type_name >> "\n"; + if (!type_name.empty()) + snapshot.type = DataTypeFactory::instance().get(type_name); + + in >> "last_update_time: " >> last_update_time_seconds >> "\n"; + snapshot.last_update_time = timePointFromUnixSeconds(last_update_time_seconds); + + in >> "last_successful_update_time: " >> last_successful_update_time_seconds >> "\n"; + snapshot.last_successful_update_time = timePointFromUnixSeconds(last_successful_update_time_seconds); + + in >> "last_update_hostname: " >> escape >> snapshot.last_update_hostname >> "\n"; + in >> "last_error_type: " >> escape >> snapshot.last_error_type >> "\n"; + in >> "last_error: " >> escape >> snapshot.last_error >> "\n"; + in >> "value: " >> escape >> value_blob >> "\n"; + + if (snapshot.has_value()) + { + ReadBufferFromString value_in(value_blob); + snapshot.value = decodeField(value_in); + } + return snapshot; +} + +std::shared_ptr StoredValue::fromEvaluationSuccess( + DataTypePtr type_, + Field value_, + const String & hostname, + std::chrono::system_clock::time_point now) +{ + auto snapshot = std::make_shared(); + snapshot->type = std::move(type_); + snapshot->value = std::move(value_); + snapshot->last_update_time = now; + snapshot->last_successful_update_time = now; + snapshot->last_update_hostname = hostname; + snapshot->state = StoredValueStatus::Valid; + return snapshot; +} + +std::shared_ptr StoredValue::fromEvaluationFailure( + const StoredValue * prev, + std::string_view error_message, + std::string_view error_type, + const String & hostname, + std::chrono::system_clock::time_point now) +{ + auto snapshot = std::make_shared(); + if (prev && prev->has_value()) + { + snapshot->type = prev->type; + snapshot->value = prev->value; + snapshot->last_successful_update_time = prev->last_successful_update_time; + snapshot->state = StoredValueStatus::StaleAfterFailure; + } + + snapshot->last_update_time = now; + snapshot->last_update_hostname = hostname; + snapshot->last_error = String(error_message); + snapshot->last_error_type = String(error_type); + return snapshot; +} + +String encodeNamedScalarValueAndCheckSize(const StoredValue & snapshot, const ContextPtr & context) +{ + static constexpr UInt64 default_max = 1ULL << 20; + const UInt64 max_size = context + ? context->getConfigRef().getUInt64("named_scalar_max_value_size", default_max) + : default_max; + + WriteBufferFromOwnString buf; + snapshot.serialize(buf); + String payload = buf.str(); + if (payload.size() > max_size) + throw Exception( + ErrorCodes::NAMED_SCALAR_VALUE_TOO_LARGE, + "Named scalar value is too large ({} bytes, max {} bytes)", + payload.size(), + max_size); + return payload; +} + +std::optional tryDecodeNamedScalarValueBlob( + const String & value_blob, + const String & name, + LoggerPtr log) +{ + try + { + ReadBufferFromString rb(value_blob); + auto snapshot = StoredValue::deserialize(rb); + if (!snapshot) + LOG_WARNING( + log, + "Ignoring persisted value for named scalar '{}' because it uses an unsupported format; it will be rebuilt", + name); + return snapshot; + } + catch (...) + { + LOG_WARNING( + log, + "Ignoring persisted value for named scalar '{}' because decoding failed: {}; it will be rebuilt", + name, + getCurrentExceptionMessage(false)); + return std::nullopt; + } +} + +} diff --git a/src/Interpreters/NamedScalars/NamedScalarValueCodec.h b/src/Interpreters/NamedScalars/NamedScalarValueCodec.h new file mode 100644 index 000000000000..ea31c31790fe --- /dev/null +++ b/src/Interpreters/NamedScalars/NamedScalarValueCodec.h @@ -0,0 +1,71 @@ +#pragma once + +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace DB +{ + +class IDataType; +using DataTypePtr = std::shared_ptr; +class ReadBuffer; +class WriteBuffer; + +enum class StoredValueStatus : UInt8 +{ + Empty, + Valid, + StaleAfterFailure, +}; + +struct StoredValue +{ + static std::shared_ptr fromEvaluationSuccess( + DataTypePtr type, + Field value, + const String & hostname, + std::chrono::system_clock::time_point now); + + static std::shared_ptr fromEvaluationFailure( + const StoredValue * prev, + std::string_view error_message, + std::string_view error_type, + const String & hostname, + std::chrono::system_clock::time_point now); + + static std::optional deserialize(ReadBuffer & in); + void serialize(WriteBuffer & out) const; + + DataTypePtr type; + Field value; + std::chrono::system_clock::time_point last_update_time; + std::chrono::system_clock::time_point last_successful_update_time; + String last_update_hostname; + String last_error; + String last_error_type; + StoredValueStatus state = StoredValueStatus::Empty; + + bool has_value() const { return state != StoredValueStatus::Empty; } + bool is_valid() const { return state == StoredValueStatus::Valid; } +}; + +/// Serialize StoredValue to a blob and enforce the configured max size +/// (`named_scalar_max_value_size`, default 1 MiB). Throws +/// NAMED_SCALAR_VALUE_TOO_LARGE if the encoded blob exceeds the limit. +String encodeNamedScalarValueAndCheckSize(const StoredValue & snapshot, const ContextPtr & context); + +/// Decode a value blob; returns nullopt and logs a warning if the blob +/// uses an unsupported format version or fails to decode. Never throws. +std::optional tryDecodeNamedScalarValueBlob( + const String & value_blob, + const String & name, + LoggerPtr log); + +} diff --git a/src/Interpreters/NamedScalars/NamedScalarsManager.cpp b/src/Interpreters/NamedScalars/NamedScalarsManager.cpp new file mode 100644 index 000000000000..de47a5fbca57 --- /dev/null +++ b/src/Interpreters/NamedScalars/NamedScalarsManager.cpp @@ -0,0 +1,599 @@ +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; + extern const int NAMED_SCALAR_ALREADY_EXISTS; + extern const int NAMED_SCALAR_NOT_FOUND; + extern const int NAMED_SCALAR_NOT_REFRESHABLE; + extern const int SHARED_NAMED_SCALARS_NOT_CONFIGURED; +} + +void NamedScalarsManager::checkName(const String & name) +{ + static constexpr size_t max_name_len = 200; + if (name.empty()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Named scalar name cannot be empty"); + if (name.size() > max_name_len) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Named scalar name is too long ({} bytes, max {} bytes)", + name.size(), max_name_len); + const auto escaped = escapeForFileName(name); + if (escaped.size() > max_name_len) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Named scalar name escapes to too long a filename ({} bytes after escape, max {} bytes); avoid characters that require escaping", + escaped.size(), max_name_len); +} + +namespace +{ +constexpr std::string_view value_suffix = ".bin"; + +void sweepOrphanLocalValueFiles( + const String & dir_path, + const std::vector & definitions, + const ContextPtr & context, + LoggerPtr log) +{ + std::unordered_set live_value_files; + for (const auto & definition : definitions) + { + auto uuid = getNamedScalarUUIDFromSerializedDefinition(definition.definition_blob, context, log); + if (uuid) + live_value_files.insert(escapeForFileName(*uuid) + String(value_suffix)); + } + + Poco::DirectoryIterator dir_end; + for (Poco::DirectoryIterator it(dir_path); it != dir_end; ++it) + { + const String & file_name = it.name(); + if (!file_name.ends_with(value_suffix) || live_value_files.contains(file_name)) + continue; + try { std::filesystem::remove(dir_path + file_name); } + catch (...) { tryLogCurrentException(log, fmt::format("while sweeping orphan value {}", file_name)); } + } +} + +} + +NamedScalarsManager::NamedScalarsManager( + const String & definitions_disk_path, + const String & definitions_zookeeper_path, + const String & local_cache_path, + NamedScalarCacheKind default_cache_kind_, + const ContextPtr & global_context) + : default_cache_kind(default_cache_kind_) + , local_value_backend(std::make_unique( + local_cache_path, + getLogger("NamedScalarsManager"))) +{ + if (definitions_zookeeper_path.empty()) + { + definition_store = std::make_shared( + definitions_disk_path, + getLogger("NamedScalarsManager")); + } + else + { + auto keeper_store = std::make_shared(global_context, definitions_zookeeper_path); + definition_store = keeper_store; + watchable_definition_store = keeper_store; + shared_value_backend = std::make_unique( + global_context, + definitions_zookeeper_path, + [this] + { + if (watcher) + watcher->pokeResyncAll(); + }); + } +} + +NamedScalarsManager::~NamedScalarsManager() = default; + +void NamedScalarsManager::initialize(const ContextPtr & global_context) +{ + local_value_backend->setGlobalContext(global_context); + local_value_backend->initialize(); + definition_store->initialize(); + + if (usesKeeperDefinitions()) + { + chassert(watchable_definition_store); + chassert(shared_value_backend); + watcher = std::make_unique( + global_context, + watchable_definition_store, + *this); + watcher->start(); + return; + } + + auto definitions = definition_store->loadAll(); + sweepOrphanLocalValueFiles(local_value_backend->directoryPath(), definitions, global_context, getLogger("NamedScalarsManager")); + + auto log = getLogger("NamedScalarsManager"); + for (auto & loaded : definitions) + { + auto cache_kind = cacheKindFromDefinitionBlob(loaded.definition_blob, global_context, log); + if (cache_kind == NamedScalarCacheKind::Shared) + { + LOG_WARNING( + log, + "Ignoring disk definition for shared-cache named scalar '{}': shared cache requires Keeper definitions", + loaded.name); + continue; + } + + auto parsed = parseAndValidateDefinition( + loaded.name, loaded.definition_blob, + std::chrono::system_clock::now(), global_context, log); + if (!parsed) + continue; + + auto scalar = std::make_shared(std::move(*parsed), valueBackendFor(cache_kind)); + scalar->loadValueFromBackend(); + installScalar(scalar, cache_kind); + scalar->start(global_context); + } +} + +void NamedScalarsManager::shutdown() +{ + try + { + if (watcher) + watcher->stop(); + } + catch (...) + { + tryLogCurrentException(getLogger("NamedScalarsManager"), "while shutting down named scalars watcher"); + } + + try + { + shutdownScalars(); + } + catch (...) + { + tryLogCurrentException(getLogger("NamedScalarsManager"), "while shutting down named scalars"); + } +} + +INamedScalarValueBackend & NamedScalarsManager::valueBackendFor(NamedScalarCacheKind cache_kind) const +{ + if (cache_kind != NamedScalarCacheKind::Shared) + return *local_value_backend; + if (!shared_value_backend) + throw Exception( + ErrorCodes::SHARED_NAMED_SCALARS_NOT_CONFIGURED, + "Shared named scalar cache requires Keeper-backed named scalar definitions " + "(configure )"); + return *shared_value_backend; +} + +NamedScalarCacheKind NamedScalarsManager::cacheKindFromDefinitionBlob(const String & definition_blob, const ContextPtr & context, LoggerPtr log) const +{ + auto cache_kind = getNamedScalarCacheKindFromSerializedDefinition(definition_blob, context, log); + if (!cache_kind) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot determine named scalar cache kind from persisted definition"); + return *cache_kind; +} + +NamedScalarPtr NamedScalarsManager::tryGetScalar(const String & name) const +{ + std::shared_lock lock(scalars_mutex); + auto it = scalars.find(name); + if (it == scalars.end()) + return nullptr; + return it->second.scalar; +} + +std::optional NamedScalarsManager::tryGetScopedScalar(const String & name) const +{ + std::shared_lock lock(scalars_mutex); + auto it = scalars.find(name); + if (it == scalars.end()) + return std::nullopt; + return it->second; +} + +std::optional NamedScalarsManager::getCacheKind( + const String & name, + const ContextPtr & context, + LoggerPtr log) const +{ + if (auto scoped = tryGetScopedScalar(name)) + return scoped->cache_kind; + + String definition_blob; + if (!definition_store->readDefinition(name, definition_blob)) + return std::nullopt; + return getNamedScalarCacheKindFromSerializedDefinition(definition_blob, context, log); +} + +std::vector NamedScalarsManager::listAllScalars() const +{ + std::vector out; + std::shared_lock lock(scalars_mutex); + out.reserve(scalars.size()); + for (const auto & [_, scoped] : scalars) + out.push_back(scoped.scalar); + return out; +} + +std::vector NamedScalarsManager::listScalars() const +{ + std::vector out; + std::shared_lock lock(scalars_mutex); + out.reserve(scalars.size()); + for (const auto & [_, scoped] : scalars) + out.push_back(scoped); + return out; +} + +void NamedScalarsManager::ensureCreatable(NamedScalarCacheKind cache_kind) const +{ + if (cache_kind == NamedScalarCacheKind::Shared && !usesKeeperDefinitions()) + throw Exception( + ErrorCodes::SHARED_NAMED_SCALARS_NOT_CONFIGURED, + "Shared named scalar cache requires Keeper-backed named scalar definitions " + "(configure )"); +} + +bool NamedScalarsManager::definitionExists(const String & name) const +{ + return definition_store->definitionExists(name); +} + +NamedScalarPtr NamedScalarsManager::swapScalar(NamedScalarPtr scalar, NamedScalarCacheKind cache_kind) +{ + chassert(scalar); + const String name = scalar->getName(); + NamedScalarPtr replaced; + std::unique_lock lock(scalars_mutex); + auto it = scalars.find(name); + if (it == scalars.end()) + { + scalars.emplace(name, NamedScalarWithScope{.cache_kind = cache_kind, .scalar = std::move(scalar)}); + } + else + { + replaced = std::move(it->second.scalar); + it->second = NamedScalarWithScope{.cache_kind = cache_kind, .scalar = std::move(scalar)}; + } + return replaced; +} + +void NamedScalarsManager::installScalar(NamedScalarPtr scalar, NamedScalarCacheKind cache_kind) +{ + auto replaced = swapScalar(std::move(scalar), cache_kind); + if (replaced) + replaced->shutdown(); +} + +bool NamedScalarsManager::dropScalar(const String & name) +{ + NamedScalarPtr removed; + { + std::unique_lock lock(scalars_mutex); + auto it = scalars.find(name); + if (it == scalars.end()) + return false; + removed = std::move(it->second.scalar); + scalars.erase(it); + } + removed->shutdown(); + return true; +} + +void NamedScalarsManager::shutdownScalars() +{ + std::unordered_map drained; + { + std::unique_lock lock(scalars_mutex); + drained = std::move(scalars); + scalars.clear(); + } + for (auto & [_, scoped] : drained) + scoped.scalar->shutdown(); +} + +bool NamedScalarsManager::create(NamedScalarCreateRequest request, const ContextPtr & context) +{ + const auto cache_kind = request.cache_kind; + const auto & name = request.name; + ensureCreatable(cache_kind); + + const ContextPtr task_context = context->getGlobalContext(); + auto log = getLogger("NamedScalarsManager"); + + /// Fast existence pre-check against the local map so IF NOT EXISTS + /// short-circuits without parsing + evaluating the SELECT. Authoritative + /// serialisation happens in publishDefinition; missing the local map + /// (peer just CREATEd, watcher hasn't reconciled) only costs a wasted + /// SELECT — publishDefinition will still serve IF NOT EXISTS correctly. + if (!request.or_replace) + { + std::shared_lock lock(scalars_mutex); + if (scalars.contains(name)) + { + if (request.if_not_exists) + return false; + throw Exception(ErrorCodes::NAMED_SCALAR_ALREADY_EXISTS, "Named scalar '{}' already exists", name); + } + } + + /// Parse + build the in-memory ParsedDefinition. The interpreter has + /// already materialised an explicit UUID into request.formatted_create_query. + auto parsed = parseAndValidateDefinition( + name, request.formatted_create_query, + std::chrono::system_clock::now(), task_context, log); + if (!parsed) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Invalid serialized definition for named scalar '{}'", name); + + auto & backend = valueBackendFor(cache_kind); + auto scalar = std::make_shared(*parsed, backend); + + /// Best-effort orphan-cleanup helper: any failure path between + /// writing the value blob and publishing the definition leaves an + /// orphan at parsed->uuid that we want to remove. removeValue may + /// itself throw (transient backend error) - that's fine, log and + /// move on; for the local backend the startup orphan-sweep is the + /// long-term safety net. + auto remove_orphan_value = [&](const String & reason) noexcept + { + try { backend.removeValue(parsed->uuid); } + catch (...) { tryLogCurrentException(log, reason); } + }; + + /// Run the SELECT and write the value blob BEFORE we publish the + /// definition. If evaluation throws, no durable definition exists + /// (the caller sees the exception). On crash between this write and + /// the definition publish, the value blob is orphaned at this UUID. + try + { + scalar->evaluateAndStoreValue(task_context); + } + catch (...) + { + remove_orphan_value(fmt::format("removing orphaned value for failed CREATE '{}'", name)); + throw; + } + + String old_uuid_to_remove; + NamedScalarPtr replaced; + + try + { + std::lock_guard catalog_lock(create_drop_mutex); + + /// Cap check uses the local map, not the definition store: doing + /// a `definitionExists` here would be a second Keeper round-trip + /// for shared mode just to gate a soft guardrail. In the rare + /// case where a peer just CREATEd a scalar but our watcher + /// hasn't installed it locally yet, we may be off by one - that's + /// acceptable for `max_named_scalars` (it's not a hard invariant). + const UInt64 cap = context->getConfigRef().getUInt64("max_named_scalars", 500); + bool name_exists_locally; + size_t local_count; + { + std::shared_lock lock(scalars_mutex); + name_exists_locally = scalars.contains(name); + local_count = scalars.size(); + } + if (!name_exists_locally && local_count >= cap) + { + remove_orphan_value("removing orphaned value after exceeding cap"); + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Too many named scalars ({} already, max {}); raise in the server config to grow", + local_count, cap); + } + + /// Capture the predecessor's UUID so we can clean up its value + /// blob after a successful OR REPLACE. Same cache_kind is assumed + /// (the interpreter rejects KIND changes via OR REPLACE). + if (request.or_replace) + { + std::shared_lock lock(scalars_mutex); + auto it = scalars.find(name); + if (it != scalars.end() && it->second.scalar) + old_uuid_to_remove = it->second.scalar->getUUID(); + } + + /// publishDefinition is the authoritative serialiser: it throws + /// NAMED_SCALAR_ALREADY_EXISTS for non-IF-NOT-EXISTS races, and + /// returns false for the legitimate IF-NOT-EXISTS-with-existing + /// case. No manual re-check needed. + const bool definition_published = definition_store->publishDefinition( + name, + request.formatted_create_query, + request.if_not_exists, + request.or_replace, + context->getSettingsRef()); + if (!definition_published) + { + remove_orphan_value("removing orphaned value after publishDefinition declined"); + return false; + } + + /// Swap in the new scalar (atomic map update) and start its task, + /// but do NOT shut down the predecessor here: holding + /// create_drop_mutex through `replaced->shutdown()` would + /// serialise all DDL on the predecessor's in-flight refresh + /// drain. The predecessor's `live` flag is already false from + /// this point on; we just defer the cancel-and-join. + replaced = swapScalar(scalar, cache_kind); + scalar->start(task_context); + } + catch (...) + { + remove_orphan_value( + fmt::format("removing orphaned value after publishDefinition or installScalar threw for '{}'", name)); + throw; + } + + /// Slow path: predecessor shutdown JOINS an in-flight refresh body. + /// Outside the catalog lock so concurrent DDL on other scalars is + /// not blocked. + if (replaced) + replaced->shutdown(); + + /// Best-effort: remove the predecessor's value blob now that the new + /// scalar is live. Not catastrophic if it fails - leaves an orphan. + if (!old_uuid_to_remove.empty() && old_uuid_to_remove != parsed->uuid) + { + try { backend.removeValue(old_uuid_to_remove); } + catch (...) { tryLogCurrentException(log, fmt::format("removing prior value for OR REPLACE of '{}'", name)); } + } + + if (watcher) + watcher->pokeResyncAll(); + return true; +} + +bool NamedScalarsManager::drop(const String & name, bool throw_if_not_exists) +{ + std::lock_guard catalog_lock(create_drop_mutex); + + NamedScalarCacheKind cache_kind = NamedScalarCacheKind::Local; + String value_key; + if (auto found = tryGetScopedScalar(name)) + { + cache_kind = found->cache_kind; + value_key = found->scalar->getUUID(); + } + else + { + String definition_blob; + if (definition_store->readDefinition(name, definition_blob)) + { + cache_kind = cacheKindFromDefinitionBlob(definition_blob, Context::getGlobalContextInstance(), getLogger("NamedScalarsManager")); + if (auto uuid = getNamedScalarUUIDFromSerializedDefinition( + definition_blob, + Context::getGlobalContextInstance(), + getLogger("NamedScalarsManager"))) + value_key = *uuid; + } + } + + const bool removed = definition_store->removeDefinition(name, throw_if_not_exists); + if (!removed) + return false; + + dropScalar(name); + if (!value_key.empty()) + { + try + { + valueBackendFor(cache_kind).removeValue(value_key); + } + catch (...) + { + tryLogCurrentException(getLogger("NamedScalarsManager"), + fmt::format("removing value blob for dropped scalar '{}'", name)); + } + } + return true; +} + +void NamedScalarsManager::refreshNow(const String & name) +{ + auto scalar = tryGetScalar(name); + if (!scalar) + throw Exception(ErrorCodes::NAMED_SCALAR_NOT_FOUND, "named scalar '{}' doesn't exist", name); + scalar->requestRefreshNow(); +} + +void NamedScalarsManager::setRefreshPaused(const String & name, bool paused) +{ + auto scalar = tryGetScalar(name); + if (!scalar) + throw Exception(ErrorCodes::NAMED_SCALAR_NOT_FOUND, "named scalar '{}' doesn't exist", name); + /// Symmetric with refreshNow's NAMED_SCALAR_NOT_REFRESHABLE: when an + /// operator names a specific scalar, surface the mismatch instead of + /// silently no-opping. The iterate-all path keeps silent-skip. + if (!scalar->isRefreshable()) + throw Exception( + ErrorCodes::NAMED_SCALAR_NOT_REFRESHABLE, + "named scalar '{}' is not refreshable", + name); + scalar->setRefreshPaused(paused); +} + +void NamedScalarsManager::setAllRefreshesPaused(bool paused) +{ + for (const auto & scalar : listAllScalars()) + scalar->setRefreshPaused(paused); +} + +void NamedScalarsManager::installStoredDefinition( + const ContextPtr & context, + const String & name, + const String & definition_blob, + LoggerPtr log) +{ + auto cache_kind = cacheKindFromDefinitionBlob(definition_blob, context, log); + + auto parsed = parseAndValidateDefinition( + name, definition_blob, std::chrono::system_clock::now(), context, log); + if (!parsed) + return; + + /// Same-UUID fast path: nothing to do. The watcher has re-armed its + /// own data-watch as part of reading the blob; the scalar's value + /// watch is independently armed by the scalar itself. + /// + /// This check is intentionally NOT serialised against `create_drop_mutex`: + /// a local CREATE racing with this watcher reconcile may end up + /// installing the same UUID twice (via both paths), in which case + /// the second call to `installScalar` just swaps in an equivalent + /// instance and shuts the first one down. End state is consistent; + /// holding `create_drop_mutex` here would block local CREATE on + /// every reconcile (which does I/O), and the wasted work is rare. + { + std::shared_lock lock(scalars_mutex); + auto it = scalars.find(name); + if (it != scalars.end() + && it->second.scalar + && it->second.scalar->getUUID() == parsed->uuid) + return; + } + + /// Different-UUID (peer CREATE / OR REPLACE / DROP-and-recreate): + /// drop+create. Swap the new scalar in atomically, then shutdown the + /// predecessor outside the swap so the watcher loop is not blocked + /// by an in-flight refresh drain on the old scalar. + auto scalar = std::make_shared(std::move(*parsed), valueBackendFor(cache_kind)); + scalar->loadValueFromBackend(); + auto replaced = swapScalar(scalar, cache_kind); + scalar->start(context); + if (replaced) + replaced->shutdown(); +} + +} diff --git a/src/Interpreters/NamedScalars/NamedScalarsManager.h b/src/Interpreters/NamedScalars/NamedScalarsManager.h new file mode 100644 index 000000000000..586133a56b5f --- /dev/null +++ b/src/Interpreters/NamedScalars/NamedScalarsManager.h @@ -0,0 +1,176 @@ +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +class NamedScalarValueBackendLocal; +class SharedNamedScalarsWatcher; + +/// CREATE request at the interpreter/manager boundary. The interpreter has +/// validated permissions, materialized UUID/cache kind/definer into SQL, +/// and formatted the CREATE query that the definition store will persist. +struct NamedScalarCreateRequest +{ + NamedScalarCacheKind cache_kind = NamedScalarCacheKind::Local; + String name; + String formatted_create_query; + bool if_not_exists = false; + bool or_replace = false; +}; + +struct NamedScalarWithScope +{ + NamedScalarCacheKind cache_kind = NamedScalarCacheKind::Local; + NamedScalarPtr scalar; +}; + +/// Public facade for the named-scalars feature. The DDL interpreter, +/// `getNamedScalar*` functions, and `system.named_scalars` go through +/// here. +/// +/// Module overview +/// --------------- +/// The user-visible namespace is singular. `LOCAL` / `SHARED` is the +/// value-cache kind of a definition, not a second name scope. +/// +/// NamedScalarsManager +/// | +/// +-- active definition store +/// | +-- NamedScalarDefinitionStoreLocal (disk definitions) +/// | \-- NamedScalarDefinitionStoreShared (Keeper definitions + watches) +/// | +/// +-- value backends selected per scalar +/// | +-- NamedScalarValueBackendLocal (disk value cache) +/// | \-- NamedScalarValueBackendShared (Keeper value cache) +/// | +/// \-- named scalar map +/// \_ NamedScalar (definition, current value, refresh task, fences) +/// +/// `NamedScalar` owns the per-name lifecycle: an immutable parsed +/// definition, the current cached value, refresh-runtime state, a +/// single `BackgroundSchedulePool` task, a `reload_requested` flag set +/// by backend value-watch fires, and a `live` flag. A separate +/// raw-byte value backend is what a scalar calls to read / watch / +/// publish durable values. +/// +/// Definition lifecycle: definitions are immutable. CREATE OR REPLACE +/// produces a NEW NamedScalar with a NEW UUID; this manager swaps it +/// into the catalog map and shuts the old one down. There is no +/// in-place reconcile path on `NamedScalar`. +/// +/// Concurrency invariants (live in `NamedScalar`, surfaced here for context): +/// * Readers (getNamedScalar, system.named_scalars) copy stable +/// snapshots and release scalar locks immediately. +/// * Publish path - initial evaluate, refresh body, backend-triggered +/// value reload - serialises on the per-scalar `publish_mutex`. +/// * `NamedScalar::shutdown()` is the publish-vs-DROP fence: it blocks +/// until any in-flight publish returns, so the caller's subsequent +/// durable removal can't be overtaken by a stale publish. +/// +/// Shutdown is explicit: the shared watcher is stopped first, then scalar +/// refresh tasks are drained while Context services, Keeper, and schedule +/// pools are still alive. Destructors are idempotent fallbacks only. +class NamedScalarsManager +{ +public: + static void checkName(const String & name); + + NamedScalarsManager(const String & definitions_disk_path, + const String & definitions_zookeeper_path, + const String & local_cache_path, + NamedScalarCacheKind default_cache_kind_, + const ContextPtr & global_context); + ~NamedScalarsManager(); + + NamedScalarsManager(const NamedScalarsManager &) = delete; + NamedScalarsManager & operator=(const NamedScalarsManager &) = delete; + + void initialize(const ContextPtr & global_context); + void shutdown(); + + /// Hot path for the getNamedScalar UDF and other readers that + /// don't care about the cache kind. Returns null if no scalar with + /// this name. Use tryGetScopedScalar when cache_kind is needed. + NamedScalarPtr tryGetScalar(const String & name) const; + std::optional tryGetScopedScalar(const String & name) const; + std::vector listScalars() const; + + /// Authoritative cache-kind lookup: prefers the local map (cheap), + /// falls back to the persisted definition (Keeper RTT for SHARED). + /// Returns nullopt if no definition exists for the name. Used by + /// CREATE OR REPLACE kind-change guard and DROP ON CLUSTER routing + /// — both want the persisted truth, not the live-map view. + std::optional getCacheKind( + const String & name, + const ContextPtr & context, + LoggerPtr log) const; + NamedScalarCacheKind getDefaultCacheKind() const { return default_cache_kind; } + + /// Throws SHARED_NAMED_SCALARS_NOT_CONFIGURED when the server uses + /// disk definitions. A shared value cache needs a Keeper-backed + /// definition store so DROP / OR REPLACE have one cluster-wide owner. + void ensureCreatable(NamedScalarCacheKind cache_kind) const; + + /// Authoritative existence check; for shared scope queries Keeper + /// directly so an IF NOT EXISTS short-circuit can't miss entries the + /// watcher hasn't reconciled yet. + bool definitionExists(const String & name) const; + + bool create(NamedScalarCreateRequest request, const ContextPtr & context); + bool drop(const String & name, bool throw_if_not_exists); + void refreshNow(const String & name); + /// Per-server pause; not propagated to peers, not persisted. + void setRefreshPaused(const String & name, bool paused); + void setAllRefreshesPaused(bool paused); + +private: + friend class SharedNamedScalarsWatcher; + + bool usesKeeperDefinitions() const { return definition_store->isKeeperBacked(); } + INamedScalarValueBackend & valueBackendFor(NamedScalarCacheKind cache_kind) const; + NamedScalarCacheKind cacheKindFromDefinitionBlob(const String & definition_blob, const ContextPtr & context, LoggerPtr log) const; + + void installScalar(NamedScalarPtr scalar, NamedScalarCacheKind cache_kind); + + /// Like `installScalar` but returns the replaced predecessor (or null + /// if no entry existed) WITHOUT shutting it down. The caller is + /// responsible for calling `shutdown()` on the returned scalar AFTER + /// any catalog mutex is released. Predecessor `shutdown()` cancels + /// and joins the in-flight refresh body; doing that under + /// `create_drop_mutex` would serialise all named-scalar DDL on the + /// slowest predecessor. Use this when adding + /// the new scalar inside a catalog lock and dropping the predecessor + /// outside it. + [[nodiscard]] NamedScalarPtr swapScalar(NamedScalarPtr scalar, NamedScalarCacheKind cache_kind); + bool dropScalar(const String & name); + void shutdownScalars(); + std::vector listAllScalars() const; + void installStoredDefinition( + const ContextPtr & context, + const String & name, + const String & definition_blob, + LoggerPtr log); + + NamedScalarDefinitionStorePtr definition_store; + WatchableNamedScalarDefinitionStorePtr watchable_definition_store; + NamedScalarCacheKind default_cache_kind = NamedScalarCacheKind::Local; + std::unique_ptr local_value_backend; + std::unique_ptr shared_value_backend; + std::unique_ptr watcher; + + mutable std::shared_mutex scalars_mutex; + std::unordered_map scalars; + std::mutex create_drop_mutex; +}; + +} diff --git a/src/Interpreters/NamedScalars/SharedNamedScalarsWatcher.cpp b/src/Interpreters/NamedScalars/SharedNamedScalarsWatcher.cpp new file mode 100644 index 000000000000..d9d90c867e7d --- /dev/null +++ b/src/Interpreters/NamedScalars/SharedNamedScalarsWatcher.cpp @@ -0,0 +1,187 @@ +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include + +namespace DB +{ + +namespace +{ +constexpr size_t WATCHER_QUEUE_POLL_TIMEOUT_MS = 10000; +} + +SharedNamedScalarsWatcher::SharedNamedScalarsWatcher( + ContextPtr global_context_, + WatchableNamedScalarDefinitionStorePtr definition_store_, + NamedScalarsManager & manager_) + : global_context(std::move(global_context_)) + , definition_store(std::move(definition_store_)) + , manager(manager_) + , log(getLogger("SharedNamedScalarsWatcher")) + , queue(std::make_shared>(std::numeric_limits::max())) + , resync_requested(std::make_shared>(false)) +{ +} + +SharedNamedScalarsWatcher::~SharedNamedScalarsWatcher() +{ + stop(); +} + +void SharedNamedScalarsWatcher::start() +{ + if (running.exchange(true)) + return; + /// First reconcile runs synchronously: avoids a "not found" window + /// for entries that already exist in Keeper. A wedged Keeper just + /// throws; the watch thread's `!loaded` loop retries. + try + { + initialLoad(); + loaded = true; + } + catch (...) + { + tryLogCurrentException(log, "initial load failed; watcher will retry in the background"); + } + thread = ThreadFromGlobalPool(&SharedNamedScalarsWatcher::watchLoop, this); +} + +void SharedNamedScalarsWatcher::stop() +{ + if (!running.exchange(false)) + return; + queue->finish(); + if (thread.joinable()) + thread.join(); +} + +void SharedNamedScalarsWatcher::pokeResyncAll() +{ + if (!running) + return; + requestResync(queue, resync_requested); +} + +void SharedNamedScalarsWatcher::requestResync( + const std::shared_ptr> & queue, + const std::shared_ptr> & resync_requested) +{ + if (!resync_requested->exchange(true)) + { + if (!queue->emplace(Wakeup{})) + resync_requested->store(false); + } +} + +void SharedNamedScalarsWatcher::watchLoop() +{ + setThreadName(ThreadName::SHARED_NAMED_SCALARS); + LOG_DEBUG(log, "Shared named_scalars watcher started"); + + while (running) + { + try + { + if (!loaded) + { + initialLoad(); + loaded = true; + } + + Wakeup wakeup; + if (!queue->tryPop(wakeup, WATCHER_QUEUE_POLL_TIMEOUT_MS)) + continue; + + /// Drain the flag in a loop: a request that arrived between + /// `exchange(false)` returning true and `resyncAll()` finishing + /// must still be served before we go back to sleep. + while (resync_requested->exchange(false)) + resyncAll(); + } + catch (...) + { + /// Don't reset resync_requested here: a watch callback that fired + /// during the failing reconcile would have its wake-up dropped. + /// The next iteration will exchange(false) and serve it. + tryLogCurrentException(log, "Shared named_scalars watcher loop"); + loaded = false; + sleepForSeconds(5); + } + } + + LOG_DEBUG(log, "Shared named_scalars watcher stopped"); +} + +void SharedNamedScalarsWatcher::initialLoad() +{ + Strings names = readDefinitionsAndInstallChildrenWatch(); + for (const auto & name : names) + reconcileScalar(name); +} + +void SharedNamedScalarsWatcher::resyncAll() +{ + Strings names = readDefinitionsAndInstallChildrenWatch(); + std::unordered_set present(names.begin(), names.end()); + + for (const auto & scalar : manager.listAllScalars()) + { + const auto & scalar_name = scalar->getName(); + if (!present.contains(scalar_name)) + manager.dropScalar(scalar_name); + } + + for (const auto & name : names) + reconcileScalar(name); +} + +void SharedNamedScalarsWatcher::reconcileScalar(const String & name) +{ + String definition_blob; + if (!readDefinitionData(name, definition_blob)) + { + manager.dropScalar(name); + return; + } + + manager.installStoredDefinition( + global_context, + name, + definition_blob, + log); +} + +Strings SharedNamedScalarsWatcher::readDefinitionsAndInstallChildrenWatch() +{ + return definition_store->listDefinitionsWithChildrenWatch( + [q = queue, requested = resync_requested] + { + requestResync(q, requested); + }); +} + +bool SharedNamedScalarsWatcher::readDefinitionData(const String & name, String & out) +{ + return definition_store->readDefinitionWithDataWatch( + name, + out, + [q = queue, requested = resync_requested] + { + requestResync(q, requested); + }); +} + +} diff --git a/src/Interpreters/NamedScalars/SharedNamedScalarsWatcher.h b/src/Interpreters/NamedScalars/SharedNamedScalarsWatcher.h new file mode 100644 index 000000000000..59e153785a51 --- /dev/null +++ b/src/Interpreters/NamedScalars/SharedNamedScalarsWatcher.h @@ -0,0 +1,87 @@ +#pragma once + +#include +#include + +#include +#include +#include + +#include +#include + +template class ConcurrentBoundedQueue; + +namespace DB +{ + +class NamedScalarsManager; + +/// Reconciles the shared-scope scalar map with Keeper state. +/// Owned privately by NamedScalarsManager. See NamedScalarsManager.h +/// for the module overview. +/// +/// Watch budget: 1 children-watch on the definitions directory + per-scalar +/// data-watches on `/defs/` and +/// `/values//value`. Capped by +/// `` (default 500). +/// +/// Reconciliation paths: +/// * definitions children-fire (peer CREATE / DROP): re-read +/// getChildren, diff against the runtime, install fresh + drop missing. +/// * `/defs/` data-fire (peer OR REPLACE): targeted +/// reconcile for that scalar. +/// +/// Value-side watches (`/values//value`) do NOT go through +/// here: the backend invokes `NamedScalar::onValueChanged()` directly, +/// which is cheap enough to run on the Keeper IO thread (atomic flip + +/// task->schedule()). +class SharedNamedScalarsWatcher +{ +public: + SharedNamedScalarsWatcher( + ContextPtr global_context_, + WatchableNamedScalarDefinitionStorePtr definition_store_, + NamedScalarsManager & manager_); + ~SharedNamedScalarsWatcher(); + + SharedNamedScalarsWatcher(const SharedNamedScalarsWatcher &) = delete; + SharedNamedScalarsWatcher & operator=(const SharedNamedScalarsWatcher &) = delete; + + void start(); + void stop(); + + /// Self-pickup nudge after a local CREATE / DROP, without waiting + /// for the Keeper children-watch echo. + void pokeResyncAll(); + +private: + /// The queue carries no payload - it's just a wakeup mechanism for + /// the watch thread. Real state lives in `resync_requested`. + struct Wakeup {}; + + static void requestResync( + const std::shared_ptr> & queue, + const std::shared_ptr> & resync_requested); + + void watchLoop(); + void initialLoad(); + void resyncAll(); + void reconcileScalar(const String & name); + + Strings readDefinitionsAndInstallChildrenWatch(); + bool readDefinitionData(const String & name, String & out); + + ContextPtr global_context; + WatchableNamedScalarDefinitionStorePtr definition_store; + NamedScalarsManager & manager; + LoggerPtr log; + + std::shared_ptr> queue; + std::shared_ptr> resync_requested; + ThreadFromGlobalPool thread; + std::atomic running{false}; + std::atomic loaded{false}; +}; + +} diff --git a/src/Interpreters/registerInterpreters.cpp b/src/Interpreters/registerInterpreters.cpp index 21795e8a5be8..f7debcde8cf5 100644 --- a/src/Interpreters/registerInterpreters.cpp +++ b/src/Interpreters/registerInterpreters.cpp @@ -52,6 +52,7 @@ void registerInterpreterShowPrivilegesQuery(InterpreterFactory & factory); void registerInterpreterTransactionControlQuery(InterpreterFactory & factory); void registerInterpreterCreateFunctionQuery(InterpreterFactory & factory); void registerInterpreterDropFunctionQuery(InterpreterFactory & factory); +void registerInterpreterNamedScalarDDLQuery(InterpreterFactory & factory); void registerInterpreterCreateWorkloadQuery(InterpreterFactory & factory); void registerInterpreterDropWorkloadQuery(InterpreterFactory & factory); void registerInterpreterCreateResourceQuery(InterpreterFactory & factory); @@ -118,6 +119,7 @@ void registerInterpreters() registerInterpreterTransactionControlQuery(factory); registerInterpreterCreateFunctionQuery(factory); registerInterpreterDropFunctionQuery(factory); + registerInterpreterNamedScalarDDLQuery(factory); registerInterpreterCreateWorkloadQuery(factory); registerInterpreterDropWorkloadQuery(factory); registerInterpreterCreateResourceQuery(factory); diff --git a/src/Parsers/ASTNamedScalarDDLQuery.cpp b/src/Parsers/ASTNamedScalarDDLQuery.cpp new file mode 100644 index 000000000000..9bc1734a8470 --- /dev/null +++ b/src/Parsers/ASTNamedScalarDDLQuery.cpp @@ -0,0 +1,90 @@ +#include +#include +#include +#include + +namespace DB +{ + +ASTPtr ASTNamedScalarDDLQuery::clone() const +{ + chassert(named_scalar_name); + auto res = make_intrusive(*this); + res->children.clear(); + + res->named_scalar_name = named_scalar_name->clone(); + res->children.push_back(res->named_scalar_name); + + if (sql_security) + { + res->sql_security = sql_security->clone(); + res->children.push_back(res->sql_security); + } + + if (expression) + { + res->expression = expression->clone(); + res->children.push_back(res->expression); + } + + return res; +} + +void ASTNamedScalarDDLQuery::formatImpl( + WriteBuffer & ostr, const IAST::FormatSettings & settings, IAST::FormatState & state, IAST::FormatStateStacked frame) const +{ + chassert(named_scalar_name); + if (action == Action::Create) + { + ostr << "CREATE "; + if (or_replace) + ostr << "OR REPLACE "; + } + else + { + ostr << "DROP "; + } + + if (action == Action::Create && cache_kind == CacheKind::Local) + ostr << "LOCAL "; + else if (action == Action::Create && cache_kind == CacheKind::Shared) + ostr << "SHARED "; + + ostr << "NAMED SCALAR "; + + if (action == Action::Create && if_not_exists) + ostr << "IF NOT EXISTS "; + else if (action == Action::Drop && if_exists) + ostr << "IF EXISTS "; + + named_scalar_name->format(ostr, settings, state, frame); + + if (action == Action::Create && uuid != UUIDHelpers::Nil) + ostr << " UUID " << quoteString(toString(uuid)); + + formatOnCluster(ostr, settings); + + if (action == Action::Create) + { + if (sql_security) + { + ostr << " "; + sql_security->format(ostr, settings, state, frame); + } + + if (refresh_period_seconds) + ostr << " REFRESH EVERY " << *refresh_period_seconds << " SECOND"; + + ostr << " AS "; + expression->format(ostr, settings, state, frame); + } +} + +String ASTNamedScalarDDLQuery::getNamedScalarName() const +{ + String name; + tryGetIdentifierNameInto(named_scalar_name, name); + return name; +} + +} diff --git a/src/Parsers/ASTNamedScalarDDLQuery.h b/src/Parsers/ASTNamedScalarDDLQuery.h new file mode 100644 index 000000000000..e6840558570c --- /dev/null +++ b/src/Parsers/ASTNamedScalarDDLQuery.h @@ -0,0 +1,61 @@ +#pragma once + +#include +#include + +#include +#include + +#include + +namespace DB +{ + +/// AST for CREATE and DROP NAMED SCALAR. +class ASTNamedScalarDDLQuery : public IAST, public ASTQueryWithOnCluster +{ +public: + enum class Action : uint8_t { Create, Drop }; + enum class CacheKind : uint8_t { Default, Local, Shared }; + + Action action = Action::Create; + CacheKind cache_kind = CacheKind::Default; + + ASTPtr named_scalar_name; + + /// CREATE. + ASTPtr expression; + ASTPtr sql_security; + std::optional refresh_period_seconds; + bool if_not_exists = false; + bool or_replace = false; + UUID uuid = UUIDHelpers::Nil; + + /// DROP. + bool if_exists = false; + + String getID(char delim) const override + { + const char * tag = action == Action::Create ? "CreateScalarQuery" : "DropScalarQuery"; + return String(tag) + (delim + getNamedScalarName()); + } + + ASTPtr clone() const override; + + ASTPtr getRewrittenASTWithoutOnCluster(const WithoutOnClusterASTRewriteParams &) const override + { + return removeOnCluster(clone()); + } + + String getNamedScalarName() const; + + QueryKind getQueryKind() const override + { + return action == Action::Create ? QueryKind::Create : QueryKind::Drop; + } + +protected: + void formatImpl(WriteBuffer & ostr, const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override; +}; + +} diff --git a/src/Parsers/ASTSystemQuery.cpp b/src/Parsers/ASTSystemQuery.cpp index 49c6611b7b5e..d05882356477 100644 --- a/src/Parsers/ASTSystemQuery.cpp +++ b/src/Parsers/ASTSystemQuery.cpp @@ -470,6 +470,22 @@ void ASTSystemQuery::formatImpl(WriteBuffer & ostr, const FormatSettings & setti print_database_table(); break; } + case Type::REFRESH_NAMED_SCALAR: + { + ostr << ' '; + print_identifier(named_scalar_name); + break; + } + case Type::START_NAMED_SCALAR_REFRESHES: + case Type::STOP_NAMED_SCALAR_REFRESHES: + { + if (!named_scalar_name.empty()) + { + ostr << ' '; + print_identifier(named_scalar_name); + } + break; + } case Type::TEST_VIEW: { ostr << ' '; diff --git a/src/Parsers/ASTSystemQuery.h b/src/Parsers/ASTSystemQuery.h index 3d4b087207d1..eae1630823fd 100644 --- a/src/Parsers/ASTSystemQuery.h +++ b/src/Parsers/ASTSystemQuery.h @@ -119,6 +119,9 @@ class ASTSystemQuery : public IAST, public ASTQueryWithOnCluster STOP_CLEANUP, START_CLEANUP, RESET_COVERAGE, + REFRESH_NAMED_SCALAR, + START_NAMED_SCALAR_REFRESHES, + STOP_NAMED_SCALAR_REFRESHES, REFRESH_VIEW, WAIT_VIEW, START_VIEW, @@ -192,6 +195,10 @@ class ASTSystemQuery : public IAST, public ASTQueryWithOnCluster String fail_point_name; + /// SYSTEM { REFRESH | { START | STOP } REFRESHES } NAMED SCALAR . + /// Empty name = the "{START|STOP} ... REFRESHES" no-arg form. + String named_scalar_name; + enum class FailPointAction { UNSPECIFIED, diff --git a/src/Parsers/CommonParsers.h b/src/Parsers/CommonParsers.h index 21f51ddbfab7..e14e08e819be 100644 --- a/src/Parsers/CommonParsers.h +++ b/src/Parsers/CommonParsers.h @@ -355,6 +355,7 @@ namespace DB MR_MACROS(N, "N") \ MR_MACROS(NAME, "NAME") \ MR_MACROS(NAMED_COLLECTION, "NAMED COLLECTION") \ + MR_MACROS(NAMED_SCALAR, "NAMED SCALAR") \ MR_MACROS(NANOSECOND, "NANOSECOND") \ MR_MACROS(NANOSECONDS, "NANOSECONDS") \ MR_MACROS(NEXT, "NEXT") \ @@ -485,6 +486,7 @@ namespace DB MR_MACROS(SET, "SET") \ MR_MACROS(SETTING, "SETTING") \ MR_MACROS(SETTINGS, "SETTINGS") \ + MR_MACROS(SHARED, "SHARED") \ MR_MACROS(SHOW_ACCESS, "SHOW ACCESS") \ MR_MACROS(SHOW_CREATE, "SHOW CREATE") \ MR_MACROS(SHOW_ENGINES, "SHOW ENGINES") \ diff --git a/src/Parsers/ParserNamedScalarDDLQuery.cpp b/src/Parsers/ParserNamedScalarDDLQuery.cpp new file mode 100644 index 000000000000..75d295e3641e --- /dev/null +++ b/src/Parsers/ParserNamedScalarDDLQuery.cpp @@ -0,0 +1,220 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int SYNTAX_ERROR; +} + +namespace +{ +/// Parse the body of `REFRESH EVERY ` after the `REFRESH` keyword +/// has already been consumed. Output is the period normalised to seconds. +/// Units: SECOND, MINUTE, HOUR, DAY (with optional plural). +bool parseRefreshEvery(IParser::Pos & pos, Expected & expected, UInt64 & out_seconds) +{ + ParserKeyword s_every(Keyword::EVERY); + if (!s_every.ignore(pos, expected)) + return false; + + ParserNumber number_p; + ASTPtr number_ast; + if (!number_p.parse(pos, number_ast, expected)) + return false; + const auto & number_field = number_ast->as().value; + UInt64 amount = 0; + if (number_field.getType() == Field::Types::UInt64) + amount = number_field.safeGet(); + else if (number_field.getType() == Field::Types::Int64) + { + Int64 v = number_field.safeGet(); + if (v <= 0) + return false; + amount = static_cast(v); + } + else + return false; + if (amount == 0) + return false; + + UInt64 unit_seconds = 0; + if (ParserKeyword(Keyword::SECOND).ignore(pos, expected) || ParserKeyword(Keyword::SECONDS).ignore(pos, expected)) + unit_seconds = 1; + else if (ParserKeyword(Keyword::MINUTE).ignore(pos, expected) || ParserKeyword(Keyword::MINUTES).ignore(pos, expected)) + unit_seconds = 60; + else if (ParserKeyword(Keyword::HOUR).ignore(pos, expected) || ParserKeyword(Keyword::HOURS).ignore(pos, expected)) + unit_seconds = 3600; + else if (ParserKeyword(Keyword::DAY).ignore(pos, expected) || ParserKeyword(Keyword::DAYS).ignore(pos, expected)) + unit_seconds = 86400; + else + return false; + + /// Reject overflow + cap at UInt32::max seconds (~136 years). Without + /// this, `amount * unit_seconds` wraps to 0 and the refresh task + /// fires every tick, hammering source/Keeper. + if (amount > std::numeric_limits::max() / unit_seconds) + throw Exception(ErrorCodes::SYNTAX_ERROR, + "REFRESH EVERY {} {}: refresh interval overflows after unit conversion", + amount, unit_seconds == 1 ? "SECOND" + : unit_seconds == 60 ? "MINUTE" + : unit_seconds == 3600 ? "HOUR" : "DAY"); + out_seconds = amount * unit_seconds; + if (out_seconds == 0) + return false; + constexpr UInt64 max_period_seconds = std::numeric_limits::max(); // ~136 years + if (out_seconds > max_period_seconds) + throw Exception(ErrorCodes::SYNTAX_ERROR, + "REFRESH EVERY interval exceeds the supported maximum of {} seconds (~136 years)", + max_period_seconds); + return true; +} +} + +bool ParserNamedScalarDDLQuery::parseImpl(IParser::Pos & pos, ASTPtr & node, Expected & expected) +{ + ParserKeyword s_create(Keyword::CREATE); + ParserKeyword s_drop(Keyword::DROP); + ParserKeyword s_named_scalar(Keyword::NAMED_SCALAR); + ParserKeyword s_or_replace(Keyword::OR_REPLACE); + ParserKeyword s_if_not_exists(Keyword::IF_NOT_EXISTS); + ParserKeyword s_if_exists(Keyword::IF_EXISTS); + ParserKeyword s_on(Keyword::ON); + ParserKeyword s_refresh(Keyword::REFRESH); + ParserKeyword s_as(Keyword::AS); + ParserKeyword s_local(Keyword::LOCAL); + ParserKeyword s_shared(Keyword::SHARED); + ParserKeyword s_uuid(Keyword::UUID); + ParserIdentifier name_p; + ParserSelectWithUnionQuery select_p; + + using Action = ASTNamedScalarDDLQuery::Action; + Action action; + if (s_create.ignore(pos, expected)) + action = Action::Create; + else if (s_drop.ignore(pos, expected)) + action = Action::Drop; + else + return false; + + bool if_not_exists = false; + bool or_replace = false; + auto cache_kind = ASTNamedScalarDDLQuery::CacheKind::Default; + bool if_exists = false; + String cluster_str; + + ASTPtr named_scalar_name; + ASTPtr expression; + ASTPtr sql_security; + std::optional refresh_period_seconds; + UUID uuid = UUIDHelpers::Nil; + + if (action == Action::Create && s_or_replace.ignore(pos, expected)) + or_replace = true; + + if (action == Action::Create) + { + if (s_local.ignore(pos, expected)) + cache_kind = ASTNamedScalarDDLQuery::CacheKind::Local; + else if (s_shared.ignore(pos, expected)) + cache_kind = ASTNamedScalarDDLQuery::CacheKind::Shared; + } + + if (!s_named_scalar.ignore(pos, expected)) + return false; + + if (action == Action::Create) + { + if (s_if_not_exists.ignore(pos, expected)) + { + /// OR REPLACE and IF NOT EXISTS are mutually exclusive. + if (or_replace) + return false; + if_not_exists = true; + } + } + else + { + if (s_if_exists.ignore(pos, expected)) + if_exists = true; + } + + if (!name_p.parse(pos, named_scalar_name, expected)) + return false; + + if (action == Action::Create && s_uuid.ignore(pos, expected)) + { + ParserStringLiteral uuid_p; + ASTPtr ast_uuid; + if (!uuid_p.parse(pos, ast_uuid, expected)) + return false; + uuid = parseFromString(ast_uuid->as()->value.safeGet()); + } + + if (s_on.ignore(pos, expected)) + { + if (!ASTQueryWithOnCluster::parse(pos, cluster_str, expected)) + return false; + } + + if (action == Action::Create) + { + ParserSQLSecurity sql_security_p; + sql_security_p.parse(pos, sql_security, expected); + + if (s_refresh.ignore(pos, expected)) + { + UInt64 seconds = 0; + if (!parseRefreshEvery(pos, expected, seconds)) + return false; + refresh_period_seconds = seconds; + } + + if (!s_as.ignore(pos, expected)) + return false; + + /// AS SELECT only. + if (!select_p.parse(pos, expression, expected)) + return false; + } + + auto query = make_intrusive(); + query->action = action; + query->cache_kind = cache_kind; + query->named_scalar_name = named_scalar_name; + query->children.push_back(named_scalar_name); + + if (action == Action::Create) + { + query->sql_security = sql_security; + if (query->sql_security) + query->children.push_back(query->sql_security); + query->expression = expression; + query->children.push_back(expression); + query->refresh_period_seconds = refresh_period_seconds; + query->if_not_exists = if_not_exists; + query->or_replace = or_replace; + query->uuid = uuid; + } + else + { + query->if_exists = if_exists; + } + + query->cluster = std::move(cluster_str); + node = query; + return true; +} + +} diff --git a/src/Parsers/ParserNamedScalarDDLQuery.h b/src/Parsers/ParserNamedScalarDDLQuery.h new file mode 100644 index 000000000000..81fed9ee0be8 --- /dev/null +++ b/src/Parsers/ParserNamedScalarDDLQuery.h @@ -0,0 +1,23 @@ +#pragma once + +#include + +namespace DB +{ + +/// Parses both CREATE and DROP NAMED SCALAR forms: +/// CREATE [OR REPLACE] [SHARED] NAMED SCALAR [IF NOT EXISTS] name +/// [ON CLUSTER cluster] [REFRESH EVERY ] AS