Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions sdk/cosmos/azure-cosmos/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
### 4.16.2 (Unreleased)

#### Features Added
* Added cold-start metadata cache cross-region hedging. When enabled, the SDK hedges the first-time population of the container and partition-key-range metadata caches to a second region if the primary region is slow, returning the first acceptable response. Controlled by the new `enable_metadata_hedging_for_cold_start` client keyword (tri-state: `None` follows the account's PPAF state, `True` forces it on, `False` disables it).

#### Breaking Changes

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,24 @@
DEFAULT_THRESHOLD_MS = 500
DEFAULT_THRESHOLD_STEPS_MS = 100

# Defaults for cold-start metadata cache hedging. These are SDK-derived and not
# customer-configurable. The threshold is an aggressive tail-latency trigger: metadata
# (control-plane) reads use a short read timeout (DBAReadTimeout, 3s by default), so the
# 1.5s hedge fires well before the primary's own timeout, giving a slow region a chance
# to be raced by a second region while the primary attempt is still outstanding. The
# concurrency budget caps the number of in-flight metadata hedges per client.
DEFAULT_METADATA_HEDGING_THRESHOLD_MS = 1500
DEFAULT_METADATA_HEDGING_THRESHOLD_STEPS_MS = 500
DEFAULT_METADATA_HEDGING_CONCURRENCY_BUDGET = 8


class CrossRegionHedgingStrategy:
"""Configuration for cross-region request hedging strategy.

:param config: Dictionary containing configuration values, defaults to None
:type config: Optional[Dict[str, Any]]
:raises ValueError: If configuration values are invalid

The config dictionary can contain:
- threshold_ms: Time in ms before routing to alternate region (default: 500)
- threshold_steps_ms: Time interval between routing attempts (default: 100)
Expand All @@ -53,11 +63,49 @@ def __init__(self, config: Optional[dict[str, Any]] = None) -> None:
raise ValueError("threshold_steps_ms must be positive")


class MetadataCrossRegionHedgingStrategy(CrossRegionHedgingStrategy):
"""Cold-start metadata cache cross-region hedging configuration.

Unlike :class:`CrossRegionHedgingStrategy`, the metadata hedging threshold is a
fixed, SDK-derived value and is not customer-configurable. The strategy is bounded
to the primary request plus a single cross-region hedge.
"""

def __init__(self) -> None:
super().__init__(
{
"threshold_ms": DEFAULT_METADATA_HEDGING_THRESHOLD_MS,
"threshold_steps_ms": DEFAULT_METADATA_HEDGING_THRESHOLD_STEPS_MS,
}
)


def resolve_metadata_hedging_opt_in(opt_in: Optional[bool], ppaf_enabled: bool) -> bool:
"""Resolve the tri-state cold-start metadata hedging opt-in to a concrete bool.

When the customer leaves the opt-in ``None`` (the default), cold-start metadata
hedging follows the account's PPAF (Per-Partition Automatic Failover) state: it is
enabled when PPAF is enabled and disabled otherwise. An explicit ``True`` enables
hedging even when PPAF is disabled, and an explicit ``False`` disables it regardless
of PPAF.

:param opt_in: The customer-supplied tri-state opt-in (True/False/None).
:type opt_in: Optional[bool]
:param ppaf_enabled: Whether Per-Partition Automatic Failover is enabled for the account.
:type ppaf_enabled: bool
:returns: True if cold-start metadata hedging should be applied, False otherwise.
:rtype: bool
"""
if opt_in is None:
return ppaf_enabled
return opt_in


def _validate_request_hedging_strategy(
config: Optional[Union[bool, dict[str, Any]]]
) -> Union[CrossRegionHedgingStrategy, bool, None]:
"""Validate and create a CrossRegionHedgingStrategy for a request.

:param config: Configuration for availability strategy. Can be:
- None: Returns None (no strategy, uses client default if available)
- True: Returns strategy with default values (threshold_ms=500, threshold_steps_ms=100)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
from . import http_constants, exceptions
from ._auth_policy import CosmosBearerTokenCredentialPolicy
from ._availability_strategy_config import validate_client_hedging_strategy, CrossRegionHedgingStrategy
from ._metadata_hedging import MetadataCrossRegionHedgingHandler
from ._base import _build_properties_cache
from ._change_feed.change_feed_iterable import ChangeFeedIterable
from ._change_feed.change_feed_state import ChangeFeedState
Expand All @@ -71,7 +72,7 @@
from ._cosmos_responses import CosmosDict, CosmosList, CosmosItemPaged
from ._range_partition_resolver import RangePartitionResolver
from ._read_items_helper import ReadItemsHelperSync
from ._request_object import RequestObject
from ._request_object import RequestObject, METADATA_CACHE_POPULATION_OPTION
from ._retry_utility import ConnectionRetryPolicy
from ._routing import routing_map_provider
from ._query_advisor import get_query_advice_info
Expand Down Expand Up @@ -143,6 +144,7 @@ def __init__( # pylint: disable=too-many-statements
consistency_level: Optional[str] = None,
availability_strategy: Union[bool, dict[str, Any]] = False,
availability_strategy_executor: Optional[ThreadPoolExecutor] = None,
enable_metadata_hedging_for_cold_start: Optional[bool] = None,
**kwargs: Any
) -> None:
"""
Expand Down Expand Up @@ -170,6 +172,12 @@ def __init__( # pylint: disable=too-many-statements
self.availability_strategy: Union[CrossRegionHedgingStrategy, None] =\
validate_client_hedging_strategy(availability_strategy)
self.availability_strategy_executor: Optional[ThreadPoolExecutor] = availability_strategy_executor
# Tri-state opt-in for cold-start metadata cache cross-region hedging. None follows the
# account's PPAF state, True forces it on, False disables it (no handler is created).
self._metadata_hedging_opt_in: Optional[bool] = enable_metadata_hedging_for_cold_start
self._metadata_hedging_handler: Optional[MetadataCrossRegionHedgingHandler] = (
None if enable_metadata_hedging_for_cold_start is False else MetadataCrossRegionHedgingHandler()
)
self.master_key: Optional[str] = None

self.resource_tokens: Optional[Mapping[str, Any]] = None
Expand Down Expand Up @@ -2975,6 +2983,7 @@ def Read(
headers,
options.get("partitionKey", None))
request_params.set_excluded_location_from_options(options)
request_params.set_metadata_cache_population_from_options(options)
request_params.set_availability_strategy(options, self.availability_strategy)
request_params.availability_strategy_executor = self.availability_strategy_executor
base.set_session_token_header(self, headers, path, request_params, options)
Expand Down Expand Up @@ -3310,6 +3319,7 @@ def __GetBodiesFromQueryResult(result: dict[str, Any]) -> list[dict[str, Any]]:
options.get("partitionKey", None)
)
request_params.set_excluded_location_from_options(options)
request_params.set_metadata_cache_population_from_options(options)
request_params.set_availability_strategy(options, self.availability_strategy)
request_params.availability_strategy_executor = self.availability_strategy_executor

Expand Down Expand Up @@ -3948,7 +3958,8 @@ def refresh_routing_map_provider(

def _refresh_container_properties_cache(self, container_link: str):
# If container properties cache is stale, refresh it by reading the container.
container = self.ReadContainer(container_link, options=None)
container = self.ReadContainer(
container_link, options={METADATA_CACHE_POPULATION_OPTION: True})
# Only cache Container Properties that will not change in the lifetime of the container
self._set_container_properties_cache(container_link, _build_properties_cache(container, container_link))

Expand Down Expand Up @@ -3995,7 +4006,9 @@ def _get_partition_key_definition(
partition_key_definition = cached_container.get("partitionKey")
# Else read the collection from backend and add it to the cache
else:
container = self.ReadContainer(collection_link, options)
read_options = {**options} if options else {}
read_options[METADATA_CACHE_POPULATION_OPTION] = True
container = self.ReadContainer(collection_link, read_options)
partition_key_definition = container.get("partitionKey")
self._set_container_properties_cache(collection_link, _build_properties_cache(container, collection_link))
return partition_key_definition
Loading
Loading