From 2d932032c89de4f69c91b23e3e6e1666823bfb7e Mon Sep 17 00:00:00 2001 From: Xiting Zhang Date: Fri, 22 May 2026 13:22:11 -0700 Subject: [PATCH 1/3] Add VoiceLive beta API updates --- sdk/voicelive/azure-ai-voicelive/CHANGELOG.md | 15 + sdk/voicelive/azure-ai-voicelive/README.md | 42 +- .../azure-ai-voicelive/_metadata.json | 4 +- .../apiview-properties.json | 42 +- .../azure/ai/voicelive/_types.py | 6 +- .../azure/ai/voicelive/_utils/model_base.py | 374 +++++++++++++-- .../ai/voicelive/_utils/serialization.py | 23 +- .../azure/ai/voicelive/_version.py | 2 +- .../azure/ai/voicelive/aio/_patch.py | 12 +- .../azure/ai/voicelive/models/__init__.py | 20 + .../azure/ai/voicelive/models/_enums.py | 52 ++ .../azure/ai/voicelive/models/_models.py | 448 ++++++++++++++++-- .../azure-ai-voicelive/pyproject.toml | 2 +- .../samples/BASIC_VOICE_ASSISTANT.md | 16 +- .../azure-ai-voicelive/samples/README.md | 17 +- .../samples/async_mcp_sample.py | 4 +- .../samples/basic_voice_assistant_async.py | 6 +- .../tests/test_live_realtime_service.py | 49 +- .../tests/test_unit_client_events.py | 25 + .../tests/test_unit_connection.py | 4 +- .../tests/test_unit_enums.py | 33 ++ .../tests/test_unit_models.py | 93 ++++ ...st_unit_models_interim_response_foundry.py | 21 + 23 files changed, 1142 insertions(+), 168 deletions(-) diff --git a/sdk/voicelive/azure-ai-voicelive/CHANGELOG.md b/sdk/voicelive/azure-ai-voicelive/CHANGELOG.md index 11d1229e22de..ca0fca5574da 100644 --- a/sdk/voicelive/azure-ai-voicelive/CHANGELOG.md +++ b/sdk/voicelive/azure-ai-voicelive/CHANGELOG.md @@ -1,5 +1,20 @@ # Release History +## 1.3.0b1 (2026-05-22) + +### Features Added + +- **Azure Realtime Native Voice Support**: Added `AzureRealtimeNativeVoice` and + `AzureRealtimeNativeVoiceName`, and expanded `voice` fields to accept Azure realtime native voices. +- **WebRTC Call Negotiation Support**: Added `ClientEventRtcCallSdpCreate`, `ServerEventRtcCallSdpCreated`, + `ServerEventRtcCallError`, and `RtcCallErrorDetails` for SDP-based WebRTC call setup. +- **Hosted Agent Invocation Input**: Added `invoke_input` to `ResponseCreateParams` and + `ServerEventResponseInvocationDelta` for hosted agent invocation passthrough data. +- **Audio Playback Lifecycle Events**: Added `ServerEventOutputAudioBufferStarted` and + `ServerEventOutputAudioBufferStopped` to track model audio playback start and stop. +- **Echo Cancellation Configuration**: Added `EchoCancellationReferenceSource` and new + `reference_source` / `channels` options on `AudioEchoCancellation` for client-provided stereo echo reference input. + ## 1.2.0 (2026-05-22) ### Features Added diff --git a/sdk/voicelive/azure-ai-voicelive/README.md b/sdk/voicelive/azure-ai-voicelive/README.md index cf1b1a2d8bb4..4eecdca3aa8a 100644 --- a/sdk/voicelive/azure-ai-voicelive/README.md +++ b/sdk/voicelive/azure-ai-voicelive/README.md @@ -5,7 +5,7 @@ This package provides a **real-time, speech-to-speech** client for Azure AI Voic It opens a WebSocket session to stream microphone audio to the service and receive typed server events (including audio) for responsive, interruptible conversations. -> **Status:** General Availability (GA). This is a stable release suitable for production use. +> **Status:** Preview (`1.3.0b1`). This beta release includes the latest SDK and sample updates and may change before the next stable release. > **Important:** As of version 1.0.0, this SDK is **async-only**. The synchronous API has been removed to focus exclusively on async patterns. All examples and samples use `async`/`await` syntax. @@ -16,34 +16,35 @@ Getting started ### Prerequisites -- **Python 3.9+** +- **Python 3.10+** - An **Azure subscription** - A **VoiceLive** resource and endpoint - A working **microphone** and **speakers/headphones** if you run the voice samples ### Install -Install the stable GA version: +Install the latest preview version: ```bash # Base install (core client only) -python -m pip install azure-ai-voicelive +python -m pip install --pre azure-ai-voicelive # For asynchronous streaming (uses aiohttp) -python -m pip install "azure-ai-voicelive[aiohttp]" +python -m pip install --pre "azure-ai-voicelive[aiohttp]" # For voice samples (includes audio processing) # First install PyAudio dependencies for your platform: # Linux: sudo apt-get install -y portaudio19-dev libasound2-dev # macOS: brew install portaudio -python -m pip install azure-ai-voicelive[aiohttp] pyaudio python-dotenv +python -m pip install --pre "azure-ai-voicelive[aiohttp]" azure-identity pyaudio python-dotenv ``` The SDK provides async-only WebSocket connections using `aiohttp` for optimal performance and reliability. ### Authenticate -You can authenticate with an **API key** or an **Azure Active Directory (AAD) token**. +You can authenticate with an **API key** or a Microsoft Entra ID token. +The samples default to `DefaultAzureCredential`; for local development, `az login` is usually the simplest path. #### API Key Authentication (Quick Start) @@ -66,7 +67,7 @@ async def main(): async with connect( endpoint="your-endpoint", credential=AzureKeyCredential("your-api-key"), - model="gpt-4o-realtime-preview" + model="gpt-realtime" ) as connection: # Your async code here pass @@ -76,7 +77,7 @@ asyncio.run(main()) #### AAD Token Authentication -For production applications, AAD authentication is recommended: +For production applications, Entra ID authentication is recommended: ```python import asyncio @@ -85,14 +86,17 @@ from azure.ai.voicelive import connect async def main(): credential = DefaultAzureCredential() - - async with connect( - endpoint="your-endpoint", - credential=credential, - model="gpt-4o-realtime-preview" - ) as connection: - # Your async code here - pass + + try: + async with connect( + endpoint="your-endpoint", + credential=credential, + model="gpt-realtime" + ) as connection: + # Your async code here + pass + finally: + await credential.close() asyncio.run(main()) ``` @@ -142,7 +146,7 @@ The Basic Voice Assistant sample demonstrates full-featured voice interaction wi python samples/basic_voice_assistant_async.py # With custom parameters -python samples/basic_voice_assistant_async.py --model gpt-4o-realtime-preview --voice alloy --instructions "You're a helpful assistant" +python samples/basic_voice_assistant_async.py --model gpt-realtime --voice alloy --instructions "You're a helpful assistant" ``` ### Minimal example @@ -157,7 +161,7 @@ from azure.ai.voicelive.models import ( API_KEY = "your-api-key" ENDPOINT = "wss://your-endpoint.com/openai/realtime" -MODEL = "gpt-4o-realtime-preview" +MODEL = "gpt-realtime" async def main(): async with connect( diff --git a/sdk/voicelive/azure-ai-voicelive/_metadata.json b/sdk/voicelive/azure-ai-voicelive/_metadata.json index 5786f7fc266c..312af8013e92 100644 --- a/sdk/voicelive/azure-ai-voicelive/_metadata.json +++ b/sdk/voicelive/azure-ai-voicelive/_metadata.json @@ -1,6 +1,6 @@ { - "apiVersion": "2026-04-10", + "apiVersion": "2026-06-01-preview", "apiVersions": { - "VoiceLive": "2026-04-10" + "VoiceLive": "2026-06-01-preview" } } \ No newline at end of file diff --git a/sdk/voicelive/azure-ai-voicelive/apiview-properties.json b/sdk/voicelive/azure-ai-voicelive/apiview-properties.json index 8a3964ca1de4..e90b8a3a7b9e 100644 --- a/sdk/voicelive/azure-ai-voicelive/apiview-properties.json +++ b/sdk/voicelive/azure-ai-voicelive/apiview-properties.json @@ -18,6 +18,7 @@ "azure.ai.voicelive.models.AzureAvatarVoiceSyncVoice": "VoiceLive.AzureAvatarVoiceSyncVoice", "azure.ai.voicelive.models.AzureCustomVoice": "VoiceLive.AzureCustomVoice", "azure.ai.voicelive.models.AzurePersonalVoice": "VoiceLive.AzurePersonalVoice", + "azure.ai.voicelive.models.AzureRealtimeNativeVoice": "VoiceLive.AzureRealtimeNativeVoice", "azure.ai.voicelive.models.EouDetection": "VoiceLive.EouDetection", "azure.ai.voicelive.models.AzureSemanticDetection": "VoiceLive.AzureSemanticDetection", "azure.ai.voicelive.models.AzureSemanticDetectionEn": "VoiceLive.AzureSemanticDetectionEn", @@ -45,6 +46,7 @@ "azure.ai.voicelive.models.ClientEventOutputAudioBufferClear": "VoiceLive.ClientEventOutputAudioBufferClear", "azure.ai.voicelive.models.ClientEventResponseCancel": "VoiceLive.ClientEventResponseCancel", "azure.ai.voicelive.models.ClientEventResponseCreate": "VoiceLive.ClientEventResponseCreate", + "azure.ai.voicelive.models.ClientEventRtcCallSdpCreate": "VoiceLive.ClientEventRtcCallSdpCreate", "azure.ai.voicelive.models.ClientEventSessionAvatarConnect": "VoiceLive.ClientEventSessionAvatarConnect", "azure.ai.voicelive.models.ClientEventSessionUpdate": "VoiceLive.ClientEventSessionUpdate", "azure.ai.voicelive.models.ContentPart": "VoiceLive.ContentPart", @@ -92,6 +94,7 @@ "azure.ai.voicelive.models.ResponseSession": "VoiceLive.ResponseSession", "azure.ai.voicelive.models.ResponseTextContentPart": "VoiceLive.ResponseTextContentPart", "azure.ai.voicelive.models.ResponseWebSearchCallItem": "VoiceLive.ResponseWebSearchCallItem", + "azure.ai.voicelive.models.RtcCallErrorDetails": "VoiceLive.RtcCallErrorDetails", "azure.ai.voicelive.models.Scene": "VoiceLive.Scene", "azure.ai.voicelive.models.ServerEvent": "VoiceLive.ServerEvent", "azure.ai.voicelive.models.ServerEventConversationItemCreated": "VoiceLive.ServerEventConversationItemCreated", @@ -111,6 +114,8 @@ "azure.ai.voicelive.models.ServerEventMcpListToolsFailed": "VoiceLive.ServerEventMcpListToolsFailed", "azure.ai.voicelive.models.ServerEventMcpListToolsInProgress": "VoiceLive.ServerEventMcpListToolsInProgress", "azure.ai.voicelive.models.ServerEventOutputAudioBufferCleared": "VoiceLive.ServerEventOutputAudioBufferCleared", + "azure.ai.voicelive.models.ServerEventOutputAudioBufferStarted": "VoiceLive.ServerEventOutputAudioBufferStarted", + "azure.ai.voicelive.models.ServerEventOutputAudioBufferStopped": "VoiceLive.ServerEventOutputAudioBufferStopped", "azure.ai.voicelive.models.ServerEventResponseAnimationBlendshapeDelta": "VoiceLive.ServerEventResponseAnimationBlendshapeDelta", "azure.ai.voicelive.models.ServerEventResponseAnimationBlendshapeDone": "VoiceLive.ServerEventResponseAnimationBlendshapeDone", "azure.ai.voicelive.models.ServerEventResponseAnimationVisemeDelta": "VoiceLive.ServerEventResponseAnimationVisemeDelta", @@ -131,6 +136,7 @@ "azure.ai.voicelive.models.ServerEventResponseFileSearchCallSearching": "VoiceLive.ServerEventResponseFileSearchCallSearching", "azure.ai.voicelive.models.ServerEventResponseFunctionCallArgumentsDelta": "VoiceLive.ServerEventResponseFunctionCallArgumentsDelta", "azure.ai.voicelive.models.ServerEventResponseFunctionCallArgumentsDone": "VoiceLive.ServerEventResponseFunctionCallArgumentsDone", + "azure.ai.voicelive.models.ServerEventResponseInvocationDelta": "VoiceLive.ServerEventResponseInvocationDelta", "azure.ai.voicelive.models.ServerEventResponseMcpCallArgumentsDelta": "VoiceLive.ServerEventResponseMcpCallArgumentsDelta", "azure.ai.voicelive.models.ServerEventResponseMcpCallArgumentsDone": "VoiceLive.ServerEventResponseMcpCallArgumentsDone", "azure.ai.voicelive.models.ServerEventResponseMcpCallCompleted": "VoiceLive.ServerEventResponseMcpCallCompleted", @@ -144,6 +150,8 @@ "azure.ai.voicelive.models.ServerEventResponseWebSearchCallCompleted": "VoiceLive.ServerEventResponseWebSearchCallCompleted", "azure.ai.voicelive.models.ServerEventResponseWebSearchCallInProgress": "VoiceLive.ServerEventResponseWebSearchCallInProgress", "azure.ai.voicelive.models.ServerEventResponseWebSearchCallSearching": "VoiceLive.ServerEventResponseWebSearchCallSearching", + "azure.ai.voicelive.models.ServerEventRtcCallError": "VoiceLive.ServerEventRtcCallError", + "azure.ai.voicelive.models.ServerEventRtcCallSdpCreated": "VoiceLive.ServerEventRtcCallSdpCreated", "azure.ai.voicelive.models.ServerEventSessionAvatarConnecting": "VoiceLive.ServerEventSessionAvatarConnecting", "azure.ai.voicelive.models.ServerEventSessionAvatarSwitchToIdle": "VoiceLive.ServerEventSessionAvatarSwitchToIdle", "azure.ai.voicelive.models.ServerEventSessionAvatarSwitchToSpeaking": "VoiceLive.ServerEventSessionAvatarSwitchToSpeaking", @@ -165,35 +173,37 @@ "azure.ai.voicelive.models.VideoParams": "VoiceLive.VideoParams", "azure.ai.voicelive.models.VideoResolution": "VoiceLive.VideoResolution", "azure.ai.voicelive.models.VoiceLiveErrorDetails": "VoiceLive.VoiceLiveErrorDetails", - "azure.ai.voicelive.models.ClientEventType": "VoiceLive.ClientEventType", - "azure.ai.voicelive.models.ItemType": "VoiceLive.ItemType", - "azure.ai.voicelive.models.ItemParamStatus": "VoiceLive.ItemParamStatus", - "azure.ai.voicelive.models.MessageRole": "VoiceLive.MessageRole", - "azure.ai.voicelive.models.ContentPartType": "VoiceLive.ContentPartType", - "azure.ai.voicelive.models.Modality": "VoiceLive.Modality", + "azure.ai.voicelive.models.AnimationOutputType": "VoiceLive.AnimationOutputType", "azure.ai.voicelive.models.OpenAIVoiceName": "VoiceLive.OAIVoice", "azure.ai.voicelive.models.AzureVoiceType": "VoiceLive.AzureVoiceType", "azure.ai.voicelive.models.PersonalVoiceModels": "VoiceLive.PersonalVoiceModels", - "azure.ai.voicelive.models.OutputAudioFormat": "VoiceLive.OutputAudioFormat", + "azure.ai.voicelive.models.AzureRealtimeNativeVoiceName": "VoiceLive.AzureRealtimeNativeVoiceName", + "azure.ai.voicelive.models.EouThresholdLevel": "VoiceLive.EouThresholdLevel", + "azure.ai.voicelive.models.TurnDetectionType": "VoiceLive.TurnDetectionType", + "azure.ai.voicelive.models.EchoCancellationReferenceSource": "VoiceLive.EchoCancellationReferenceSource", + "azure.ai.voicelive.models.AvatarConfigTypes": "VoiceLive.AvatarConfigTypes", + "azure.ai.voicelive.models.PhotoAvatarBaseModes": "VoiceLive.PhotoAvatarBaseModes", + "azure.ai.voicelive.models.AvatarOutputProtocol": "VoiceLive.AvatarOutputProtocol", "azure.ai.voicelive.models.ToolType": "VoiceLive.ToolType", "azure.ai.voicelive.models.MCPApprovalType": "VoiceLive.MCPApprovalType", - "azure.ai.voicelive.models.ReasoningEffort": "VoiceLive.ReasoningEffort", "azure.ai.voicelive.models.InterimResponseConfigType": "VoiceLive.InterimResponseConfigType", "azure.ai.voicelive.models.InterimResponseTrigger": "VoiceLive.InterimResponseTrigger", - "azure.ai.voicelive.models.AnimationOutputType": "VoiceLive.AnimationOutputType", + "azure.ai.voicelive.models.Modality": "VoiceLive.Modality", "azure.ai.voicelive.models.InputAudioFormat": "VoiceLive.InputAudioFormat", - "azure.ai.voicelive.models.TurnDetectionType": "VoiceLive.TurnDetectionType", - "azure.ai.voicelive.models.EouThresholdLevel": "VoiceLive.EouThresholdLevel", - "azure.ai.voicelive.models.AvatarConfigTypes": "VoiceLive.AvatarConfigTypes", - "azure.ai.voicelive.models.PhotoAvatarBaseModes": "VoiceLive.PhotoAvatarBaseModes", - "azure.ai.voicelive.models.AvatarOutputProtocol": "VoiceLive.AvatarOutputProtocol", + "azure.ai.voicelive.models.OutputAudioFormat": "VoiceLive.OutputAudioFormat", "azure.ai.voicelive.models.AudioTimestampType": "VoiceLive.AudioTimestampType", "azure.ai.voicelive.models.ToolChoiceLiteral": "VoiceLive.ToolChoiceLiteral", + "azure.ai.voicelive.models.ReasoningEffort": "VoiceLive.ReasoningEffort", "azure.ai.voicelive.models.SessionIncludeOption": "VoiceLive.SessionIncludeOption", + "azure.ai.voicelive.models.ClientEventType": "VoiceLive.ClientEventType", + "azure.ai.voicelive.models.ItemType": "VoiceLive.ItemType", + "azure.ai.voicelive.models.ItemParamStatus": "VoiceLive.ItemParamStatus", + "azure.ai.voicelive.models.MessageRole": "VoiceLive.MessageRole", + "azure.ai.voicelive.models.ContentPartType": "VoiceLive.ContentPartType", "azure.ai.voicelive.models.ResponseStatus": "VoiceLive.ResponseStatus", - "azure.ai.voicelive.models.ResponseItemStatus": "VoiceLive.ResponseItemStatus", "azure.ai.voicelive.models.RequestImageContentPartDetail": "VoiceLive.RequestImageContentPartDetail", + "azure.ai.voicelive.models.ResponseItemStatus": "VoiceLive.ResponseItemStatus", "azure.ai.voicelive.models.ServerEventType": "VoiceLive.ServerEventType" }, - "CrossLanguageVersion": "4f7c08a38aa5" + "CrossLanguageVersion": "d4391398f022" } \ No newline at end of file diff --git a/sdk/voicelive/azure-ai-voicelive/azure/ai/voicelive/_types.py b/sdk/voicelive/azure-ai-voicelive/azure/ai/voicelive/_types.py index c2ff170a25f0..ae3e380f7eaa 100644 --- a/sdk/voicelive/azure-ai-voicelive/azure/ai/voicelive/_types.py +++ b/sdk/voicelive/azure-ai-voicelive/azure/ai/voicelive/_types.py @@ -10,6 +10,8 @@ if TYPE_CHECKING: from . import models as _models -Voice = Union[str, "_models.OpenAIVoiceName", "_models.OpenAIVoice", "_models.AzureVoice"] -InterimResponseConfig = Union["_models.StaticInterimResponseConfig", "_models.LlmInterimResponseConfig"] +Voice = Union[ + str, "_models.OpenAIVoiceName", "_models.OpenAIVoice", "_models.AzureVoice", "_models.AzureRealtimeNativeVoice" +] ToolChoice = Union[str, "_models.ToolChoiceLiteral", "_models.ToolChoiceSelection"] +InterimResponseConfig = Union["_models.StaticInterimResponseConfig", "_models.LlmInterimResponseConfig"] diff --git a/sdk/voicelive/azure-ai-voicelive/azure/ai/voicelive/_utils/model_base.py b/sdk/voicelive/azure-ai-voicelive/azure/ai/voicelive/_utils/model_base.py index 4102784f9a85..d725c55906d3 100644 --- a/sdk/voicelive/azure-ai-voicelive/azure/ai/voicelive/_utils/model_base.py +++ b/sdk/voicelive/azure-ai-voicelive/azure/ai/voicelive/_utils/model_base.py @@ -590,6 +590,239 @@ def _create_value(rf: typing.Optional["_RestField"], value: typing.Any) -> typin return _serialize(value, rf._format) +# ============================================================================ +# Fast-path scalar deserializer functions for rest_field(deserializer=...) +# These are referenced from rest_field declarations to bypass the generic +# _deserialize -> _deserialize_with_callable chain. +# Only simple/primitive types — no models or container types. +# ============================================================================ + + +def _xml_deser_str(value): + if isinstance(value, ET.Element): + return value.text or "" + return str(value) if value is not None else None + + +def _xml_deser_int(value): + if isinstance(value, ET.Element): + return int(value.text) if value.text else None + return int(value) if value is not None else None + + +def _xml_deser_float(value): + if isinstance(value, ET.Element): + return float(value.text) if value.text else None + return float(value) if value is not None else None + + +def _xml_deser_bool(value): + if isinstance(value, ET.Element): + text = value.text + else: + text = value + if text is None: + return None + if text in (True, False): + return text + return text.lower() == "true" + + +# pylint: disable=docstring-missing-param +def _xml_deser_bytes(value): + """Deserialize bytes from XML (base64).""" + if isinstance(value, ET.Element): + text = value.text + else: + text = value + if text is None: + return None + return _deserialize_bytes(text) + + +def _xml_deser_bytes_base64url(value): + """Deserialize bytes from XML (base64url).""" + if isinstance(value, ET.Element): + text = value.text + else: + text = value + if text is None: + return None + return _deserialize_bytes_base64(text) + + +def _xml_deser_datetime(value): + """Deserialize a datetime from XML (ISO 8601 / rfc3339).""" + if isinstance(value, ET.Element): + text = value.text + else: + text = value + if text is None: + return None + return _deserialize_datetime(text) + + +def _xml_deser_datetime_rfc7231(value): + """Deserialize a datetime from XML (RFC7231 format).""" + if isinstance(value, ET.Element): + text = value.text + else: + text = value + if text is None: + return None + return _deserialize_datetime_rfc7231(text) + + +def _xml_deser_datetime_unix_timestamp(value): + """Deserialize a datetime from XML (Unix timestamp).""" + if isinstance(value, ET.Element): + text = value.text + else: + text = value + if text is None: + return None + return _deserialize_datetime_unix_timestamp(float(text)) + + +def _xml_deser_date(value): + """Deserialize a date from XML (ISO 8601).""" + if isinstance(value, ET.Element): + text = value.text + else: + text = value + if text is None: + return None + return _deserialize_date(text) + + +def _xml_deser_time(value): + """Deserialize a time from XML (ISO 8601).""" + if isinstance(value, ET.Element): + text = value.text + else: + text = value + if text is None: + return None + return _deserialize_time(text) + + +def _xml_deser_duration(value): + """Deserialize a timedelta from XML (ISO 8601 duration).""" + if isinstance(value, ET.Element): + text = value.text + else: + text = value + if text is None: + return None + return _deserialize_duration(text) + + +def _xml_deser_decimal(value): + """Deserialize a Decimal from XML.""" + if isinstance(value, ET.Element): + text = value.text + else: + text = value + if text is None: + return None + return _deserialize_decimal(text) + + +def _xml_deser_enum_or_str(enum_cls, value): + """Deserialize a Union[EnumType, str] from XML.""" + text = value.text if isinstance(value, ET.Element) else value + if text is None: + return None + try: + return enum_cls(text) + except ValueError: + return text + + +def _extract_xml_model_type(rf_type): + """Extract the concrete Model class from a resolved rf._type partial chain. + + Unwraps ``Optional[Model]`` and ``_deserialize_model(Model, ...)`` + wrappers. Only handles Model and Optional[Model] — other composite + types (List, Dict, Union, etc.) return None and fall through to the + generic ``_deserialize`` path at runtime. + """ + if rf_type is None: + return None + if isinstance(rf_type, type) and _is_model(rf_type): + return rf_type + if not isinstance(rf_type, functools.partial): + return None + func = rf_type.func + args = rf_type.args + if func is _deserialize_with_optional and args: + return _extract_xml_model_type(args[0]) + if func is _deserialize_model and args: + cls = args[0] + return cls if isinstance(cls, type) and _is_model(cls) else None + return None + + +def _build_xml_field_plan( # pylint: disable=docstring-missing-return, docstring-missing-rtype, unused-variable + cls, attr_to_rest_field: dict +) -> list: + """Build a precomputed XML field plan for fast _init_from_xml iteration. + + Called once per model class in __new__. Returns a list of tuples: + (rest_name, xml_name, kind, deser, rf_type, is_optional, items_name) + + kind: 0=wrapped, 1=attribute, 2=unwrapped, 3=text + + For Model and Optional[Model] fields that lack a scalar + ``_deserializer``, this function precomputes the Model class as the + deserializer so ``_init_from_xml`` can call ``ModelClass(element)`` + directly instead of going through the expensive + ``_get_deserialize_callable_from_annotation`` chain at runtime. + """ + model_meta = getattr(cls, "_xml", {}) + model_ns = model_meta.get("ns") or model_meta.get("namespace") + plan = [] + + for rf in attr_to_rest_field.values(): + prop_meta = getattr(rf, "_xml", {}) + deser = rf._deserializer + + xml_name = prop_meta.get("name", rf._rest_name) + xml_ns = _resolve_xml_ns(prop_meta, model_meta) + if xml_ns: + xml_name = "{" + xml_ns + "}" + xml_name + + is_optional = rf._is_optional + + # For Model / Optional[Model] fields without a scalar deserializer, + # precompute the Model class as the deserializer. + if deser is None and rf._type is not None: + model_cls = _extract_xml_model_type(rf._type) + if model_cls is not None: + deser = model_cls + + if prop_meta.get("attribute", False): + plan.append((rf._rest_name, xml_name, 1, deser, rf._type, is_optional, None)) + elif prop_meta.get("unwrapped", False): + items_name = prop_meta.get("itemsName") + if items_name: + items_ns = prop_meta.get("itemsNs") + if items_ns is not None: + xml_ns = items_ns + if xml_ns: + items_name = "{" + xml_ns + "}" + items_name + else: + items_name = xml_name + plan.append((rf._rest_name, xml_name, 2, deser, rf._type, is_optional, items_name)) + elif prop_meta.get("text", False): + plan.append((rf._rest_name, xml_name, 3, deser, rf._type, is_optional, None)) + else: + plan.append((rf._rest_name, xml_name, 0, deser, rf._type, is_optional, None)) + + return plan + + +# pylint: enable=docstring-missing-param class Model(_MyMutableMapping): _is_model = True # label whether current class's _attr_to_rest_field has been calculated @@ -630,7 +863,9 @@ def __init__(self, *args: typing.Any, **kwargs: typing.Any) -> None: dict_to_pass[rf._rest_name] = _create_value(rf, rf._default) super().__init__(dict_to_pass) - def _init_from_xml(self, element: ET.Element) -> dict[str, typing.Any]: + def _init_from_xml( # pylint: disable=too-many-branches, too-many-statements + self, element: ET.Element + ) -> dict[str, typing.Any]: """Deserialize an XML element into a dict mapping rest field names to values. :param ET.Element element: The XML element to deserialize from. @@ -638,53 +873,89 @@ def _init_from_xml(self, element: ET.Element) -> dict[str, typing.Any]: :rtype: dict """ result: dict[str, typing.Any] = {} - model_meta = getattr(self, "_xml", {}) existed_attr_keys: list[str] = [] - for rf in self._attr_to_rest_field.values(): - prop_meta = getattr(rf, "_xml", {}) - xml_name = prop_meta.get("name", rf._rest_name) - xml_ns = _resolve_xml_ns(prop_meta, model_meta) - if xml_ns: - xml_name = "{" + xml_ns + "}" + xml_name - - # attribute - if prop_meta.get("attribute", False) and element.get(xml_name) is not None: - existed_attr_keys.append(xml_name) - result[rf._rest_name] = _deserialize(rf._type, element.get(xml_name)) - continue - - # unwrapped element is array - if prop_meta.get("unwrapped", False): - # unwrapped array could either use prop items meta/prop meta - _items_name = prop_meta.get("itemsName") - if _items_name: - xml_name = _items_name - _items_ns = prop_meta.get("itemsNs") - if _items_ns is not None: - xml_ns = _items_ns - if xml_ns: - xml_name = "{" + xml_ns + "}" + xml_name - items = element.findall(xml_name) # pyright: ignore - if len(items) > 0: + field_plan = getattr(self, "_xml_field_plan", None) + if field_plan: + for rest_name, xml_name, kind, deser, rf_type, is_optional, items_name in field_plan: + if kind == 0: # wrapped element (most common) + item = element.find(xml_name) + if item is not None: + existed_attr_keys.append(xml_name) + if deser: + result[rest_name] = deser(item) + else: + result[rest_name] = _deserialize(rf_type, item) + elif kind == 1: # attribute + attr_val = element.get(xml_name) + if attr_val is not None: + existed_attr_keys.append(xml_name) + if deser: + result[rest_name] = deser(attr_val) + else: + result[rest_name] = attr_val + elif kind == 2: # unwrapped array + items = element.findall(items_name) # pyright: ignore + if len(items) > 0: + existed_attr_keys.append(items_name) + if deser: + result[rest_name] = deser(items) + else: + result[rest_name] = _deserialize(rf_type, items) + elif not is_optional: + existed_attr_keys.append(items_name) + result[rest_name] = [] + elif kind == 3: # text + if element.text is not None: + if deser: + result[rest_name] = deser(element.text) + else: + result[rest_name] = element.text + else: + model_meta = getattr(self, "_xml", {}) + for rf in self._attr_to_rest_field.values(): + prop_meta = getattr(rf, "_xml", {}) + xml_name = prop_meta.get("name", rf._rest_name) + xml_ns = _resolve_xml_ns(prop_meta, model_meta) + if xml_ns: + xml_name = "{" + xml_ns + "}" + xml_name + + # attribute + if prop_meta.get("attribute", False) and element.get(xml_name) is not None: existed_attr_keys.append(xml_name) - result[rf._rest_name] = _deserialize(rf._type, items) - elif not rf._is_optional: + result[rf._rest_name] = _deserialize(rf._type, element.get(xml_name)) + continue + + # unwrapped element is array + if prop_meta.get("unwrapped", False): + _items_name = prop_meta.get("itemsName") + if _items_name: + xml_name = _items_name + _items_ns = prop_meta.get("itemsNs") + if _items_ns is not None: + xml_ns = _items_ns + if xml_ns: + xml_name = "{" + xml_ns + "}" + xml_name + items = element.findall(xml_name) # pyright: ignore + if len(items) > 0: + existed_attr_keys.append(xml_name) + result[rf._rest_name] = _deserialize(rf._type, items) + elif not rf._is_optional: + existed_attr_keys.append(xml_name) + result[rf._rest_name] = [] + continue + + # text element is primitive type + if prop_meta.get("text", False): + if element.text is not None: + result[rf._rest_name] = _deserialize(rf._type, element.text) + continue + + # wrapped element could be normal property or array + item = element.find(xml_name) + if item is not None: existed_attr_keys.append(xml_name) - result[rf._rest_name] = [] - continue - - # text element is primitive type - if prop_meta.get("text", False): - if element.text is not None: - result[rf._rest_name] = _deserialize(rf._type, element.text) - continue - - # wrapped element could be normal property or array, it should only have one element - item = element.find(xml_name) - if item is not None: - existed_attr_keys.append(xml_name) - result[rf._rest_name] = _deserialize(rf._type, item) + result[rf._rest_name] = _deserialize(rf._type, item) # rest thing is additional properties for e in element: @@ -717,6 +988,9 @@ def __new__(cls, *args: typing.Any, **kwargs: typing.Any) -> Self: if not rf._rest_name_input: rf._rest_name_input = attr cls._attr_to_rest_field: dict[str, _RestField] = dict(attr_to_rest_field.items()) + # Build XML field plan for fast _init_from_xml (only for XML models) + if getattr(cls, "_xml", None): + cls._xml_field_plan = _build_xml_field_plan(cls, attr_to_rest_field) cls._calculated.add(f"{cls.__module__}.{cls.__qualname__}") return super().__new__(cls) @@ -1091,6 +1365,7 @@ def __init__( format: typing.Optional[str] = None, is_multipart_file_input: bool = False, xml: typing.Optional[dict[str, typing.Any]] = None, + deserializer: typing.Optional[typing.Callable] = None, ): self._type = type self._rest_name_input = name @@ -1103,6 +1378,7 @@ def __init__( self._format = format self._is_multipart_file_input = is_multipart_file_input self._xml = xml if xml is not None else {} + self._deserializer = deserializer @property def _class_type(self) -> typing.Any: @@ -1138,7 +1414,11 @@ def __get__(self, obj: Model, type=None): # pylint: disable=redefined-builtin # Return the value from _data directly (it's been deserialized in place) return obj._data.get(self._rest_name) - deserialized = _deserialize(self._type, _serialize(item, self._format), rf=self) + # Fast path: use _deserializer directly (avoids _serialize/_deserialize chain) + if self._deserializer: + deserialized = self._deserializer(item) + else: + deserialized = _deserialize(self._type, _serialize(item, self._format), rf=self) # For mutable types, store the deserialized value back in _data # so mutations directly affect _data @@ -1184,6 +1464,7 @@ def rest_field( format: typing.Optional[str] = None, is_multipart_file_input: bool = False, xml: typing.Optional[dict[str, typing.Any]] = None, + deserializer: typing.Optional[typing.Callable] = None, ) -> typing.Any: return _RestField( name=name, @@ -1193,6 +1474,7 @@ def rest_field( format=format, is_multipart_file_input=is_multipart_file_input, xml=xml, + deserializer=deserializer, ) @@ -1426,6 +1708,8 @@ def _deserialize_xml( value: str, ) -> typing.Any: element = ET.fromstring(value) # nosec + if _is_model(deserializer): + return deserializer._deserialize(element, []) return _deserialize(deserializer, element) diff --git a/sdk/voicelive/azure-ai-voicelive/azure/ai/voicelive/_utils/serialization.py b/sdk/voicelive/azure-ai-voicelive/azure/ai/voicelive/_utils/serialization.py index 954bf7ebffa7..a088671e9c51 100644 --- a/sdk/voicelive/azure-ai-voicelive/azure/ai/voicelive/_utils/serialization.py +++ b/sdk/voicelive/azure-ai-voicelive/azure/ai/voicelive/_utils/serialization.py @@ -1405,7 +1405,7 @@ def __init__(self, classes: Optional[Mapping[str, type]] = None) -> None: # Otherwise, result are unexpected self.additional_properties_detection = True - def __call__(self, target_obj, response_data, content_type=None): + def __call__(self, target_obj, response_data, content_type=None): # pylint: disable=too-many-return-statements """Call the deserializer to process a REST response. :param str target_obj: Target data type to deserialize to. @@ -1415,6 +1415,27 @@ def __call__(self, target_obj, response_data, content_type=None): :return: Deserialized object. :rtype: object """ + # Fast path for header deserialization: response_data is a plain str or None + # and target_obj is a simple scalar type. This avoids the expensive + # _unpack_content → _deserialize → _classify_target → deserialize_data chain. + if response_data is None: + return None + if target_obj == "str" and isinstance(response_data, str): + return response_data + if isinstance(response_data, str): + if target_obj == "int": + return int(response_data) + if target_obj == "bool": + if response_data in ("true", "1", "True"): + return True + if response_data in ("false", "0", "False"): + return False + return bool(response_data) + if target_obj == "rfc-1123": + return Deserializer.deserialize_rfc(response_data) + if target_obj == "bytearray": + return Deserializer.deserialize_bytearray(response_data) + data = self._unpack_content(response_data, content_type) return self._deserialize(target_obj, data) diff --git a/sdk/voicelive/azure-ai-voicelive/azure/ai/voicelive/_version.py b/sdk/voicelive/azure-ai-voicelive/azure/ai/voicelive/_version.py index a73f358d285a..8b42750446d5 100644 --- a/sdk/voicelive/azure-ai-voicelive/azure/ai/voicelive/_version.py +++ b/sdk/voicelive/azure-ai-voicelive/azure/ai/voicelive/_version.py @@ -6,4 +6,4 @@ # Changes may cause incorrect behavior and will be lost if the code is regenerated. # -------------------------------------------------------------------------- -VERSION = "1.2.0" +VERSION = "1.3.0b1" diff --git a/sdk/voicelive/azure-ai-voicelive/azure/ai/voicelive/aio/_patch.py b/sdk/voicelive/azure-ai-voicelive/azure/ai/voicelive/aio/_patch.py index 5369ffe1eb1d..889c4a0d235c 100644 --- a/sdk/voicelive/azure-ai-voicelive/azure/ai/voicelive/aio/_patch.py +++ b/sdk/voicelive/azure-ai-voicelive/azure/ai/voicelive/aio/_patch.py @@ -1,4 +1,4 @@ -# pylint: disable=line-too-long,useless-suppression +# pylint: disable=line-too-long,useless-suppression,too-many-lines # coding=utf-8 # -------------------------------------------------------------------------- # Copyright (c) Microsoft Corporation. @@ -115,7 +115,9 @@ def _build_foundry_agent_config( return None if agent_name is None or project_name is None: - raise ValueError("Both 'agent_name' and 'project_name' are required when connecting to an Azure AI Foundry agent.") + raise ValueError( + "Both 'agent_name' and 'project_name' are required when connecting to an Azure AI Foundry agent." + ) return {key: value for key, value in agent_config.items() if value is not None} @@ -877,8 +879,7 @@ def connect( headers: Optional[Mapping[str, Any]] = None, connection_options: Optional[WebsocketConnectionOptions] = None, credential_scopes: Optional[Union[str, Sequence[str]]] = None, -) -> AbstractAsyncContextManager["VoiceLiveConnection"]: - ... +) -> AbstractAsyncContextManager["VoiceLiveConnection"]: ... @overload @@ -898,8 +899,7 @@ def connect( headers: Optional[Mapping[str, Any]] = None, connection_options: Optional[WebsocketConnectionOptions] = None, credential_scopes: Optional[Union[str, Sequence[str]]] = None, -) -> AbstractAsyncContextManager["VoiceLiveConnection"]: - ... +) -> AbstractAsyncContextManager["VoiceLiveConnection"]: ... def connect( diff --git a/sdk/voicelive/azure-ai-voicelive/azure/ai/voicelive/models/__init__.py b/sdk/voicelive/azure-ai-voicelive/azure/ai/voicelive/models/__init__.py index 9b2114496b5e..55c3a8acb5cf 100644 --- a/sdk/voicelive/azure-ai-voicelive/azure/ai/voicelive/models/__init__.py +++ b/sdk/voicelive/azure-ai-voicelive/azure/ai/voicelive/models/__init__.py @@ -28,6 +28,7 @@ AzureAvatarVoiceSyncVoice, AzureCustomVoice, AzurePersonalVoice, + AzureRealtimeNativeVoice, AzureSemanticDetection, AzureSemanticDetectionEn, AzureSemanticDetectionMultilingual, @@ -54,6 +55,7 @@ ClientEventOutputAudioBufferClear, ClientEventResponseCancel, ClientEventResponseCreate, + ClientEventRtcCallSdpCreate, ClientEventSessionAvatarConnect, ClientEventSessionUpdate, ContentPart, @@ -103,6 +105,7 @@ ResponseStatusDetails, ResponseTextContentPart, ResponseWebSearchCallItem, + RtcCallErrorDetails, Scene, ServerEvent, ServerEventConversationItemCreated, @@ -122,6 +125,8 @@ ServerEventMcpListToolsFailed, ServerEventMcpListToolsInProgress, ServerEventOutputAudioBufferCleared, + ServerEventOutputAudioBufferStarted, + ServerEventOutputAudioBufferStopped, ServerEventResponseAnimationBlendshapeDelta, ServerEventResponseAnimationBlendshapeDone, ServerEventResponseAnimationVisemeDelta, @@ -142,6 +147,7 @@ ServerEventResponseFileSearchCallSearching, ServerEventResponseFunctionCallArgumentsDelta, ServerEventResponseFunctionCallArgumentsDone, + ServerEventResponseInvocationDelta, ServerEventResponseMcpCallArgumentsDelta, ServerEventResponseMcpCallArgumentsDone, ServerEventResponseMcpCallCompleted, @@ -155,6 +161,8 @@ ServerEventResponseWebSearchCallCompleted, ServerEventResponseWebSearchCallInProgress, ServerEventResponseWebSearchCallSearching, + ServerEventRtcCallError, + ServerEventRtcCallSdpCreated, ServerEventSessionAvatarConnecting, ServerEventSessionAvatarSwitchToIdle, ServerEventSessionAvatarSwitchToSpeaking, @@ -185,9 +193,11 @@ AudioTimestampType, AvatarConfigTypes, AvatarOutputProtocol, + AzureRealtimeNativeVoiceName, AzureVoiceType, ClientEventType, ContentPartType, + EchoCancellationReferenceSource, EouThresholdLevel, InputAudioFormat, InterimResponseConfigType, @@ -230,6 +240,7 @@ "AzureAvatarVoiceSyncVoice", "AzureCustomVoice", "AzurePersonalVoice", + "AzureRealtimeNativeVoice", "AzureSemanticDetection", "AzureSemanticDetectionEn", "AzureSemanticDetectionMultilingual", @@ -256,6 +267,7 @@ "ClientEventOutputAudioBufferClear", "ClientEventResponseCancel", "ClientEventResponseCreate", + "ClientEventRtcCallSdpCreate", "ClientEventSessionAvatarConnect", "ClientEventSessionUpdate", "ContentPart", @@ -305,6 +317,7 @@ "ResponseStatusDetails", "ResponseTextContentPart", "ResponseWebSearchCallItem", + "RtcCallErrorDetails", "Scene", "ServerEvent", "ServerEventConversationItemCreated", @@ -324,6 +337,8 @@ "ServerEventMcpListToolsFailed", "ServerEventMcpListToolsInProgress", "ServerEventOutputAudioBufferCleared", + "ServerEventOutputAudioBufferStarted", + "ServerEventOutputAudioBufferStopped", "ServerEventResponseAnimationBlendshapeDelta", "ServerEventResponseAnimationBlendshapeDone", "ServerEventResponseAnimationVisemeDelta", @@ -344,6 +359,7 @@ "ServerEventResponseFileSearchCallSearching", "ServerEventResponseFunctionCallArgumentsDelta", "ServerEventResponseFunctionCallArgumentsDone", + "ServerEventResponseInvocationDelta", "ServerEventResponseMcpCallArgumentsDelta", "ServerEventResponseMcpCallArgumentsDone", "ServerEventResponseMcpCallCompleted", @@ -357,6 +373,8 @@ "ServerEventResponseWebSearchCallCompleted", "ServerEventResponseWebSearchCallInProgress", "ServerEventResponseWebSearchCallSearching", + "ServerEventRtcCallError", + "ServerEventRtcCallSdpCreated", "ServerEventSessionAvatarConnecting", "ServerEventSessionAvatarSwitchToIdle", "ServerEventSessionAvatarSwitchToSpeaking", @@ -384,9 +402,11 @@ "AudioTimestampType", "AvatarConfigTypes", "AvatarOutputProtocol", + "AzureRealtimeNativeVoiceName", "AzureVoiceType", "ClientEventType", "ContentPartType", + "EchoCancellationReferenceSource", "EouThresholdLevel", "InputAudioFormat", "InterimResponseConfigType", diff --git a/sdk/voicelive/azure-ai-voicelive/azure/ai/voicelive/models/_enums.py b/sdk/voicelive/azure-ai-voicelive/azure/ai/voicelive/models/_enums.py index b01b33321a0b..e21cd92d45fe 100644 --- a/sdk/voicelive/azure-ai-voicelive/azure/ai/voicelive/models/_enums.py +++ b/sdk/voicelive/azure-ai-voicelive/azure/ai/voicelive/models/_enums.py @@ -44,6 +44,37 @@ class AvatarOutputProtocol(str, Enum, metaclass=CaseInsensitiveEnumMeta): """WebSocket protocol, output the video frames over WebSocket.""" +class AzureRealtimeNativeVoiceName(str, Enum, metaclass=CaseInsensitiveEnumMeta): + """Currently known voice names for the Azure realtime native voice type. This is an extensible + enum; additional voice names may be accepted by the service in the future. + """ + + AARTI = "aarti" + """Aarti voice.""" + ANDREW = "andrew" + """Andrew voice.""" + AVA = "ava" + """Ava voice.""" + DENISE = "denise" + """Denise voice.""" + DIYA = "diya" + """Diya voice.""" + ELSA = "elsa" + """Elsa voice.""" + FLORIAN = "florian" + """Florian voice.""" + FRANCISCA = "francisca" + """Francisca voice.""" + MEERA = "meera" + """Meera voice.""" + XIAOXIAO = "xiaoxiao" + """Xiaoxiao voice.""" + YUNXI = "yunxi" + """Yunxi voice.""" + XIMENA = "ximena" + """Ximena voice.""" + + class AzureVoiceType(str, Enum, metaclass=CaseInsensitiveEnumMeta): """Union of all supported Azure voice types.""" @@ -96,6 +127,8 @@ class ClientEventType(str, Enum, metaclass=CaseInsensitiveEnumMeta): """MCP_APPROVAL_RESPONSE.""" OUTPUT_AUDIO_BUFFER_CLEAR = "output_audio_buffer.clear" """Client request to clear the avatar output buffer.""" + RTC_CALL_SDP_CREATE = "rtc.call.sdp.create" + """Sent by the client to initiate a WebRTC session with an SDP offer.""" class ContentPartType(str, Enum, metaclass=CaseInsensitiveEnumMeta): @@ -113,6 +146,15 @@ class ContentPartType(str, Enum, metaclass=CaseInsensitiveEnumMeta): """AUDIO.""" +class EchoCancellationReferenceSource(str, Enum, metaclass=CaseInsensitiveEnumMeta): + """The source of the echo cancellation reference signal.""" + + SERVER = "server" + """EC uses the internal TTS loopback as the reference signal.""" + CLIENT = "client" + """EC uses the client-supplied reference channel from the stereo input stream.""" + + class EouThresholdLevel(str, Enum, metaclass=CaseInsensitiveEnumMeta): """Threshold level settings for Azure semantic end-of-utterance detection.""" @@ -466,6 +508,16 @@ class ServerEventType(str, Enum, metaclass=CaseInsensitiveEnumMeta): """Output audio buffer has been cleared.""" RESPONSE_AUDIO_TRANSCRIPT_ANNOTATION_ADDED = "response.audio_transcript.annotation.added" """Audio transcript annotation added.""" + RESPONSE_INVOCATION_DELTA = "response.invocation.delta" + """Invocation passthrough delta from hosted agent.""" + RTC_CALL_SDP_CREATED = "rtc.call.sdp.created" + """Returned when the WebRTC SDP negotiation completes successfully.""" + RTC_CALL_ERROR = "rtc.call.error" + """Returned when a WebRTC call operation fails.""" + OUTPUT_AUDIO_BUFFER_STARTED = "output_audio_buffer.started" + """Output audio buffer playback started.""" + OUTPUT_AUDIO_BUFFER_STOPPED = "output_audio_buffer.stopped" + """Output audio buffer playback stopped.""" class SessionIncludeOption(str, Enum, metaclass=CaseInsensitiveEnumMeta): diff --git a/sdk/voicelive/azure-ai-voicelive/azure/ai/voicelive/models/_models.py b/sdk/voicelive/azure-ai-voicelive/azure/ai/voicelive/models/_models.py index 94d77d7bac95..3accdd4f4906 100644 --- a/sdk/voicelive/azure-ai-voicelive/azure/ai/voicelive/models/_models.py +++ b/sdk/voicelive/azure-ai-voicelive/azure/ai/voicelive/models/_models.py @@ -397,11 +397,58 @@ class AudioEchoCancellation(_Model): :ivar type: The type of echo cancellation model to use. Required. Default value is "server_echo_cancellation". :vartype type: str + :ivar reference_source: The source of the echo cancellation reference signal. + + * `server`: EC uses the internal TTS loopback as the reference signal (default, existing + behavior). + * `client`: EC uses the client-supplied reference channel (ch1 of stereo input). Internal + TTS loopback is skipped. Known values are: "server" and "client". + :vartype reference_source: str or ~azure.ai.voicelive.models.EchoCancellationReferenceSource + :ivar channels: Number of input audio channels. + + * `1`: Mono input (default). + * `2`: Interleaved stereo input where channel 0 is the microphone signal and channel 1 is + the echo reference signal. + When set to 2, `reference_source` must be `client` and `input_audio_format` must be + `pcm16`. + :vartype channels: int """ type: Literal["server_echo_cancellation"] = rest_field(visibility=["read", "create", "update", "delete", "query"]) """The type of echo cancellation model to use. Required. Default value is \"server_echo_cancellation\".""" + reference_source: Optional[Union[str, "_models.EchoCancellationReferenceSource"]] = rest_field( + visibility=["read", "create", "update", "delete", "query"] + ) + """The source of the echo cancellation reference signal. + + * `server`: EC uses the internal TTS loopback as the reference signal (default, existing + behavior). + * `client`: EC uses the client-supplied reference channel (ch1 of stereo input). Internal + TTS loopback is skipped. Known values are: \"server\" and \"client\".""" + channels: Optional[int] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """Number of input audio channels. + + * `1`: Mono input (default). + * `2`: Interleaved stereo input where channel 0 is the microphone signal and channel 1 is + the echo reference signal. + When set to 2, `reference_source` must be `client` and `input_audio_format` must be + `pcm16`.""" + + @overload + def __init__( + self, + *, + reference_source: Optional[Union[str, "_models.EchoCancellationReferenceSource"]] = None, + channels: Optional[int] = None, + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: + """ + :param mapping: raw JSON to initialize the model. + :type mapping: Mapping[str, Any] + """ def __init__(self, *args: Any, **kwargs: Any) -> None: super().__init__(*args, **kwargs) @@ -1047,6 +1094,47 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: self.type = AzureVoiceType.AZURE_PERSONAL # type: ignore +class AzureRealtimeNativeVoice(_Model): + """Azure realtime native voice configuration. These voices are natively supported by the + ``azure-realtime`` model and offer higher quality speech synthesis than standard Azure voices. + Only valid when using the ``azure-realtime`` model. + + :ivar type: The type of the voice. Required. Default value is "azure-realtime-native". + :vartype type: str + :ivar name: The name of the Azure realtime native voice. Required. Known values are: "aarti", + "andrew", "ava", "denise", "diya", "elsa", "florian", "francisca", "meera", "xiaoxiao", + "yunxi", and "ximena". + :vartype name: str or ~azure.ai.voicelive.models.AzureRealtimeNativeVoiceName + """ + + type: Literal["azure-realtime-native"] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """The type of the voice. Required. Default value is \"azure-realtime-native\".""" + name: Union[str, "_models.AzureRealtimeNativeVoiceName"] = rest_field( + visibility=["read", "create", "update", "delete", "query"] + ) + """The name of the Azure realtime native voice. Required. Known values are: \"aarti\", \"andrew\", + \"ava\", \"denise\", \"diya\", \"elsa\", \"florian\", \"francisca\", \"meera\", \"xiaoxiao\", + \"yunxi\", and \"ximena\".""" + + @overload + def __init__( + self, + *, + name: Union[str, "_models.AzureRealtimeNativeVoiceName"], + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: + """ + :param mapping: raw JSON to initialize the model. + :type mapping: Mapping[str, Any] + """ + + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + self.type: Literal["azure-realtime-native"] = "azure-realtime-native" + + class EouDetection(_Model): """Top-level union for end-of-utterance (EOU) semantic detection configuration. @@ -1703,16 +1791,16 @@ class ClientEvent(_Model): ClientEventInputAudioTurnEnd, ClientEventInputAudioTurnStart, ClientEventInputAudioBufferAppend, ClientEventInputAudioBufferClear, ClientEventInputAudioBufferCommit, ClientEventOutputAudioBufferClear, - ClientEventResponseCancel, ClientEventResponseCreate, ClientEventSessionAvatarConnect, - ClientEventSessionUpdate + ClientEventResponseCancel, ClientEventResponseCreate, ClientEventRtcCallSdpCreate, + ClientEventSessionAvatarConnect, ClientEventSessionUpdate :ivar type: The type of event. Required. Known values are: "session.update", "input_audio_buffer.append", "input_audio_buffer.commit", "input_audio_buffer.clear", "input_audio.turn.start", "input_audio.turn.append", "input_audio.turn.end", "input_audio.turn.cancel", "input_audio.clear", "conversation.item.create", "conversation.item.retrieve", "conversation.item.truncate", "conversation.item.delete", - "response.create", "response.cancel", "session.avatar.connect", "mcp_approval_response", and - "output_audio_buffer.clear". + "response.create", "response.cancel", "session.avatar.connect", "mcp_approval_response", + "output_audio_buffer.clear", and "rtc.call.sdp.create". :vartype type: str or ~azure.ai.voicelive.models.ClientEventType :ivar event_id: :vartype event_id: str @@ -1726,7 +1814,7 @@ class ClientEvent(_Model): \"input_audio.turn.cancel\", \"input_audio.clear\", \"conversation.item.create\", \"conversation.item.retrieve\", \"conversation.item.truncate\", \"conversation.item.delete\", \"response.create\", \"response.cancel\", \"session.avatar.connect\", - \"mcp_approval_response\", and \"output_audio_buffer.clear\".""" + \"mcp_approval_response\", \"output_audio_buffer.clear\", and \"rtc.call.sdp.create\".""" event_id: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"]) @overload @@ -2364,6 +2452,50 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: self.type = ClientEventType.RESPONSE_CREATE # type: ignore +class ClientEventRtcCallSdpCreate(ClientEvent, discriminator="rtc.call.sdp.create"): + """Sent by the client to initiate a WebRTC session with an SDP offer. + + :ivar event_id: + :vartype event_id: str + :ivar type: The event type, must be ``rtc.call.sdp.create``. Required. Sent by the client to + initiate a WebRTC session with an SDP offer. + :vartype type: str or ~azure.ai.voicelive.models.RTC_CALL_SDP_CREATE + :ivar sdp_offer: The SDP offer from the client for WebRTC negotiation. Required. + :vartype sdp_offer: str + :ivar session: Optional initial session configuration. If provided, applied before the session + is established. + :vartype session: ~azure.ai.voicelive.models.RequestSession + """ + + type: Literal[ClientEventType.RTC_CALL_SDP_CREATE] = rest_discriminator(name="type", visibility=["read", "create", "update", "delete", "query"]) # type: ignore + """The event type, must be ``rtc.call.sdp.create``. Required. Sent by the client to initiate a + WebRTC session with an SDP offer.""" + sdp_offer: str = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """The SDP offer from the client for WebRTC negotiation. Required.""" + session: Optional["_models.RequestSession"] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """Optional initial session configuration. If provided, applied before the session is established.""" + + @overload + def __init__( + self, + *, + sdp_offer: str, + event_id: Optional[str] = None, + session: Optional["_models.RequestSession"] = None, + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: + """ + :param mapping: raw JSON to initialize the model. + :type mapping: Mapping[str, Any] + """ + + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + self.type = ClientEventType.RTC_CALL_SDP_CREATE # type: ignore + + class ClientEventSessionAvatarConnect(ClientEvent, discriminator="session.avatar.connect"): """Sent when the client connects and provides its SDP (Session Description Protocol) @@ -3411,9 +3543,10 @@ class RequestSession(_Model): :ivar animation: The animation configuration for the session. :vartype animation: ~azure.ai.voicelive.models.Animation :ivar voice: The voice configuration for the session. Is one of the following types: Union[str, - "_models.OpenAIVoiceName"], OpenAIVoice, AzureVoice + "_models.OpenAIVoiceName"], OpenAIVoice, AzureVoice, AzureRealtimeNativeVoice :vartype voice: str or ~azure.ai.voicelive.models.OpenAIVoiceName or - ~azure.ai.voicelive.models.OpenAIVoice or ~azure.ai.voicelive.models.AzureVoice + ~azure.ai.voicelive.models.OpenAIVoice or ~azure.ai.voicelive.models.AzureVoice or + ~azure.ai.voicelive.models.AzureRealtimeNativeVoice :ivar instructions: Optional instructions to guide the model's behavior throughout the session. :vartype instructions: str :ivar input_audio_sampling_rate: Input audio sampling rate in Hz. Available values: @@ -3484,7 +3617,7 @@ class RequestSession(_Model): """The animation configuration for the session.""" voice: Optional["_types.Voice"] = rest_field(visibility=["read", "create", "update", "delete", "query"]) """The voice configuration for the session. Is one of the following types: Union[str, - \"_models.OpenAIVoiceName\"], OpenAIVoice, AzureVoice""" + \"_models.OpenAIVoiceName\"], OpenAIVoice, AzureVoice, AzureRealtimeNativeVoice""" instructions: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"]) """Optional instructions to guide the model's behavior throughout the session.""" input_audio_sampling_rate: Optional[int] = rest_field(visibility=["read", "create", "update", "delete", "query"]) @@ -3661,9 +3794,10 @@ class Response(_Model): like ``conv_1234``. :vartype conversation_id: str :ivar voice: supported voice identifiers and configurations. Is one of the following types: - Union[str, "_models.OpenAIVoiceName"], OpenAIVoice, AzureVoice + Union[str, "_models.OpenAIVoiceName"], OpenAIVoice, AzureVoice, AzureRealtimeNativeVoice :vartype voice: str or ~azure.ai.voicelive.models.OpenAIVoiceName or - ~azure.ai.voicelive.models.OpenAIVoice or ~azure.ai.voicelive.models.AzureVoice + ~azure.ai.voicelive.models.OpenAIVoice or ~azure.ai.voicelive.models.AzureVoice or + ~azure.ai.voicelive.models.AzureRealtimeNativeVoice :ivar modalities: The set of modalities the model used to respond. If there are multiple modalities, the model will pick one, for example if ``modalities`` is ``["text", "audio"]``, the model could be responding in either text or audio. @@ -3718,7 +3852,7 @@ class Response(_Model): default conversation, thus the ``conversation_id`` will be an id like ``conv_1234``.""" voice: Optional["_types.Voice"] = rest_field(visibility=["read", "create", "update", "delete", "query"]) """supported voice identifiers and configurations. Is one of the following types: Union[str, - \"_models.OpenAIVoiceName\"], OpenAIVoice, AzureVoice""" + \"_models.OpenAIVoiceName\"], OpenAIVoice, AzureVoice, AzureRealtimeNativeVoice""" modalities: Optional[list[Union[str, "_models.Modality"]]] = rest_field( visibility=["read", "create", "update", "delete", "query"] ) @@ -3906,9 +4040,10 @@ class ResponseCreateParams(_Model): start of the session. :vartype instructions: str :ivar voice: supported voice identifiers and configurations. Is one of the following types: - Union[str, "_models.OpenAIVoiceName"], OpenAIVoice, AzureVoice + Union[str, "_models.OpenAIVoiceName"], OpenAIVoice, AzureVoice, AzureRealtimeNativeVoice :vartype voice: str or ~azure.ai.voicelive.models.OpenAIVoiceName or - ~azure.ai.voicelive.models.OpenAIVoice or ~azure.ai.voicelive.models.AzureVoice + ~azure.ai.voicelive.models.OpenAIVoice or ~azure.ai.voicelive.models.AzureVoice or + ~azure.ai.voicelive.models.AzureRealtimeNativeVoice :ivar output_audio_format: The format of output audio. Options are ``pcm16``, ``g711_ulaw``, or ``g711_alaw``. Known values are: "pcm16", "pcm16_8000hz", "pcm16_16000hz", "g711_ulaw", and "g711_alaw". @@ -3942,6 +4077,8 @@ class ResponseCreateParams(_Model): calls. Is either a StaticInterimResponseConfig type or a LlmInterimResponseConfig type. :vartype interim_response: ~azure.ai.voicelive.models.StaticInterimResponseConfig or ~azure.ai.voicelive.models.LlmInterimResponseConfig + :ivar invoke_input: Input data to invoke the hosted agent. This feature is in preview. + :vartype invoke_input: dict[str, any] """ commit: Optional[bool] = rest_field(visibility=["read", "create", "update", "delete", "query"]) @@ -3976,7 +4113,7 @@ class ResponseCreateParams(_Model): start of the session.""" voice: Optional["_types.Voice"] = rest_field(visibility=["read", "create", "update", "delete", "query"]) """supported voice identifiers and configurations. Is one of the following types: Union[str, - \"_models.OpenAIVoiceName\"], OpenAIVoice, AzureVoice""" + \"_models.OpenAIVoiceName\"], OpenAIVoice, AzureVoice, AzureRealtimeNativeVoice""" output_audio_format: Optional[Union[str, "_models.OutputAudioFormat"]] = rest_field( visibility=["read", "create", "update", "delete", "query"] ) @@ -4017,6 +4154,8 @@ class ResponseCreateParams(_Model): ) """Configuration for interim response generation during latency or tool calls. Is either a StaticInterimResponseConfig type or a LlmInterimResponseConfig type.""" + invoke_input: Optional[dict[str, Any]] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """Input data to invoke the hosted agent. This feature is in preview.""" @overload def __init__( @@ -4038,6 +4177,7 @@ def __init__( reasoning_effort: Optional[Union[str, "_models.ReasoningEffort"]] = None, metadata: Optional[dict[str, str]] = None, interim_response: Optional["_types.InterimResponseConfig"] = None, + invoke_input: Optional[dict[str, Any]] = None, ) -> None: ... @overload @@ -4604,9 +4744,10 @@ class ResponseSession(_Model): :ivar animation: The animation configuration for the session. :vartype animation: ~azure.ai.voicelive.models.Animation :ivar voice: The voice configuration for the session. Is one of the following types: Union[str, - "_models.OpenAIVoiceName"], OpenAIVoice, AzureVoice + "_models.OpenAIVoiceName"], OpenAIVoice, AzureVoice, AzureRealtimeNativeVoice :vartype voice: str or ~azure.ai.voicelive.models.OpenAIVoiceName or - ~azure.ai.voicelive.models.OpenAIVoice or ~azure.ai.voicelive.models.AzureVoice + ~azure.ai.voicelive.models.OpenAIVoice or ~azure.ai.voicelive.models.AzureVoice or + ~azure.ai.voicelive.models.AzureRealtimeNativeVoice :ivar instructions: Optional instructions to guide the model's behavior throughout the session. :vartype instructions: str :ivar input_audio_sampling_rate: Input audio sampling rate in Hz. Available values: @@ -4681,7 +4822,7 @@ class ResponseSession(_Model): """The animation configuration for the session.""" voice: Optional["_types.Voice"] = rest_field(visibility=["read", "create", "update", "delete", "query"]) """The voice configuration for the session. Is one of the following types: Union[str, - \"_models.OpenAIVoiceName\"], OpenAIVoice, AzureVoice""" + \"_models.OpenAIVoiceName\"], OpenAIVoice, AzureVoice, AzureRealtimeNativeVoice""" instructions: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"]) """Optional instructions to guide the model's behavior throughout the session.""" input_audio_sampling_rate: Optional[int] = rest_field(visibility=["read", "create", "update", "delete", "query"]) @@ -4878,6 +5019,44 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: self.type = ItemType.WEB_SEARCH_CALL # type: ignore +class RtcCallErrorDetails(_Model): + """Error details for RTC call errors. + + :ivar type: The error category: ``invalid_request_error`` or ``server_error``. Required. + :vartype type: str + :ivar code: A machine-readable error code. + :vartype code: str + :ivar message: A human-readable error description. Required. + :vartype message: str + """ + + type: str = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """The error category: ``invalid_request_error`` or ``server_error``. Required.""" + code: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """A machine-readable error code.""" + message: str = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """A human-readable error description. Required.""" + + @overload + def __init__( + self, + *, + type: str, + message: str, + code: Optional[str] = None, + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: + """ + :param mapping: raw JSON to initialize the model. + :type mapping: Mapping[str, Any] + """ + + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + + class Scene(_Model): """Configuration for avatar's zoom level, position, rotation and movement amplitude in the video frame. @@ -4963,7 +5142,8 @@ class ServerEvent(_Model): ServerEventInputAudioBufferCommitted, ServerEventInputAudioBufferSpeechStarted, ServerEventInputAudioBufferSpeechStopped, ServerEventMcpListToolsCompleted, ServerEventMcpListToolsFailed, ServerEventMcpListToolsInProgress, - ServerEventOutputAudioBufferCleared, ServerEventResponseAnimationBlendshapeDelta, + ServerEventOutputAudioBufferCleared, ServerEventOutputAudioBufferStarted, + ServerEventOutputAudioBufferStopped, ServerEventResponseAnimationBlendshapeDelta, ServerEventResponseAnimationBlendshapeDone, ServerEventResponseAnimationVisemeDelta, ServerEventResponseAnimationVisemeDone, ServerEventResponseAudioDelta, ServerEventResponseAudioDone, ServerEventResponseAudioTimestampDelta, @@ -4973,13 +5153,14 @@ class ServerEvent(_Model): ServerEventResponseCreated, ServerEventResponseDone, ServerEventResponseFileSearchCallCompleted, ServerEventResponseFileSearchCallInProgress, ServerEventResponseFileSearchCallSearching, ServerEventResponseFunctionCallArgumentsDelta, - ServerEventResponseFunctionCallArgumentsDone, ServerEventResponseMcpCallCompleted, - ServerEventResponseMcpCallFailed, ServerEventResponseMcpCallInProgress, - ServerEventResponseMcpCallArgumentsDelta, ServerEventResponseMcpCallArgumentsDone, - ServerEventResponseOutputItemAdded, ServerEventResponseOutputItemDone, - ServerEventResponseTextDelta, ServerEventResponseTextDone, ServerEventResponseVideoDelta, - ServerEventResponseWebSearchCallCompleted, ServerEventResponseWebSearchCallInProgress, - ServerEventResponseWebSearchCallSearching, ServerEventSessionAvatarConnecting, + ServerEventResponseFunctionCallArgumentsDone, ServerEventResponseInvocationDelta, + ServerEventResponseMcpCallCompleted, ServerEventResponseMcpCallFailed, + ServerEventResponseMcpCallInProgress, ServerEventResponseMcpCallArgumentsDelta, + ServerEventResponseMcpCallArgumentsDone, ServerEventResponseOutputItemAdded, + ServerEventResponseOutputItemDone, ServerEventResponseTextDelta, ServerEventResponseTextDone, + ServerEventResponseVideoDelta, ServerEventResponseWebSearchCallCompleted, + ServerEventResponseWebSearchCallInProgress, ServerEventResponseWebSearchCallSearching, + ServerEventRtcCallError, ServerEventRtcCallSdpCreated, ServerEventSessionAvatarConnecting, ServerEventSessionAvatarSwitchToIdle, ServerEventSessionAvatarSwitchToSpeaking, ServerEventSessionCreated, ServerEventSessionUpdated, ServerEventWarning @@ -5005,8 +5186,10 @@ class ServerEvent(_Model): "session.avatar.switch_to_idle", "response.video.delta", "response.web_search_call.searching", "response.web_search_call.in_progress", "response.web_search_call.completed", "response.file_search_call.searching", "response.file_search_call.in_progress", - "response.file_search_call.completed", "output_audio_buffer.cleared", and - "response.audio_transcript.annotation.added". + "response.file_search_call.completed", "output_audio_buffer.cleared", + "response.audio_transcript.annotation.added", "response.invocation.delta", + "rtc.call.sdp.created", "rtc.call.error", "output_audio_buffer.started", and + "output_audio_buffer.stopped". :vartype type: str or ~azure.ai.voicelive.models.ServerEventType :ivar event_id: :vartype event_id: str @@ -5038,8 +5221,10 @@ class ServerEvent(_Model): \"response.video.delta\", \"response.web_search_call.searching\", \"response.web_search_call.in_progress\", \"response.web_search_call.completed\", \"response.file_search_call.searching\", \"response.file_search_call.in_progress\", - \"response.file_search_call.completed\", \"output_audio_buffer.cleared\", and - \"response.audio_transcript.annotation.added\".""" + \"response.file_search_call.completed\", \"output_audio_buffer.cleared\", + \"response.audio_transcript.annotation.added\", \"response.invocation.delta\", + \"rtc.call.sdp.created\", \"rtc.call.error\", \"output_audio_buffer.started\", and + \"output_audio_buffer.stopped\".""" event_id: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"]) @overload @@ -5832,6 +6017,82 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: self.type = ServerEventType.OUTPUT_AUDIO_BUFFER_CLEARED # type: ignore +class ServerEventOutputAudioBufferStarted(ServerEvent, discriminator="output_audio_buffer.started"): + """Returned when model audio output starts playing. + + :ivar event_id: + :vartype event_id: str + :ivar type: The event type, must be ``output_audio_buffer.started``. Required. Output audio + buffer playback started. + :vartype type: str or ~azure.ai.voicelive.models.OUTPUT_AUDIO_BUFFER_STARTED + :ivar response_id: The ID of the response whose audio started playing. + :vartype response_id: str + """ + + type: Literal[ServerEventType.OUTPUT_AUDIO_BUFFER_STARTED] = rest_discriminator(name="type", visibility=["read", "create", "update", "delete", "query"]) # type: ignore + """The event type, must be ``output_audio_buffer.started``. Required. Output audio buffer playback + started.""" + response_id: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """The ID of the response whose audio started playing.""" + + @overload + def __init__( + self, + *, + event_id: Optional[str] = None, + response_id: Optional[str] = None, + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: + """ + :param mapping: raw JSON to initialize the model. + :type mapping: Mapping[str, Any] + """ + + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + self.type = ServerEventType.OUTPUT_AUDIO_BUFFER_STARTED # type: ignore + + +class ServerEventOutputAudioBufferStopped(ServerEvent, discriminator="output_audio_buffer.stopped"): + """Returned when model audio output finishes playing. + + :ivar event_id: + :vartype event_id: str + :ivar type: The event type, must be ``output_audio_buffer.stopped``. Required. Output audio + buffer playback stopped. + :vartype type: str or ~azure.ai.voicelive.models.OUTPUT_AUDIO_BUFFER_STOPPED + :ivar response_id: The ID of the response whose audio stopped playing. + :vartype response_id: str + """ + + type: Literal[ServerEventType.OUTPUT_AUDIO_BUFFER_STOPPED] = rest_discriminator(name="type", visibility=["read", "create", "update", "delete", "query"]) # type: ignore + """The event type, must be ``output_audio_buffer.stopped``. Required. Output audio buffer playback + stopped.""" + response_id: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """The ID of the response whose audio stopped playing.""" + + @overload + def __init__( + self, + *, + event_id: Optional[str] = None, + response_id: Optional[str] = None, + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: + """ + :param mapping: raw JSON to initialize the model. + :type mapping: Mapping[str, Any] + """ + + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + self.type = ServerEventType.OUTPUT_AUDIO_BUFFER_STOPPED # type: ignore + + class ServerEventResponseAnimationBlendshapeDelta( ServerEvent, discriminator="response.animation_blendshapes.delta" ): # pylint: disable=name-too-long @@ -6950,6 +7211,44 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: self.type = ServerEventType.RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE # type: ignore +class ServerEventResponseInvocationDelta(ServerEvent, discriminator="response.invocation.delta"): + """Returned when a hosted agent invocation produces a non-speech SSE event, passed through as-is. + + :ivar event_id: + :vartype event_id: str + :ivar type: The event type, must be ``response.invocation.delta``. Required. Invocation + passthrough delta from hosted agent. + :vartype type: str or ~azure.ai.voicelive.models.RESPONSE_INVOCATION_DELTA + :ivar delta: The raw event data from the hosted agent invocation. Required. + :vartype delta: dict[str, any] + """ + + type: Literal[ServerEventType.RESPONSE_INVOCATION_DELTA] = rest_discriminator(name="type", visibility=["read", "create", "update", "delete", "query"]) # type: ignore + """The event type, must be ``response.invocation.delta``. Required. Invocation passthrough delta + from hosted agent.""" + delta: dict[str, Any] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """The raw event data from the hosted agent invocation. Required.""" + + @overload + def __init__( + self, + *, + delta: dict[str, Any], + event_id: Optional[str] = None, + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: + """ + :param mapping: raw JSON to initialize the model. + :type mapping: Mapping[str, Any] + """ + + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + self.type = ServerEventType.RESPONSE_INVOCATION_DELTA # type: ignore + + class ServerEventResponseMcpCallArgumentsDelta(ServerEvent, discriminator="response.mcp_call_arguments.delta"): """Represents a delta update of the arguments for an MCP tool call. @@ -7599,6 +7898,97 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: self.type = ServerEventType.RESPONSE_WEB_SEARCH_CALL_SEARCHING # type: ignore +class ServerEventRtcCallError(ServerEvent, discriminator="rtc.call.error"): + """Returned when a WebRTC call operation fails. + + :ivar event_id: + :vartype event_id: str + :ivar type: The event type, must be ``rtc.call.error``. Required. Returned when a WebRTC call + operation fails. + :vartype type: str or ~azure.ai.voicelive.models.RTC_CALL_ERROR + :ivar operation: The operation that caused the error (e.g., ``rtc.call.sdp.create``). + :vartype operation: str + :ivar rtc_call_id: The RTC call identifier, if available. + :vartype rtc_call_id: str + :ivar error: The error details. Required. + :vartype error: ~azure.ai.voicelive.models.RtcCallErrorDetails + """ + + type: Literal[ServerEventType.RTC_CALL_ERROR] = rest_discriminator(name="type", visibility=["read", "create", "update", "delete", "query"]) # type: ignore + """The event type, must be ``rtc.call.error``. Required. Returned when a WebRTC call operation + fails.""" + operation: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """The operation that caused the error (e.g., ``rtc.call.sdp.create``).""" + rtc_call_id: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """The RTC call identifier, if available.""" + error: "_models.RtcCallErrorDetails" = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """The error details. Required.""" + + @overload + def __init__( + self, + *, + error: "_models.RtcCallErrorDetails", + event_id: Optional[str] = None, + operation: Optional[str] = None, + rtc_call_id: Optional[str] = None, + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: + """ + :param mapping: raw JSON to initialize the model. + :type mapping: Mapping[str, Any] + """ + + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + self.type = ServerEventType.RTC_CALL_ERROR # type: ignore + + +class ServerEventRtcCallSdpCreated(ServerEvent, discriminator="rtc.call.sdp.created"): + """Returned when the WebRTC SDP negotiation completes successfully. + + :ivar event_id: + :vartype event_id: str + :ivar type: The event type, must be ``rtc.call.sdp.created``. Required. Returned when the + WebRTC SDP negotiation completes successfully. + :vartype type: str or ~azure.ai.voicelive.models.RTC_CALL_SDP_CREATED + :ivar rtc_call_id: The unique identifier for this RTC call session. Required. + :vartype rtc_call_id: str + :ivar sdp_answer: The SDP answer from the server for WebRTC negotiation. Required. + :vartype sdp_answer: str + """ + + type: Literal[ServerEventType.RTC_CALL_SDP_CREATED] = rest_discriminator(name="type", visibility=["read", "create", "update", "delete", "query"]) # type: ignore + """The event type, must be ``rtc.call.sdp.created``. Required. Returned when the WebRTC SDP + negotiation completes successfully.""" + rtc_call_id: str = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """The unique identifier for this RTC call session. Required.""" + sdp_answer: str = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """The SDP answer from the server for WebRTC negotiation. Required.""" + + @overload + def __init__( + self, + *, + rtc_call_id: str, + sdp_answer: str, + event_id: Optional[str] = None, + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: + """ + :param mapping: raw JSON to initialize the model. + :type mapping: Mapping[str, Any] + """ + + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + self.type = ServerEventType.RTC_CALL_SDP_CREATED # type: ignore + + class ServerEventSessionAvatarConnecting(ServerEvent, discriminator="session.avatar.connecting"): """Sent when the server is in the process of establishing an avatar media connection and provides its SDP answer. diff --git a/sdk/voicelive/azure-ai-voicelive/pyproject.toml b/sdk/voicelive/azure-ai-voicelive/pyproject.toml index 63dfecc7ca70..f966545977e7 100644 --- a/sdk/voicelive/azure-ai-voicelive/pyproject.toml +++ b/sdk/voicelive/azure-ai-voicelive/pyproject.toml @@ -17,7 +17,7 @@ authors = [ description = "Microsoft Corporation Azure Ai Voicelive Client Library for Python" license = "MIT" classifiers = [ - "Development Status :: 5 - Production/Stable", + "Development Status :: 4 - Beta", "Programming Language :: Python", "Programming Language :: Python :: 3 :: Only", "Programming Language :: Python :: 3", diff --git a/sdk/voicelive/azure-ai-voicelive/samples/BASIC_VOICE_ASSISTANT.md b/sdk/voicelive/azure-ai-voicelive/samples/BASIC_VOICE_ASSISTANT.md index 04a30e660c45..de4c955173ce 100644 --- a/sdk/voicelive/azure-ai-voicelive/samples/BASIC_VOICE_ASSISTANT.md +++ b/sdk/voicelive/azure-ai-voicelive/samples/BASIC_VOICE_ASSISTANT.md @@ -14,7 +14,7 @@ This sample demonstrates a complete voice assistant implementation using the Azu ## Prerequisites -- Python 3.9+ +- Python 3.10+ - Microphone and speakers/headphones - Azure AI VoiceLive endpoint - An Entra ID identity with access to Azure AI VoiceLive, or a VoiceLive API key @@ -22,9 +22,9 @@ This sample demonstrates a complete voice assistant implementation using the Azu ## Installation 1. **Install the SDK**: - ```bash - pip install azure-ai-voicelive azure-identity python-dotenv - ``` + ```bash + python -m pip install --pre "azure-ai-voicelive[aiohttp]" azure-identity python-dotenv + ``` 2. **Install PyAudio** (required for audio capture/playback): @@ -47,12 +47,12 @@ This sample demonstrates a complete voice assistant implementation using the Azu ## Configuration -Create a `.env` file. By default, the sample uses Entra ID via `DefaultAzureCredential`: +Create a `.env` file. By default, the sample uses Entra ID via `DefaultAzureCredential`. For local development, `az login` is the easiest way to satisfy that credential chain: ```bash AZURE_VOICELIVE_ENDPOINT=your-endpoint AZURE_VOICELIVE_MODEL=gpt-realtime -AZURE_VOICELIVE_VOICE=en-US-AvaNeural +AZURE_VOICELIVE_VOICE=en-US-Ava:DragonHDLatestNeural AZURE_VOICELIVE_INSTRUCTIONS=You are a helpful AI assistant. Respond naturally and conversationally. ``` @@ -69,6 +69,8 @@ AZURE_VOICELIVE_API_KEY=your-api-key python basic_voice_assistant_async.py ``` +The sample writes logs to standard output and does not create log files. + Optional command-line arguments: ```bash @@ -177,7 +179,7 @@ Main application class that coordinates WebSocket connection, session management - **Network timeouts**: Check firewall settings and network connectivity ### Authentication Issues -- **Default auth failures**: Confirm `DefaultAzureCredential` can get a token and your identity can access the VoiceLive resource +- **Default auth failures**: Run `az login` or otherwise confirm `DefaultAzureCredential` can get a token and your identity can access the VoiceLive resource - **API key auth failures**: Set `AZURE_VOICELIVE_USE_API_KEY=true` and verify `AZURE_VOICELIVE_API_KEY` ### PyAudio Installation Issues diff --git a/sdk/voicelive/azure-ai-voicelive/samples/README.md b/sdk/voicelive/azure-ai-voicelive/samples/README.md index 84694bc4bd14..139553efd768 100644 --- a/sdk/voicelive/azure-ai-voicelive/samples/README.md +++ b/sdk/voicelive/azure-ai-voicelive/samples/README.md @@ -6,7 +6,7 @@ This directory contains sample applications demonstrating various capabilities o ## Prerequisites -- Python 3.9 or later +- Python 3.10 or later - An Azure subscription with access to Azure AI VoiceLive - An Entra ID identity with access to Azure AI VoiceLive, or a VoiceLive API key @@ -14,9 +14,9 @@ This directory contains sample applications demonstrating various capabilities o 1. **Install dependencies**: - ```bash - pip install azure-ai-voicelive[aiohttp] azure-identity python-dotenv - ``` + ```bash + python -m pip install --pre "azure-ai-voicelive[aiohttp]" azure-identity python-dotenv + ``` 2. **Install PyAudio** (required for audio samples): @@ -39,7 +39,7 @@ This directory contains sample applications demonstrating various capabilities o 3. **Configure environment variables**: - Create a `.env` file at the root of the azure-ai-voicelive directory or in the samples directory. By default, the samples use Entra ID via `DefaultAzureCredential`: + Create a `.env` file at the root of the azure-ai-voicelive directory or in the samples directory. By default, the samples use Entra ID via `DefaultAzureCredential`. For local development, run `az login` first if you want to use your Azure CLI session: ```ini AZURE_VOICELIVE_ENDPOINT=wss://api.voicelive.com/v1 @@ -116,9 +116,10 @@ python basic_voice_assistant_async.py --help ## Sample descriptions -- **basic_voice_assistant_async.py**: 🌟 **[Featured Sample]** Complete async voice assistant demonstrating real-time conversation, interruption handling, and server VAD. Supports optional OpenTelemetry tracing via `--enable-tracing`. Perfect starting point for voice applications. See "BASIC_VOICE_ASSISTANT.md" for detailed documentation. -- **agent_v2_sample.py**: Demonstrates how to connect to an Azure AI Foundry agent using flattened `connect()` keyword arguments. Shows the new pattern where agents are configured at connection time rather than as tools in the session. Features callback-based audio streaming, sequence number based interrupt handling, and standard logger output for conversation events. +- **basic_voice_assistant_async.py**: 🌟 **[Featured Sample]** Complete async voice assistant demonstrating real-time conversation, interruption handling, and server VAD. Supports optional OpenTelemetry tracing via `--enable-tracing`, defaults to Entra ID auth, and writes logs to standard output instead of creating log files. See "BASIC_VOICE_ASSISTANT.md" for detailed documentation. +- **agent_v2_sample.py**: Demonstrates how to connect to an Azure AI Foundry agent using flattened `connect()` keyword arguments. Shows the new pattern where agents are configured at connection time rather than as tools in the session. Features callback-based audio streaming, sequence number based interrupt handling, standard logger output for conversation events, and defaults the agent connection to API version `2026-04-10`. - **async_function_calling_sample.py**: Demonstrates async function calling capabilities with the VoiceLive SDK, showing how to handle function calls from the AI model. +- **async_mcp_sample.py**: Demonstrates async MCP capabilities with Entra ID-first authentication and uses API version `2026-04-10` for MCP support. ### Telemetry samples @@ -162,7 +163,7 @@ Set `AZURE_EXPERIMENTAL_ENABLE_GENAI_TRACING=true` to enable tracing. - Confirm your network allows WSS to the service - **Auth errors** - - By default: ensure `DefaultAzureCredential` can acquire a token and your identity has access to the resource + - By default: run `az login` or otherwise ensure `DefaultAzureCredential` can acquire a token and your identity has access to the resource - For API key auth: set `AZURE_VOICELIVE_USE_API_KEY=true` and confirm `AZURE_VOICELIVE_API_KEY` ## Next steps diff --git a/sdk/voicelive/azure-ai-voicelive/samples/async_mcp_sample.py b/sdk/voicelive/azure-ai-voicelive/samples/async_mcp_sample.py index 0ce7bdd446e4..8561b9144f8c 100644 --- a/sdk/voicelive/azure-ai-voicelive/samples/async_mcp_sample.py +++ b/sdk/voicelive/azure-ai-voicelive/samples/async_mcp_sample.py @@ -12,7 +12,8 @@ DESCRIPTION: This sample demonstrates how to use the Azure AI Voice Live SDK asynchronously - with MCP capabilities. It shows how to define mcp servers, handle mcp call events. + with MCP capabilities. It shows how to define mcp servers, handle mcp call events, + and connects with API version 2026-04-10 because MCP support requires that service version. USAGE: python async_mcp_sample.py @@ -753,4 +754,3 @@ async def main(): asyncio.run(main()) except KeyboardInterrupt: print("\n👋 Voice Live MCP shut down.") - diff --git a/sdk/voicelive/azure-ai-voicelive/samples/basic_voice_assistant_async.py b/sdk/voicelive/azure-ai-voicelive/samples/basic_voice_assistant_async.py index 32efeb725ae4..2f61fd3999ed 100644 --- a/sdk/voicelive/azure-ai-voicelive/samples/basic_voice_assistant_async.py +++ b/sdk/voicelive/azure-ai-voicelive/samples/basic_voice_assistant_async.py @@ -14,7 +14,8 @@ This sample demonstrates the fundamental capabilities of the VoiceLive SDK by creating a basic voice assistant that can engage in natural conversation with proper interruption handling. This serves as the foundational example that showcases the core value - proposition of unified speech-to-speech interaction. + proposition of unified speech-to-speech interaction. Logs are written to standard output + and the sample does not create log files. USAGE: python basic_voice_assistant_async.py @@ -26,7 +27,7 @@ - AZURE_VOICELIVE_USE_API_KEY - Set to "true" to use AZURE_VOICELIVE_API_KEY instead of Entra ID - AZURE_VOICELIVE_API_KEY - VoiceLive API key used when AZURE_VOICELIVE_USE_API_KEY is enabled - AZURE_VOICELIVE_MODEL - The VoiceLive model to use (default: gpt-realtime) - - AZURE_VOICELIVE_VOICE - The voice to use for synthesis + - AZURE_VOICELIVE_VOICE - The voice to use for synthesis (default: en-US-Ava:DragonHDLatestNeural) - AZURE_VOICELIVE_INSTRUCTIONS - System instructions for the assistant Or copy .env.template to .env and fill in your values. @@ -566,6 +567,7 @@ def main(): # Start the assistant try: + async def _run_assistant() -> None: try: await assistant.start() diff --git a/sdk/voicelive/azure-ai-voicelive/tests/test_live_realtime_service.py b/sdk/voicelive/azure-ai-voicelive/tests/test_live_realtime_service.py index e27895a87475..20bd8f0d0ce7 100644 --- a/sdk/voicelive/azure-ai-voicelive/tests/test_live_realtime_service.py +++ b/sdk/voicelive/azure-ai-voicelive/tests/test_live_realtime_service.py @@ -178,7 +178,7 @@ def smoke_test(self, **kwargs): @VoiceLivePreparer() @pytest.mark.flaky(reruns=3, reruns_delay=2) @pytest.mark.parametrize("model", ["gpt-realtime", "gpt-4.1", "phi4-mm-realtime", "phi4-mini"]) - @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-01-01-preview"]) + @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-04-10"]) async def test_realtime_service(self, test_data_dir: Path, model: str, api_version: str, **kwargs): voicelive_openai_endpoint = kwargs.pop("voicelive_openai_endpoint") voicelive_openai_api_key = kwargs.pop("voicelive_openai_api_key") @@ -232,7 +232,7 @@ async def test_realtime_service(self, test_data_dir: Path, model: str, api_versi @VoiceLivePreparer() @pytest.mark.flaky(reruns=3, reruns_delay=2) @pytest.mark.parametrize("model", ["gpt-realtime", "gpt-4.1"]) - @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-01-01-preview"]) + @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-04-10"]) async def test_realtime_service_with_audio_enhancements( self, test_data_dir: Path, @@ -281,7 +281,7 @@ async def test_realtime_service_with_audio_enhancements( ), ], ) - @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-01-01-preview"]) + @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-04-10"]) async def test_realtime_service_with_turn_detection_long_tts_vad_duration( self, test_data_dir: Path, model: str, server_sd_conf: dict, api_version: str, **kwargs ): @@ -327,7 +327,7 @@ async def test_realtime_service_with_turn_detection_long_tts_vad_duration( pytest.param("gpt-4o", {"languages": ["en", "es"]}, id="cascaded-realtime"), ], ) - @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-01-01-preview"]) + @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-04-10"]) async def test_realtime_service_with_turn_detection_multilingual( self, test_data_dir: Path, model: str, semantic_vad_params: dict, api_version: str, **kwargs ): @@ -360,7 +360,7 @@ async def test_realtime_service_with_turn_detection_multilingual( "filler_word_24kHz.wav", ], ) - @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-01-01-preview"]) + @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-04-10"]) async def test_realtime_service_with_filler_word_removal( self, test_data_dir: Path, @@ -396,7 +396,7 @@ async def test_realtime_service_with_filler_word_removal( "filler_word_24kHz.wav", ], ) - @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-01-01-preview"]) + @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-04-10"]) async def test_realtime_service_with_filler_word_removal_multilingual( self, test_data_dir: Path, test_audio_file: str, api_version: str, **kwargs ): @@ -428,7 +428,7 @@ async def test_realtime_service_with_filler_word_removal_multilingual( @VoiceLivePreparer() @pytest.mark.flaky(reruns=3, reruns_delay=2) @pytest.mark.parametrize("model", ["gpt-realtime", "gpt-4o"]) - @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-01-01-preview"]) + @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-04-10"]) async def test_realtime_service_tool_call(self, test_data_dir: Path, model: str, api_version: str, **kwargs): audio_file = test_data_dir / "4-1.wav" voicelive_openai_endpoint = kwargs.pop("voicelive_openai_endpoint") @@ -483,7 +483,7 @@ async def test_realtime_service_tool_call(self, test_data_dir: Path, model: str, @VoiceLivePreparer() @pytest.mark.flaky(reruns=3, reruns_delay=2) @pytest.mark.parametrize("model", ["gpt-realtime", "gpt-4o", "gpt-5-chat"]) - @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-01-01-preview"]) + @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-04-10"]) async def test_realtime_service_tool_choice(self, test_data_dir: Path, model: str, api_version: str, **kwargs): if "realtime" in model: pytest.skip("Tool choice is not supported in realtime models yet") @@ -578,7 +578,7 @@ async def test_realtime_service_tool_choice(self, test_data_dir: Path, model: st @VoiceLivePreparer() @pytest.mark.flaky(reruns=3, reruns_delay=2) @pytest.mark.parametrize("model", ["gpt-realtime", "gpt-4.1", "gpt-5", "gpt-5.1", "gpt-5.2", "phi4-mm-realtime"]) - @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-01-01-preview"]) + @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-04-10"]) async def test_realtime_service_tool_call_parameter( self, test_data_dir: Path, @@ -668,7 +668,7 @@ def get_weather(arguments: Union[str, Mapping[str, Any]]) -> str: @VoiceLivePreparer() @pytest.mark.flaky(reruns=3, reruns_delay=2) @pytest.mark.parametrize("model", ["gpt-realtime"]) - @pytest.mark.parametrize("api_version", ["2025-05-01-preview", "2026-01-01-preview"]) + @pytest.mark.parametrize("api_version", ["2025-05-01-preview", "2026-04-10"]) async def test_realtime_service_live_session_update( self, test_data_dir: Path, @@ -747,7 +747,7 @@ async def test_realtime_service_live_session_update( @pytest.mark.flaky(reruns=3, reruns_delay=2) @pytest.mark.skip() @pytest.mark.parametrize("model", ["gpt-4o", "gpt-realtime"]) - @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-01-01-preview"]) + @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-04-10"]) async def test_realtime_service_tool_call_no_audio_overlap( self, test_data_dir: Path, @@ -820,7 +820,7 @@ async def test_realtime_service_tool_call_no_audio_overlap( "mai-transcribe-1", ], ) - @pytest.mark.parametrize("api_version", ["2025-05-01-preview", "2026-01-01-preview"]) + @pytest.mark.parametrize("api_version", ["2025-05-01-preview", "2026-04-10"]) async def test_realtime_service_input_audio_transcription( self, test_data_dir: Path, @@ -891,7 +891,7 @@ async def test_realtime_service_input_audio_transcription( ), ], ) - @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-01-01-preview"]) + @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-04-10"]) async def test_realtime_service_with_eou( self, test_data_dir: Path, @@ -927,7 +927,7 @@ async def test_realtime_service_with_eou( @VoiceLivePreparer() @pytest.mark.flaky(reruns=3, reruns_delay=2) @pytest.mark.parametrize("model", ["gpt-realtime", "gpt-4.1"]) - @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-01-01-preview"]) + @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-04-10"]) async def test_realtime_service_with_audio_timestamp_viseme( self, test_data_dir: Path, @@ -985,7 +985,7 @@ async def test_realtime_service_with_audio_timestamp_viseme( @VoiceLivePreparer() @pytest.mark.flaky(reruns=3, reruns_delay=2) @pytest.mark.parametrize("model", ["gpt-realtime", "gpt-4o", "phi4-mm-realtime", "phi4-mini"]) - @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-01-01-preview"]) + @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-04-10"]) async def test_realtime_service_wo_turn_detection( self, test_data_dir: Path, @@ -1023,7 +1023,7 @@ async def test_realtime_service_wo_turn_detection( @VoiceLivePreparer() @pytest.mark.flaky(reruns=3, reruns_delay=2) @pytest.mark.parametrize("model", ["gpt-realtime", "gpt-4.1", "phi4-mm-realtime"]) - @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-01-01-preview"]) + @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-04-10"]) async def test_realtime_service_with_voice_properties( self, test_data_dir: Path, @@ -1059,7 +1059,7 @@ async def test_realtime_service_with_voice_properties( @VoiceLivePreparer() @pytest.mark.flaky(reruns=3, reruns_delay=2) @pytest.mark.parametrize("model", ["gpt-realtime"]) - @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-01-01-preview"]) + @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-04-10"]) async def test_realtime_service_retrieve_item(self, test_data_dir: Path, model: str, api_version: str, **kwargs): file = test_data_dir / "largest_lake.wav" voicelive_openai_endpoint = kwargs.pop("voicelive_openai_endpoint") @@ -1102,7 +1102,7 @@ async def test_realtime_service_retrieve_item(self, test_data_dir: Path, model: @VoiceLivePreparer() @pytest.mark.flaky(reruns=3, reruns_delay=2) @pytest.mark.parametrize("model", ["gpt-realtime"]) - @pytest.mark.parametrize("api_version", ["2025-05-01-preview", "2026-01-01-preview"]) + @pytest.mark.parametrize("api_version", ["2025-05-01-preview", "2026-04-10"]) async def test_realtime_service_truncate_item(self, test_data_dir: Path, model: str, api_version: str, **kwargs): file = test_data_dir / "largest_lake.wav" voicelive_openai_endpoint = kwargs.pop("voicelive_openai_endpoint") @@ -1195,7 +1195,7 @@ async def test_realtime_service_truncate_item(self, test_data_dir: Path, model: ), ], ) - @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-01-01-preview"]) + @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-04-10"]) async def test_realtime_service_with_input_audio_format( self, test_data_dir: Path, @@ -1270,7 +1270,7 @@ async def test_realtime_service_with_input_audio_format( pytest.param("phi4-mm-realtime", 44100, id="phi4_mm_realtime_44kHz_no_resample"), ], ) - @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-01-01-preview"]) + @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-04-10"]) async def test_realtime_service_with_input_audio_sampling_rate( self, test_data_dir: Path, model: str, sampling_rate: int, api_version: str, **kwargs ): @@ -1334,7 +1334,7 @@ async def test_realtime_service_with_input_audio_sampling_rate( "g711_alaw", ], ) - @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-01-01-preview"]) + @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-04-10"]) async def test_output_formats_with_azure_voice( self, test_data_dir: Path, model: str, audio_output_format: str, api_version: str, **kwargs ): @@ -1375,7 +1375,7 @@ async def test_output_formats_with_azure_voice( "g711_alaw", ], ) - @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-01-01-preview"]) + @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-04-10"]) async def test_output_formats_with_openai_voice( self, test_data_dir: Path, model: str, audio_output_format: str, api_version: str, **kwargs ): @@ -1408,7 +1408,7 @@ async def test_output_formats_with_openai_voice( @VoiceLivePreparer() @pytest.mark.flaky(reruns=3, reruns_delay=2) @pytest.mark.parametrize("model", ["gpt-realtime", "gpt-4.1"]) - @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-01-01-preview"]) + @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-04-10"]) async def test_realtime_service_with_echo_cancellation( self, test_data_dir: Path, @@ -1458,7 +1458,7 @@ async def test_realtime_service_with_echo_cancellation( "g711_alaw", ], ) - @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-01-01-preview"]) + @pytest.mark.parametrize("api_version", ["2025-10-01", "2026-04-10"]) async def test_write_loopback_audio_echo_cancellation( self, test_data_dir: Path, model: str, audio_output_format: str, api_version: str, **kwargs ): @@ -1485,4 +1485,3 @@ async def test_write_loopback_audio_echo_cancellation( contents, audio_bytes = await _collect_event(conn, event_type=ServerEventType.RESPONSE_CONTENT_PART_ADDED) assert contents >= 1, "Response should be generated with echo cancellation" assert audio_bytes > 0, "Audio bytes should be greater than 0" - diff --git a/sdk/voicelive/azure-ai-voicelive/tests/test_unit_client_events.py b/sdk/voicelive/azure-ai-voicelive/tests/test_unit_client_events.py index 019ac1fde905..50911a2f9df4 100644 --- a/sdk/voicelive/azure-ai-voicelive/tests/test_unit_client_events.py +++ b/sdk/voicelive/azure-ai-voicelive/tests/test_unit_client_events.py @@ -18,6 +18,7 @@ ClientEventConversationItemTruncate, ClientEventResponseCreate, ClientEventResponseCancel, + ClientEventRtcCallSdpCreate, # Event Types ClientEventType, # Supporting Models @@ -247,6 +248,30 @@ def test_response_cancel_with_response_id(self): assert event.event_id == event_id +class TestClientEventRtcCall: + """Test RTC call client events.""" + + def test_rtc_call_sdp_create_basic(self): + """Test creating an RTC SDP offer event.""" + event = ClientEventRtcCallSdpCreate(sdp_offer="v=0\r\no=- 1 2 IN IP4 127.0.0.1") + + assert event.type == ClientEventType.RTC_CALL_SDP_CREATE + assert event.sdp_offer.startswith("v=0") + assert event.session is None + + def test_rtc_call_sdp_create_with_session(self): + """Test creating an RTC SDP offer event with an initial session.""" + session = RequestSession(model="gpt-4o-realtime-preview", modalities=[Modality.AUDIO]) + event = ClientEventRtcCallSdpCreate( + sdp_offer="v=0\r\no=- 1 2 IN IP4 127.0.0.1", + session=session, + event_id="rtc-evt-1", + ) + + assert event.event_id == "rtc-evt-1" + assert event.session == session + + class TestClientEventSerialization: """Test client event serialization capabilities.""" diff --git a/sdk/voicelive/azure-ai-voicelive/tests/test_unit_connection.py b/sdk/voicelive/azure-ai-voicelive/tests/test_unit_connection.py index e35cc719a8be..fbcc022d4e97 100644 --- a/sdk/voicelive/azure-ai-voicelive/tests/test_unit_connection.py +++ b/sdk/voicelive/azure-ai-voicelive/tests/test_unit_connection.py @@ -671,7 +671,7 @@ def test_url_includes_api_version(self): manager = _VoiceLiveConnectionManager( credential=self.credential, endpoint="https://test.azure.com", - api_version="2026-01-01-preview", + api_version="2026-04-10", agent_config=agent_config, extra_query={}, extra_headers={}, @@ -679,4 +679,4 @@ def test_url_includes_api_version(self): url = manager._prepare_url() - assert "api-version=2026-01-01-preview" in url + assert "api-version=2026-04-10" in url diff --git a/sdk/voicelive/azure-ai-voicelive/tests/test_unit_enums.py b/sdk/voicelive/azure-ai-voicelive/tests/test_unit_enums.py index 0bdaa03aee4e..aad8f3c5bb15 100644 --- a/sdk/voicelive/azure-ai-voicelive/tests/test_unit_enums.py +++ b/sdk/voicelive/azure-ai-voicelive/tests/test_unit_enums.py @@ -7,9 +7,11 @@ from azure.ai.voicelive.models import ( AnimationOutputType, AudioTimestampType, + AzureRealtimeNativeVoiceName, AzureVoiceType, ClientEventType, ContentPartType, + EchoCancellationReferenceSource, SessionIncludeOption, InputAudioFormat, ItemParamStatus, @@ -73,6 +75,24 @@ def test_case_insensitive(self): assert AzureVoiceType.AZURE_STANDARD.value == "azure-standard" +class TestAzureRealtimeNativeVoiceName: + """Test AzureRealtimeNativeVoiceName enum.""" + + def test_all_values(self): + """Test representative realtime native voice values are accessible.""" + assert AzureRealtimeNativeVoiceName.AVA == "ava" + assert AzureRealtimeNativeVoiceName.XIAOXIAO == "xiaoxiao" + + +class TestEchoCancellationReferenceSource: + """Test EchoCancellationReferenceSource enum.""" + + def test_all_values(self): + """Test all echo cancellation reference source values are accessible.""" + assert EchoCancellationReferenceSource.SERVER == "server" + assert EchoCancellationReferenceSource.CLIENT == "client" + + class TestClientEventType: """Test ClientEventType enum.""" @@ -99,6 +119,10 @@ def test_response_events(self): assert ClientEventType.RESPONSE_CREATE == "response.create" assert ClientEventType.RESPONSE_CANCEL == "response.cancel" + def test_rtc_call_events(self): + """Test RTC call events.""" + assert ClientEventType.RTC_CALL_SDP_CREATE == "rtc.call.sdp.create" + class TestContentPartType: """Test ContentPartType enum.""" @@ -283,6 +307,15 @@ def test_file_search_events(self): def test_output_audio_buffer_cleared(self): assert ServerEventType.OUTPUT_AUDIO_BUFFER_CLEARED == "output_audio_buffer.cleared" + def test_output_audio_buffer_lifecycle(self): + assert ServerEventType.OUTPUT_AUDIO_BUFFER_STARTED == "output_audio_buffer.started" + assert ServerEventType.OUTPUT_AUDIO_BUFFER_STOPPED == "output_audio_buffer.stopped" + + def test_invocation_and_rtc_events(self): + assert ServerEventType.RESPONSE_INVOCATION_DELTA == "response.invocation.delta" + assert ServerEventType.RTC_CALL_SDP_CREATED == "rtc.call.sdp.created" + assert ServerEventType.RTC_CALL_ERROR == "rtc.call.error" + def test_audio_transcript_annotation(self): assert ( ServerEventType.RESPONSE_AUDIO_TRANSCRIPT_ANNOTATION_ADDED == "response.audio_transcript.annotation.added" diff --git a/sdk/voicelive/azure-ai-voicelive/tests/test_unit_models.py b/sdk/voicelive/azure-ai-voicelive/tests/test_unit_models.py index 7413758582a5..386fef051f23 100644 --- a/sdk/voicelive/azure-ai-voicelive/tests/test_unit_models.py +++ b/sdk/voicelive/azure-ai-voicelive/tests/test_unit_models.py @@ -9,12 +9,16 @@ ActionOpenPage, ActionSearch, ActionSearchSource, + AudioEchoCancellation, AssistantMessageItem, AzureAvatarVoiceSyncVoice, AzureCustomVoice, AzurePersonalVoice, + AzureRealtimeNativeVoice, + AzureRealtimeNativeVoiceName, AzureStandardVoice, AzureVoiceType, + EchoCancellationReferenceSource, FileSearchResult, InputAudioContentPart, InputTextContentPart, @@ -39,11 +43,17 @@ ResponseMCPListToolItem, ResponseSession, ResponseWebSearchCallItem, + RtcCallErrorDetails, ServerEventMcpListToolsCompleted, ServerEventMcpListToolsFailed, ServerEventMcpListToolsInProgress, + ServerEventOutputAudioBufferStarted, + ServerEventOutputAudioBufferStopped, + ServerEventResponseInvocationDelta, ServerEventResponseMcpCallArgumentsDelta, ServerEventResponseMcpCallArgumentsDone, + ServerEventRtcCallError, + ServerEventRtcCallSdpCreated, ServerEventType, SystemMessageItem, ToolType, @@ -101,6 +111,13 @@ def test_azure_personal_voice_with_temperature(self): assert voice.temperature == 0.5 assert voice.model == PersonalVoiceModels.DRAGON_LATEST_NEURAL + def test_azure_realtime_native_voice(self): + """Test AzureRealtimeNativeVoice model.""" + voice = AzureRealtimeNativeVoice(name=AzureRealtimeNativeVoiceName.AVA) + + assert voice.type == "azure-realtime-native" + assert voice.name == AzureRealtimeNativeVoiceName.AVA + class TestOpenAIVoice: """Test OpenAIVoice model.""" @@ -261,6 +278,14 @@ def test_request_session_with_temperature(self): assert session.temperature == 0.7 assert session.max_response_output_tokens == 1000 + def test_request_session_with_azure_realtime_native_voice(self): + """Test request session with Azure realtime native voice configuration.""" + voice = AzureRealtimeNativeVoice(name=AzureRealtimeNativeVoiceName.XIAOXIAO) + session = RequestSession(model="azure-realtime", voice=voice) + + assert session.voice == voice + assert session.voice.type == "azure-realtime-native" + class TestResponseSession: """Test ResponseSession model.""" @@ -319,6 +344,17 @@ def test_complex_model_structure(self): assert session.voice.model == PersonalVoiceModels.PHOENIX_LATEST_NEURAL +class TestAudioEchoCancellationModel: + """Test enhanced audio echo cancellation configuration.""" + + def test_audio_echo_cancellation_with_client_reference(self): + """Test AudioEchoCancellation with client-provided stereo reference.""" + config = AudioEchoCancellation(reference_source=EchoCancellationReferenceSource.CLIENT, channels=2) + + assert config.reference_source == EchoCancellationReferenceSource.CLIENT + assert config.channels == 2 + + class TestMCPModels: """Test MCP (Model Context Protocol) related models.""" @@ -758,6 +794,63 @@ def test_server_event_response_mcp_call_arguments_done_with_full_arguments(self) assert event.arguments == full_args +class TestRealtimeAndRtcServerEvents: + """Test realtime playback and RTC server event models.""" + + def test_server_event_output_audio_buffer_started(self): + """Test output audio buffer started event.""" + event = ServerEventOutputAudioBufferStarted(event_id="evt-1", response_id="resp-123") + + assert event.type == ServerEventType.OUTPUT_AUDIO_BUFFER_STARTED + assert event.event_id == "evt-1" + assert event.response_id == "resp-123" + + def test_server_event_output_audio_buffer_stopped(self): + """Test output audio buffer stopped event.""" + event = ServerEventOutputAudioBufferStopped(event_id="evt-2", response_id="resp-456") + + assert event.type == ServerEventType.OUTPUT_AUDIO_BUFFER_STOPPED + assert event.event_id == "evt-2" + assert event.response_id == "resp-456" + + def test_server_event_response_invocation_delta(self): + """Test hosted agent invocation delta event.""" + delta = {"type": "trace", "message": "partial hosted agent event"} + event = ServerEventResponseInvocationDelta(delta=delta, event_id="evt-3") + + assert event.type == ServerEventType.RESPONSE_INVOCATION_DELTA + assert event.event_id == "evt-3" + assert event.delta == delta + + def test_server_event_rtc_call_sdp_created(self): + """Test RTC SDP created event.""" + event = ServerEventRtcCallSdpCreated( + event_id="evt-4", + rtc_call_id="rtc-123", + sdp_answer="v=0\r\no=- 1 2 IN IP4 127.0.0.1", + ) + + assert event.type == ServerEventType.RTC_CALL_SDP_CREATED + assert event.rtc_call_id == "rtc-123" + assert event.sdp_answer.startswith("v=0") + + def test_server_event_rtc_call_error(self): + """Test RTC call error event.""" + error = RtcCallErrorDetails(type="server_error", message="RTC negotiation failed", code="rtc_failed") + event = ServerEventRtcCallError( + error=error, + operation="rtc.call.sdp.create", + rtc_call_id="rtc-123", + event_id="evt-5", + ) + + assert event.type == ServerEventType.RTC_CALL_ERROR + assert event.error.code == "rtc_failed" + assert event.error.message == "RTC negotiation failed" + assert event.operation == "rtc.call.sdp.create" + assert event.rtc_call_id == "rtc-123" + + class TestMCPApprovalType: """Test MCPApprovalType enum.""" diff --git a/sdk/voicelive/azure-ai-voicelive/tests/test_unit_models_interim_response_foundry.py b/sdk/voicelive/azure-ai-voicelive/tests/test_unit_models_interim_response_foundry.py index e90bf34bb26d..35fb031ab8d6 100644 --- a/sdk/voicelive/azure-ai-voicelive/tests/test_unit_models_interim_response_foundry.py +++ b/sdk/voicelive/azure-ai-voicelive/tests/test_unit_models_interim_response_foundry.py @@ -21,6 +21,7 @@ Response, ResponseCreateParams, ResponseSession, + ServerEventResponseInvocationDelta, ServerEventType, ServerEventWarning, ServerEventWarningDetails, @@ -198,6 +199,26 @@ def test_response_create_params_with_metadata(self): assert params.metadata == metadata +class TestHostedAgentInvocation: + """Test hosted agent invocation models.""" + + def test_response_create_params_with_invoke_input(self): + """Test ResponseCreateParams with invoke_input.""" + invoke_input = {"thread_id": "thread-123", "input": "hello"} + params = ResponseCreateParams(invoke_input=invoke_input) + + assert params.invoke_input == invoke_input + + def test_response_invocation_delta_event(self): + """Test ServerEventResponseInvocationDelta model.""" + delta = {"type": "status", "message": "agent invoked"} + event = ServerEventResponseInvocationDelta(delta=delta, event_id="evt-invoke") + + assert event.type == ServerEventType.RESPONSE_INVOCATION_DELTA + assert event.event_id == "evt-invoke" + assert event.delta == delta + + class TestSessionWithInterimResponse: """Test session models with interim_response field.""" From f43d569c4aa3b891c5304b93f481fb08b30e18f6 Mon Sep 17 00:00:00 2001 From: Xiting Zhang Date: Fri, 22 May 2026 17:43:48 -0700 Subject: [PATCH 2/3] update cspell names --- sdk/voicelive/azure-ai-voicelive/cspell.json | 25 ++++++++++++++++++- .../tests/test_live_realtime_service.py | 2 +- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/sdk/voicelive/azure-ai-voicelive/cspell.json b/sdk/voicelive/azure-ai-voicelive/cspell.json index 1dc68d63932d..f2b691d03b5b 100644 --- a/sdk/voicelive/azure-ai-voicelive/cspell.json +++ b/sdk/voicelive/azure-ai-voicelive/cspell.json @@ -1,21 +1,44 @@ { "ignoreWords": [ + "AARTI", + "aarti", + "Aarti", + "DIYA", + "diya", + "Diya", + "deser", + "FLORIAN", + "florian", + "Florian", "GENAI", "genai", "HDOMNI", "libasound", "logprobs", + "MEERA", + "meera", + "Meera", "pyaudio", "PyAudio", "SSML", "ULAW", "ulaw", + "precomputes", "VISEME", "viseme", "WEBRTC", "webrtc", + "XIAOXIAO", + "xiaoxiao", + "Xiaoxiao", + "XIMENA", + "ximena", + "Ximena", "XHIGH", - "xhigh" + "xhigh", + "YUNXI", + "yunxi", + "Yunxi" ], "ignorePaths": [ "*.csv", diff --git a/sdk/voicelive/azure-ai-voicelive/tests/test_live_realtime_service.py b/sdk/voicelive/azure-ai-voicelive/tests/test_live_realtime_service.py index 20bd8f0d0ce7..3f1676eeaa8d 100644 --- a/sdk/voicelive/azure-ai-voicelive/tests/test_live_realtime_service.py +++ b/sdk/voicelive/azure-ai-voicelive/tests/test_live_realtime_service.py @@ -820,7 +820,7 @@ async def test_realtime_service_tool_call_no_audio_overlap( "mai-transcribe-1", ], ) - @pytest.mark.parametrize("api_version", ["2025-05-01-preview", "2026-04-10"]) + @pytest.mark.parametrize("api_version", ["2025-05-01-preview", "2026-01-01-preview"]) async def test_realtime_service_input_audio_transcription( self, test_data_dir: Path, From 2aff5ae090b3a973f59e2a55f0cd080338db9898 Mon Sep 17 00:00:00 2001 From: Xiting Zhang Date: Fri, 22 May 2026 18:11:57 -0700 Subject: [PATCH 3/3] update docs --- .../azure/ai/voicelive/models/_models.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sdk/voicelive/azure-ai-voicelive/azure/ai/voicelive/models/_models.py b/sdk/voicelive/azure-ai-voicelive/azure/ai/voicelive/models/_models.py index 3accdd4f4906..b775d6b7aa6c 100644 --- a/sdk/voicelive/azure-ai-voicelive/azure/ai/voicelive/models/_models.py +++ b/sdk/voicelive/azure-ai-voicelive/azure/ai/voicelive/models/_models.py @@ -403,14 +403,15 @@ class AudioEchoCancellation(_Model): behavior). * `client`: EC uses the client-supplied reference channel (ch1 of stereo input). Internal TTS loopback is skipped. Known values are: "server" and "client". + :vartype reference_source: str or ~azure.ai.voicelive.models.EchoCancellationReferenceSource :ivar channels: Number of input audio channels. * `1`: Mono input (default). * `2`: Interleaved stereo input where channel 0 is the microphone signal and channel 1 is - the echo reference signal. - When set to 2, `reference_source` must be `client` and `input_audio_format` must be - `pcm16`. + the echo reference signal. When set to 2, `reference_source` must be `client` and + `input_audio_format` must be `pcm16`. + :vartype channels: int """