microsoft · eavanvalkenburg · Jan 8, 2025 · Jan 9, 2025 · Jan 9, 2025 · Jan 9, 2025
diff --git a/docs/decisions/00XX-realtime-api-clients.md b/docs/decisions/00XX-realtime-api-clients.md
@@ -39,7 +39,7 @@ repos:
       - id: ruff-format
   - repo: https://github.com/astral-sh/uv-pre-commit
     # uv version.
-    rev: 0.5.2
+    rev: 0.5.20
     hooks:
       # Update the uv lockfile
       - id: uv-lock

@@ -45,6 +45,7 @@ dependencies = [
     "pybars4 ~= 0.9",
     "jinja2 ~= 3.1",
     "nest-asyncio ~= 1.6",
+    "taskgroup >= 0.2.2; python_version < '3.11'",
 ]
 
 ### Optional dependencies
@@ -61,7 +62,8 @@ chroma = [
 ]
 google = [
     "google-cloud-aiplatform ~= 1.60",
-    "google-generativeai ~= 0.7"
+    "google-generativeai ~= 0.7",
+    "google-genai ~= 0.4"
 ]
 hugging_face = [
     "transformers[torch] ~= 4.28",
@@ -123,6 +125,11 @@ dapr = [
     "dapr-ext-fastapi>=1.14.0",
     "flask-dapr>=1.14.0"
 ]
+openai_realtime = [
+    "openai[realtime] ~= 1.0",
+    "aiortc>=1.9.0",
+    "sounddevice>=0.5.1",
+]
 
 [tool.uv]
 prerelease = "if-necessary-or-explicit"
@@ -220,5 +227,3 @@ name = "semantic_kernel"
 [build-system]
 requires = ["flit-core >= 3.9,<4.0"]
 build-backend = "flit_core.buildapi"
-
-
@@ -0,0 +1,174 @@
+# Copyright (c) Microsoft. All rights reserved.
+import asyncio
+import logging
+import signal
+from random import randint
+
+import sounddevice as sd
+
+from semantic_kernel import Kernel
+from semantic_kernel.connectors.ai import FunctionChoiceBehavior
+from semantic_kernel.connectors.ai.open_ai import (
+    OpenAIRealtimeExecutionSettings,
+    OpenAIRealtimeWebRTC,
+    TurnDetection,
+)
+from semantic_kernel.connectors.ai.open_ai.services.open_ai_realtime_base import ListenEvents
+from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase
+from semantic_kernel.connectors.ai.realtime_helpers import SKSimplePlayer
+from semantic_kernel.contents import ChatHistory
+from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent
+from semantic_kernel.functions import kernel_function
+
+logging.basicConfig(level=logging.WARNING)
+aiortc_log = logging.getLogger("aiortc")
+aiortc_log.setLevel(logging.WARNING)
+aioice_log = logging.getLogger("aioice")
+aioice_log.setLevel(logging.WARNING)
+logger = logging.getLogger(__name__)
+
+# This simple sample demonstrates how to use the OpenAI Realtime API to create
+# a chat bot that can listen and respond directly through audio.
+# It requires installing:
+# - semantic-kernel[openai_realtime]
+# - pyaudio
+# - sounddevice
+# - pydub
+# - aiortc
+# e.g. pip install pyaudio sounddevice pydub
+
+# The characterics of your speaker and microphone are a big factor in a smooth conversation
+# so you may need to try out different devices for each.
+# you can also play around with the turn_detection settings to get the best results.
+# It has device id's set in the AudioRecorderStream and AudioPlayerAsync classes,
+# so you may need to adjust these for your system.
+# you can check the available devices by uncommenting line below the function
+
+
+def check_audio_devices():
+    logger.info(sd.query_devices())
+
+
+check_audio_devices()
+
+
+class ReceivingStreamHandler:
+    """This is a simple class that listens to the received buffer of the RealtimeClientBase.
+
+    It can be used to play audio and print the transcript of the conversation.
+
+    It can also be used to act on other events from the service.
+    """
+
+    def __init__(self, realtime_client: RealtimeClientBase, audio_player: SKSimplePlayer | None = None):
+        self.audio_player = audio_player
+        self.realtime_client = realtime_client
+
+    async def listen(
+        self,
+        play_audio: bool = True,
+        print_transcript: bool = True,
+    ) -> None:
+        # print the start message of the transcript
+        if print_transcript:
+            print("Mosscap (transcript): ", end="")
+        try:
+            # start listening for events
+            while True:
+                event_type, event = await self.realtime_client.receive_buffer.get()
+                match event_type:
+                    case ListenEvents.RESPONSE_AUDIO_DELTA:
+                        if play_audio and self.audio_player and isinstance(event, StreamingChatMessageContent):
+                            await self.audio_player.add_audio(event.items[0])
+                    case ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DELTA:
+                        if print_transcript and isinstance(event, StreamingChatMessageContent):
+                            print(event.content, end="")
+                    case ListenEvents.RESPONSE_CREATED:
+                        if print_transcript:
+                            print("")
+                await asyncio.sleep(0.01)
+        except asyncio.CancelledError:
+            print("\nThanks for talking to Mosscap!")
+
+
+# this function is used to stop the processes when ctrl + c is pressed
+def signal_handler():
+    for task in asyncio.all_tasks():
+        task.cancel()
+
+
+weather_conditions = ["sunny", "hot", "cloudy", "raining", "freezing", "snowing"]
+
+
+@kernel_function
+def get_weather(location: str) -> str:
+    """Get the weather for a location."""
+    weather = weather_conditions[randint(0, len(weather_conditions))]  # nosec
+    logger.warning(f"Getting weather for {location}: {weather}")
+    return f"The weather in {location} is {weather}."
+
+
+async def main() -> None:
+    # setup the asyncio loop with the signal event handler
+    loop = asyncio.get_event_loop()
+    loop.add_signal_handler(signal.SIGINT, signal_handler)
+
+    # create the Kernel and add a simple function for function calling.
+    kernel = Kernel()
+    kernel.add_function(plugin_name="weather", function_name="get_weather", function=get_weather)
+
+    # create the realtime client and optionally add the audio output function, this is optional
+    audio_player = SKSimplePlayer()
+    realtime_client = OpenAIRealtimeWebRTC(audio_output=audio_player.realtime_client_callback)
+
+    # create stream receiver, this can play the audio, if the audio_player is passed
+    # and allows you to print the transcript of the conversation
+    # and review or act on other events from the service
+    stream_handler = ReceivingStreamHandler(realtime_client)  # SimplePlayer(device_id=None)
+
+    # Create the settings for the session
+    # the key thing to decide on is to enable the server_vad turn detection
+    # if turn is turned off (by setting turn_detection=None), you will have to send
+    # the "input_audio_buffer.commit" and "response.create" event to the realtime api
+    # to signal the end of the user's turn and start the response.
+    # The realtime api, does not use a system message, but takes instructions as a parameter for a session
+    instructions = """
+    You are a chat bot. Your name is Mosscap and
+    you have one goal: figure out what people need.
+    Your full name, should you need to know it, is
+    Splendid Speckled Mosscap. You communicate
+    effectively, but you tend to answer with long
+    flowery prose.
+    """
+    # and we can add a chat history to conversation after starting it
+    chat_history = ChatHistory()
+    chat_history.add_user_message("Hi there, who are you?")
+    chat_history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.")
+
+    settings = OpenAIRealtimeExecutionSettings(
+        instructions=instructions,
+        voice="sage",
+        turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8),
+        function_choice_behavior=FunctionChoiceBehavior.Auto(),
+    )
+    # the context manager calls the create_session method on the client and start listening to the audio stream
+    async with realtime_client, audio_player:
+        await realtime_client.update_session(
+            settings=settings, chat_history=chat_history, kernel=kernel, create_response=True
+        )
+        # you can also send other events to the service, like this
+        # await realtime_client.send_buffer.put((
+        #     SendEvents.CONVERSATION_ITEM_CREATE,
+        #     {"item": ChatMessageContent(role="user", content="Hi there, who are you?")},
+        # ))
+        async with asyncio.TaskGroup() as tg:
+            tg.create_task(realtime_client.start_streaming())
+            tg.create_task(stream_handler.listen())
+
+
+if __name__ == "__main__":
+    print(
+        "Instruction: start speaking, when you stop the API should detect you finished and start responding. "
+        "Press ctrl + c to stop the program."
+    )
+    asyncio.run(main())
@@ -276,7 +276,9 @@ async def get_streaming_chat_message_contents(
                     for msg in messages:
                         if msg is not None:
                             all_messages.append(msg)
-                            if any(isinstance(item, FunctionCallContent) for item in msg.items):
+                            if not function_call_returned and any(
+                                isinstance(item, FunctionCallContent) for item in msg.items
+                            ):
                                 function_call_returned = True
                     yield messages
 
@@ -442,7 +444,10 @@ def _get_ai_model_id(self, settings: "PromptExecutionSettings") -> str:
         return getattr(settings, "ai_model_id", self.ai_model_id) or self.ai_model_id
 
     def _yield_function_result_messages(self, function_result_messages: list) -> bool:
-        """Determine if the function result messages should be yielded."""
+        """Determine if the function result messages should be yielded.
+
+        If there are messages and if the first message has items, then yield the messages.
+        """
         return len(function_result_messages) > 0 and len(function_result_messages[0].items) > 0
 
     # endregion
@@ -1,13 +1,16 @@
 # Copyright (c) Microsoft. All rights reserved.
 
 from collections import OrderedDict
+from collections.abc import Callable
+from copy import deepcopy
 from typing import TYPE_CHECKING, Any
 
 from semantic_kernel.contents.chat_message_content import ChatMessageContent
 from semantic_kernel.contents.function_result_content import FunctionResultContent
 from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent
 from semantic_kernel.contents.utils.author_role import AuthorRole
 from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError
+from semantic_kernel.utils.experimental_decorator import experimental_function
 
 if TYPE_CHECKING:
     from semantic_kernel.connectors.ai.function_choice_behavior import (
@@ -16,6 +19,7 @@
     )
     from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings
     from semantic_kernel.functions.kernel_function_metadata import KernelFunctionMetadata
+    from semantic_kernel.kernel import Kernel
 
 
 def update_settings_from_function_call_configuration(
@@ -129,3 +133,49 @@ def merge_streaming_function_results(
             function_invoke_attempt=function_invoke_attempt,
         )
     ]
+
+
+@experimental_function
+def prepare_settings_for_function_calling(
+    settings: "PromptExecutionSettings",
+    settings_class: type["PromptExecutionSettings"],
+    update_settings_callback: Callable[..., None],
+    kernel: "Kernel",
+) -> "PromptExecutionSettings":
+    """Prepare settings for the service.
+
+    Args:
+        settings: Prompt execution settings.
+        settings_class: The settings class.
+        update_settings_callback: The callback to update the settings.
+        kernel: Kernel instance.
+
+    Returns:
+        PromptExecutionSettings of type settings_class.
+    """
+    settings = deepcopy(settings)
+    if not isinstance(settings, settings_class):
+        settings = settings_class.from_prompt_execution_settings(settings)
+
+    # For backwards compatibility we need to convert the `FunctionCallBehavior` to `FunctionChoiceBehavior`
+    # if this method is called with a `FunctionCallBehavior` object as part of the settings
+
+    from semantic_kernel.connectors.ai.function_call_behavior import FunctionCallBehavior
+    from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceBehavior
+
+    if hasattr(settings, "function_call_behavior") and isinstance(
+        settings.function_call_behavior, FunctionCallBehavior
+    ):
+        settings.function_choice_behavior = FunctionChoiceBehavior.from_function_call_behavior(
+            settings.function_call_behavior
+        )
+
+    if settings.function_choice_behavior:
+        # Configure the function choice behavior into the settings object
+        # that will become part of the request to the AI service
+        settings.function_choice_behavior.configure(
+            kernel=kernel,
+            update_settings_callback=update_settings_callback,
+            settings=settings,
+        )
+    return settings
@@ -22,6 +22,10 @@
     OpenAIPromptExecutionSettings,
     OpenAITextPromptExecutionSettings,
 )
+from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import (
+    OpenAIRealtimeExecutionSettings,
+    TurnDetection,
+)
 from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_text_to_audio_execution_settings import (
     OpenAITextToAudioExecutionSettings,
 )
@@ -36,6 +40,7 @@
 from semantic_kernel.connectors.ai.open_ai.services.azure_text_to_image import AzureTextToImage
 from semantic_kernel.connectors.ai.open_ai.services.open_ai_audio_to_text import OpenAIAudioToText
 from semantic_kernel.connectors.ai.open_ai.services.open_ai_chat_completion import OpenAIChatCompletion
+from semantic_kernel.connectors.ai.open_ai.services.open_ai_realtime import OpenAIRealtime, OpenAIRealtimeWebRTC
 from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_completion import OpenAITextCompletion
 from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_embedding import OpenAITextEmbedding
 from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_to_audio import OpenAITextToAudio
@@ -69,6 +74,9 @@
     "OpenAIChatPromptExecutionSettings",
     "OpenAIEmbeddingPromptExecutionSettings",
     "OpenAIPromptExecutionSettings",
+    "OpenAIRealtime",
+    "OpenAIRealtimeExecutionSettings",
+    "OpenAIRealtimeWebRTC",
     "OpenAISettings",
     "OpenAITextCompletion",
     "OpenAITextEmbedding",
@@ -77,4 +85,5 @@
     "OpenAITextToAudioExecutionSettings",
     "OpenAITextToImage",
     "OpenAITextToImageExecutionSettings",
+    "TurnDetection",
 ]