Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Python: draft initial implementation of Realtime API #10127

Draft
wants to merge 15 commits into
base: main
Choose a base branch
from
207 changes: 207 additions & 0 deletions docs/decisions/00XX-realtime-api-clients.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion python/.pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ repos:
- id: ruff-format
- repo: https://github.com/astral-sh/uv-pre-commit
# uv version.
rev: 0.5.2
rev: 0.5.20
hooks:
# Update the uv lockfile
- id: uv-lock
Expand Down
11 changes: 8 additions & 3 deletions python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ dependencies = [
"pybars4 ~= 0.9",
"jinja2 ~= 3.1",
"nest-asyncio ~= 1.6",
"taskgroup >= 0.2.2; python_version < '3.11'",
]

### Optional dependencies
Expand All @@ -61,7 +62,8 @@ chroma = [
]
google = [
"google-cloud-aiplatform ~= 1.60",
"google-generativeai ~= 0.7"
"google-generativeai ~= 0.7",
"google-genai ~= 0.4"
]
hugging_face = [
"transformers[torch] ~= 4.28",
Expand Down Expand Up @@ -123,6 +125,11 @@ dapr = [
"dapr-ext-fastapi>=1.14.0",
"flask-dapr>=1.14.0"
]
openai_realtime = [
"openai[realtime] ~= 1.0",
"aiortc>=1.9.0",
"sounddevice>=0.5.1",
]

[tool.uv]
prerelease = "if-necessary-or-explicit"
Expand Down Expand Up @@ -220,5 +227,3 @@ name = "semantic_kernel"
[build-system]
requires = ["flit-core >= 3.9,<4.0"]
build-backend = "flit_core.buildapi"


174 changes: 174 additions & 0 deletions python/samples/concepts/audio/04-chat_with_realtime_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
# Copyright (c) Microsoft. All rights reserved.
import asyncio
import logging
import signal
from random import randint

import sounddevice as sd

from semantic_kernel import Kernel
from semantic_kernel.connectors.ai import FunctionChoiceBehavior
from semantic_kernel.connectors.ai.open_ai import (
OpenAIRealtimeExecutionSettings,
OpenAIRealtimeWebRTC,
TurnDetection,
)
from semantic_kernel.connectors.ai.open_ai.services.open_ai_realtime_base import ListenEvents
from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase
from semantic_kernel.connectors.ai.realtime_helpers import SKSimplePlayer
from semantic_kernel.contents import ChatHistory
from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent
from semantic_kernel.functions import kernel_function

logging.basicConfig(level=logging.WARNING)
aiortc_log = logging.getLogger("aiortc")
aiortc_log.setLevel(logging.WARNING)
aioice_log = logging.getLogger("aioice")
aioice_log.setLevel(logging.WARNING)
logger = logging.getLogger(__name__)

# This simple sample demonstrates how to use the OpenAI Realtime API to create
# a chat bot that can listen and respond directly through audio.
# It requires installing:
# - semantic-kernel[openai_realtime]
# - pyaudio
# - sounddevice
# - pydub
# - aiortc
# e.g. pip install pyaudio sounddevice pydub

# The characterics of your speaker and microphone are a big factor in a smooth conversation
# so you may need to try out different devices for each.
# you can also play around with the turn_detection settings to get the best results.
# It has device id's set in the AudioRecorderStream and AudioPlayerAsync classes,
# so you may need to adjust these for your system.
# you can check the available devices by uncommenting line below the function


def check_audio_devices():
logger.info(sd.query_devices())


check_audio_devices()


class ReceivingStreamHandler:
"""This is a simple class that listens to the received buffer of the RealtimeClientBase.

It can be used to play audio and print the transcript of the conversation.

It can also be used to act on other events from the service.
"""

def __init__(self, realtime_client: RealtimeClientBase, audio_player: SKSimplePlayer | None = None):
self.audio_player = audio_player
self.realtime_client = realtime_client

async def listen(
self,
play_audio: bool = True,
print_transcript: bool = True,
) -> None:
# print the start message of the transcript
if print_transcript:
print("Mosscap (transcript): ", end="")
try:
# start listening for events
while True:
event_type, event = await self.realtime_client.receive_buffer.get()
match event_type:
case ListenEvents.RESPONSE_AUDIO_DELTA:
if play_audio and self.audio_player and isinstance(event, StreamingChatMessageContent):
await self.audio_player.add_audio(event.items[0])
case ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DELTA:
if print_transcript and isinstance(event, StreamingChatMessageContent):
print(event.content, end="")
case ListenEvents.RESPONSE_CREATED:
if print_transcript:
print("")
await asyncio.sleep(0.01)
except asyncio.CancelledError:
print("\nThanks for talking to Mosscap!")


# this function is used to stop the processes when ctrl + c is pressed
def signal_handler():
for task in asyncio.all_tasks():
task.cancel()


weather_conditions = ["sunny", "hot", "cloudy", "raining", "freezing", "snowing"]


@kernel_function
def get_weather(location: str) -> str:
"""Get the weather for a location."""
weather = weather_conditions[randint(0, len(weather_conditions))] # nosec
logger.warning(f"Getting weather for {location}: {weather}")
return f"The weather in {location} is {weather}."


async def main() -> None:
# setup the asyncio loop with the signal event handler
loop = asyncio.get_event_loop()
loop.add_signal_handler(signal.SIGINT, signal_handler)

# create the Kernel and add a simple function for function calling.
kernel = Kernel()
kernel.add_function(plugin_name="weather", function_name="get_weather", function=get_weather)

# create the realtime client and optionally add the audio output function, this is optional
audio_player = SKSimplePlayer()
realtime_client = OpenAIRealtimeWebRTC(audio_output=audio_player.realtime_client_callback)

# create stream receiver, this can play the audio, if the audio_player is passed
# and allows you to print the transcript of the conversation
# and review or act on other events from the service
stream_handler = ReceivingStreamHandler(realtime_client) # SimplePlayer(device_id=None)

# Create the settings for the session
# the key thing to decide on is to enable the server_vad turn detection
# if turn is turned off (by setting turn_detection=None), you will have to send
# the "input_audio_buffer.commit" and "response.create" event to the realtime api
# to signal the end of the user's turn and start the response.
# The realtime api, does not use a system message, but takes instructions as a parameter for a session
instructions = """
You are a chat bot. Your name is Mosscap and
you have one goal: figure out what people need.
Your full name, should you need to know it, is
Splendid Speckled Mosscap. You communicate
effectively, but you tend to answer with long
flowery prose.
"""
# and we can add a chat history to conversation after starting it
chat_history = ChatHistory()
chat_history.add_user_message("Hi there, who are you?")
chat_history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.")

settings = OpenAIRealtimeExecutionSettings(
instructions=instructions,
voice="sage",
turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8),
function_choice_behavior=FunctionChoiceBehavior.Auto(),
)
# the context manager calls the create_session method on the client and start listening to the audio stream
async with realtime_client, audio_player:
await realtime_client.update_session(
settings=settings, chat_history=chat_history, kernel=kernel, create_response=True
)
# you can also send other events to the service, like this
# await realtime_client.send_buffer.put((
# SendEvents.CONVERSATION_ITEM_CREATE,
# {"item": ChatMessageContent(role="user", content="Hi there, who are you?")},
# ))
async with asyncio.TaskGroup() as tg:
tg.create_task(realtime_client.start_streaming())
tg.create_task(stream_handler.listen())


if __name__ == "__main__":
print(
"Instruction: start speaking, when you stop the API should detect you finished and start responding. "
"Press ctrl + c to stop the program."
)
asyncio.run(main())
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,9 @@ async def get_streaming_chat_message_contents(
for msg in messages:
if msg is not None:
all_messages.append(msg)
if any(isinstance(item, FunctionCallContent) for item in msg.items):
if not function_call_returned and any(
isinstance(item, FunctionCallContent) for item in msg.items
):
function_call_returned = True
yield messages

Expand Down Expand Up @@ -442,7 +444,10 @@ def _get_ai_model_id(self, settings: "PromptExecutionSettings") -> str:
return getattr(settings, "ai_model_id", self.ai_model_id) or self.ai_model_id

def _yield_function_result_messages(self, function_result_messages: list) -> bool:
"""Determine if the function result messages should be yielded."""
"""Determine if the function result messages should be yielded.

If there are messages and if the first message has items, then yield the messages.
"""
return len(function_result_messages) > 0 and len(function_result_messages[0].items) > 0

# endregion
50 changes: 50 additions & 0 deletions python/semantic_kernel/connectors/ai/function_calling_utils.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
# Copyright (c) Microsoft. All rights reserved.

from collections import OrderedDict
from collections.abc import Callable
from copy import deepcopy
from typing import TYPE_CHECKING, Any

from semantic_kernel.contents.chat_message_content import ChatMessageContent
from semantic_kernel.contents.function_result_content import FunctionResultContent
from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent
from semantic_kernel.contents.utils.author_role import AuthorRole
from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError
from semantic_kernel.utils.experimental_decorator import experimental_function

if TYPE_CHECKING:
from semantic_kernel.connectors.ai.function_choice_behavior import (
Expand All @@ -16,6 +19,7 @@
)
from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings
from semantic_kernel.functions.kernel_function_metadata import KernelFunctionMetadata
from semantic_kernel.kernel import Kernel


def update_settings_from_function_call_configuration(
Expand Down Expand Up @@ -129,3 +133,49 @@ def merge_streaming_function_results(
function_invoke_attempt=function_invoke_attempt,
)
]


@experimental_function
def prepare_settings_for_function_calling(
settings: "PromptExecutionSettings",
settings_class: type["PromptExecutionSettings"],
update_settings_callback: Callable[..., None],
kernel: "Kernel",
) -> "PromptExecutionSettings":
"""Prepare settings for the service.

Args:
settings: Prompt execution settings.
settings_class: The settings class.
update_settings_callback: The callback to update the settings.
kernel: Kernel instance.

Returns:
PromptExecutionSettings of type settings_class.
"""
settings = deepcopy(settings)
if not isinstance(settings, settings_class):
settings = settings_class.from_prompt_execution_settings(settings)

# For backwards compatibility we need to convert the `FunctionCallBehavior` to `FunctionChoiceBehavior`
# if this method is called with a `FunctionCallBehavior` object as part of the settings

from semantic_kernel.connectors.ai.function_call_behavior import FunctionCallBehavior
from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceBehavior

if hasattr(settings, "function_call_behavior") and isinstance(
settings.function_call_behavior, FunctionCallBehavior
):
settings.function_choice_behavior = FunctionChoiceBehavior.from_function_call_behavior(
settings.function_call_behavior
)

if settings.function_choice_behavior:
# Configure the function choice behavior into the settings object
# that will become part of the request to the AI service
settings.function_choice_behavior.configure(
kernel=kernel,
update_settings_callback=update_settings_callback,
settings=settings,
)
return settings
9 changes: 9 additions & 0 deletions python/semantic_kernel/connectors/ai/open_ai/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@
OpenAIPromptExecutionSettings,
OpenAITextPromptExecutionSettings,
)
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import (
OpenAIRealtimeExecutionSettings,
TurnDetection,
)
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_text_to_audio_execution_settings import (
OpenAITextToAudioExecutionSettings,
)
Expand All @@ -36,6 +40,7 @@
from semantic_kernel.connectors.ai.open_ai.services.azure_text_to_image import AzureTextToImage
from semantic_kernel.connectors.ai.open_ai.services.open_ai_audio_to_text import OpenAIAudioToText
from semantic_kernel.connectors.ai.open_ai.services.open_ai_chat_completion import OpenAIChatCompletion
from semantic_kernel.connectors.ai.open_ai.services.open_ai_realtime import OpenAIRealtime, OpenAIRealtimeWebRTC
from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_completion import OpenAITextCompletion
from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_embedding import OpenAITextEmbedding
from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_to_audio import OpenAITextToAudio
Expand Down Expand Up @@ -69,6 +74,9 @@
"OpenAIChatPromptExecutionSettings",
"OpenAIEmbeddingPromptExecutionSettings",
"OpenAIPromptExecutionSettings",
"OpenAIRealtime",
"OpenAIRealtimeExecutionSettings",
"OpenAIRealtimeWebRTC",
"OpenAISettings",
"OpenAITextCompletion",
"OpenAITextEmbedding",
Expand All @@ -77,4 +85,5 @@
"OpenAITextToAudioExecutionSettings",
"OpenAITextToImage",
"OpenAITextToImageExecutionSettings",
"TurnDetection",
]
Loading
Loading