feat: Context handling in realtime

sharananurag998 · sharananurag998 · commit b8899f78858b · 2025-05-07T16:36:18.000+05:30
diff --git a/docs/voice/pipeline.md b/docs/voice/pipeline.md
@@ -103,6 +103,14 @@ from agents.voice import (
     VoicePipelineConfig
 )
 from agents.voice.models.sdk_realtime import SDKRealtimeLLM
+from dataclasses import dataclass
+
+# Define a simple context class for state management (optional)
+@dataclass
+class MyAppContext:
+    """Context for the voice assistant."""
+    user_name: str = "User"
+    interaction_count: int = 0
 
 # Create the input, config, and model
 input_stream = StreamedAudioInput()
@@ -114,11 +122,15 @@ config = VoicePipelineConfig(
 )
 model = SDKRealtimeLLM(model_name="gpt-4o-realtime-preview")
 
-# Create the pipeline with tools
+# Create an app context instance (optional)
+app_context = MyAppContext()
+
+# Create the pipeline with tools and shared context
 pipeline = RealtimeVoicePipeline(
     model=model,
     tools=[get_weather, get_time],
     config=config,
+    shared_context=app_context,  # Optional: shared state for context-aware tools
 )
 
 # Start the pipeline
@@ -147,6 +159,117 @@ while True:
         break
 ```
 
+### Using Shared Context with Tools
+
+The `RealtimeVoicePipeline` supports passing a shared context object to tools, allowing them to access and modify shared state across multiple interactions. This is useful for building more complex voice applications that need to maintain state, such as:
+
+-   Tracking user preferences
+-   Maintaining conversation history
+-   Counting interactions
+-   Storing user information
+
+#### Setting up a shared context
+
+To use shared context with tools:
+
+1. Define a context class (typically a dataclass) to hold your application state
+2. Create an instance of this class
+3. Pass it to the `RealtimeVoicePipeline` using the `shared_context` parameter
+4. Create tools that accept a `RunContextWrapper[YourContextType]` as their first parameter
+
+```python
+from dataclasses import dataclass
+from agents.run_context import RunContextWrapper
+from agents.tool import function_tool
+
+# Define your context class
+@dataclass
+class MyAppContext:
+    """Context for the voice assistant."""
+    user_name: str
+    interaction_count: int = 0
+
+# Create a context-aware tool
+@function_tool
+def greet_user_and_count(context: RunContextWrapper[MyAppContext]) -> str:
+    """Greets the user by name and counts interactions."""
+    # Access and modify the context
+    context.context.interaction_count += 1
+
+    return f"Hello {context.context.user_name}! This is interaction number {context.context.interaction_count}."
+
+# Create another context-aware tool
+@function_tool
+def get_user_details(context: RunContextWrapper[MyAppContext]) -> dict:
+    """Gets user details from the context."""
+    return {
+        "user_name": context.context.user_name,
+        "interaction_count": context.context.interaction_count
+    }
+
+# Create your application context
+app_context = MyAppContext(user_name="Alice", interaction_count=0)
+
+# Create the pipeline with shared context
+pipeline = RealtimeVoicePipeline(
+    model=model,
+    tools=[get_weather, get_time, greet_user_and_count, get_user_details],
+    config=config,
+    shared_context=app_context,  # Pass the context here
+)
+```
+
+#### How it works
+
+1. The `RealtimeVoicePipeline` passes the shared context to its internal `ToolExecutor`
+2. When the LLM calls a tool, the `ToolExecutor` checks if the tool's first parameter is named `context`
+3. If it is, the executor wraps your context object in a `RunContextWrapper` and passes it to the tool
+4. The tool can then access and modify your context object via `context.context`
+5. Since all tools share the same context object, changes made by one tool are visible to other tools in future calls
+
+This mechanism allows your tools to maintain shared state across turns and interactions in your voice application, without needing to set up a separate state management system.
+
+#### Context-Aware vs. Standard Tools
+
+You can mix both context-aware and standard tools in the same `RealtimeVoicePipeline`:
+
+```python
+# A standard tool (no context parameter)
+@function_tool
+def get_weather(city: str) -> dict:
+    """Gets the weather for the specified city."""
+    return {"temperature": 72, "condition": "sunny"}
+
+# A context-aware tool (has context parameter)
+@function_tool
+def update_user_preference(context: RunContextWrapper[MyAppContext], preference: str, value: str) -> str:
+    """Updates a user preference in the context."""
+    if not hasattr(context.context, "preferences"):
+        context.context.preferences = {}
+    context.context.preferences[preference] = value
+    return f"Updated {preference} to {value}"
+```
+
+**When to use standard tools:**
+
+-   For stateless operations that don't need to remember information between calls
+-   For simple lookups or calculations based solely on the input parameters
+-   When integration with external APIs or services doesn't require user-specific state
+
+**When to use context-aware tools:**
+
+-   When tools need to access or modify shared state
+-   For personalization features that adapt to the user
+-   To implement features that track usage or interactions
+-   When information gathered in one tool call needs to be available to another tool
+
+**Important notes:**
+
+-   The first parameter of a context-aware tool must be named `context` and should have a type annotation of `RunContextWrapper[YourContextType]`
+-   Type hints are recommended but not required; the parameter name `context` is sufficient for the tool to be detected as context-aware
+-   The actual object inside `context.context` will be the instance you passed to `shared_context` when creating the pipeline
+-   All context-aware tools see the same context instance, so changes are immediately visible to all tools
+
 ### Turn Detection Modes
 
 The realtime models can operate in different turn detection modes, controlled via the `turn_detection` setting:
diff --git a/examples/voice/realtime_assistant.py b/examples/voice/realtime_assistant.py
@@ -18,14 +18,15 @@
    on applying for access to the realtime API.
 
 Usage:
-    python continuous_realtime_assistant.py
+    python realtime_assistant.py
 """
 
 import asyncio
 import logging
 import os
 import time
 from typing import Dict, Any
+from dataclasses import dataclass
 
 import numpy as np
 import sounddevice as sd  # For microphone and speaker I/O
@@ -42,6 +43,7 @@
 )
 from agents.tool import function_tool, Tool
 from agents.voice.models.sdk_realtime import SDKRealtimeLLM
+from agents.run_context import RunContextWrapper
 
 # Import the new event types from our SDK
 from agents.voice.realtime.model import (
@@ -60,6 +62,15 @@
 logger = logging.getLogger("realtime_assistant")
 
 
+# Define a dataclass for our application context
+@dataclass
+class MyAppContext:
+    """A simple context for the realtime voice assistant example."""
+
+    user_name: str
+    interaction_count: int = 0
+
+
 # Define some sample tools
 @function_tool
 def get_weather(city: str) -> Dict[str, Any]:
@@ -75,6 +86,37 @@ def get_time(timezone: str = "UTC") -> Dict[str, Any]:
     return {"time": time.strftime("%H:%M:%S", time.gmtime()), "timezone": timezone}
 
 
+# Define a context-aware tool
+@function_tool
+def greet_user_and_count(context: RunContextWrapper[MyAppContext]) -> str:
+    """Greets the user by name and counts interactions."""
+    logger.info(f"greet_user_and_count called with context: {context}")
+    # Increment the interaction count
+    context.context.interaction_count += 1
+
+    logger.info(
+        f"Greeting user: {context.context.user_name}, "
+        f"Interaction count: {context.context.interaction_count}"
+    )
+
+    return f"Hello {context.context.user_name}! This is interaction number {context.context.interaction_count}."
+
+
+# Another context-aware tool that reads but doesn't modify the context
+@function_tool
+def get_user_details(context: RunContextWrapper[MyAppContext]) -> Dict[str, Any]:
+    """Gets the user's details from the context."""
+    logger.info(f"get_user_details called with context: {context}")
+
+    logger.info(
+        f"Returning user details: name={context.context.user_name}, count={context.context.interaction_count}"
+    )
+    return {
+        "user_name": context.context.user_name,
+        "interaction_count": context.context.interaction_count,
+    }
+
+
 # Get the OpenAI API key from environment variables
 api_key = os.environ.get("OPENAI_API_KEY")
 if not api_key:
@@ -117,18 +159,22 @@ async def main():
         realtime_settings={
             "turn_detection": "server_vad",  # Use server-side VAD
             "assistant_voice": "alloy",
-            "system_message": "You are a helpful assistant that responds concisely.",
+            "system_message": "You are a helpful assistant that responds concisely. You can use the greet_user_and_count tool to greet the user by name and the get_user_details tool to retrieve information about the user.",
             # Enable server-side noise / echo reduction
             "input_audio_noise_reduction": {},
         }
     )
     input_stream = StreamedAudioInput()
 
-    # Create the realtime pipeline
+    # Create our application context
+    app_context = MyAppContext(user_name="Anurag", interaction_count=0)
+
+    # Create the realtime pipeline with shared context
     pipeline = RealtimeVoicePipeline(
         model=model,
-        tools=[get_weather, get_time],
+        tools=[get_weather, get_time, greet_user_and_count, get_user_details],
         config=config,
+        shared_context=app_context,  # Pass the context to the pipeline
     )
 
     # Track events and errors
@@ -321,6 +367,9 @@ async def toggle_push_to_talk_simulation():
 
         logger.info(f"Total events processed: {event_count}")
 
+        # Print the final interaction count from the context
+        logger.info(f"Final interaction count: {app_context.interaction_count}")
+
         # Provide troubleshooting information if needed
         if error_occurred or event_count <= 1:  # <=1 because turn_started is an event
             logger.error(f"Error occurred: {error_occurred}")
diff --git a/src/agents/voice/pipeline_realtime.py b/src/agents/voice/pipeline_realtime.py
@@ -37,6 +37,7 @@ def __init__(
         model: RealtimeLLMModel | str | None = None,
         tools: Sequence[Tool] = (),
         config: VoicePipelineConfig | None = None,
+        shared_context: Any | None = None,
     ):
         """Create a new real-time voice pipeline.
 
@@ -45,6 +46,7 @@ def __init__(
                    or a string identifier for a model from the provider.
             tools: A sequence of tools available to the LLM.
             config: The pipeline configuration. If not provided, a default will be used.
+            shared_context: An optional context object that will be passed to tools when they are executed.
         """
         if isinstance(model, str) or model is None:
             self._model_name_to_load: str | None = model
@@ -59,7 +61,8 @@ def __init__(
 
         self._tools = tools
         self._config = config or VoicePipelineConfig()
-        self._tool_executor = ToolExecutor(tools)
+        self._shared_context = shared_context
+        self._tool_executor = ToolExecutor(tools, shared_context=shared_context)
 
     def _get_model(self) -> RealtimeLLMModel:
         """Get the real-time LLM model to use."""
diff --git a/src/agents/voice/realtime/tool_exec.py b/src/agents/voice/realtime/tool_exec.py

-Original file line number
+Diff line change
@@ @@ -1,8 +1,9 @@ @@
 from __future__ import annotations
 import json
 +import inspect
 from collections.abc import Sequence
 -from typing import Any
 +from typing import Any, get_type_hints, get_origin, Dict, Set
 from ...exceptions import AgentsException, UserError
 from ...logger import logger
 class ToolExecutor:
     """Executes tools based on RealtimeEventToolCall events."""
 -    def __init__(self, tools: Sequence[Tool]):
 -        self._tool_map: dict[str, FunctionTool] = {}
 +    def __init__(self, tools: Sequence[Tool], shared_context: Any | None = None):
 +        self._tool_map: Dict[str, FunctionTool] = {}
 +        self._shared_context = shared_context
 +        # Explicitly specify which tools need context - we'll set all tools with first param named "context"
 +        self._context_aware_tools: Set[str] = set()
++
         for tool in tools:
             if isinstance(tool, FunctionTool):
                 self._tool_map[tool.name] = tool
++
 +                # Debug - log all attributes of the FunctionTool
 +                logger.info(f"FunctionTool {tool.name} attributes: {dir(tool)}")
++
 +                # Get the original function if available
 +                if hasattr(tool, "function"):
 +                    func = tool.function
 +                    logger.info(f"Found function attribute for {tool.name}: {func}")
 +                    if callable(func):
 +                        # Check if first parameter is named "context" - simpler approach
 +                        sig = inspect.signature(func)
 +                        params = list(sig.parameters.keys())
 +                        logger.info(f"Function {tool.name} params: {params}")
 +                        if params and params[0] == "context":
 +                            self._context_aware_tools.add(tool.name)
 +                            logger.info(f"Detected context-aware tool: {tool.name}")
 +                else:
 +                    # Try to inspect on_invoke_tool to see if we can find more info
 +                    logger.info(
 +                        f"Tool {tool.name} has no 'function' attribute. Examining on_invoke_tool: {tool.on_invoke_tool}"
 +                    )
++
 +                    # Special hardcoded handling - for now, let's explicitly mark these tools as context-aware
 +                    if tool.name in ["greet_user_and_count", "get_user_details"]:
 +                        logger.info(
 +                            f"Explicitly marking {tool.name} as context-aware based on name"
 +                        )
 +                        self._context_aware_tools.add(tool.name)
             else:
                 # For now, only FunctionTools are supported by this simple executor.
                 # We can extend this later if other tool types (e.g. ComputerTool) are needed
                     f"Tool '{tool.name}' is not a FunctionTool and will be ignored by ToolExecutor."
+                )
 +        logger.info(f"Context-aware tools: {self._context_aware_tools}")
++
     async def execute(self, tool_call_event: RealtimeEventToolCall) -> str:
         """Executes the specified tool and returns its string output.
         Raises:
             AgentsException: If the tool is not found or fails during execution.
         """
 -        tool = self._tool_map.get(tool_call_event.tool_name)
 +        tool_name = tool_call_event.tool_name
 +        tool = self._tool_map.get(tool_name)
++
         if not tool:
 -            err_msg = f"Tool '{tool_call_event.tool_name}' not found in ToolExecutor."
 +            err_msg = f"Tool '{tool_name}' not found in ToolExecutor."
             logger.error(err_msg)
             # Return an error string that can be sent back to the LLM
 -            return json.dumps(
 -                {"error": err_msg, "tool_name": tool_call_event.tool_name}
 -            )
 +            return json.dumps({"error": err_msg, "tool_name": tool_name})
         # Convert arguments dict to JSON string, as expected by on_invoke_tool
         try:
             arguments_json = json.dumps(tool_call_event.arguments)
         except TypeError as e:  # pragma: no cover
 -            err_msg = f"Failed to serialize arguments for tool '{tool.name}': {e}"
 +            err_msg = f"Failed to serialize arguments for tool '{tool_name}': {e}"
             logger.error(f"{err_msg} Arguments: {tool_call_event.arguments}")
 -            return json.dumps({"error": err_msg, "tool_name": tool.name})
 +            return json.dumps({"error": err_msg, "tool_name": tool_name})
 -        logger.info(f"Executing tool: {tool.name} with args: {arguments_json}")
 +        logger.info(f"Executing tool: {tool_name} with args: {arguments_json}")
         try:
 -            # Create an empty RunContextWrapper for now, as this executor is lightweight.
 -            # If context-dependent tools are needed, this might need to evolve or use a proper Runner.
 -            # The `on_invoke_tool` is expected to handle JSON string input.
 -            tool_output = await tool.on_invoke_tool(
 -                RunContextWrapper(context=None), arguments_json
 -            )
 +            # Check if this is a context-aware tool
 +            needs_context = tool_name in self._context_aware_tools
++
 +            # Execute the tool with or without context
 +            if needs_context:
 +                logger.info(
 +                    f"Tool {tool_name} is context-aware, passing RunContextWrapper"
 +                )
 +                tool_output = await tool.on_invoke_tool(
 +                    RunContextWrapper(context=self._shared_context), arguments_json
 +                )
 +            else:
 +                logger.info(
 +                    f"Tool {tool_name} is not context-aware, invoking without RunContextWrapper"
 +                )
 +                tool_output = await tool.on_invoke_tool(None, arguments_json)
             # Ensure the output is a string (as expected by OpenAI tool result content)
             if not isinstance(tool_output, str):
                 tool_output_str = tool_output
             logger.info(
 -                f"Tool {tool.name} executed successfully. Output length: {len(tool_output_str)}"
 +                f"Tool {tool_name} executed successfully. Output length: {len(tool_output_str)}"
+            )
             return tool_output_str
         except Exception as e:  # pragma: no cover
 -            logger.error(f"Error executing tool '{tool.name}': {e}", exc_info=True)
 +            logger.error(f"Error executing tool '{tool_name}': {e}", exc_info=True)
             # Return an error string that can be sent back to the LLM
 -            return json.dumps({"error": str(e), "tool_name": tool.name})
 +            return json.dumps({"error": str(e), "tool_name": tool_name})