diff --git a/.gitignore b/.gitignore
index 4f186239a7..d50737ed65 100644
--- a/.gitignore
+++ b/.gitignore
@@ -177,3 +177,4 @@ docs/
# Examples for development
examples/dev/*
+.env.local
diff --git a/README.md b/README.md
index 2a09aac241..0719d2290b 100644
--- a/README.md
+++ b/README.md
@@ -1,375 +1,30 @@
-
+# Intelligent Interruption Handling
-
-
-
-
-
+## What's added
+- Backchannel-aware interruption logic for LiveKit voice agents.
+- Configurable vocab for backchannels and interrupt words.
+- Example agent demonstrating the behavior.
-
-
+## What was the challenge
+- Default VAD pauses on filler words ("yeah/ok/hmm") while the agent is speaking, causing hiccups.
+- We need to ignore passive acknowledgements during agent speech but still interrupt on real commands or mixed input.
-
-[](https://pepy.tech/projects/livekit-agents)
-[](https://livekit.io/join-slack)
-[](https://twitter.com/livekit)
-[](https://deepwiki.com/livekit/agents)
-[](https://github.com/livekit/livekit/blob/master/LICENSE)
+## Why it works
+- State-aware filter (`BackchannelFilter`) plus gating in `agent_activity.py`: only pause speech when STT confirms the user said something other than a pure backchannel.
+- Backchannels while speaking are ignored; commands or mixed phrases interrupt; when silent, all input (including "yeah") is processed.
+- Fast path on interim transcripts for interrupt words; otherwise defer to final STT to avoid stutters.
+- Vocab is configurable via `AgentSession(backchannel_words=..., interrupt_words=...)` or env vars `LIVEKIT_BACKCHANNEL_WORDS` / `LIVEKIT_INTERRUPT_WORDS`.
-
+## How to run (console)
+1. Ensure env keys for your chosen STT/TTS/LLM and LiveKit (`LIVEKIT_URL`, `LIVEKIT_API_KEY`, `LIVEKIT_API_SECRET`).
+2. `uv run python examples/voice_agents/intelligent_interruption.py console`
+3. While the agent speaks, say "yeah/ok/uh-huh" → it keeps talking. When silent, say "yeah" → it responds. Say "stop" or "yeah wait" → it interrupts.
-Looking for the JS/TS library? Check out [AgentsJS](https://github.com/livekit/agents-js)
+## Proof video
+- Demo run (ignores "yeah" while speaking, responds when silent, interrupts on "stop"): https://drive.google.com/file/d/10UVOjNTleis-KmO4hsQBLCw8g-pjdtAR/view?usp=drivesdk
-## What is Agents?
+## How to run (dev with Agents Playground)
+1. Set env: `LIVEKIT_URL`, `LIVEKIT_API_KEY`, `LIVEKIT_API_SECRET` (and provider keys for STT/TTS/LLM in the example).
+2. Start hot reload: `uv run python examples/voice_agents/intelligent_interruption.py dev` (Ctrl+C to restart if needed).
+3. Open https://agents-playground.livekit.io, enter the same URL/key/secret and a room name; connect and talk to the agent to verify the backchannel behavior.
-
-
-The Agent Framework is designed for building realtime, programmable participants
-that run on servers. Use it to create conversational, multi-modal voice
-agents that can see, hear, and understand.
-
-
-
-## Features
-
-- **Flexible integrations**: A comprehensive ecosystem to mix and match the right STT, LLM, TTS, and Realtime API to suit your use case.
-- **Integrated job scheduling**: Built-in task scheduling and distribution with [dispatch APIs](https://docs.livekit.io/agents/build/dispatch/) to connect end users to agents.
-- **Extensive WebRTC clients**: Build client applications using LiveKit's open-source SDK ecosystem, supporting all major platforms.
-- **Telephony integration**: Works seamlessly with LiveKit's [telephony stack](https://docs.livekit.io/sip/), allowing your agent to make calls to or receive calls from phones.
-- **Exchange data with clients**: Use [RPCs](https://docs.livekit.io/home/client/data/rpc/) and other [Data APIs](https://docs.livekit.io/home/client/data/) to seamlessly exchange data with clients.
-- **Semantic turn detection**: Uses a transformer model to detect when a user is done with their turn, helps to reduce interruptions.
-- **MCP support**: Native support for MCP. Integrate tools provided by MCP servers with one loc.
-- **Builtin test framework**: Write tests and use judges to ensure your agent is performing as expected.
-- **Open-source**: Fully open-source, allowing you to run the entire stack on your own servers, including [LiveKit server](https://github.com/livekit/livekit), one of the most widely used WebRTC media servers.
-
-## Installation
-
-To install the core Agents library, along with plugins for popular model providers:
-
-```bash
-pip install "livekit-agents[openai,silero,deepgram,cartesia,turn-detector]~=1.0"
-```
-
-## Docs and guides
-
-Documentation on the framework and how to use it can be found [here](https://docs.livekit.io/agents/)
-
-## Core concepts
-
-- Agent: An LLM-based application with defined instructions.
-- AgentSession: A container for agents that manages interactions with end users.
-- entrypoint: The starting point for an interactive session, similar to a request handler in a web server.
-- Worker: The main process that coordinates job scheduling and launches agents for user sessions.
-
-## Usage
-
-### Simple voice agent
-
----
-
-```python
-from livekit.agents import (
- Agent,
- AgentSession,
- JobContext,
- RunContext,
- WorkerOptions,
- cli,
- function_tool,
-)
-from livekit.plugins import deepgram, elevenlabs, openai, silero
-
-@function_tool
-async def lookup_weather(
- context: RunContext,
- location: str,
-):
- """Used to look up weather information."""
-
- return {"weather": "sunny", "temperature": 70}
-
-
-async def entrypoint(ctx: JobContext):
- await ctx.connect()
-
- agent = Agent(
- instructions="You are a friendly voice assistant built by LiveKit.",
- tools=[lookup_weather],
- )
- session = AgentSession(
- vad=silero.VAD.load(),
- # any combination of STT, LLM, TTS, or realtime API can be used
- stt=deepgram.STT(model="nova-3"),
- llm=openai.LLM(model="gpt-4o-mini"),
- tts=elevenlabs.TTS(),
- )
-
- await session.start(agent=agent, room=ctx.room)
- await session.generate_reply(instructions="greet the user and ask about their day")
-
-
-if __name__ == "__main__":
- cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint))
-```
-
-You'll need the following environment variables for this example:
-
-- DEEPGRAM_API_KEY
-- OPENAI_API_KEY
-- ELEVEN_API_KEY
-
-### Multi-agent handoff
-
----
-
-This code snippet is abbreviated. For the full example, see [multi_agent.py](examples/voice_agents/multi_agent.py)
-
-```python
-...
-class IntroAgent(Agent):
- def __init__(self) -> None:
- super().__init__(
- instructions=f"You are a story teller. Your goal is to gather a few pieces of information from the user to make the story personalized and engaging."
- "Ask the user for their name and where they are from"
- )
-
- async def on_enter(self):
- self.session.generate_reply(instructions="greet the user and gather information")
-
- @function_tool
- async def information_gathered(
- self,
- context: RunContext,
- name: str,
- location: str,
- ):
- """Called when the user has provided the information needed to make the story personalized and engaging.
-
- Args:
- name: The name of the user
- location: The location of the user
- """
-
- context.userdata.name = name
- context.userdata.location = location
-
- story_agent = StoryAgent(name, location)
- return story_agent, "Let's start the story!"
-
-
-class StoryAgent(Agent):
- def __init__(self, name: str, location: str) -> None:
- super().__init__(
- instructions=f"You are a storyteller. Use the user's information in order to make the story personalized."
- f"The user's name is {name}, from {location}"
- # override the default model, switching to Realtime API from standard LLMs
- llm=openai.realtime.RealtimeModel(voice="echo"),
- chat_ctx=chat_ctx,
- )
-
- async def on_enter(self):
- self.session.generate_reply()
-
-
-async def entrypoint(ctx: JobContext):
- await ctx.connect()
-
- userdata = StoryData()
- session = AgentSession[StoryData](
- vad=silero.VAD.load(),
- stt=deepgram.STT(model="nova-3"),
- llm=openai.LLM(model="gpt-4o-mini"),
- tts=openai.TTS(voice="echo"),
- userdata=userdata,
- )
-
- await session.start(
- agent=IntroAgent(),
- room=ctx.room,
- )
-...
-```
-
-### Testing
-
-Automated tests are essential for building reliable agents, especially with the non-deterministic behavior of LLMs. LiveKit Agents include native test integration to help you create dependable agents.
-
-```python
-@pytest.mark.asyncio
-async def test_no_availability() -> None:
- llm = google.LLM()
- async AgentSession(llm=llm) as sess:
- await sess.start(MyAgent())
- result = await sess.run(
- user_input="Hello, I need to place an order."
- )
- result.expect.skip_next_event_if(type="message", role="assistant")
- result.expect.next_event().is_function_call(name="start_order")
- result.expect.next_event().is_function_call_output()
- await (
- result.expect.next_event()
- .is_message(role="assistant")
- .judge(llm, intent="assistant should be asking the user what they would like")
- )
-
-```
-
-## Examples
-
-
-
-
-
🎙️ Starter Agent
-
A starter agent optimized for voice conversations.
-
-## Running your agent
-
-### Testing in terminal
-
-```shell
-python myagent.py console
-```
-
-Runs your agent in terminal mode, enabling local audio input and output for testing.
-This mode doesn't require external servers or dependencies and is useful for quickly validating behavior.
-
-### Developing with LiveKit clients
-
-```shell
-python myagent.py dev
-```
-
-Starts the agent server and enables hot reloading when files change. This mode allows each process to host multiple concurrent agents efficiently.
-
-The agent connects to LiveKit Cloud or your self-hosted server. Set the following environment variables:
-- LIVEKIT_URL
-- LIVEKIT_API_KEY
-- LIVEKIT_API_SECRET
-
-You can connect using any LiveKit client SDK or telephony integration.
-To get started quickly, try the [Agents Playground](https://agents-playground.livekit.io/).
-
-### Running for production
-
-```shell
-python myagent.py start
-```
-
-Runs the agent with production-ready optimizations.
-
-## Contributing
-
-The Agents framework is under active development in a rapidly evolving field. We welcome and appreciate contributions of any kind, be it feedback, bugfixes, features, new plugins and tools, or better documentation. You can file issues under this repo, open a PR, or chat with us in LiveKit's [Slack community](https://livekit.io/join-slack).
-
-
-
-
diff --git a/examples/.env.example b/examples/.env.example
deleted file mode 100644
index d71e0e2b12..0000000000
--- a/examples/.env.example
+++ /dev/null
@@ -1,3 +0,0 @@
-LIVEKIT_API_SECRET=""
-LIVEKIT_API_KEY=""
-LIVEKIT_URL=""
\ No newline at end of file
diff --git a/examples/voice_agents/intelligent_interruption.py b/examples/voice_agents/intelligent_interruption.py
new file mode 100644
index 0000000000..ef2e02f472
--- /dev/null
+++ b/examples/voice_agents/intelligent_interruption.py
@@ -0,0 +1,98 @@
+"""
+Example: Intelligent Interruption Handling with Backchannel Filtering
+
+This example demonstrates the backchannel filter feature that allows the agent to:
+1. Continue speaking when user says backchannel words ("yeah", "ok", "hmm")
+2. Stop immediately when user says interrupt commands ("stop", "wait", "no")
+3. Handle mixed input ("yeah wait") as an interruption
+4. Respond to backchannels when the agent is silent
+
+Usage:
+ uv run python examples/voice_agents/intelligent_interruption.py console
+"""
+
+import logging
+
+from dotenv import load_dotenv
+
+from livekit.agents import (
+ Agent,
+ AgentServer,
+ AgentSession,
+ JobContext,
+ JobProcess,
+ cli,
+ room_io,
+)
+from livekit.agents.voice import DEFAULT_BACKCHANNEL_WORDS, DEFAULT_INTERRUPT_WORDS
+from livekit.plugins import silero
+
+logger = logging.getLogger("intelligent-interruption")
+
+# Load environment from .env.local
+load_dotenv(".env.local")
+
+
+class StorytellerAgent(Agent):
+ """An agent that tells stories and demonstrates intelligent interruption handling."""
+
+ def __init__(self) -> None:
+ super().__init__(
+ instructions="""You are a storyteller assistant. When asked to tell a story,
+ provide an engaging, moderately long story (about 3-5 paragraphs).
+
+ Important behavior:
+ - If the user says "yeah", "ok", "hmm" while you're speaking, continue your story.
+ - If the user says "stop", "wait", or "actually", stop and listen.
+ - If the user is silent and says "yeah" or "ok", ask what they'd like to hear about.
+
+ Keep your responses without emojis or markdown formatting since this is voice-only.
+ """
+ )
+
+ async def on_enter(self) -> None:
+ """Called when the agent joins the session."""
+ self.session.generate_reply(
+ instructions="Greet the user briefly and offer to tell them a story."
+ )
+
+
+server = AgentServer()
+
+
+def prewarm(proc: JobProcess):
+ proc.userdata["vad"] = silero.VAD.load()
+
+
+server.setup_fnc = prewarm
+
+
+@server.rtc_session()
+async def entrypoint(ctx: JobContext):
+ """Main entry point for the agent."""
+ # Create agent session with backchannel filtering enabled (default)
+ session = AgentSession(
+ stt="deepgram/nova-3",
+ llm="openai/gpt-4.1-mini",
+ tts="cartesia/sonic-2:9626c31c-bec5-4cca-baa8-f8ba9e84c8bc",
+ vad=ctx.proc.userdata["vad"],
+ # Backchannel filtering is enabled by default; override lists via
+ # LIVEKIT_BACKCHANNEL_WORDS / LIVEKIT_INTERRUPT_WORDS if needed
+ # Optional: customize backchannel words (uses defaults if None)
+ # backchannel_words=frozenset({"yeah", "yep", "ok", "okay", "hmm"}),
+ # Optional: customize interrupt words (uses defaults if None)
+ # interrupt_words=frozenset({"stop", "wait", "no", "pause"}),
+ # Enable false interruption handling
+ resume_false_interruption=True,
+ false_interruption_timeout=1.5,
+ )
+
+ await session.start(
+ agent=StorytellerAgent(),
+ room=ctx.room,
+ room_options=room_io.RoomOptions(),
+ )
+
+
+if __name__ == "__main__":
+ cli.run_app(server)
diff --git a/livekit-agents/livekit/agents/__init__.py b/livekit-agents/livekit/agents/__init__.py
index 5ff089e3de..8e609c319a 100644
--- a/livekit-agents/livekit/agents/__init__.py
+++ b/livekit-agents/livekit/agents/__init__.py
@@ -63,9 +63,13 @@
AgentSession,
AgentStateChangedEvent,
AgentTask,
+ BackchannelFilter,
+ BackchannelFilterOptions,
CloseEvent,
CloseReason,
ConversationItemAddedEvent,
+ DEFAULT_BACKCHANNEL_WORDS,
+ DEFAULT_INTERRUPT_WORDS,
ErrorEvent,
FunctionToolsExecutedEvent,
MetricsCollectedEvent,
@@ -157,6 +161,10 @@ def __getattr__(name: str) -> typing.Any:
"ModelSettings",
"Agent",
"AgentTask",
+ "BackchannelFilter",
+ "BackchannelFilterOptions",
+ "DEFAULT_BACKCHANNEL_WORDS",
+ "DEFAULT_INTERRUPT_WORDS",
"AssignmentTimeoutError",
"APIConnectionError",
"APIError",
diff --git a/livekit-agents/livekit/agents/voice/__init__.py b/livekit-agents/livekit/agents/voice/__init__.py
index cfcc9ca5db..e52c9a4c6e 100644
--- a/livekit-agents/livekit/agents/voice/__init__.py
+++ b/livekit-agents/livekit/agents/voice/__init__.py
@@ -1,6 +1,12 @@
from . import io, run_result
from .agent import Agent, AgentTask, ModelSettings
from .agent_session import AgentSession, VoiceActivityVideoSampler
+from .backchannel_filter import (
+ BackchannelFilter,
+ BackchannelFilterOptions,
+ DEFAULT_BACKCHANNEL_WORDS,
+ DEFAULT_INTERRUPT_WORDS,
+)
from .events import (
AgentEvent,
AgentFalseInterruptionEvent,
@@ -45,6 +51,10 @@
"FunctionToolsExecutedEvent",
"AgentFalseInterruptionEvent",
"TranscriptSynchronizer",
+ "BackchannelFilter",
+ "BackchannelFilterOptions",
+ "DEFAULT_BACKCHANNEL_WORDS",
+ "DEFAULT_INTERRUPT_WORDS",
"io",
"room_io",
"run_result",
diff --git a/livekit-agents/livekit/agents/voice/agent_activity.py b/livekit-agents/livekit/agents/voice/agent_activity.py
index 0c3f7c743d..5d11cd82d2 100644
--- a/livekit-agents/livekit/agents/voice/agent_activity.py
+++ b/livekit-agents/livekit/agents/voice/agent_activity.py
@@ -53,6 +53,7 @@
_EndOfTurnInfo,
_PreemptiveGenerationInfo,
)
+from .backchannel_filter import BackchannelFilter
from .events import (
AgentFalseInterruptionEvent,
ErrorEvent,
@@ -125,6 +126,7 @@ def __init__(self, agent: Agent, sess: AgentSession) -> None:
self._paused_speech: SpeechHandle | None = None
self._false_interruption_timer: asyncio.TimerHandle | None = None
self._interrupt_paused_speech_task: asyncio.Task[None] | None = None
+ self._pending_backchannel_check: bool = False
# fired when a speech_task finishes or when a new speech_handle is scheduled
# this is used to wake up the main task when the scheduling state changes
@@ -142,6 +144,15 @@ def __init__(self, agent: Agent, sess: AgentSession) -> None:
self._on_enter_task: asyncio.Task | None = None
self._on_exit_task: asyncio.Task | None = None
+ # Initialize backchannel filter for intelligent interruption handling
+ bcf_opts = sess.options.backchannel_filter
+ self._backchannel_filter: BackchannelFilter | None = None
+ if bcf_opts.enabled:
+ self._backchannel_filter = BackchannelFilter(
+ backchannel_words=bcf_opts.backchannel_words,
+ interrupt_words=bcf_opts.interrupt_words,
+ )
+
if (
isinstance(self.llm, llm.RealtimeModel)
and self.llm.capabilities.turn_detection
@@ -1174,6 +1185,12 @@ def _interrupt_by_audio_activity(self) -> None:
# ignore if realtime model has turn detection enabled
return
+ # If we're waiting to confirm whether the utterance is just a backchannel,
+ # do not pause/interrupt yet.
+ if self._pending_backchannel_check and self._session.agent_state == "speaking":
+ logger.debug("backchannel pending, suppressing interrupt-by-audio")
+ return
+
if (
self.stt is not None
and opt.min_interruption_words > 0
@@ -1185,6 +1202,37 @@ def _interrupt_by_audio_activity(self) -> None:
if len(split_words(text, split_character=True)) < opt.min_interruption_words:
return
+ # Check backchannel filter: skip interrupt if it's a pure backchannel while agent is speaking
+ # Only apply the filter if we have an actual transcript - empty transcript means VAD triggered
+ # before STT, so we should let the normal flow continue
+ if (
+ self._backchannel_filter is not None
+ and self._audio_recognition is not None
+ and self._current_speech is not None
+ and not self._current_speech.interrupted
+ ):
+ text = self._audio_recognition.current_transcript
+ agent_speaking = self._session.agent_state == "speaking"
+
+ # If agent is speaking and we don't have text yet, defer until STT tells us more.
+ if agent_speaking and (not text or not text.strip()):
+ # mark pending and bail without pausing audio
+ self._pending_backchannel_check = True
+ logger.debug("backchannel pending, waiting for transcript before interrupting")
+ return
+
+ # Only filter if we have transcript content to analyze
+ if text and text.strip():
+ if not self._backchannel_filter.should_interrupt(text, agent_speaking=agent_speaking):
+ logger.debug(
+ "backchannel filtered, ignoring interruption",
+ extra={"transcript": text, "agent_speaking": agent_speaking},
+ )
+ self._pending_backchannel_check = False
+ return
+ # It should interrupt; clear pending gate and continue to interrupt below
+ self._pending_backchannel_check = False
+
if self._rt_session is not None:
self._rt_session.start_user_activity()
@@ -1214,6 +1262,9 @@ def _interrupt_by_audio_activity(self) -> None:
def on_start_of_speech(self, ev: vad.VADEvent | None) -> None:
self._session._update_user_state("speaking")
+ # reset pending gate for a fresh utterance
+ self._pending_backchannel_check = False
+
if self._false_interruption_timer:
# cancel the timer when user starts speaking but leave the paused state unchanged
self._false_interruption_timer.cancel()
@@ -1228,6 +1279,9 @@ def on_end_of_speech(self, ev: vad.VADEvent | None) -> None:
last_speaking_time=speech_end_time,
)
+ # clear any pending backchannel gate when user stops
+ self._pending_backchannel_check = False
+
if (
self._paused_speech
and (timeout := self._session.options.false_interruption_timeout) is not None
@@ -1241,6 +1295,36 @@ def on_vad_inference_done(self, ev: vad.VADEvent) -> None:
return
if ev.speech_duration >= self._session.options.min_interruption_duration:
+ # When backchannel filter is enabled and agent is speaking,
+ # don't interrupt on VAD alone - wait for STT to confirm what was said.
+ # Exception: If we already have an interrupt word, interrupt immediately (fast path).
+ if (
+ self._backchannel_filter is not None
+ and self._current_speech is not None
+ and not self._current_speech.interrupted
+ and self._current_speech.allow_interruptions
+ ):
+ # Get current transcript from STT (may be partial or empty)
+ transcript = ""
+ if self._audio_recognition is not None:
+ transcript = self._audio_recognition.current_transcript or ""
+
+ # Fast path: If we have a confirmed interrupt word, interrupt immediately
+ if transcript.strip() and self._backchannel_filter.contains_interrupt_word(transcript):
+ logger.debug(
+ "interrupt word detected in VAD, interrupting immediately",
+ extra={"transcript": transcript},
+ )
+ # Fall through to interrupt
+ else:
+ # No interrupt word yet - defer to STT; keep gate and continue playing
+ self._pending_backchannel_check = True
+ logger.debug(
+ "VAD deferring to STT for backchannel decision",
+ extra={"transcript": transcript, "speech_duration": ev.speech_duration},
+ )
+ return
+
self._interrupt_by_audio_activity()
def on_interim_transcript(self, ev: stt.SpeechEvent, *, speaking: bool | None) -> None:
@@ -1257,10 +1341,36 @@ def on_interim_transcript(self, ev: stt.SpeechEvent, *, speaking: bool | None) -
),
)
- if ev.alternatives[0].text and self._turn_detection not in (
+ transcript = ev.alternatives[0].text
+ if transcript and self._turn_detection not in (
"manual",
"realtime_llm",
):
+ # When backchannel filter is enabled and agent is speaking,
+ # only interrupt early if we detect an interrupt word (fast path for "stop").
+ # Everything else defers to final transcript for the backchannel decision.
+ if (
+ self._backchannel_filter is not None
+ and self._current_speech is not None
+ and not self._current_speech.interrupted
+ and self._session.agent_state == "speaking"
+ ):
+ # Fast path: If we have an interrupt word, interrupt immediately
+ if self._backchannel_filter.contains_interrupt_word(transcript):
+ logger.debug(
+ "interrupt word detected in interim, interrupting",
+ extra={"transcript": transcript},
+ )
+ # Fall through to interrupt
+ else:
+ # No interrupt word - defer to final transcript, keep gate if pending
+ self._pending_backchannel_check = True
+ logger.debug(
+ "interim deferring to final transcript for backchannel decision",
+ extra={"transcript": transcript},
+ )
+ return
+
self._interrupt_by_audio_activity()
if (
@@ -1284,6 +1394,39 @@ def on_final_transcript(self, ev: stt.SpeechEvent, *, speaking: bool | None = No
speaker_id=ev.alternatives[0].speaker_id,
),
)
+
+ transcript = ev.alternatives[0].text
+
+ # When backchannel filter is enabled and agent is speaking,
+ # this is the decision point:
+ # - Pure backchannel → IGNORE (continue speaking)
+ # - NOT a backchannel → INTERRUPT (stop and listen to user)
+ if (
+ self._backchannel_filter is not None
+ and self._current_speech is not None
+ and not self._current_speech.interrupted
+ and transcript
+ and transcript.strip()
+ and self._session.agent_state == "speaking"
+ ):
+ # Decision: Is this a pure backchannel (like "yeah", "ok", "hmm")?
+ if self._backchannel_filter.is_pure_backchannel(transcript):
+ # Pure backchannel → IGNORE, continue speaking
+ logger.debug(
+ "pure backchannel confirmed, continuing speech",
+ extra={"transcript": transcript},
+ )
+ self._pending_backchannel_check = False
+ return # Ignore this input, don't interrupt
+ else:
+ # NOT a backchannel → INTERRUPT (could be real content or interrupt word)
+ logger.debug(
+ "non-backchannel content detected, stopping to listen",
+ extra={"transcript": transcript},
+ )
+ self._pending_backchannel_check = False
+ # Fall through to interrupt
+
# agent speech might not be interrupted if VAD failed and a final transcript is received
# we call _interrupt_by_audio_activity (idempotent) to pause the speech, if possible
# which will also be immediately interrupted
@@ -1345,6 +1488,22 @@ def on_end_of_turn(self, info: _EndOfTurnInfo) -> bool:
# IMPORTANT: This method is sync to avoid it being cancelled by the AudioRecognition
# We explicitly create a new task here
+ # If agent is speaking and this is just a backchannel, skip committing/interrupting
+ if (
+ self._backchannel_filter is not None
+ and self._session.agent_state == "speaking"
+ and info.new_transcript
+ and info.new_transcript.strip()
+ and self._backchannel_filter.is_pure_backchannel(info.new_transcript)
+ ):
+ logger.debug(
+ "end_of_turn ignored (pure backchannel while agent speaking)",
+ extra={"transcript": info.new_transcript},
+ )
+ self._pending_backchannel_check = False
+ self._cancel_preemptive_generation()
+ return False
+
if self._scheduling_paused:
self._cancel_preemptive_generation()
logger.warning(
@@ -1377,8 +1536,12 @@ def on_end_of_turn(self, info: _EndOfTurnInfo) -> bool:
):
self._cancel_preemptive_generation()
# avoid interruption if the new_transcript is too short
+ self._pending_backchannel_check = False
return False
+ # If a backchannel was pending and we ended up here, clear the gate now
+ self._pending_backchannel_check = False
+
old_task = self._user_turn_completed_atask
self._user_turn_completed_atask = self._create_speech_task(
self._user_turn_completed_task(old_task, info),
diff --git a/livekit-agents/livekit/agents/voice/agent_session.py b/livekit-agents/livekit/agents/voice/agent_session.py
index 628718a6b2..447af9a5b9 100644
--- a/livekit-agents/livekit/agents/voice/agent_session.py
+++ b/livekit-agents/livekit/agents/voice/agent_session.py
@@ -2,6 +2,7 @@
import asyncio
import copy
+import os
import time
from collections.abc import AsyncIterable, Sequence
from contextlib import AbstractContextManager, nullcontext
@@ -52,6 +53,7 @@
UserState,
UserStateChangedEvent,
)
+from .backchannel_filter import BackchannelFilter, BackchannelFilterOptions
from .ivr import IVRActivity
from .recorder_io import RecorderIO
from .run_result import RunResult
@@ -89,6 +91,7 @@ class AgentSessionOptions:
preemptive_generation: bool
tts_text_transforms: Sequence[TextTransforms] | None
ivr_detection: bool
+ backchannel_filter: BackchannelFilterOptions
Userdata_T = TypeVar("Userdata_T")
@@ -159,6 +162,9 @@ def __init__(
tts_text_transforms: NotGivenOr[Sequence[TextTransforms] | None] = NOT_GIVEN,
preemptive_generation: bool = False,
ivr_detection: bool = False,
+ backchannel_filter_enabled: bool = True,
+ backchannel_words: frozenset[str] | None = None,
+ interrupt_words: frozenset[str] | None = None,
conn_options: NotGivenOr[SessionConnectOptions] = NOT_GIVEN,
loop: asyncio.AbstractEventLoop | None = None,
# deprecated
@@ -245,6 +251,16 @@ def __init__(
Defaults to ``False``.
ivr_detection (bool): Whether to detect if the agent is interacting with an IVR system.
Default ``False``.
+ backchannel_filter_enabled (bool): Whether to filter out backchannel words
+ (like "yeah", "ok", "hmm") when the agent is speaking. Enabled by
+ default so the agent keeps talking through passive acknowledgements.
+ backchannel_words (frozenset[str], optional): Custom set of words to treat as
+ backchannels. When ``None``, uses default list including "yeah", "ok", "hmm", etc.
+ interrupt_words (frozenset[str], optional): Custom set of words that force
+ interruption even if mixed with backchannels (e.g., "yeah but wait").
+ When ``None``, uses default list including "stop", "wait", "no", etc.
+ You can also override both lists via the ``LIVEKIT_BACKCHANNEL_WORDS`` and
+ ``LIVEKIT_INTERRUPT_WORDS`` environment variables (comma separated).
conn_options (SessionConnectOptions, optional): Connection options for
stt, llm, and tts.
loop (asyncio.AbstractEventLoop, optional): Event loop to bind the
@@ -264,6 +280,21 @@ def __init__(
self._video_sampler = video_sampler
+ # Allow env vars to tweak backchannel/interrupt vocab without code edits
+ if backchannel_words is None:
+ env_words = os.getenv("LIVEKIT_BACKCHANNEL_WORDS")
+ if env_words:
+ backchannel_words = frozenset(
+ w.strip().lower() for w in env_words.split(",") if w.strip()
+ )
+
+ if interrupt_words is None:
+ env_words = os.getenv("LIVEKIT_INTERRUPT_WORDS")
+ if env_words:
+ interrupt_words = frozenset(
+ w.strip().lower() for w in env_words.split(",") if w.strip()
+ )
+
# This is the "global" chat_context, it holds the entire conversation history
self._chat_ctx = ChatContext.empty()
self._opts = AgentSessionOptions(
@@ -288,6 +319,11 @@ def __init__(
use_tts_aligned_transcript=use_tts_aligned_transcript
if is_given(use_tts_aligned_transcript)
else None,
+ backchannel_filter=BackchannelFilterOptions(
+ enabled=backchannel_filter_enabled,
+ backchannel_words=backchannel_words,
+ interrupt_words=interrupt_words,
+ ),
)
self._conn_options = conn_options or SessionConnectOptions()
self._started = False
diff --git a/livekit-agents/livekit/agents/voice/backchannel_filter.py b/livekit-agents/livekit/agents/voice/backchannel_filter.py
new file mode 100644
index 0000000000..2ed5f1cacd
--- /dev/null
+++ b/livekit-agents/livekit/agents/voice/backchannel_filter.py
@@ -0,0 +1,286 @@
+"""
+Backchannel filter for intelligent interruption handling.
+
+This module provides context-aware filtering of user inputs to distinguish
+between passive acknowledgements (backchannels like "yeah", "ok", "hmm") and
+active interruptions ("stop", "wait", "no").
+
+The filter is state-aware:
+- When agent is speaking: backchannels are ignored, commands trigger interrupts
+- When agent is silent: all inputs (including backchannels) are processed normally
+"""
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+from typing import Sequence
+
+# Default backchannel words that should be ignored when agent is speaking
+DEFAULT_BACKCHANNEL_WORDS: frozenset[str] = frozenset({
+ # Affirmatives
+ "yeah",
+ "yep",
+ "yes",
+ "ok",
+ "okay",
+ "aha",
+ "yup",
+ "sure",
+ "right",
+ "alright",
+ # Partials/Colloquial
+ "ye",
+ "yea",
+ "oh",
+ "ah",
+ "uh",
+ "um",
+ "uhm",
+ "uhh",
+ "er",
+ # Sounds/vocalizations
+ "hmm",
+ "mhm",
+ "mhmm",
+ "uh-huh",
+ "uhuh",
+ "uh huh",
+ "mm",
+ "mm-hmm",
+ "mmhmm",
+ "hm",
+ "oops",
+ # Acknowledgements
+ "got it",
+ "i see",
+ "interesting",
+ # Encouragements (to continue speaking)
+ "go on",
+ "go ahead",
+ "continue",
+ "keep going",
+})
+
+# Words that force interruption even if mixed with backchannels
+DEFAULT_INTERRUPT_WORDS: frozenset[str] = frozenset({
+ "stop",
+ "wait",
+ "hold on",
+ "hold",
+ "no",
+ "pause",
+ "actually",
+ "but",
+ "however",
+ "hang on",
+ "one second",
+ "second",
+ "minute",
+ "question",
+ "what",
+ "why",
+ "how",
+ "when",
+ "where",
+ "who",
+})
+
+
+@dataclass
+class BackchannelFilterOptions:
+ """Options for backchannel filtering.
+
+ Attributes:
+ enabled: Whether backchannel filtering is enabled (default: True).
+ backchannel_words: Set of words considered as backchannels.
+ interrupt_words: Set of words that force interruption.
+ """
+ enabled: bool = True
+ backchannel_words: frozenset[str] | None = None
+ interrupt_words: frozenset[str] | None = None
+
+ def __post_init__(self) -> None:
+ if self.backchannel_words is None:
+ self.backchannel_words = DEFAULT_BACKCHANNEL_WORDS
+ if self.interrupt_words is None:
+ self.interrupt_words = DEFAULT_INTERRUPT_WORDS
+
+
+class BackchannelFilter:
+ """
+ Filters user inputs to distinguish between backchannels and interruptions.
+
+ This filter implements state-based logic:
+ - When the agent is speaking, backchannels are ignored
+ - When the agent is silent, all inputs are treated as valid
+ - Interrupt words always trigger an interruption regardless of context
+
+ Example usage:
+ filter = BackchannelFilter()
+
+ # Agent speaking, user says "yeah"
+ filter.should_interrupt("yeah", agent_speaking=True) # Returns False
+
+ # Agent speaking, user says "stop"
+ filter.should_interrupt("stop", agent_speaking=True) # Returns True
+
+ # Agent silent, user says "yeah"
+ filter.should_interrupt("yeah", agent_speaking=False) # Returns True
+ """
+
+ def __init__(
+ self,
+ *,
+ backchannel_words: frozenset[str] | Sequence[str] | None = None,
+ interrupt_words: frozenset[str] | Sequence[str] | None = None,
+ ) -> None:
+ """Initialize the BackchannelFilter.
+
+ Args:
+ backchannel_words: Custom set of backchannel words. Defaults to
+ DEFAULT_BACKCHANNEL_WORDS if None.
+ interrupt_words: Custom set of interrupt words. Defaults to
+ DEFAULT_INTERRUPT_WORDS if None.
+ """
+ if backchannel_words is None:
+ self._backchannel_words = DEFAULT_BACKCHANNEL_WORDS
+ elif isinstance(backchannel_words, frozenset):
+ self._backchannel_words = backchannel_words
+ else:
+ self._backchannel_words = frozenset(w.lower() for w in backchannel_words)
+
+ if interrupt_words is None:
+ self._interrupt_words = DEFAULT_INTERRUPT_WORDS
+ elif isinstance(interrupt_words, frozenset):
+ self._interrupt_words = interrupt_words
+ else:
+ self._interrupt_words = frozenset(w.lower() for w in interrupt_words)
+
+ @property
+ def backchannel_words(self) -> frozenset[str]:
+ """Get the set of backchannel words."""
+ return self._backchannel_words
+
+ @property
+ def interrupt_words(self) -> frozenset[str]:
+ """Get the set of interrupt words."""
+ return self._interrupt_words
+
+ def _normalize_text(self, text: str) -> str:
+ """Normalize text for comparison by removing punctuation and lowercasing."""
+ # Remove punctuation except hyphens (for words like "uh-huh")
+ cleaned = re.sub(r"[^\w\s\-]", "", text).lower().strip()
+ # Normalize multiple spaces
+ return re.sub(r"\s+", " ", cleaned)
+
+ def _split_words(self, text: str) -> list[str]:
+ """Split text into individual words."""
+ return self._normalize_text(text).split()
+
+ def contains_interrupt_word(self, text: str) -> bool:
+ """Check if the text contains any interrupt word.
+
+ This also checks for multi-word interrupt phrases like "hold on".
+
+ Args:
+ text: The user input text.
+
+ Returns:
+ True if the text contains an interrupt word/phrase.
+ """
+ normalized = self._normalize_text(text)
+ words = normalized.split()
+
+ # Check for single-word interrupts
+ for word in words:
+ if word in self._interrupt_words:
+ return True
+
+ # Check for multi-word interrupt phrases
+ for phrase in self._interrupt_words:
+ if " " in phrase and phrase in normalized:
+ return True
+
+ return False
+
+ def is_pure_backchannel(self, text: str) -> bool:
+ """Check if the text consists entirely of backchannel words.
+
+ Args:
+ text: The user input text.
+
+ Returns:
+ True if all words in the text are backchannels.
+ """
+ normalized = self._normalize_text(text)
+ if not normalized:
+ return False
+
+ words = normalized.split()
+ if not words:
+ return False
+
+ # First check if it contains any interrupt words
+ if self.contains_interrupt_word(text):
+ return False
+
+ # Check if entire phrase is a known backchannel
+ if normalized in self._backchannel_words:
+ return True
+
+ # Check if all individual words are backchannels
+ return all(word in self._backchannel_words for word in words)
+
+ def should_interrupt(self, text: str, *, agent_speaking: bool) -> bool:
+ """Determine if the given text should trigger an interruption.
+
+ The logic follows this matrix:
+
+ | User Input | Agent Speaking | Result |
+ |-----------------|----------------|-----------|
+ | Backchannel | Yes | No interrupt (ignore) |
+ | Interrupt word | Yes | Interrupt |
+ | Mixed input | Yes | Interrupt |
+ | Any input | No | Process (return True) |
+
+ Args:
+ text: The user input text (from STT).
+ agent_speaking: Whether the agent is currently speaking.
+
+ Returns:
+ True if the input should trigger an interrupt/be processed.
+ False if the input should be ignored (pure backchannel while speaking).
+ """
+ if not text or not text.strip():
+ return False
+
+ # When agent is not speaking, always process all inputs
+ if not agent_speaking:
+ return True
+
+ # When agent is speaking:
+ # 1. Check for interrupt words first - they always interrupt
+ if self.contains_interrupt_word(text):
+ return True
+
+ # 2. Check if it's a pure backchannel - ignore these
+ if self.is_pure_backchannel(text):
+ return False
+
+ # 3. Any other input should interrupt
+ return True
+
+ def is_valid_input_when_silent(self, text: str) -> bool:
+ """Check if text is valid user input when agent is silent.
+
+ When the agent is silent, all inputs including backchannels
+ should be treated as valid user input.
+
+ Args:
+ text: The user input text.
+
+ Returns:
+ True if there is any content to process.
+ """
+ return bool(text and text.strip())