diff --git a/litellm/proxy/anthropic_endpoints/endpoints.py b/litellm/proxy/anthropic_endpoints/endpoints.py index 900386f3d7b..1717e7d88b5 100644 --- a/litellm/proxy/anthropic_endpoints/endpoints.py +++ b/litellm/proxy/anthropic_endpoints/endpoints.py @@ -23,6 +23,36 @@ router = APIRouter() +def _get_blocked_response_usage( + model: str, + messages: Optional[list], + block_message: str, +) -> dict: + """ + Compute token usage for a synthetic guardrail-blocked response. + + The original request still consumed input tokens and the synthetic block + message has real content, so both counts are computed rather than reported + as zero. Token counting is best-effort: any failure falls back to 0 so a + blocked response is always returned to the caller. + """ + import litellm + + input_tokens = 0 + output_tokens = 0 + try: + if messages: + input_tokens = litellm.token_counter(model=model, messages=messages) + if block_message: + output_tokens = litellm.token_counter(model=model, text=block_message) + except Exception as token_count_error: + verbose_proxy_logger.debug( + "Failed to count tokens for blocked response: %s", token_count_error + ) + + return {"input_tokens": input_tokens, "output_tokens": output_tokens} + + @router.post( "/v1/messages", tags=["[beta] Anthropic `/v1/messages`"], @@ -87,6 +117,16 @@ async def anthropic_response( # noqa: PLR0915 from litellm.types.utils import AnthropicMessagesResponse + # The original request was processed upstream and the block message is + # synthesized here, so both token counts are reachable: input_tokens + # from the original request messages (carried on the exception) and + # output_tokens from the block message text. + _usage = _get_blocked_response_usage( + model=e.model, + messages=_data.get("messages"), + block_message=e.message, + ) + _anthropic_response = AnthropicMessagesResponse( id=f"msg_{str(uuid.uuid4())}", type="message", @@ -94,7 +134,7 @@ async def anthropic_response( # noqa: PLR0915 content=[{"type": "text", "text": e.message}], model=e.model, stop_reason="end_turn", - usage={"input_tokens": 0, "output_tokens": 0}, + usage=_usage, ) if data.get("stream", None) is not None and data["stream"] is True: diff --git a/tests/test_litellm/proxy/anthropic_endpoints/test_endpoints.py b/tests/test_litellm/proxy/anthropic_endpoints/test_endpoints.py index f6189382d74..9772a257c5d 100644 --- a/tests/test_litellm/proxy/anthropic_endpoints/test_endpoints.py +++ b/tests/test_litellm/proxy/anthropic_endpoints/test_endpoints.py @@ -69,6 +69,60 @@ async def test_async_data_generator_anthropic_dict_handling(self, mock_safe_dump ) # Called twice, once for each dict object +class TestBlockedResponseUsage: + """Token usage on synthetic guardrail-blocked responses.""" + + def test_blocked_response_reports_nonzero_token_counts(self): + from litellm.proxy.anthropic_endpoints.endpoints import ( + _get_blocked_response_usage, + ) + + usage = _get_blocked_response_usage( + model="claude-3-5-sonnet-20240620", + messages=[{"role": "user", "content": "Tell me something harmful"}], + block_message="This request was blocked by a content guardrail.", + ) + + assert usage["input_tokens"] > 0 + assert usage["output_tokens"] > 0 + + def test_blocked_response_output_tokens_match_block_message(self): + import litellm + from litellm.proxy.anthropic_endpoints.endpoints import ( + _get_blocked_response_usage, + ) + + model = "claude-3-5-sonnet-20240620" + block_message = "Blocked: this content violates policy." + + usage = _get_blocked_response_usage( + model=model, + messages=[{"role": "user", "content": "hi"}], + block_message=block_message, + ) + + expected_output = litellm.token_counter(model=model, text=block_message) + assert usage["output_tokens"] == expected_output + + def test_blocked_response_usage_falls_back_to_zero_on_error(self): + from unittest.mock import patch + + from litellm.proxy.anthropic_endpoints.endpoints import ( + _get_blocked_response_usage, + ) + + with patch( + "litellm.token_counter", side_effect=RuntimeError("boom") + ): + usage = _get_blocked_response_usage( + model="claude-3-5-sonnet-20240620", + messages=[{"role": "user", "content": "hi"}], + block_message="blocked", + ) + + assert usage == {"input_tokens": 0, "output_tokens": 0} + + class TestEventLoggingBatchEndpoint: """Test the stubbed event logging batch endpoint"""