Skip to content

Commit e91da86

Browse files
authored
feat(openrouter): add streaming token usage support (#35559)
Streaming token usage was silently dropped for `ChatOpenRouter`. Both `_stream` and `_astream` skipped any SSE chunk without a `choices` array — which is exactly the shape OpenRouter uses for the final usage-reporting chunk. This meant `usage_metadata` was never populated on streamed responses, causing downstream consumers (like the Deep Agents CLI) to show "unknown" model with 0 tokens. ## Changes - Add `stream_usage: bool = True` field to `ChatOpenRouter`, which passes `stream_options: {"include_usage": True}` to the OpenRouter API when streaming — matching the pattern already established in `langchain-openai`'s `BaseChatOpenAI` - Handle usage-only chunks (no `choices`, just `usage`) in both `_stream` and `_astream` by emitting a `ChatGenerationChunk` with `usage_metadata` via `_create_usage_metadata`, instead of silently `continue`-ing past them
1 parent e50625e commit e91da86

File tree

4 files changed

+208
-29
lines changed

4 files changed

+208
-29
lines changed

libs/partners/openai/langchain_openai/chat_models/base.py

Lines changed: 38 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -577,8 +577,8 @@ class BaseChatOpenAI(BaseChatModel):
577577
)
578578
"""API key to use.
579579
580-
Can be inferred from the `OPENAI_API_KEY` environment variable, or specified as a
581-
string, or sync or async callable that returns a string.
580+
Can be inferred from the `OPENAI_API_KEY` environment variable, or specified
581+
as a string, or sync or async callable that returns a string.
582582
583583
??? example "Specify with environment variable"
584584
@@ -600,6 +600,7 @@ class BaseChatOpenAI(BaseChatModel):
600600
```
601601
602602
??? example "Specify with a sync callable"
603+
603604
```python
604605
from langchain_openai import ChatOpenAI
605606
@@ -611,6 +612,7 @@ def get_api_key() -> str:
611612
```
612613
613614
??? example "Specify with an async callable"
615+
614616
```python
615617
from langchain_openai import ChatOpenAI
616618
@@ -636,16 +638,20 @@ async def get_api_key() -> str:
636638
request_timeout: float | tuple[float, float] | Any | None = Field(
637639
default=None, alias="timeout"
638640
)
639-
"""Timeout for requests to OpenAI completion API. Can be float, `httpx.Timeout` or
640-
`None`."""
641+
"""Timeout for requests to OpenAI completion API.
642+
643+
Can be float, `httpx.Timeout` or `None`.
644+
"""
641645

642646
stream_usage: bool | None = None
643-
"""Whether to include usage metadata in streaming output. If enabled, an additional
644-
message chunk will be generated during the stream including usage metadata.
647+
"""Whether to include usage metadata in streaming output.
648+
649+
If enabled, an additional message chunk will be generated during the stream
650+
including usage metadata.
645651
646652
This parameter is enabled unless `openai_api_base` is set or the model is
647-
initialized with a custom client, as many chat completions APIs do not support
648-
streaming token usage.
653+
initialized with a custom client, as many chat completions APIs do not
654+
support streaming token usage.
649655
650656
!!! version-added "Added in `langchain-openai` 0.3.9"
651657
@@ -671,8 +677,10 @@ async def get_api_key() -> str:
671677

672678
top_logprobs: int | None = None
673679
"""Number of most likely tokens to return at each token position, each with an
674-
associated log probability. `logprobs` must be set to true if this parameter is
675-
used."""
680+
associated log probability.
681+
682+
`logprobs` must be set to true if this parameter is used.
683+
"""
676684

677685
logit_bias: dict[int, int] | None = None
678686
"""Modify the likelihood of specified tokens appearing in the completion."""
@@ -690,18 +698,19 @@ async def get_api_key() -> str:
690698
"""Maximum number of tokens to generate."""
691699

692700
reasoning_effort: str | None = None
693-
"""Constrains effort on reasoning for reasoning models. For use with the Chat
694-
Completions API.
701+
"""Constrains effort on reasoning for reasoning models.
695702
696-
Reasoning models only.
703+
For use with the Chat Completions API. Reasoning models only.
697704
698705
Currently supported values are `'minimal'`, `'low'`, `'medium'`, and
699706
`'high'`. Reducing reasoning effort can result in faster responses and fewer
700707
tokens used on reasoning in a response.
701708
"""
702709

703710
reasoning: dict[str, Any] | None = None
704-
"""Reasoning parameters for reasoning models. For use with the Responses API.
711+
"""Reasoning parameters for reasoning models.
712+
713+
For use with the Responses API.
705714
706715
```python
707716
reasoning={
@@ -714,8 +723,9 @@ async def get_api_key() -> str:
714723
"""
715724

716725
verbosity: str | None = None
717-
"""Controls the verbosity level of responses for reasoning models. For use with the
718-
Responses API.
726+
"""Controls the verbosity level of responses for reasoning models.
727+
728+
For use with the Responses API.
719729
720730
Currently supported values are `'low'`, `'medium'`, and `'high'`.
721731
@@ -745,35 +755,36 @@ async def get_api_key() -> str:
745755
http_client: Any | None = Field(default=None, exclude=True)
746756
"""Optional `httpx.Client`.
747757
748-
Only used for sync invocations. Must specify `http_async_client` as well if you'd
749-
like a custom client for async invocations.
758+
Only used for sync invocations. Must specify `http_async_client` as well if
759+
you'd like a custom client for async invocations.
750760
"""
751761

752762
http_async_client: Any | None = Field(default=None, exclude=True)
753763
"""Optional `httpx.AsyncClient`.
754764
755-
Only used for async invocations. Must specify `http_client` as well if you'd like a
756-
custom client for sync invocations.
765+
Only used for async invocations. Must specify `http_client` as well if you'd
766+
like a custom client for sync invocations.
757767
"""
758768

759769
stop: list[str] | str | None = Field(default=None, alias="stop_sequences")
760770
"""Default stop sequences."""
761771

762772
extra_body: Mapping[str, Any] | None = None
763-
"""Optional additional JSON properties to include in the request parameters when
764-
making requests to OpenAI compatible APIs, such as vLLM, LM Studio, or other
765-
providers.
773+
"""Optional additional JSON properties to include in the request parameters
774+
when making requests to OpenAI compatible APIs, such as vLLM, LM Studio, or
775+
other providers.
766776
767777
This is the recommended way to pass custom parameters that are specific to your
768778
OpenAI-compatible API provider but not part of the standard OpenAI API.
769779
770780
Examples:
771-
- [LM Studio](https://lmstudio.ai/) TTL parameter: `extra_body={"ttl": 300}`
772-
- [vLLM](https://github.com/vllm-project/vllm) custom parameters:
773-
`extra_body={"use_beam_search": True}`
774-
- Any other provider-specific parameters
781+
- [LM Studio](https://lmstudio.ai/) TTL parameter: `extra_body={"ttl": 300}`
782+
- [vLLM](https://github.com/vllm-project/vllm) custom parameters:
783+
`extra_body={"use_beam_search": True}`
784+
- Any other provider-specific parameters
775785
776786
!!! warning
787+
777788
Do not use `model_kwargs` for custom parameters that are not part of the
778789
standard OpenAI API, as this will cause errors when making API calls. Use
779790
`extra_body` instead.

libs/partners/openrouter/langchain_openrouter/chat_models.py

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,13 @@ def model(self) -> str:
223223
streaming: bool = False
224224
"""Whether to stream the results or not."""
225225

226+
stream_usage: bool = True
227+
"""Whether to include usage metadata in streaming output.
228+
229+
If `True`, additional message chunks will be generated during the stream including
230+
usage metadata.
231+
"""
232+
226233
model_kwargs: dict[str, Any] = Field(default_factory=dict)
227234
"""Any extra model parameters for the OpenRouter API."""
228235

@@ -443,7 +450,7 @@ async def _agenerate(
443450
response = await self.client.chat.send_async(messages=sdk_messages, **params)
444451
return self._create_chat_result(response)
445452

446-
def _stream( # noqa: C901
453+
def _stream( # noqa: C901, PLR0912
447454
self,
448455
messages: list[BaseMessage],
449456
stop: list[str] | None = None,
@@ -452,6 +459,8 @@ def _stream( # noqa: C901
452459
) -> Iterator[ChatGenerationChunk]:
453460
message_dicts, params = self._create_message_dicts(messages, stop)
454461
params = {**params, **kwargs, "stream": True}
462+
if self.stream_usage:
463+
params["stream_options"] = {"include_usage": True}
455464
_strip_internal_kwargs(params)
456465
sdk_messages = _wrap_messages_for_sdk(message_dicts)
457466

@@ -466,6 +475,18 @@ def _stream( # noqa: C901
466475
f"(code: {error.get('code', 'unknown')})"
467476
)
468477
raise ValueError(msg)
478+
# Usage-only chunk (no choices) — emit with usage_metadata
479+
if usage := chunk_dict.get("usage"):
480+
usage_metadata = _create_usage_metadata(usage)
481+
usage_chunk = AIMessageChunk(
482+
content="", usage_metadata=usage_metadata
483+
)
484+
generation_chunk = ChatGenerationChunk(message=usage_chunk)
485+
if run_manager:
486+
run_manager.on_llm_new_token(
487+
generation_chunk.text, chunk=generation_chunk
488+
)
489+
yield generation_chunk
469490
continue
470491
choice = chunk_dict["choices"][0]
471492
message_chunk = _convert_chunk_to_message_chunk(
@@ -515,7 +536,7 @@ def _stream( # noqa: C901
515536
)
516537
yield generation_chunk
517538

518-
async def _astream( # noqa: C901
539+
async def _astream( # noqa: C901, PLR0912
519540
self,
520541
messages: list[BaseMessage],
521542
stop: list[str] | None = None,
@@ -524,6 +545,8 @@ async def _astream( # noqa: C901
524545
) -> AsyncIterator[ChatGenerationChunk]:
525546
message_dicts, params = self._create_message_dicts(messages, stop)
526547
params = {**params, **kwargs, "stream": True}
548+
if self.stream_usage:
549+
params["stream_options"] = {"include_usage": True}
527550
_strip_internal_kwargs(params)
528551
sdk_messages = _wrap_messages_for_sdk(message_dicts)
529552

@@ -540,6 +563,18 @@ async def _astream( # noqa: C901
540563
f"(code: {error.get('code', 'unknown')})"
541564
)
542565
raise ValueError(msg)
566+
# Usage-only chunk (no choices) — emit with usage_metadata
567+
if usage := chunk_dict.get("usage"):
568+
usage_metadata = _create_usage_metadata(usage)
569+
usage_chunk = AIMessageChunk(
570+
content="", usage_metadata=usage_metadata
571+
)
572+
generation_chunk = ChatGenerationChunk(message=usage_chunk)
573+
if run_manager:
574+
await run_manager.on_llm_new_token(
575+
token=generation_chunk.text, chunk=generation_chunk
576+
)
577+
yield generation_chunk
543578
continue
544579
choice = chunk_dict["choices"][0]
545580
message_chunk = _convert_chunk_to_message_chunk(

libs/partners/openrouter/tests/unit_tests/__snapshots__/test_standard.ambr

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
'request_timeout': 60,
2424
'stop': list([
2525
]),
26+
'stream_usage': True,
2627
'temperature': 0.0,
2728
}),
2829
'lc': 1,

libs/partners/openrouter/tests/unit_tests/test_chat_models.py

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2831,3 +2831,135 @@ def test_stream_malformed_tool_call_with_null_function(self) -> None:
28312831
assert any(
28322832
"malformed tool call chunk" in str(warning.message) for warning in w
28332833
)
2834+
2835+
2836+
class TestStreamUsage:
2837+
"""Tests for stream_usage and usage-only chunk handling."""
2838+
2839+
def test_stream_options_passed_by_default(self) -> None:
2840+
"""Test that stream_options with include_usage is sent by default."""
2841+
model = _make_model()
2842+
model.client = MagicMock()
2843+
model.client.chat.send.return_value = _MockSyncStream(
2844+
[dict(c) for c in _STREAM_CHUNKS]
2845+
)
2846+
list(model.stream("Hello"))
2847+
call_kwargs = model.client.chat.send.call_args[1]
2848+
assert call_kwargs["stream_options"] == {"include_usage": True}
2849+
2850+
def test_stream_options_not_passed_when_disabled(self) -> None:
2851+
"""Test that stream_options is omitted when stream_usage=False."""
2852+
model = _make_model(stream_usage=False)
2853+
model.client = MagicMock()
2854+
model.client.chat.send.return_value = _MockSyncStream(
2855+
[dict(c) for c in _STREAM_CHUNKS]
2856+
)
2857+
list(model.stream("Hello"))
2858+
call_kwargs = model.client.chat.send.call_args[1]
2859+
assert "stream_options" not in call_kwargs
2860+
2861+
def test_usage_only_chunk_emitted(self) -> None:
2862+
"""Test that a usage-only chunk (no choices) emits usage_metadata."""
2863+
model = _make_model()
2864+
model.client = MagicMock()
2865+
# Content chunks followed by a usage-only chunk (no choices key)
2866+
chunks_with_separate_usage: list[dict[str, Any]] = [
2867+
{
2868+
"choices": [
2869+
{"delta": {"role": "assistant", "content": "Hi"}, "index": 0}
2870+
],
2871+
"model": MODEL_NAME,
2872+
"object": "chat.completion.chunk",
2873+
"created": 1700000000.0,
2874+
"id": "gen-1",
2875+
},
2876+
{
2877+
"choices": [{"delta": {}, "finish_reason": "stop", "index": 0}],
2878+
"model": MODEL_NAME,
2879+
"object": "chat.completion.chunk",
2880+
"created": 1700000000.0,
2881+
"id": "gen-1",
2882+
},
2883+
# Usage-only final chunk — no choices
2884+
{
2885+
"usage": {
2886+
"prompt_tokens": 10,
2887+
"completion_tokens": 5,
2888+
"total_tokens": 15,
2889+
},
2890+
"model": MODEL_NAME,
2891+
"object": "chat.completion.chunk",
2892+
"created": 1700000000.0,
2893+
"id": "gen-1",
2894+
},
2895+
]
2896+
model.client.chat.send.return_value = _MockSyncStream(
2897+
chunks_with_separate_usage
2898+
)
2899+
chunks = list(model.stream("Hello"))
2900+
2901+
# Last chunk should carry usage_metadata
2902+
usage_chunks = [c for c in chunks if c.usage_metadata]
2903+
assert len(usage_chunks) >= 1
2904+
usage = usage_chunks[-1].usage_metadata
2905+
assert usage is not None
2906+
assert usage["input_tokens"] == 10
2907+
assert usage["output_tokens"] == 5
2908+
assert usage["total_tokens"] == 15
2909+
2910+
async def test_astream_options_passed_by_default(self) -> None:
2911+
"""Test that async stream sends stream_options by default."""
2912+
model = _make_model()
2913+
model.client = MagicMock()
2914+
model.client.chat.send_async = AsyncMock(
2915+
return_value=_MockAsyncStream([dict(c) for c in _STREAM_CHUNKS])
2916+
)
2917+
chunks = [c async for c in model.astream("Hello")] # noqa: F841
2918+
call_kwargs = model.client.chat.send_async.call_args[1]
2919+
assert call_kwargs["stream_options"] == {"include_usage": True}
2920+
2921+
async def test_astream_usage_only_chunk_emitted(self) -> None:
2922+
"""Test that an async usage-only chunk emits usage_metadata."""
2923+
model = _make_model()
2924+
model.client = MagicMock()
2925+
chunks_with_separate_usage: list[dict[str, Any]] = [
2926+
{
2927+
"choices": [
2928+
{"delta": {"role": "assistant", "content": "Hi"}, "index": 0}
2929+
],
2930+
"model": MODEL_NAME,
2931+
"object": "chat.completion.chunk",
2932+
"created": 1700000000.0,
2933+
"id": "gen-1",
2934+
},
2935+
{
2936+
"choices": [{"delta": {}, "finish_reason": "stop", "index": 0}],
2937+
"model": MODEL_NAME,
2938+
"object": "chat.completion.chunk",
2939+
"created": 1700000000.0,
2940+
"id": "gen-1",
2941+
},
2942+
{
2943+
"usage": {
2944+
"prompt_tokens": 10,
2945+
"completion_tokens": 5,
2946+
"total_tokens": 15,
2947+
},
2948+
"model": MODEL_NAME,
2949+
"object": "chat.completion.chunk",
2950+
"created": 1700000000.0,
2951+
"id": "gen-1",
2952+
},
2953+
]
2954+
model.client.chat.send_async = AsyncMock(
2955+
return_value=_MockAsyncStream(chunks_with_separate_usage)
2956+
)
2957+
chunks = [c async for c in model.astream("Hello")]
2958+
2959+
usage_chunks = [c for c in chunks if c.usage_metadata]
2960+
assert len(usage_chunks) >= 1
2961+
usage = usage_chunks[-1].usage_metadata
2962+
assert usage is not None
2963+
assert usage["input_tokens"] == 10
2964+
assert usage["output_tokens"] == 5
2965+
assert usage["total_tokens"] == 15

0 commit comments

Comments
 (0)