Skip to content

Commit 370ed3f

Browse files
authored
Merge branch 'master' into core-no-return-any
2 parents 5d5e90f + 78b2d51 commit 370ed3f

File tree

66 files changed

+1958
-295
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

66 files changed

+1958
-295
lines changed

libs/core/langchain_core/caches.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
Distinct from provider-based [prompt caching](https://docs.langchain.com/oss/python/langchain/models#prompt-caching).
44
55
!!! warning "Beta feature"
6+
67
This is a beta feature. Please be wary of deploying experimental code to production
78
unless you've taken appropriate precautions.
89

libs/core/langchain_core/callbacks/manager.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
from contextlib import asynccontextmanager, contextmanager
1313
from contextvars import copy_context
1414
from typing import TYPE_CHECKING, Any, TypeVar, cast
15-
from uuid import UUID
1615

1716
from langsmith.run_helpers import get_tracing_context
1817
from typing_extensions import Self, override
@@ -44,6 +43,7 @@
4443

4544
if TYPE_CHECKING:
4645
from collections.abc import AsyncGenerator, Coroutine, Generator, Sequence
46+
from uuid import UUID
4747

4848
from tenacity import RetryCallState
4949

libs/core/langchain_core/language_models/base.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@
1818
from pydantic import BaseModel, ConfigDict, Field, field_validator
1919
from typing_extensions import TypedDict, override
2020

21-
from langchain_core.caches import BaseCache
22-
from langchain_core.callbacks import Callbacks
21+
from langchain_core.caches import BaseCache # noqa: TC001
22+
from langchain_core.callbacks import Callbacks # noqa: TC001
2323
from langchain_core.globals import get_verbose
2424
from langchain_core.messages import (
2525
AIMessage,

libs/core/langchain_core/language_models/chat_models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,7 @@ class BaseChatModel(BaseLanguageModel[AIMessage], ABC):
341341
"""Profile detailing model capabilities.
342342
343343
!!! warning "Beta feature"
344+
344345
This is a beta feature. The format of model profiles is subject to change.
345346
346347
If not specified, automatically loaded from the provider package on initialization

libs/core/langchain_core/language_models/model_profile.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ class ModelProfile(TypedDict, total=False):
77
"""Model profile.
88
99
!!! warning "Beta feature"
10+
1011
This is a beta feature. The format of model profiles is subject to change.
1112
1213
Provides information about chat model capabilities, such as context window sizes

libs/core/langchain_core/load/__init__.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
if TYPE_CHECKING:
88
from langchain_core.load.dump import dumpd, dumps
9-
from langchain_core.load.load import loads
9+
from langchain_core.load.load import InitValidator, loads
1010
from langchain_core.load.serializable import Serializable
1111

1212
# Unfortunately, we have to eagerly import load from langchain_core/load/load.py
@@ -15,11 +15,19 @@
1515
# the `from langchain_core.load.load import load` absolute import should also work.
1616
from langchain_core.load.load import load
1717

18-
__all__ = ("Serializable", "dumpd", "dumps", "load", "loads")
18+
__all__ = (
19+
"InitValidator",
20+
"Serializable",
21+
"dumpd",
22+
"dumps",
23+
"load",
24+
"loads",
25+
)
1926

2027
_dynamic_imports = {
2128
"dumpd": "dump",
2229
"dumps": "dump",
30+
"InitValidator": "load",
2331
"loads": "load",
2432
"Serializable": "serializable",
2533
}
Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
"""Validation utilities for LangChain serialization.
2+
3+
Provides escape-based protection against injection attacks in serialized objects. The
4+
approach uses an allowlist design: only dicts explicitly produced by
5+
`Serializable.to_json()` are treated as LC objects during deserialization.
6+
7+
## How escaping works
8+
9+
During serialization, plain dicts (user data) that contain an `'lc'` key are wrapped:
10+
11+
```python
12+
{"lc": 1, ...} # user data that looks like LC object
13+
# becomes:
14+
{"__lc_escaped__": {"lc": 1, ...}}
15+
```
16+
17+
During deserialization, escaped dicts are unwrapped and returned as plain dicts,
18+
NOT instantiated as LC objects.
19+
"""
20+
21+
from typing import Any
22+
23+
_LC_ESCAPED_KEY = "__lc_escaped__"
24+
"""Sentinel key used to mark escaped user dicts during serialization.
25+
26+
When a plain dict contains 'lc' key (which could be confused with LC objects),
27+
we wrap it as {"__lc_escaped__": {...original...}}.
28+
"""
29+
30+
31+
def _needs_escaping(obj: dict[str, Any]) -> bool:
32+
"""Check if a dict needs escaping to prevent confusion with LC objects.
33+
34+
A dict needs escaping if:
35+
36+
1. It has an `'lc'` key (could be confused with LC serialization format)
37+
2. It has only the escape key (would be mistaken for an escaped dict)
38+
"""
39+
return "lc" in obj or (len(obj) == 1 and _LC_ESCAPED_KEY in obj)
40+
41+
42+
def _escape_dict(obj: dict[str, Any]) -> dict[str, Any]:
43+
"""Wrap a dict in the escape marker.
44+
45+
Example:
46+
```python
47+
{"key": "value"} # becomes {"__lc_escaped__": {"key": "value"}}
48+
```
49+
"""
50+
return {_LC_ESCAPED_KEY: obj}
51+
52+
53+
def _is_escaped_dict(obj: dict[str, Any]) -> bool:
54+
"""Check if a dict is an escaped user dict.
55+
56+
Example:
57+
```python
58+
{"__lc_escaped__": {...}} # is an escaped dict
59+
```
60+
"""
61+
return len(obj) == 1 and _LC_ESCAPED_KEY in obj
62+
63+
64+
def _serialize_value(obj: Any) -> Any:
65+
"""Serialize a value with escaping of user dicts.
66+
67+
Called recursively on kwarg values to escape any plain dicts that could be confused
68+
with LC objects.
69+
70+
Args:
71+
obj: The value to serialize.
72+
73+
Returns:
74+
The serialized value with user dicts escaped as needed.
75+
"""
76+
from langchain_core.load.serializable import ( # noqa: PLC0415
77+
Serializable,
78+
to_json_not_implemented,
79+
)
80+
81+
if isinstance(obj, Serializable):
82+
# This is an LC object - serialize it properly (not escaped)
83+
return _serialize_lc_object(obj)
84+
if isinstance(obj, dict):
85+
if not all(isinstance(k, (str, int, float, bool, type(None))) for k in obj):
86+
# if keys are not json serializable
87+
return to_json_not_implemented(obj)
88+
# Check if dict needs escaping BEFORE recursing into values.
89+
# If it needs escaping, wrap it as-is - the contents are user data that
90+
# will be returned as-is during deserialization (no instantiation).
91+
# This prevents re-escaping of already-escaped nested content.
92+
if _needs_escaping(obj):
93+
return _escape_dict(obj)
94+
# Safe dict (no 'lc' key) - recurse into values
95+
return {k: _serialize_value(v) for k, v in obj.items()}
96+
if isinstance(obj, (list, tuple)):
97+
return [_serialize_value(item) for item in obj]
98+
if isinstance(obj, (str, int, float, bool, type(None))):
99+
return obj
100+
101+
# Non-JSON-serializable object (datetime, custom objects, etc.)
102+
return to_json_not_implemented(obj)
103+
104+
105+
def _is_lc_secret(obj: Any) -> bool:
106+
"""Check if an object is a LangChain secret marker."""
107+
expected_num_keys = 3
108+
return (
109+
isinstance(obj, dict)
110+
and obj.get("lc") == 1
111+
and obj.get("type") == "secret"
112+
and "id" in obj
113+
and len(obj) == expected_num_keys
114+
)
115+
116+
117+
def _serialize_lc_object(obj: Any) -> dict[str, Any]:
118+
"""Serialize a `Serializable` object with escaping of user data in kwargs.
119+
120+
Args:
121+
obj: The `Serializable` object to serialize.
122+
123+
Returns:
124+
The serialized dict with user data in kwargs escaped as needed.
125+
126+
Note:
127+
Kwargs values are processed with `_serialize_value` to escape user data (like
128+
metadata) that contains `'lc'` keys. Secret fields (from `lc_secrets`) are
129+
skipped because `to_json()` replaces their values with secret markers.
130+
"""
131+
from langchain_core.load.serializable import Serializable # noqa: PLC0415
132+
133+
if not isinstance(obj, Serializable):
134+
msg = f"Expected Serializable, got {type(obj)}"
135+
raise TypeError(msg)
136+
137+
serialized: dict[str, Any] = dict(obj.to_json())
138+
139+
# Process kwargs to escape user data that could be confused with LC objects
140+
# Skip secret fields - to_json() already converted them to secret markers
141+
if serialized.get("type") == "constructor" and "kwargs" in serialized:
142+
serialized["kwargs"] = {
143+
k: v if _is_lc_secret(v) else _serialize_value(v)
144+
for k, v in serialized["kwargs"].items()
145+
}
146+
147+
return serialized
148+
149+
150+
def _unescape_value(obj: Any) -> Any:
151+
"""Unescape a value, processing escape markers in dict values and lists.
152+
153+
When an escaped dict is encountered (`{"__lc_escaped__": ...}`), it's
154+
unwrapped and the contents are returned AS-IS (no further processing).
155+
The contents represent user data that should not be modified.
156+
157+
For regular dicts and lists, we recurse to find any nested escape markers.
158+
159+
Args:
160+
obj: The value to unescape.
161+
162+
Returns:
163+
The unescaped value.
164+
"""
165+
if isinstance(obj, dict):
166+
if _is_escaped_dict(obj):
167+
# Unwrap and return the user data as-is (no further unescaping).
168+
# The contents are user data that may contain more escape keys,
169+
# but those are part of the user's actual data.
170+
return obj[_LC_ESCAPED_KEY]
171+
172+
# Regular dict - recurse into values to find nested escape markers
173+
return {k: _unescape_value(v) for k, v in obj.items()}
174+
if isinstance(obj, list):
175+
return [_unescape_value(item) for item in obj]
176+
return obj

libs/core/langchain_core/load/dump.py

Lines changed: 55 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,26 @@
1-
"""Dump objects to json."""
1+
"""Serialize LangChain objects to JSON.
2+
3+
Provides `dumps` (to JSON string) and `dumpd` (to dict) for serializing
4+
`Serializable` objects.
5+
6+
## Escaping
7+
8+
During serialization, plain dicts (user data) that contain an `'lc'` key are escaped
9+
by wrapping them: `{"__lc_escaped__": {...original...}}`. This prevents injection
10+
attacks where malicious data could trick the deserializer into instantiating
11+
arbitrary classes. The escape marker is removed during deserialization.
12+
13+
This is an allowlist approach: only dicts explicitly produced by
14+
`Serializable.to_json()` are treated as LC objects; everything else is escaped if it
15+
could be confused with the LC format.
16+
"""
217

318
import json
419
from typing import Any
520

621
from pydantic import BaseModel
722

23+
from langchain_core.load._validation import _serialize_value
824
from langchain_core.load.serializable import Serializable, to_json_not_implemented
925
from langchain_core.messages import AIMessage
1026
from langchain_core.outputs import ChatGeneration
@@ -25,6 +41,20 @@ def default(obj: Any) -> Any:
2541

2642

2743
def _dump_pydantic_models(obj: Any) -> Any:
44+
"""Convert nested Pydantic models to dicts for JSON serialization.
45+
46+
Handles the special case where a `ChatGeneration` contains an `AIMessage`
47+
with a parsed Pydantic model in `additional_kwargs["parsed"]`. Since
48+
Pydantic models aren't directly JSON serializable, this converts them to
49+
dicts.
50+
51+
Args:
52+
obj: The object to process.
53+
54+
Returns:
55+
A copy of the object with nested Pydantic models converted to dicts, or
56+
the original object unchanged if no conversion was needed.
57+
"""
2858
if (
2959
isinstance(obj, ChatGeneration)
3060
and isinstance(obj.message, AIMessage)
@@ -40,10 +70,17 @@ def _dump_pydantic_models(obj: Any) -> Any:
4070
def dumps(obj: Any, *, pretty: bool = False, **kwargs: Any) -> str:
4171
"""Return a JSON string representation of an object.
4272
73+
Note:
74+
Plain dicts containing an `'lc'` key are automatically escaped to prevent
75+
confusion with LC serialization format. The escape marker is removed during
76+
deserialization.
77+
4378
Args:
4479
obj: The object to dump.
45-
pretty: Whether to pretty print the json. If `True`, the json will be
46-
indented with 2 spaces (if no indent is provided as part of `kwargs`).
80+
pretty: Whether to pretty print the json.
81+
82+
If `True`, the json will be indented by either 2 spaces or the amount
83+
provided in the `indent` kwarg.
4784
**kwargs: Additional arguments to pass to `json.dumps`
4885
4986
Returns:
@@ -55,28 +92,29 @@ def dumps(obj: Any, *, pretty: bool = False, **kwargs: Any) -> str:
5592
if "default" in kwargs:
5693
msg = "`default` should not be passed to dumps"
5794
raise ValueError(msg)
58-
try:
59-
obj = _dump_pydantic_models(obj)
60-
if pretty:
61-
indent = kwargs.pop("indent", 2)
62-
return json.dumps(obj, default=default, indent=indent, **kwargs)
63-
return json.dumps(obj, default=default, **kwargs)
64-
except TypeError:
65-
if pretty:
66-
indent = kwargs.pop("indent", 2)
67-
return json.dumps(to_json_not_implemented(obj), indent=indent, **kwargs)
68-
return json.dumps(to_json_not_implemented(obj), **kwargs)
95+
96+
obj = _dump_pydantic_models(obj)
97+
serialized = _serialize_value(obj)
98+
99+
if pretty:
100+
indent = kwargs.pop("indent", 2)
101+
return json.dumps(serialized, indent=indent, **kwargs)
102+
return json.dumps(serialized, **kwargs)
69103

70104

71105
def dumpd(obj: Any) -> Any:
72106
"""Return a dict representation of an object.
73107
108+
Note:
109+
Plain dicts containing an `'lc'` key are automatically escaped to prevent
110+
confusion with LC serialization format. The escape marker is removed during
111+
deserialization.
112+
74113
Args:
75114
obj: The object to dump.
76115
77116
Returns:
78117
Dictionary that can be serialized to json using `json.dumps`.
79118
"""
80-
# Unfortunately this function is not as efficient as it could be because it first
81-
# dumps the object to a json string and then loads it back into a dictionary.
82-
return json.loads(dumps(obj))
119+
obj = _dump_pydantic_models(obj)
120+
return _serialize_value(obj)

0 commit comments

Comments
 (0)