Skip to content

Commit 53614e2

Browse files
authored
Prevent SimpleDirectoryReader from excessive memory consumption (#18983)
1 parent 5d8280c commit 53614e2

File tree

3 files changed

+25
-16
lines changed
  • llama-index-core
  • llama-index-integrations/readers/llama-index-readers-file/tests

3 files changed

+25
-16
lines changed

llama-index-core/llama_index/core/readers/file/base.py

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from itertools import repeat
1515
from pathlib import Path, PurePosixPath
1616
from typing import (
17+
Optional,
1718
Any,
1819
Callable,
1920
Generator,
@@ -247,19 +248,19 @@ class SimpleDirectoryReader(BaseReader, ResourcesReaderMixin, FileSystemReaderMi
247248

248249
def __init__(
249250
self,
250-
input_dir: Path | str | None = None,
251-
input_files: list | None = None,
252-
exclude: list | None = None,
251+
input_dir: Optional[Union[Path, str]] = None,
252+
input_files: Optional[list] = None,
253+
exclude: Optional[list] = None,
253254
exclude_hidden: bool = True,
254255
exclude_empty: bool = False,
255256
errors: str = "ignore",
256257
recursive: bool = False,
257258
encoding: str = "utf-8",
258259
filename_as_id: bool = False,
259-
required_exts: list[str] | None = None,
260-
file_extractor: dict[str, BaseReader] | None = None,
261-
num_files_limit: int | None = None,
262-
file_metadata: Callable[[str], dict] | None = None,
260+
required_exts: Optional[list[str]] = None,
261+
file_extractor: Optional[dict[str, BaseReader]] = None,
262+
num_files_limit: Optional[int] = None,
263+
file_metadata: Optional[Callable[[str], dict]] = None,
263264
raise_on_error: bool = False,
264265
fs: fsspec.AbstractFileSystem | None = None,
265266
) -> None:
@@ -333,10 +334,21 @@ def _add_files(self, input_dir: Path | PurePosixPath) -> list[Path | PurePosixPa
333334
rejected_files.add(_Path(str(file)))
334335

335336
file_refs: list[str] = []
336-
if self.recursive:
337-
file_refs = cast(list[str], self.fs.glob(str(input_dir) + "/**/*"))
338-
else:
339-
file_refs = cast(list[str], self.fs.glob(str(input_dir) + "/*"))
337+
limit = (
338+
self.num_files_limit
339+
if self.num_files_limit is not None and self.num_files_limit > 0
340+
else None
341+
)
342+
c = 0
343+
depth = 1000 if self.recursive else 1
344+
for root, _, files in self.fs.walk(
345+
str(input_dir), topdown=True, maxdepth=depth
346+
):
347+
for file in files:
348+
c += 1
349+
if limit and c > limit:
350+
break
351+
file_refs.append(os.path.join(root, file))
340352

341353
for _ref in file_refs:
342354
# Manually check if file is hidden or directory instead of
@@ -381,9 +393,6 @@ def _add_files(self, input_dir: Path | PurePosixPath) -> list[Path | PurePosixPa
381393
if len(new_input_files) == 0:
382394
raise ValueError(f"No files found in {input_dir}.")
383395

384-
if self.num_files_limit is not None and self.num_files_limit > 0:
385-
new_input_files = new_input_files[0 : self.num_files_limit]
386-
387396
# print total number of files added
388397
logger.debug(
389398
f"> [SimpleDirectoryReader] Total files added: {len(new_input_files)}"

llama-index-core/tests/readers/file/test_base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def test_SimpleDirectoryReader_empty(data_path):
9999

100100
def test_SimpleDirectoryReader_file_limit(data_path):
101101
r = SimpleDirectoryReader(input_dir=data_path, recursive=True, num_files_limit=2)
102-
assert [f.name for f in r.input_files] == ["excluded_1.txt", "excluded_0.txt"]
102+
assert [f.name for f in r.input_files] == ["excluded_0.txt", "file_0.md"]
103103

104104

105105
def test_SimpleDirectoryReader_list_resources(data_path):

llama-index-integrations/readers/llama-index-readers-file/tests/test_file.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,7 @@ def test_num_files_limit(tmp_dir_type: Type[Union[Path, str]]) -> None:
179179
assert len(reader.input_files) == 2
180180
assert set(input_file_names) == {
181181
"test1.txt",
182-
"test2.txt",
182+
"test3.txt",
183183
}
184184

185185
reader = SimpleDirectoryReader(

0 commit comments

Comments
 (0)