|
14 | 14 | from itertools import repeat
|
15 | 15 | from pathlib import Path, PurePosixPath
|
16 | 16 | from typing import (
|
| 17 | + Optional, |
17 | 18 | Any,
|
18 | 19 | Callable,
|
19 | 20 | Generator,
|
@@ -247,19 +248,19 @@ class SimpleDirectoryReader(BaseReader, ResourcesReaderMixin, FileSystemReaderMi
|
247 | 248 |
|
248 | 249 | def __init__(
|
249 | 250 | self,
|
250 |
| - input_dir: Path | str | None = None, |
251 |
| - input_files: list | None = None, |
252 |
| - exclude: list | None = None, |
| 251 | + input_dir: Optional[Union[Path, str]] = None, |
| 252 | + input_files: Optional[list] = None, |
| 253 | + exclude: Optional[list] = None, |
253 | 254 | exclude_hidden: bool = True,
|
254 | 255 | exclude_empty: bool = False,
|
255 | 256 | errors: str = "ignore",
|
256 | 257 | recursive: bool = False,
|
257 | 258 | encoding: str = "utf-8",
|
258 | 259 | filename_as_id: bool = False,
|
259 |
| - required_exts: list[str] | None = None, |
260 |
| - file_extractor: dict[str, BaseReader] | None = None, |
261 |
| - num_files_limit: int | None = None, |
262 |
| - file_metadata: Callable[[str], dict] | None = None, |
| 260 | + required_exts: Optional[list[str]] = None, |
| 261 | + file_extractor: Optional[dict[str, BaseReader]] = None, |
| 262 | + num_files_limit: Optional[int] = None, |
| 263 | + file_metadata: Optional[Callable[[str], dict]] = None, |
263 | 264 | raise_on_error: bool = False,
|
264 | 265 | fs: fsspec.AbstractFileSystem | None = None,
|
265 | 266 | ) -> None:
|
@@ -333,10 +334,21 @@ def _add_files(self, input_dir: Path | PurePosixPath) -> list[Path | PurePosixPa
|
333 | 334 | rejected_files.add(_Path(str(file)))
|
334 | 335 |
|
335 | 336 | file_refs: list[str] = []
|
336 |
| - if self.recursive: |
337 |
| - file_refs = cast(list[str], self.fs.glob(str(input_dir) + "/**/*")) |
338 |
| - else: |
339 |
| - file_refs = cast(list[str], self.fs.glob(str(input_dir) + "/*")) |
| 337 | + limit = ( |
| 338 | + self.num_files_limit |
| 339 | + if self.num_files_limit is not None and self.num_files_limit > 0 |
| 340 | + else None |
| 341 | + ) |
| 342 | + c = 0 |
| 343 | + depth = 1000 if self.recursive else 1 |
| 344 | + for root, _, files in self.fs.walk( |
| 345 | + str(input_dir), topdown=True, maxdepth=depth |
| 346 | + ): |
| 347 | + for file in files: |
| 348 | + c += 1 |
| 349 | + if limit and c > limit: |
| 350 | + break |
| 351 | + file_refs.append(os.path.join(root, file)) |
340 | 352 |
|
341 | 353 | for _ref in file_refs:
|
342 | 354 | # Manually check if file is hidden or directory instead of
|
@@ -381,9 +393,6 @@ def _add_files(self, input_dir: Path | PurePosixPath) -> list[Path | PurePosixPa
|
381 | 393 | if len(new_input_files) == 0:
|
382 | 394 | raise ValueError(f"No files found in {input_dir}.")
|
383 | 395 |
|
384 |
| - if self.num_files_limit is not None and self.num_files_limit > 0: |
385 |
| - new_input_files = new_input_files[0 : self.num_files_limit] |
386 |
| - |
387 | 396 | # print total number of files added
|
388 | 397 | logger.debug(
|
389 | 398 | f"> [SimpleDirectoryReader] Total files added: {len(new_input_files)}"
|
|
0 commit comments