feat(file_processors): add inline::auto composite file processor (#5673)

leseb · claude · web-flow · commit 8158a1c13e16 · 2026-05-04T08:07:59.000Z
# What does this PR do? Adds a new `inline::auto` composite file processor that dispatches to the appropriate backend based on file MIME type. Currently routes PDF and text files to the built-in PyPDF processor and rejects unsupported formats with a clear 422 error listing supported types. The architecture is extensible for adding additional format backends (e.g. docling) in the future. Switches the starter and ci-tests distributions from `inline::pypdf` to `inline::auto` as the default file processor. Admins who want direct control over which formats are processed can still configure a specific provider (`inline::pypdf`, `inline::docling`, `remote::docling-serve`) directly. Builds on #5670 which fixed pypdf's silent fallback for unsupported formats. ## Test Plan Run the file processor unit tests: ```bash uv run pytest tests/unit/providers/file_processor/ -v ``` Output: ``` tests/unit/providers/file_processor/test_auto.py::test_routes_pdf_to_pypdf PASSED tests/unit/providers/file_processor/test_auto.py::test_routes_text_to_pypdf PASSED tests/unit/providers/file_processor/test_auto.py::test_routes_csv_to_pypdf PASSED tests/unit/providers/file_processor/test_auto.py::test_routes_markdown_to_pypdf PASSED tests/unit/providers/file_processor/test_auto.py::test_rejects_docx_with_422 PASSED tests/unit/providers/file_processor/test_auto.py::test_rejects_pptx_with_422 PASSED tests/unit/providers/file_processor/test_auto.py::test_rejects_xlsx_with_422 PASSED tests/unit/providers/file_processor/test_auto.py::test_error_message_lists_supported_types PASSED tests/unit/providers/file_processor/test_auto.py::test_error_message_includes_mime_type PASSED tests/unit/providers/file_processor/test_pypdf_validation.py::test_rejects_docx_with_422 PASSED tests/unit/providers/file_processor/test_pypdf_validation.py::test_rejects_pptx_with_422 PASSED tests/unit/providers/file_processor/test_pypdf_validation.py::test_rejects_xlsx_with_422 PASSED tests/unit/providers/file_processor/test_pypdf_validation.py::test_allows_pdf PASSED tests/unit/providers/file_processor/test_pypdf_validation.py::test_allows_text_files PASSED tests/unit/providers/file_processor/test_pypdf_validation.py::test_allows_csv_files PASSED tests/unit/providers/file_processor/test_pypdf_validation.py::test_allows_markdown_files PASSED 35 passed in 0.64s ``` --------- Signed-off-by: Sébastien Han <seb@redhat.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/docs/docs/providers/file_processors/inline_auto.mdx b/docs/docs/providers/file_processors/inline_auto.mdx
@@ -0,0 +1,26 @@
+---
+description: "Composite file processor that automatically dispatches to the appropriate backend based on file MIME type. Routes PDF and text files to PyPDF. Unsupported formats are rejected with a clear error listing the supported types."
+sidebar_label: Auto
+title: inline::auto
+---
+
+# inline::auto
+
+## Description
+
+Composite file processor that automatically dispatches to the appropriate backend based on file MIME type. Routes PDF and text files to PyPDF. Unsupported formats are rejected with a clear error listing the supported types.
+
+## Configuration
+
+| Field | Type | Required | Default | Description |
+|-------|------|----------|---------|-------------|
+| `default_chunk_size_tokens` | `int` | No | 800 | Default chunk size in tokens when chunking_strategy type is 'auto' |
+| `default_chunk_overlap_tokens` | `int` | No | 400 | Default chunk overlap in tokens when chunking_strategy type is 'auto' |
+| `extract_metadata` | `bool` | No | True | Whether to extract PDF metadata (title, author, etc.) |
+| `clean_text` | `bool` | No | True | Whether to clean extracted text (remove extra whitespace, normalize line breaks) |
+
+## Sample Configuration
+
+```yaml
+{}
+```
diff --git a/src/ogx/distributions/ci-tests/build.yaml b/src/ogx/distributions/ci-tests/build.yaml
@@ -31,7 +31,7 @@ distribution_spec:
     files:
     - provider_type: inline::localfs
     file_processors:
-    - provider_type: inline::pypdf
+    - provider_type: inline::auto
     safety:
     - provider_type: inline::llama-guard
     - provider_type: inline::code-scanner
diff --git a/src/ogx/distributions/ci-tests/config.yaml b/src/ogx/distributions/ci-tests/config.yaml
@@ -191,8 +191,8 @@ providers:
         table_name: files_metadata
         backend: sql_default
   file_processors:
-  - provider_id: pypdf
-    provider_type: inline::pypdf
+  - provider_id: auto
+    provider_type: inline::auto
   safety:
   - provider_id: llama-guard
     provider_type: inline::llama-guard
diff --git a/src/ogx/distributions/ci-tests/run-with-postgres-store.yaml b/src/ogx/distributions/ci-tests/run-with-postgres-store.yaml
@@ -191,8 +191,8 @@ providers:
         table_name: files_metadata
         backend: sql_default
   file_processors:
-  - provider_id: pypdf
-    provider_type: inline::pypdf
+  - provider_id: auto
+    provider_type: inline::auto
   safety:
   - provider_id: llama-guard
     provider_type: inline::llama-guard
diff --git a/src/ogx/distributions/starter/build.yaml b/src/ogx/distributions/starter/build.yaml
@@ -32,7 +32,7 @@ distribution_spec:
     files:
     - provider_type: inline::localfs
     file_processors:
-    - provider_type: inline::pypdf
+    - provider_type: inline::auto
     safety:
     - provider_type: inline::llama-guard
     - provider_type: inline::code-scanner
diff --git a/src/ogx/distributions/starter/config.yaml b/src/ogx/distributions/starter/config.yaml
@@ -185,8 +185,8 @@ providers:
         table_name: files_metadata
         backend: sql_default
   file_processors:
-  - provider_id: pypdf
-    provider_type: inline::pypdf
+  - provider_id: auto
+    provider_type: inline::auto
   safety:
   - provider_id: llama-guard
     provider_type: inline::llama-guard
diff --git a/src/ogx/distributions/starter/run-with-postgres-store.yaml b/src/ogx/distributions/starter/run-with-postgres-store.yaml
@@ -185,8 +185,8 @@ providers:
         table_name: files_metadata
         backend: sql_default
   file_processors:
-  - provider_id: pypdf
-    provider_type: inline::pypdf
+  - provider_id: auto
+    provider_type: inline::auto
   safety:
   - provider_id: llama-guard
     provider_type: inline::llama-guard
diff --git a/src/ogx/distributions/starter/starter.py b/src/ogx/distributions/starter/starter.py
@@ -22,7 +22,7 @@
 from ogx.core.storage.sqlstore.sqlstore import PostgresSqlStoreConfig
 from ogx.core.utils.dynamic import instantiate_class_type
 from ogx.distributions.template import DistributionTemplate, RunConfigSettings
-from ogx.providers.inline.file_processor.pypdf.config import PyPDFFileProcessorConfig
+from ogx.providers.inline.file_processor.auto.config import AutoFileProcessorConfig
 from ogx.providers.inline.files.localfs.config import LocalfsFilesImplConfig
 from ogx.providers.inline.inference.sentence_transformers import (
     SentenceTransformersInferenceConfig,
@@ -148,7 +148,7 @@ def get_distribution_template(name: str = "starter") -> DistributionTemplate:
             BuildProvider(provider_type="remote::infinispan"),
         ],
         "files": [BuildProvider(provider_type="inline::localfs")],
-        "file_processors": [BuildProvider(provider_type="inline::pypdf")],
+        "file_processors": [BuildProvider(provider_type="inline::auto")],
         "safety": [
             BuildProvider(provider_type="inline::llama-guard"),
             BuildProvider(provider_type="inline::code-scanner"),
@@ -267,9 +267,9 @@ def get_distribution_template(name: str = "starter") -> DistributionTemplate:
         "files": [files_provider],
         "file_processors": [
             Provider(
-                provider_id="pypdf",
-                provider_type="inline::pypdf",
-                config=PyPDFFileProcessorConfig.sample_run_config(),
+                provider_id="auto",
+                provider_type="inline::auto",
+                config=AutoFileProcessorConfig.sample_run_config(),
             ),
         ],
         "tool_runtime": [
diff --git a/src/ogx/providers/inline/file_processor/auto/__init__.py b/src/ogx/providers/inline/file_processor/auto/__init__.py
@@ -0,0 +1,26 @@
+# Copyright (c) The OGX Contributors.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any
+
+from ogx_api import Api
+
+from .config import AutoFileProcessorConfig
+
+
+async def get_provider_impl(config: AutoFileProcessorConfig, deps: dict[Api, Any]):
+    """Get the auto file processor implementation."""
+    from .auto import AutoFileProcessor
+
+    assert isinstance(config, AutoFileProcessorConfig), f"Unexpected config type: {type(config)}"
+
+    files_api = deps[Api.files]
+
+    impl = AutoFileProcessor(config, files_api)
+    return impl
+
+
+__all__ = ["AutoFileProcessorConfig", "get_provider_impl"]
diff --git a/src/ogx/providers/inline/file_processor/auto/auto.py b/src/ogx/providers/inline/file_processor/auto/auto.py
@@ -0,0 +1,79 @@
+# Copyright (c) The OGX Contributors.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import mimetypes
+
+from fastapi import HTTPException, UploadFile
+
+from ogx.log import get_logger
+from ogx.providers.inline.file_processor.pypdf.config import PyPDFFileProcessorConfig
+from ogx.providers.inline.file_processor.pypdf.pypdf import PyPDFFileProcessor
+from ogx_api.file_processors import ProcessFileRequest, ProcessFileResponse
+from ogx_api.files import RetrieveFileRequest
+
+from .config import AutoFileProcessorConfig
+
+log = get_logger(name=__name__, category="providers::file_processors")
+
+SUPPORTED_TEXT_DESCRIPTION = "PDF and text files (txt, csv, md, etc.)"
+
+
+class AutoFileProcessor:
+    """Composite file processor that dispatches to backends based on MIME type.
+
+    Routes PDF and text files to the built-in PyPDF processor. Unsupported
+    formats are rejected with a 422 error listing the supported types.
+    """
+
+    def __init__(self, config: AutoFileProcessorConfig, files_api) -> None:
+        self.config = config
+        self.files_api = files_api
+
+        pypdf_config = PyPDFFileProcessorConfig(
+            default_chunk_size_tokens=config.default_chunk_size_tokens,
+            default_chunk_overlap_tokens=config.default_chunk_overlap_tokens,
+            extract_metadata=config.extract_metadata,
+            clean_text=config.clean_text,
+        )
+        self.pypdf = PyPDFFileProcessor(pypdf_config, files_api)
+
+    async def process_file(
+        self,
+        request: ProcessFileRequest,
+        file: UploadFile | None = None,
+    ) -> ProcessFileResponse:
+        filename = await self._resolve_filename(request, file)
+        mime_type, _ = mimetypes.guess_type(filename)
+        mime_category = mime_type.split("/")[0] if (mime_type and "/" in mime_type) else None
+
+        if mime_type == "application/pdf" or mime_category == "text":
+            return await self.pypdf.process_file(
+                file=file,
+                file_id=request.file_id,
+                options=request.options,
+                chunking_strategy=request.chunking_strategy,
+            )
+
+        raise HTTPException(
+            status_code=422,
+            detail=(
+                f"File type '{mime_type or 'unknown'}' is not supported. Supported types: {SUPPORTED_TEXT_DESCRIPTION}."
+            ),
+        )
+
+    async def _resolve_filename(self, request: ProcessFileRequest, file: UploadFile | None) -> str:
+        if file is not None:
+            name: str | None = file.filename
+            if name is not None:
+                return name
+        if request.file_id is not None:
+            file_info = await self.files_api.openai_retrieve_file(RetrieveFileRequest(file_id=request.file_id))
+            resolved: str = file_info.filename
+            return resolved
+        return "unknown"
+
+    async def shutdown(self) -> None:
+        pass
diff --git a/src/ogx/providers/inline/file_processor/auto/config.py b/src/ogx/providers/inline/file_processor/auto/config.py
@@ -0,0 +1,44 @@
+# Copyright (c) The OGX Contributors.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+from ogx_api.vector_io import VectorStoreChunkingStrategyStaticConfig
+
+
+class AutoFileProcessorConfig(BaseModel):
+    """Configuration for the auto file processor.
+
+    The auto file processor dispatches to the appropriate backend based on file
+    MIME type. It always includes PyPDF for PDF and text files. When a supported
+    document-conversion backend is available, it routes office formats (DOCX,
+    PPTX, XLSX, HTML) there instead of rejecting them.
+    """
+
+    default_chunk_size_tokens: int = Field(
+        default=VectorStoreChunkingStrategyStaticConfig.model_fields["max_chunk_size_tokens"].default,
+        ge=100,
+        le=4096,
+        description="Default chunk size in tokens when chunking_strategy type is 'auto'",
+    )
+    default_chunk_overlap_tokens: int = Field(
+        default=VectorStoreChunkingStrategyStaticConfig.model_fields["chunk_overlap_tokens"].default,
+        ge=0,
+        le=2048,
+        description="Default chunk overlap in tokens when chunking_strategy type is 'auto'",
+    )
+
+    extract_metadata: bool = Field(default=True, description="Whether to extract PDF metadata (title, author, etc.)")
+
+    clean_text: bool = Field(
+        default=True, description="Whether to clean extracted text (remove extra whitespace, normalize line breaks)"
+    )
+
+    @classmethod
+    def sample_run_config(cls, **kwargs: Any) -> dict[str, Any]:
+        return {}
diff --git a/src/ogx/providers/registry/file_processors.py b/src/ogx/providers/registry/file_processors.py
@@ -19,6 +19,19 @@ def available_providers() -> list[ProviderSpec]:
         List of ProviderSpec objects describing available providers
     """
     return [
+        InlineProviderSpec(
+            api=Api.file_processors,
+            provider_type="inline::auto",
+            pip_packages=["pypdf>=6.7.2"],
+            module="ogx.providers.inline.file_processor.auto",
+            config_class="ogx.providers.inline.file_processor.auto.AutoFileProcessorConfig",
+            api_dependencies=[Api.files],
+            description=(
+                "Composite file processor that automatically dispatches to the appropriate backend "
+                "based on file MIME type. Routes PDF and text files to PyPDF. Unsupported formats "
+                "are rejected with a clear error listing the supported types."
+            ),
+        ),
         InlineProviderSpec(
             api=Api.file_processors,
             provider_type="inline::pypdf",
diff --git a/tests/unit/providers/file_processor/test_auto.py b/tests/unit/providers/file_processor/test_auto.py
diff --git a/tests/unit/providers/file_processor/test_pypdf_validation.py b/tests/unit/providers/file_processor/test_pypdf_validation.py