Adding filters to paper-qa Docs (#707)

whitead · web-flow · commit d5c468828b83 · 2024-11-19T15:37:50.000-08:00
This adds a new filter mechanism to exclude papers from the `Docs` object via settings. 

For example, to exclude a specific DOI

```py
settings = Settings()
settings.parsing.doc_filters = [{"!doi": "xxxx/xxxxxx"}]
```

Or to only consider years 2020 and 2018:
```py
settings.parsing.doc_filters = [
    {"year": "2020"},
    {"year": "2018"}]
```

Description:

Optional filters to only allow documents that match this filter. This is a dictionary where the keys are the fields from DocDetails or Docs to filter on, and the values are the values to filter for. to invert filter, prefix the key with a '!'. If the key is not found, by default the Doc is rejected. To change this behavior, prefix the key with a '?' to allow the Doc to pass if the key is not found. For example, {'!title': 'bad title', '?year': '2022'} would only allow Docs with a title that is not 'bad title' and a year of 2022 or no year at all.
diff --git a/paperqa/docs.py b/paperqa/docs.py
@@ -436,7 +436,7 @@ async def aadd_texts(
         """
         Add chunked texts to the collection.
 
-        NOTE: this is useful if you have already chunked the texts yourself.
+        This is useful to use if you have already chunked the texts yourself.
 
         Returns:
             True if the doc was added, otherwise False if already in the collection.
@@ -451,6 +451,11 @@ async def aadd_texts(
             # want to embed now!
             embedding_model = all_settings.get_embedding_model()
 
+        # 0. Short-circuit if it is caught by a filter
+        for doc_filter in all_settings.parsing.doc_filters or []:
+            if not doc.matches_filter_criteria(doc_filter):
+                return False
+
         # 1. Calculate text embeddings if not already present
         if embedding_model and texts[0].embedding is None:
             for t, t_embedding in zip(
diff --git a/paperqa/settings.py b/paperqa/settings.py
@@ -196,6 +196,17 @@ class ParsingSettings(BaseModel):
         ),
     )
     chunking_algorithm: ChunkingOptions = ChunkingOptions.SIMPLE_OVERLAP
+    doc_filters: list[dict] | None = Field(
+        default=None,
+        description=(
+            "Optional filters to only allow documents that match this filter. This is a dictionary where the keys"
+            " are the fields from DocDetails or Docs to filter on, and the values are the values to filter for."
+            " To invert filter, prefix the key with a '!'. If the key is not found, by default the Doc is rejected."
+            " To change this behavior, prefix the key with a '?' to allow the Doc to pass if the key is not found."
+            " For example, {'!title': 'bad title', '?year': '2022'} would only allow Docs with a title that is not"
+            " 'bad title' and a year of 2022 or no year at all."
+        ),
+    )
 
     def chunk_type(self, chunking_selection: ChunkingOptions | None = None) -> str:
         """Future chunking implementations (i.e. by section) will get an elif clause here."""
diff --git a/paperqa/types.py b/paperqa/types.py
@@ -134,6 +134,24 @@ def __hash__(self) -> int:
     def formatted_citation(self) -> str:
         return self.citation
 
+    def matches_filter_criteria(self, filter_criteria: dict) -> bool:
+        """Returns True if the doc matches the filter criteria, False otherwise."""
+        data_dict = self.model_dump()
+        for key, value in filter_criteria.items():
+            invert = key.startswith("!")
+            relaxed = key.startswith("?")
+            key = key.lstrip("!?")
+            # we check if missing or sentinel/unset
+            if relaxed and (key not in data_dict or data_dict[key] is None):
+                continue
+            if key not in data_dict:
+                return False
+            if invert and data_dict[key] == value:
+                return False
+            if not invert and data_dict[key] != value:
+                return False
+        return True
+
 
 class Text(Embeddable):
     text: str
diff --git a/tests/test_configs.py b/tests/test_configs.py
@@ -14,6 +14,7 @@
     get_formatted_variables,
     get_settings,
 )
+from paperqa.types import Doc, DocDetails
 
 
 def test_prompt_settings_validation() -> None:
@@ -90,3 +91,79 @@ def test_o1_requires_temp_equals_1() -> None:
         warnings.simplefilter("always")
         _ = Settings(llm="o1-thismodeldoesnotexist", temperature=1)
         assert not w
+
+
+@pytest.mark.parametrize(
+    ("doc_class", "doc_data", "filter_criteria", "expected_result"),
+    [
+        pytest.param(
+            Doc,
+            {
+                "docname": "Test Paper",
+                "citation": "Test Citation",
+                "dockey": "key1",
+            },
+            {"docname": "Test Paper"},
+            True,
+            id="Doc-matching-docname",
+        ),
+        pytest.param(
+            Doc,
+            {
+                "docname": "Test Paper",
+                "citation": "Test Citation",
+                "dockey": "key1",
+            },
+            {"docname": "Another Paper"},
+            False,
+            id="Doc-nonmatching-docname",
+        ),
+        pytest.param(
+            DocDetails,
+            {
+                "title": "Test Paper",
+                "authors": ["Alice", "Bob"],
+                "year": 2020,
+            },
+            {"title": "Test Paper"},
+            True,
+            id="DocDetails-matching-title",
+        ),
+        pytest.param(
+            DocDetails,
+            {
+                "title": "Test Paper",
+                "authors": ["Alice", "Bob"],
+                "year": 2020,
+            },
+            {"!year": 2020, "?foo": "bar"},
+            False,
+            id="DocDetails-inverted-matching-year",
+        ),
+        pytest.param(
+            DocDetails,
+            {
+                "title": "Test Paper",
+                "authors": ["Alice", "Bob"],
+                "year": 2020,
+            },
+            {"year": 2020, "foo": "bar"},
+            False,
+            id="DocDetails-missing-param-fail",
+        ),
+        pytest.param(
+            DocDetails,
+            {
+                "title": "Test Paper",
+                "authors": ["Alice", "Bob"],
+                "year": 2020,
+            },
+            {"?volume": "10", "!title": "Another Paper"},
+            True,
+            id="DocDetails-relaxed-missing-volume",
+        ),
+    ],
+)
+def test_matches_filter_criteria(doc_class, doc_data, filter_criteria, expected_result):
+    doc = doc_class(**doc_data)
+    assert doc.matches_filter_criteria(filter_criteria) == expected_result