feat: allow sentence hits to be returned directly (quickwit-oss#153)

cjrh · web-flow · commit 132afc2526b4 · 2023-11-06T12:35:25.000+01:00
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/src/searcher_frame_document.rs b/src/searcher_frame_document.rs
@@ -44,6 +44,13 @@ impl SearchResult {
         Ok(s)
     }
 
+    #[getter]
+    fn unique_sentences(&self, py: Python) -> PyResult<BTreeSet<u64>> {
+        let s =
+            BTreeSet::from_iter(self.hits.iter().map(|(d, f, s, score)| *s));
+        Ok(s)
+    }
+
     #[getter]
     fn unique_docs_frames(&self, py: Python) -> PyResult<BTreeSet<(u64, u64)>> {
         let s = BTreeSet::from_iter(
@@ -70,6 +77,27 @@ impl SearchResult {
         }
         Ok((v1, v2))
     }
+
+    /// This is an optimization to allow Python callers to obtain vectors
+    /// without having to do iteration to get them.
+    #[getter]
+    fn unique_docs_frames_sentences_unzipped(
+        &self,
+        py: Python,
+    ) -> PyResult<(Vec<u64>, Vec<u64>, Vec<u64>)> {
+        let s = BTreeSet::from_iter(
+            self.hits.iter().map(|(d, f, s, score)| (*d, *f, *s)),
+        );
+        let mut v1 = Vec::with_capacity(s.len());
+        let mut v2 = Vec::with_capacity(s.len());
+        let mut v3 = Vec::with_capacity(s.len());
+        for (d, f, s) in s.into_iter() {
+            v1.push(d);
+            v2.push(f);
+            v3.push(s);
+        }
+        Ok((v1, v2, v3))
+    }
 }
 
 #[pymethods]
diff --git a/tests/tantivy_kapiche_test.py b/tests/tantivy_kapiche_test.py
@@ -34,7 +34,7 @@ def create_index(dir=None):
     # assume all tests will use the same documents for now
     # other methods may set up function-local indexes
     index = Index(schema(), dir)
-    writer = index.writer()
+    writer = index.writer(15_000_000, 1)
 
     # 2 ways of adding documents
     # 1
@@ -179,7 +179,7 @@ def create_kapiche_index(dir=None):
     # assume all tests will use the same documents for now
     # other methods may set up function-local indexes
     index = Index(kapiche_schema(), dir)
-    writer = index.writer()
+    writer = index.writer(15_000_000, 1)
 
     # 2 ways of adding documents
     # 1
@@ -513,7 +513,7 @@ def test_create_readers(self):
 class TestSearcher(object):
     def test_searcher_repr(self, ram_index, ram_index_numeric_fields):
         assert ram_index.searcher().num_docs == 3
-        assert ram_index.searcher().num_segments == 2
+        assert ram_index.searcher().num_segments == 1
         assert (
             repr(ram_index_numeric_fields.searcher())
             == "Searcher(num_docs=2, num_segments=2)"
diff --git a/tests/test_stat_collector.py b/tests/test_stat_collector.py
@@ -78,8 +78,10 @@ def test_stat_searcher_filter(self, ram_kapiche_index):
         assert sorted(result.unique_docs_frames) == [(1, 1), (3, 5)]
         assert list(result.unique_docs) == [1, 3]
         assert list(result.unique_frames) == [1, 5]
+        assert list(result.unique_sentences) == [1, 2, 7]
         print(f"{result.hits}")
         print(f"{result.unique_docs_frames}")
+        print(f"{result.unique_sentences}")
 
     def test_stat_searcher_filter_unzipped(self, ram_kapiche_index):
         index = ram_kapiche_index
@@ -92,6 +94,9 @@ def test_stat_searcher_filter_unzipped(self, ram_kapiche_index):
         assert list(result.unique_docs) == [1, 3]
         assert list(result.unique_frames) == [1, 5]
 
+        d, f, s = result.unique_docs_frames_sentences_unzipped
+        assert sorted(s) == [1, 2, 7]
+
 
 def test_stat_searcher_memory():
     # Create index