Future-House · maykcaldas · Nov 14, 2024 · Nov 14, 2024 · Nov 15, 2024 · Nov 16, 2024
diff --git a/paperqa/__init__.py b/paperqa/__init__.py
@@ -11,13 +11,15 @@
 from paperqa.agents.models import QueryRequest  # noqa: E402
 from paperqa.docs import Docs, PQASession, print_callback  # noqa: E402
 from paperqa.llms import (  # noqa: E402
+    AnthropicBatchLLMModel,
     EmbeddingModel,
     HybridEmbeddingModel,
     LiteLLMEmbeddingModel,
     LiteLLMModel,
     LLMModel,
     LLMResult,
     NumpyVectorStore,
+    OpenAIBatchLLMModel,
     SentenceTransformerEmbeddingModel,
     SparseEmbeddingModel,
     embedding_model_factory,
@@ -28,6 +30,7 @@
 
 __all__ = [
     "Answer",
+    "AnthropicBatchLLMModel",
     "Context",
     "Doc",
     "DocDetails",
@@ -39,6 +42,7 @@
     "LiteLLMEmbeddingModel",
     "LiteLLMModel",
     "NumpyVectorStore",
+    "OpenAIBatchLLMModel",
     "PQASession",
     "QueryRequest",
     "SentenceTransformerEmbeddingModel",

diff --git a/paperqa/agents/env.py b/paperqa/agents/env.py
@@ -13,7 +13,11 @@
 )
 
 from paperqa.docs import Docs
-from paperqa.llms import EmbeddingModel, LiteLLMModel
+from paperqa.llms import (
+    EmbeddingModel,
+    LiteLLMModel,
+    LLMBatchModel,
+)
 from paperqa.settings import Settings
 from paperqa.types import PQASession
 from paperqa.utils import get_year
@@ -37,7 +41,7 @@
 def settings_to_tools(
     settings: Settings,
     llm_model: LiteLLMModel | None = POPULATE_FROM_SETTINGS,
-    summary_llm_model: LiteLLMModel | None = POPULATE_FROM_SETTINGS,
+    summary_llm_model: LiteLLMModel | LLMBatchModel | None = POPULATE_FROM_SETTINGS,
     embedding_model: EmbeddingModel | None = POPULATE_FROM_SETTINGS,
 ) -> list[Tool]:
     """

diff --git a/paperqa/core.py b/paperqa/core.py
@@ -68,12 +68,13 @@ async def map_fxn_summary(
     success = False
 
     if prompt_runner:
-        llm_result = await prompt_runner(
+        result = await prompt_runner(
             {"question": question, "citation": citation, "text": text.text}
             | (extra_prompt_data or {}),
             callbacks,
             "evidence:" + text.name,
         )
+        llm_result = result if isinstance(result, LLMResult) else result[0]
         context = llm_result.text
         result_data = parser(context) if parser else {}
         success = bool(result_data)
@@ -115,3 +116,74 @@ async def map_fxn_summary(
         ),
         llm_result,
     )
+
+
+async def gather_with_batch(
+    matches: list[Text],
+    question: str,
+    prompt_runner: PromptRunner | None,
+    extra_prompt_data: dict[str, str] | None = None,
+    parser: Callable[[str], dict[str, Any]] | None = None,
+    callbacks: list[Callable[[str], None]] | None = None,
+) -> list[tuple[Context, LLMResult]]:
+    """
+    Gathers evidence considering a batch of texts. The completions are obtained using a batch API.
+
+    Args:
+        matches: A list of text matches to gather evidence from.
+        question: The question to be answered.
+        prompt_runner: The prompt runner to use for obtaining completions.
+        extra_prompt_data: Additional data to include in the prompt.
+        parser: A function to parse the LLM result text.
+        callbacks: A list of callback functions to be called
+        with the LLM result text.
+
+    Returns:
+        list: A list of tuples containing the context and LLM result for each match.
+    """
+    data = [
+        {
+            "question": question,
+            "citation": m.name + ": " + m.doc.formatted_citation,
+            "text": m.text,
+        }
+        | (extra_prompt_data or {})
+        for m in matches
+    ]
+
+    llm_results: list[LLMResult] = []
+    if prompt_runner:
+        result = await prompt_runner(
+            data,
+            callbacks,
+            "evidence:" + matches[0].name,
+        )
+        llm_results = result if isinstance(result, list) else [result]
+
+    results_data = []
+    scores = []
+    for r in llm_results:
+        if parser:
+            res = parser(r.text)
+            results_data.append(res)
+            scores.append(res.pop("relevance_score"))
+            # just in case question was present
+            res.pop("question", None)
+        else:
+            results_data.append({})
+            scores.append(extract_score(r.text))
+
+    return [
+        (
+            Context(
+                context=strip_citations(llm_result.text),
+                text=m,
+                score=score,
+                **r,
+            ),
+            llm_result,
+        )
+        for r, m, llm_result, score in zip(
+            results_data, matches, llm_results, scores, strict=True
+        )
+    ]
diff --git a/paperqa/docs.py b/paperqa/docs.py
@@ -22,9 +22,10 @@
 )
 
 from paperqa.clients import DEFAULT_CLIENTS, DocMetadataClient
-from paperqa.core import llm_parse_json, map_fxn_summary
+from paperqa.core import gather_with_batch, llm_parse_json, map_fxn_summary
 from paperqa.llms import (
     EmbeddingModel,
+    LLMBatchModel,
     LLMModel,
     NumpyVectorStore,
     PromptRunner,
@@ -559,14 +560,14 @@ def get_evidence(
             )
         )
 
-    async def aget_evidence(
+    async def aget_evidence(  # noqa: PLR0912
         self,
         query: PQASession | str,
         exclude_text_filter: set[str] | None = None,
         settings: MaybeSettings = None,
         callbacks: list[Callable] | None = None,
         embedding_model: EmbeddingModel | None = None,
-        summary_llm_model: LLMModel | None = None,
+        summary_llm_model: LLMModel | LLMBatchModel | None = None,
     ) -> PQASession:
 
         evidence_settings = get_settings(settings)
@@ -629,28 +630,40 @@ async def aget_evidence(
                 )
 
         with set_llm_session_ids(session.id):
-            results = await gather_with_concurrency(
-                answer_config.max_concurrent_requests,
-                [
-                    map_fxn_summary(
-                        text=m,
-                        question=session.question,
-                        prompt_runner=prompt_runner,
-                        extra_prompt_data={
-                            "summary_length": answer_config.evidence_summary_length,
-                            "citation": f"{m.name}: {m.doc.formatted_citation}",
-                        },
-                        parser=llm_parse_json if prompt_config.use_json else None,
-                        callbacks=callbacks,
-                    )
-                    for m in matches
-                ],
-            )
+            if evidence_settings.use_batch_in_summary:
+                results = await gather_with_batch(
+                    matches=matches,
+                    question=session.question,
+                    prompt_runner=prompt_runner,
+                    extra_prompt_data={
+                        "summary_length": answer_config.evidence_summary_length,
+                    },
+                    parser=llm_parse_json if prompt_config.use_json else None,
+                    callbacks=callbacks,
+                )
+            else:
+                results = await gather_with_concurrency(
+                    answer_config.max_concurrent_requests,
+                    [
+                        map_fxn_summary(
+                            text=m,
+                            question=session.question,
+                            prompt_runner=prompt_runner,
+                            extra_prompt_data={
+                                "summary_length": answer_config.evidence_summary_length,
+                                "citation": f"{m.name}: {m.doc.formatted_citation}",
+                            },
+                            parser=llm_parse_json if prompt_config.use_json else None,
+                            callbacks=callbacks,
+                        )
+                        for m in matches
+                    ],
+                )
 
         for _, llm_result in results:
             session.add_tokens(llm_result)
 
-        session.contexts += [r for r, _ in results if r is not None]
+        session.contexts += [r for r, _ in results]
         return session
 
     def query(
@@ -659,7 +672,7 @@ def query(
         settings: MaybeSettings = None,
         callbacks: list[Callable] | None = None,
         llm_model: LLMModel | None = None,
-        summary_llm_model: LLMModel | None = None,
+        summary_llm_model: LLMModel | LLMBatchModel | None = None,
         embedding_model: EmbeddingModel | None = None,
     ) -> PQASession:
         return get_loop().run_until_complete(
@@ -679,7 +692,7 @@ async def aquery(  # noqa: PLR0912
         settings: MaybeSettings = None,
         callbacks: list[Callable] | None = None,
         llm_model: LLMModel | None = None,
-        summary_llm_model: LLMModel | None = None,
+        summary_llm_model: LLMModel | LLMBatchModel | None = None,
         embedding_model: EmbeddingModel | None = None,
     ) -> PQASession: