Fixed sonnet json formatting issue (#293)

whitead · web-flow · commit f16240a85c73 · 2024-06-25T13:17:49.000-07:00
* Fixed sonnet json formatting issue

* PR comments - addedd notes and types
diff --git a/paperqa/utils.py b/paperqa/utils.py
@@ -184,4 +184,15 @@ def llm_read_json(text: str) -> dict:
     text = "{" + text.split("{", 1)[-1]
     # split anything after the last }
     text = text.rsplit("}", 1)[0] + "}"
+
+    # escape new lines within strings
+    def replace_newlines(match: re.Match) -> str:
+        return match.group(0).replace("\n", "\\n")
+
+    # Match anything between double quotes
+    # including escaped quotes and other escaped characters.
+    # https://regex101.com/r/VFcDmB/1
+    pattern = r'"(?:[^"\\]|\\.)*"'
+    text = re.sub(pattern, replace_newlines, text)
+
     return json.loads(text)
diff --git a/pyproject.toml b/pyproject.toml
@@ -40,7 +40,7 @@ name = "paper-qa"
 readme = "README.md"
 requires-python = ">=3.8"
 urls = {repository = "https://github.com/whitead/paper-qa"}
-version = "4.8.0"
+version = "4.8.1"
 
 [tool.codespell]
 check-filenames = true
diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py
@@ -3,6 +3,7 @@
 import os
 import pickle
 import tempfile
+import textwrap
 from io import BytesIO
 from pathlib import Path
 
@@ -457,6 +458,23 @@ def test_llm_read_json(example: str):
     assert llm_read_json(example) == {"example": "json"}
 
 
+def test_llm_read_json_newlines():
+    """Make sure that newlines in json are preserved and escaped."""
+    example = textwrap.dedent(
+        """
+        {
+        "summary": "A line
+
+        Another line",
+        "relevance_score": 7
+        }"""
+    )
+    assert llm_read_json(example) == {
+        "summary": "A line\n\nAnother line",
+        "relevance_score": 7,
+    }
+
+
 @pytest.mark.asyncio()
 async def test_chain_completion():
     client = AsyncOpenAI()