diff --git a/paperqa/core.py b/paperqa/core.py index baa556436..13babb76f 100644 --- a/paperqa/core.py +++ b/paperqa/core.py @@ -13,29 +13,25 @@ def llm_parse_json(text: str) -> dict: """Read LLM output and extract JSON data from it.""" # fetch from markdown ```json if present - text = text.strip().split("```json")[-1].split("```")[0] - # split anything before the first { - text = "{" + text.split("{", 1)[-1] - # split anything after the last } - text = text.rsplit("}", 1)[0] + "}" + ptext = text.strip().split("```json")[-1].split("```")[0] + # split anything before the first { after the last } + ptext = ("{" + ptext.split("{", 1)[-1]).rsplit("}", 1)[0] + "}" - # escape new lines within strings - def replace_newlines(match: re.Match) -> str: + def escape_newlines(match: re.Match) -> str: return match.group(0).replace("\n", "\\n") # Match anything between double quotes # including escaped quotes and other escaped characters. # https://regex101.com/r/VFcDmB/1 pattern = r'"(?:[^"\\]|\\.)*"' - text = re.sub(pattern, replace_newlines, text) + ptext = re.sub(pattern, escape_newlines, ptext) try: - return json.loads(text) + return json.loads(ptext) except json.JSONDecodeError as e: raise ValueError( - "Failed to parse JSON. Your model may not " - "be capable of supporting JSON output. Try " - "a different model or with " - "`Settings(prompts={'use_json': False})`" + f"Failed to parse JSON from text {text!r}. Your model may not be capable of" + " supporting JSON output or our parsing technique could use some work. Try" + " a different model or specify `Settings(prompts={'use_json': False})`" ) from e