Skip to content

Commit 3ee8646

Browse files
committed
Reverted changes to parse_text that forced it to work with str
1 parent 3b89e85 commit 3ee8646

File tree

1 file changed

+8
-2
lines changed

1 file changed

+8
-2
lines changed

paperqa/readers.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,18 +105,24 @@ def parse_text(
105105
"""
106106
try:
107107
with path.open() as f:
108-
text: str = "".join(list(f)) if split_lines else f.read()
108+
text = list(f) if split_lines else f.read()
109109
except UnicodeDecodeError:
110110
with path.open(encoding="utf-8", errors="ignore") as f:
111111
text = f.read()
112112

113113
if html:
114+
if not isinstance(text, str):
115+
raise NotImplementedError(
116+
"HTML parsing is not yet set up to work with split_lines."
117+
)
114118
text = html2text.html2text(text)
115119

116120
metadata = {
117121
"parsing_libraries": ["tiktoken (cl100k_base)"] if use_tiktoken else [],
118122
"paperqa_version": pqa_version,
119-
"total_parsed_text_length": len(text),
123+
"total_parsed_text_length": (
124+
len(text) if isinstance(text, str) else sum(len(t) for t in text)
125+
),
120126
"parse_type": "txt" if not html else "html",
121127
}
122128
if html:

0 commit comments

Comments
 (0)