Skip to content

Commit 2d0a14a

Browse files
committed
allow BeautifulSoup configuration kwargs to be specified
Signed-off-by: chrispy <[email protected]>
1 parent 016251e commit 2d0a14a

File tree

3 files changed

+22
-8
lines changed

3 files changed

+22
-8
lines changed

README.rst

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -157,12 +157,16 @@ strip_document
157157
within the document are unaffected.
158158
Defaults to ``STRIP``.
159159

160-
beautiful_soup_parser
161-
Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such
162-
as `html5lib`, `lxml` or even a custom parser as long as it is installed on the execution
163-
environment. Defaults to ``html.parser``.
164-
165-
.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/
160+
bs4_options
161+
Specify additional configuration options for the ``BeautifulSoup`` object
162+
used to interpret the HTML markup. String and list values (such as ``lxml``)
163+
are treated as ``features`` parameter arguments to control parser
164+
selection. Dictionary values (such as ``{"from_encoding": "iso-8859-8"}``)
165+
are treated as full kwargs to be used for the BeautifulSoup constructor,
166+
allowing specification of any parameter. For parameter details, see the
167+
Beautiful Soup documentation at:
168+
169+
.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
166170

167171
Options may be specified as kwargs to the ``markdownify`` function, or as a
168172
nested ``Options`` class in ``MarkdownConverter`` subclasses.

markdownify/__init__.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ def _next_block_content_sibling(el):
154154
class MarkdownConverter(object):
155155
class DefaultOptions:
156156
autolinks = True
157-
beautiful_soup_parser = 'html.parser'
157+
bs4_options = 'html.parser'
158158
bullets = '*+-' # An iterable of bullet types.
159159
code_language = ''
160160
code_language_callback = None
@@ -188,11 +188,15 @@ def __init__(self, **options):
188188
raise ValueError('You may specify either tags to strip or tags to'
189189
' convert, but not both.')
190190

191+
# If a string or list is passed to bs4_options, assume it is a 'features' specification
192+
if not isinstance(self.options['bs4_options'], dict):
193+
self.options['bs4_options'] = {'features': self.options['bs4_options']}
194+
191195
# Initialize the conversion function cache
192196
self.convert_fn_cache = {}
193197

194198
def convert(self, html):
195-
soup = BeautifulSoup(html, self.options['beautiful_soup_parser'])
199+
soup = BeautifulSoup(html, **self.options['bs4_options'])
196200
return self.convert_soup(soup)
197201

198202
def convert_soup(self, soup):

tests/test_args.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,9 @@ def test_strip_document():
3232
assert markdownify("<p>Hello</p>", strip_document=RSTRIP) == "\n\nHello"
3333
assert markdownify("<p>Hello</p>", strip_document=STRIP) == "Hello"
3434
assert markdownify("<p>Hello</p>", strip_document=None) == "\n\nHello\n\n"
35+
36+
37+
def bs4_options():
38+
assert markdownify("<p>Hello</p>", bs4_options="html.parser") == "Hello"
39+
assert markdownify("<p>Hello</p>", bs4_options=["html.parser"]) == "Hello"
40+
assert markdownify("<p>Hello</p>", bs4_options={"features": "html.parser"}) == "Hello"

0 commit comments

Comments
 (0)