allow BeautifulSoup configuration kwargs to be specified

chrispy-snps · chrispy-snps · commit 2d0a14a2a796 · 2025-06-14T07:24:19.000-04:00
Signed-off-by: chrispy &lt;chrispy@synopsys.com&gt;
diff --git a/README.rst b/README.rst
@@ -157,12 +157,16 @@ strip_document
   within the document are unaffected.
   Defaults to ``STRIP``.
 
-beautiful_soup_parser
-  Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such
-  as `html5lib`, `lxml` or even a custom parser as long as it is installed on the execution
-  environment. Defaults to ``html.parser``.
-
-.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/
+bs4_options
+  Specify additional configuration options for the ``BeautifulSoup`` object
+  used to interpret the HTML markup. String and list values (such as ``lxml``)
+  are treated as ``features`` parameter arguments to control parser
+  selection. Dictionary values (such as ``{"from_encoding": "iso-8859-8"}``)
+  are treated as full kwargs to be used for the BeautifulSoup constructor,
+  allowing specification of any parameter. For parameter details, see the
+  Beautiful Soup documentation at:
+
+.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
 
 Options may be specified as kwargs to the ``markdownify`` function, or as a
 nested ``Options`` class in ``MarkdownConverter`` subclasses.
diff --git a/markdownify/__init__.py b/markdownify/__init__.py
@@ -154,7 +154,7 @@ def _next_block_content_sibling(el):
 class MarkdownConverter(object):
     class DefaultOptions:
         autolinks = True
-        beautiful_soup_parser = 'html.parser'
+        bs4_options = 'html.parser'
         bullets = '*+-'  # An iterable of bullet types.
         code_language = ''
         code_language_callback = None
@@ -188,11 +188,15 @@ def __init__(self, **options):
             raise ValueError('You may specify either tags to strip or tags to'
                              ' convert, but not both.')
 
+        # If a string or list is passed to bs4_options, assume it is a 'features' specification
+        if not isinstance(self.options['bs4_options'], dict):
+            self.options['bs4_options'] = {'features': self.options['bs4_options']}
+
         # Initialize the conversion function cache
         self.convert_fn_cache = {}
 
     def convert(self, html):
-        soup = BeautifulSoup(html, self.options['beautiful_soup_parser'])
+        soup = BeautifulSoup(html, **self.options['bs4_options'])
         return self.convert_soup(soup)
 
     def convert_soup(self, soup):
diff --git a/tests/test_args.py b/tests/test_args.py
@@ -32,3 +32,9 @@ def test_strip_document():
     assert markdownify("<p>Hello</p>", strip_document=RSTRIP) == "\n\nHello"
     assert markdownify("<p>Hello</p>", strip_document=STRIP) == "Hello"
     assert markdownify("<p>Hello</p>", strip_document=None) == "\n\nHello\n\n"
+
+
+def bs4_options():
+    assert markdownify("<p>Hello</p>", bs4_options="html.parser") == "Hello"
+    assert markdownify("<p>Hello</p>", bs4_options=["html.parser"]) == "Hello"
+    assert markdownify("<p>Hello</p>", bs4_options={"features": "html.parser"}) == "Hello"