Skip to content

Commit a369e07

Browse files
committed
More selective escaping of -#.) (alternative approach)
This is a partial alternative to #122 (open since April) for more selective escaping of some special characters. Here, we fix the test function naming (as noted in that PR) so the tests are actually run (and fix some incorrect test assertions so they pass). We also make escaping of `-#.)` (the most common cases of unnecessary escaping in my use case) more selective, while still being conservatively safe in escaping all cases of those characters that might have Markdown significance (including in the presence of wrapping, unlike in #122). (Being conservatively safe doesn't include the cases where `.` or `)` start a fragment, where the existing code already was not conservatively safe.) There are certainly more cases where the code could also be made more selective while remaining conservatively safe (including in the presence of wrapping), so this is not a complete replacement for #122, but by fixing some of the most common cases in a safe way, and getting the tests actually running, I hope this allows progress to be made where the previous attempt appears to have stalled, while still allowing further incremental progress with appropriately safe logic for other characters where useful.
1 parent 964d89f commit a369e07

File tree

2 files changed

+41
-6
lines changed

2 files changed

+41
-6
lines changed

markdownify/__init__.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -208,8 +208,20 @@ def escape(self, text):
208208
if not text:
209209
return ''
210210
if self.options['escape_misc']:
211-
text = re.sub(r'([\\&<`[>~#=+|-])', r'\\\1', text)
212-
text = re.sub(r'([0-9])([.)])', r'\1\\\2', text)
211+
text = re.sub(r'([\\&<`[>~=+|])', r'\\\1', text)
212+
# A sequence of one or more consecutive '-', preceded and
213+
# followed by whitespace or start/end of fragment, might
214+
# be confused with an underline of a header, or with a
215+
# list marker.
216+
text = re.sub(r'(\s|^)(-+(?:\s|$))', r'\1\\\2', text)
217+
# A sequence of up to six consecutive '#', preceded and
218+
# followed by whitespace or start/end of fragment, might
219+
# be confused with an ATX heading.
220+
text = re.sub(r'(\s|^)(#{1,6}(?:\s|$))', r'\1\\\2', text)
221+
# '.' or ')' preceded by up to nine digits might be
222+
# confused with a list item.
223+
text = re.sub(r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))', r'\1\\\2',
224+
text)
213225
if self.options['escape_asterisks']:
214226
text = text.replace('*', r'\*')
215227
if self.options['escape_underscores']:

tests/test_escaping.py

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,20 +28,43 @@ def test_single_escaping_entities():
2828
assert md('&amp;amp;') == r'\&amp;'
2929

3030

31-
def text_misc():
31+
def test_misc():
3232
assert md('\\*') == r'\\\*'
33-
assert md('<foo>') == r'\<foo\>'
33+
assert md('&lt;foo>') == r'\<foo\>'
3434
assert md('# foo') == r'\# foo'
35+
assert md('#5') == r'#5'
36+
assert md('5#') == '5#'
37+
assert md('####### foo') == r'####### foo'
3538
assert md('> foo') == r'\> foo'
3639
assert md('~~foo~~') == r'\~\~foo\~\~'
3740
assert md('foo\n===\n') == 'foo\n\\=\\=\\=\n'
38-
assert md('---\n') == '\\-\\-\\-\n'
41+
assert md('---\n') == '\\---\n'
42+
assert md('- test') == r'\- test'
43+
assert md('x - y') == r'x \- y'
44+
assert md('test-case') == 'test-case'
45+
assert md('x-') == 'x-'
46+
assert md('-y') == '-y'
3947
assert md('+ x\n+ y\n') == '\\+ x\n\\+ y\n'
4048
assert md('`x`') == r'\`x\`'
4149
assert md('[text](link)') == r'\[text](link)'
4250
assert md('1. x') == r'1\. x'
51+
# assert md('1<span>.</span> x') == r'1\. x'
52+
assert md('<span>1.</span> x') == r'1\. x'
53+
assert md(' 1. x') == r' 1\. x'
54+
assert md('123456789. x') == r'123456789\. x'
55+
assert md('1234567890. x') == r'1234567890. x'
56+
assert md('A1. x') == r'A1. x'
57+
assert md('1.2') == r'1.2'
4358
assert md('not a number. x') == r'not a number. x'
4459
assert md('1) x') == r'1\) x'
60+
# assert md('1<span>)</span> x') == r'1\) x'
61+
assert md('<span>1)</span> x') == r'1\) x'
62+
assert md(' 1) x') == r' 1\) x'
63+
assert md('123456789) x') == r'123456789\) x'
64+
assert md('1234567890) x') == r'1234567890) x'
65+
assert md('(1) x') == r'(1) x'
66+
assert md('A1) x') == r'A1) x'
67+
assert md('1)x') == r'1)x'
4568
assert md('not a number) x') == r'not a number) x'
4669
assert md('|not table|') == r'\|not table\|'
47-
assert md(r'\ <foo> &amp;amp; | ` `', escape_misc=False) == r'\ <foo> &amp; | ` `'
70+
assert md(r'\ &lt;foo> &amp;amp; | ` `', escape_misc=False) == r'\ <foo> &amp; | ` `'

0 commit comments

Comments
 (0)