Skip to content

Commit f3c6f88

Browse files
[3.11] gh-135462: Fix quadratic complexity in processing special input in HTMLParser (GH-135464) (GH-135484)
End-of-file errors are now handled according to the HTML5 specs -- comments and declarations are automatically closed, tags are ignored. (cherry picked from commit 6eb6c5d)
1 parent 313544e commit f3c6f88

File tree

3 files changed

+117
-23
lines changed

3 files changed

+117
-23
lines changed

Lib/html/parser.py

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
2626

2727
starttagopen = re.compile('<[a-zA-Z]')
28+
endtagopen = re.compile('</[a-zA-Z]')
2829
piclose = re.compile('>')
2930
commentclose = re.compile(r'--\s*>')
3031
# Note:
@@ -176,25 +177,43 @@ def goahead(self, end):
176177
k = self.parse_pi(i)
177178
elif startswith("<!", i):
178179
k = self.parse_html_declaration(i)
179-
elif (i + 1) < n:
180+
elif (i + 1) < n or end:
180181
self.handle_data("<")
181182
k = i + 1
182183
else:
183184
break
184185
if k < 0:
185186
if not end:
186187
break
187-
k = rawdata.find('>', i + 1)
188-
if k < 0:
189-
k = rawdata.find('<', i + 1)
190-
if k < 0:
191-
k = i + 1
192-
else:
193-
k += 1
194-
if self.convert_charrefs and not self.cdata_elem:
195-
self.handle_data(unescape(rawdata[i:k]))
188+
if starttagopen.match(rawdata, i): # < + letter
189+
pass
190+
elif startswith("</", i):
191+
if i + 2 == n:
192+
self.handle_data("</")
193+
elif endtagopen.match(rawdata, i): # </ + letter
194+
pass
195+
else:
196+
# bogus comment
197+
self.handle_comment(rawdata[i+2:])
198+
elif startswith("<!--", i):
199+
j = n
200+
for suffix in ("--!", "--", "-"):
201+
if rawdata.endswith(suffix, i+4):
202+
j -= len(suffix)
203+
break
204+
self.handle_comment(rawdata[i+4:j])
205+
elif startswith("<![CDATA[", i):
206+
self.unknown_decl(rawdata[i+3:])
207+
elif rawdata[i:i+9].lower() == '<!doctype':
208+
self.handle_decl(rawdata[i+2:])
209+
elif startswith("<!", i):
210+
# bogus comment
211+
self.handle_comment(rawdata[i+2:])
212+
elif startswith("<?", i):
213+
self.handle_pi(rawdata[i+2:])
196214
else:
197-
self.handle_data(rawdata[i:k])
215+
raise AssertionError("we should not get here!")
216+
k = n
198217
i = self.updatepos(i, k)
199218
elif startswith("&#", i):
200219
match = charref.match(rawdata, i)

Lib/test/test_htmlparser.py

Lines changed: 83 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
import pprint
55
import unittest
66

7+
from test import support
8+
79

810
class EventCollector(html.parser.HTMLParser):
911

@@ -391,28 +393,34 @@ def test_tolerant_parsing(self):
391393
('data', '<'),
392394
('starttag', 'bc<', [('a', None)]),
393395
('endtag', 'html'),
394-
('data', '\n<img src="URL>'),
395-
('comment', '/img'),
396-
('endtag', 'html<')])
396+
('data', '\n')])
397397

398398
def test_starttag_junk_chars(self):
399+
self._run_check("<", [('data', '<')])
400+
self._run_check("<>", [('data', '<>')])
401+
self._run_check("< >", [('data', '< >')])
402+
self._run_check("< ", [('data', '< ')])
399403
self._run_check("</>", [])
404+
self._run_check("<$>", [('data', '<$>')])
400405
self._run_check("</$>", [('comment', '$')])
401406
self._run_check("</", [('data', '</')])
402-
self._run_check("</a", [('data', '</a')])
407+
self._run_check("</a", [])
408+
self._run_check("</ a>", [('endtag', 'a')])
409+
self._run_check("</ a", [('comment', ' a')])
403410
self._run_check("<a<a>", [('starttag', 'a<a', [])])
404411
self._run_check("</a<a>", [('endtag', 'a<a')])
405-
self._run_check("<!", [('data', '<!')])
406-
self._run_check("<a", [('data', '<a')])
407-
self._run_check("<a foo='bar'", [('data', "<a foo='bar'")])
408-
self._run_check("<a foo='bar", [('data', "<a foo='bar")])
409-
self._run_check("<a foo='>'", [('data', "<a foo='>'")])
410-
self._run_check("<a foo='>", [('data', "<a foo='>")])
412+
self._run_check("<!", [('comment', '')])
413+
self._run_check("<a", [])
414+
self._run_check("<a foo='bar'", [])
415+
self._run_check("<a foo='bar", [])
416+
self._run_check("<a foo='>'", [])
417+
self._run_check("<a foo='>", [])
411418
self._run_check("<a$>", [('starttag', 'a$', [])])
412419
self._run_check("<a$b>", [('starttag', 'a$b', [])])
413420
self._run_check("<a$b/>", [('startendtag', 'a$b', [])])
414421
self._run_check("<a$b >", [('starttag', 'a$b', [])])
415422
self._run_check("<a$b />", [('startendtag', 'a$b', [])])
423+
self._run_check("</a$b>", [('endtag', 'a$b')])
416424

417425
def test_slashes_in_starttag(self):
418426
self._run_check('<a foo="var"/>', [('startendtag', 'a', [('foo', 'var')])])
@@ -537,13 +545,56 @@ def test_EOF_in_charref(self):
537545
for html, expected in data:
538546
self._run_check(html, expected)
539547

540-
def test_broken_comments(self):
541-
html = ('<! not really a comment >'
548+
def test_eof_in_comments(self):
549+
data = [
550+
('<!--', [('comment', '')]),
551+
('<!---', [('comment', '')]),
552+
('<!----', [('comment', '')]),
553+
('<!-----', [('comment', '-')]),
554+
('<!------', [('comment', '--')]),
555+
('<!----!', [('comment', '')]),
556+
('<!---!', [('comment', '-!')]),
557+
('<!---!>', [('comment', '-!>')]),
558+
('<!--foo', [('comment', 'foo')]),
559+
('<!--foo-', [('comment', 'foo')]),
560+
('<!--foo--', [('comment', 'foo')]),
561+
('<!--foo--!', [('comment', 'foo')]),
562+
('<!--<!--', [('comment', '<!')]),
563+
('<!--<!--!', [('comment', '<!')]),
564+
]
565+
for html, expected in data:
566+
self._run_check(html, expected)
567+
568+
def test_eof_in_declarations(self):
569+
data = [
570+
('<!', [('comment', '')]),
571+
('<!-', [('comment', '-')]),
572+
('<![', [('comment', '[')]),
573+
('<![CDATA[', [('unknown decl', 'CDATA[')]),
574+
('<![CDATA[x', [('unknown decl', 'CDATA[x')]),
575+
('<![CDATA[x]', [('unknown decl', 'CDATA[x]')]),
576+
('<![CDATA[x]]', [('unknown decl', 'CDATA[x]]')]),
577+
('<!DOCTYPE', [('decl', 'DOCTYPE')]),
578+
('<!DOCTYPE ', [('decl', 'DOCTYPE ')]),
579+
('<!DOCTYPE html', [('decl', 'DOCTYPE html')]),
580+
('<!DOCTYPE html ', [('decl', 'DOCTYPE html ')]),
581+
('<!DOCTYPE html PUBLIC', [('decl', 'DOCTYPE html PUBLIC')]),
582+
('<!DOCTYPE html PUBLIC "foo', [('decl', 'DOCTYPE html PUBLIC "foo')]),
583+
('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo',
584+
[('decl', 'DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo')]),
585+
]
586+
for html, expected in data:
587+
self._run_check(html, expected)
588+
589+
def test_bogus_comments(self):
590+
html = ('<!ELEMENT br EMPTY>'
591+
'<! not really a comment >'
542592
'<! not a comment either -->'
543593
'<! -- close enough -->'
544594
'<!><!<-- this was an empty comment>'
545595
'<!!! another bogus comment !!!>')
546596
expected = [
597+
('comment', 'ELEMENT br EMPTY'),
547598
('comment', ' not really a comment '),
548599
('comment', ' not a comment either --'),
549600
('comment', ' -- close enough --'),
@@ -598,6 +649,26 @@ def test_convert_charrefs_dropped_text(self):
598649
('endtag', 'a'), ('data', ' bar & baz')]
599650
)
600651

652+
@support.requires_resource('cpu')
653+
def test_eof_no_quadratic_complexity(self):
654+
# Each of these examples used to take about an hour.
655+
# Now they take a fraction of a second.
656+
def check(source):
657+
parser = html.parser.HTMLParser()
658+
parser.feed(source)
659+
parser.close()
660+
n = 120_000
661+
check("<a " * n)
662+
check("<a a=" * n)
663+
check("</a " * 14 * n)
664+
check("</a a=" * 11 * n)
665+
check("<!--" * 4 * n)
666+
check("<!" * 60 * n)
667+
check("<?" * 19 * n)
668+
check("</$" * 15 * n)
669+
check("<![CDATA[" * 9 * n)
670+
check("<!doctype" * 35 * n)
671+
601672

602673
class AttributesTestCase(TestCaseBase):
603674

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Fix quadratic complexity in processing specially crafted input in
2+
:class:`html.parser.HTMLParser`. End-of-file errors are now handled according
3+
to the HTML5 specs -- comments and declarations are automatically closed,
4+
tags are ignored.

0 commit comments

Comments
 (0)