|
5 | 5 | import unittest
|
6 | 6 |
|
7 | 7 | from unittest.mock import patch
|
| 8 | +from test import support |
8 | 9 |
|
9 | 10 |
|
10 | 11 | class EventCollector(html.parser.HTMLParser):
|
@@ -393,28 +394,34 @@ def test_tolerant_parsing(self):
|
393 | 394 | ('data', '<'),
|
394 | 395 | ('starttag', 'bc<', [('a', None)]),
|
395 | 396 | ('endtag', 'html'),
|
396 |
| - ('data', '\n<img src="URL>'), |
397 |
| - ('comment', '/img'), |
398 |
| - ('endtag', 'html<')]) |
| 397 | + ('data', '\n')]) |
399 | 398 |
|
400 | 399 | def test_starttag_junk_chars(self):
|
| 400 | + self._run_check("<", [('data', '<')]) |
| 401 | + self._run_check("<>", [('data', '<>')]) |
| 402 | + self._run_check("< >", [('data', '< >')]) |
| 403 | + self._run_check("< ", [('data', '< ')]) |
401 | 404 | self._run_check("</>", [])
|
| 405 | + self._run_check("<$>", [('data', '<$>')]) |
402 | 406 | self._run_check("</$>", [('comment', '$')])
|
403 | 407 | self._run_check("</", [('data', '</')])
|
404 |
| - self._run_check("</a", [('data', '</a')]) |
| 408 | + self._run_check("</a", []) |
| 409 | + self._run_check("</ a>", [('endtag', 'a')]) |
| 410 | + self._run_check("</ a", [('comment', ' a')]) |
405 | 411 | self._run_check("<a<a>", [('starttag', 'a<a', [])])
|
406 | 412 | self._run_check("</a<a>", [('endtag', 'a<a')])
|
407 |
| - self._run_check("<!", [('data', '<!')]) |
408 |
| - self._run_check("<a", [('data', '<a')]) |
409 |
| - self._run_check("<a foo='bar'", [('data', "<a foo='bar'")]) |
410 |
| - self._run_check("<a foo='bar", [('data', "<a foo='bar")]) |
411 |
| - self._run_check("<a foo='>'", [('data', "<a foo='>'")]) |
412 |
| - self._run_check("<a foo='>", [('data', "<a foo='>")]) |
| 413 | + self._run_check("<!", [('comment', '')]) |
| 414 | + self._run_check("<a", []) |
| 415 | + self._run_check("<a foo='bar'", []) |
| 416 | + self._run_check("<a foo='bar", []) |
| 417 | + self._run_check("<a foo='>'", []) |
| 418 | + self._run_check("<a foo='>", []) |
413 | 419 | self._run_check("<a$>", [('starttag', 'a$', [])])
|
414 | 420 | self._run_check("<a$b>", [('starttag', 'a$b', [])])
|
415 | 421 | self._run_check("<a$b/>", [('startendtag', 'a$b', [])])
|
416 | 422 | self._run_check("<a$b >", [('starttag', 'a$b', [])])
|
417 | 423 | self._run_check("<a$b />", [('startendtag', 'a$b', [])])
|
| 424 | + self._run_check("</a$b>", [('endtag', 'a$b')]) |
418 | 425 |
|
419 | 426 | def test_slashes_in_starttag(self):
|
420 | 427 | self._run_check('<a foo="var"/>', [('startendtag', 'a', [('foo', 'var')])])
|
@@ -539,13 +546,56 @@ def test_EOF_in_charref(self):
|
539 | 546 | for html, expected in data:
|
540 | 547 | self._run_check(html, expected)
|
541 | 548 |
|
542 |
| - def test_broken_comments(self): |
543 |
| - html = ('<! not really a comment >' |
| 549 | + def test_eof_in_comments(self): |
| 550 | + data = [ |
| 551 | + ('<!--', [('comment', '')]), |
| 552 | + ('<!---', [('comment', '')]), |
| 553 | + ('<!----', [('comment', '')]), |
| 554 | + ('<!-----', [('comment', '-')]), |
| 555 | + ('<!------', [('comment', '--')]), |
| 556 | + ('<!----!', [('comment', '')]), |
| 557 | + ('<!---!', [('comment', '-!')]), |
| 558 | + ('<!---!>', [('comment', '-!>')]), |
| 559 | + ('<!--foo', [('comment', 'foo')]), |
| 560 | + ('<!--foo-', [('comment', 'foo')]), |
| 561 | + ('<!--foo--', [('comment', 'foo')]), |
| 562 | + ('<!--foo--!', [('comment', 'foo')]), |
| 563 | + ('<!--<!--', [('comment', '<!')]), |
| 564 | + ('<!--<!--!', [('comment', '<!')]), |
| 565 | + ] |
| 566 | + for html, expected in data: |
| 567 | + self._run_check(html, expected) |
| 568 | + |
| 569 | + def test_eof_in_declarations(self): |
| 570 | + data = [ |
| 571 | + ('<!', [('comment', '')]), |
| 572 | + ('<!-', [('comment', '-')]), |
| 573 | + ('<![', [('comment', '[')]), |
| 574 | + ('<![CDATA[', [('unknown decl', 'CDATA[')]), |
| 575 | + ('<![CDATA[x', [('unknown decl', 'CDATA[x')]), |
| 576 | + ('<![CDATA[x]', [('unknown decl', 'CDATA[x]')]), |
| 577 | + ('<![CDATA[x]]', [('unknown decl', 'CDATA[x]]')]), |
| 578 | + ('<!DOCTYPE', [('decl', 'DOCTYPE')]), |
| 579 | + ('<!DOCTYPE ', [('decl', 'DOCTYPE ')]), |
| 580 | + ('<!DOCTYPE html', [('decl', 'DOCTYPE html')]), |
| 581 | + ('<!DOCTYPE html ', [('decl', 'DOCTYPE html ')]), |
| 582 | + ('<!DOCTYPE html PUBLIC', [('decl', 'DOCTYPE html PUBLIC')]), |
| 583 | + ('<!DOCTYPE html PUBLIC "foo', [('decl', 'DOCTYPE html PUBLIC "foo')]), |
| 584 | + ('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo', |
| 585 | + [('decl', 'DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo')]), |
| 586 | + ] |
| 587 | + for html, expected in data: |
| 588 | + self._run_check(html, expected) |
| 589 | + |
| 590 | + def test_bogus_comments(self): |
| 591 | + html = ('<!ELEMENT br EMPTY>' |
| 592 | + '<! not really a comment >' |
544 | 593 | '<! not a comment either -->'
|
545 | 594 | '<! -- close enough -->'
|
546 | 595 | '<!><!<-- this was an empty comment>'
|
547 | 596 | '<!!! another bogus comment !!!>')
|
548 | 597 | expected = [
|
| 598 | + ('comment', 'ELEMENT br EMPTY'), |
549 | 599 | ('comment', ' not really a comment '),
|
550 | 600 | ('comment', ' not a comment either --'),
|
551 | 601 | ('comment', ' -- close enough --'),
|
@@ -600,6 +650,26 @@ def test_convert_charrefs_dropped_text(self):
|
600 | 650 | ('endtag', 'a'), ('data', ' bar & baz')]
|
601 | 651 | )
|
602 | 652 |
|
| 653 | + @support.requires_resource('cpu') |
| 654 | + def test_eof_no_quadratic_complexity(self): |
| 655 | + # Each of these examples used to take about an hour. |
| 656 | + # Now they take a fraction of a second. |
| 657 | + def check(source): |
| 658 | + parser = html.parser.HTMLParser() |
| 659 | + parser.feed(source) |
| 660 | + parser.close() |
| 661 | + n = 120_000 |
| 662 | + check("<a " * n) |
| 663 | + check("<a a=" * n) |
| 664 | + check("</a " * 14 * n) |
| 665 | + check("</a a=" * 11 * n) |
| 666 | + check("<!--" * 4 * n) |
| 667 | + check("<!" * 60 * n) |
| 668 | + check("<?" * 19 * n) |
| 669 | + check("</$" * 15 * n) |
| 670 | + check("<![CDATA[" * 9 * n) |
| 671 | + check("<!doctype" * 35 * n) |
| 672 | + |
603 | 673 |
|
604 | 674 | class AttributesTestCase(TestCaseBase):
|
605 | 675 |
|
|
0 commit comments