|
4 | 4 | import pprint
|
5 | 5 | import unittest
|
6 | 6 |
|
| 7 | +from test import support |
| 8 | + |
7 | 9 |
|
8 | 10 | class EventCollector(html.parser.HTMLParser):
|
9 | 11 |
|
@@ -391,28 +393,34 @@ def test_tolerant_parsing(self):
|
391 | 393 | ('data', '<'),
|
392 | 394 | ('starttag', 'bc<', [('a', None)]),
|
393 | 395 | ('endtag', 'html'),
|
394 |
| - ('data', '\n<img src="URL>'), |
395 |
| - ('comment', '/img'), |
396 |
| - ('endtag', 'html<')]) |
| 396 | + ('data', '\n')]) |
397 | 397 |
|
398 | 398 | def test_starttag_junk_chars(self):
|
| 399 | + self._run_check("<", [('data', '<')]) |
| 400 | + self._run_check("<>", [('data', '<>')]) |
| 401 | + self._run_check("< >", [('data', '< >')]) |
| 402 | + self._run_check("< ", [('data', '< ')]) |
399 | 403 | self._run_check("</>", [])
|
| 404 | + self._run_check("<$>", [('data', '<$>')]) |
400 | 405 | self._run_check("</$>", [('comment', '$')])
|
401 | 406 | self._run_check("</", [('data', '</')])
|
402 |
| - self._run_check("</a", [('data', '</a')]) |
| 407 | + self._run_check("</a", []) |
| 408 | + self._run_check("</ a>", [('endtag', 'a')]) |
| 409 | + self._run_check("</ a", [('comment', ' a')]) |
403 | 410 | self._run_check("<a<a>", [('starttag', 'a<a', [])])
|
404 | 411 | self._run_check("</a<a>", [('endtag', 'a<a')])
|
405 |
| - self._run_check("<!", [('data', '<!')]) |
406 |
| - self._run_check("<a", [('data', '<a')]) |
407 |
| - self._run_check("<a foo='bar'", [('data', "<a foo='bar'")]) |
408 |
| - self._run_check("<a foo='bar", [('data', "<a foo='bar")]) |
409 |
| - self._run_check("<a foo='>'", [('data', "<a foo='>'")]) |
410 |
| - self._run_check("<a foo='>", [('data', "<a foo='>")]) |
| 412 | + self._run_check("<!", [('comment', '')]) |
| 413 | + self._run_check("<a", []) |
| 414 | + self._run_check("<a foo='bar'", []) |
| 415 | + self._run_check("<a foo='bar", []) |
| 416 | + self._run_check("<a foo='>'", []) |
| 417 | + self._run_check("<a foo='>", []) |
411 | 418 | self._run_check("<a$>", [('starttag', 'a$', [])])
|
412 | 419 | self._run_check("<a$b>", [('starttag', 'a$b', [])])
|
413 | 420 | self._run_check("<a$b/>", [('startendtag', 'a$b', [])])
|
414 | 421 | self._run_check("<a$b >", [('starttag', 'a$b', [])])
|
415 | 422 | self._run_check("<a$b />", [('startendtag', 'a$b', [])])
|
| 423 | + self._run_check("</a$b>", [('endtag', 'a$b')]) |
416 | 424 |
|
417 | 425 | def test_slashes_in_starttag(self):
|
418 | 426 | self._run_check('<a foo="var"/>', [('startendtag', 'a', [('foo', 'var')])])
|
@@ -537,13 +545,56 @@ def test_EOF_in_charref(self):
|
537 | 545 | for html, expected in data:
|
538 | 546 | self._run_check(html, expected)
|
539 | 547 |
|
540 |
| - def test_broken_comments(self): |
541 |
| - html = ('<! not really a comment >' |
| 548 | + def test_eof_in_comments(self): |
| 549 | + data = [ |
| 550 | + ('<!--', [('comment', '')]), |
| 551 | + ('<!---', [('comment', '')]), |
| 552 | + ('<!----', [('comment', '')]), |
| 553 | + ('<!-----', [('comment', '-')]), |
| 554 | + ('<!------', [('comment', '--')]), |
| 555 | + ('<!----!', [('comment', '')]), |
| 556 | + ('<!---!', [('comment', '-!')]), |
| 557 | + ('<!---!>', [('comment', '-!>')]), |
| 558 | + ('<!--foo', [('comment', 'foo')]), |
| 559 | + ('<!--foo-', [('comment', 'foo')]), |
| 560 | + ('<!--foo--', [('comment', 'foo')]), |
| 561 | + ('<!--foo--!', [('comment', 'foo')]), |
| 562 | + ('<!--<!--', [('comment', '<!')]), |
| 563 | + ('<!--<!--!', [('comment', '<!')]), |
| 564 | + ] |
| 565 | + for html, expected in data: |
| 566 | + self._run_check(html, expected) |
| 567 | + |
| 568 | + def test_eof_in_declarations(self): |
| 569 | + data = [ |
| 570 | + ('<!', [('comment', '')]), |
| 571 | + ('<!-', [('comment', '-')]), |
| 572 | + ('<![', [('comment', '[')]), |
| 573 | + ('<![CDATA[', [('unknown decl', 'CDATA[')]), |
| 574 | + ('<![CDATA[x', [('unknown decl', 'CDATA[x')]), |
| 575 | + ('<![CDATA[x]', [('unknown decl', 'CDATA[x]')]), |
| 576 | + ('<![CDATA[x]]', [('unknown decl', 'CDATA[x]]')]), |
| 577 | + ('<!DOCTYPE', [('decl', 'DOCTYPE')]), |
| 578 | + ('<!DOCTYPE ', [('decl', 'DOCTYPE ')]), |
| 579 | + ('<!DOCTYPE html', [('decl', 'DOCTYPE html')]), |
| 580 | + ('<!DOCTYPE html ', [('decl', 'DOCTYPE html ')]), |
| 581 | + ('<!DOCTYPE html PUBLIC', [('decl', 'DOCTYPE html PUBLIC')]), |
| 582 | + ('<!DOCTYPE html PUBLIC "foo', [('decl', 'DOCTYPE html PUBLIC "foo')]), |
| 583 | + ('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo', |
| 584 | + [('decl', 'DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo')]), |
| 585 | + ] |
| 586 | + for html, expected in data: |
| 587 | + self._run_check(html, expected) |
| 588 | + |
| 589 | + def test_bogus_comments(self): |
| 590 | + html = ('<!ELEMENT br EMPTY>' |
| 591 | + '<! not really a comment >' |
542 | 592 | '<! not a comment either -->'
|
543 | 593 | '<! -- close enough -->'
|
544 | 594 | '<!><!<-- this was an empty comment>'
|
545 | 595 | '<!!! another bogus comment !!!>')
|
546 | 596 | expected = [
|
| 597 | + ('comment', 'ELEMENT br EMPTY'), |
547 | 598 | ('comment', ' not really a comment '),
|
548 | 599 | ('comment', ' not a comment either --'),
|
549 | 600 | ('comment', ' -- close enough --'),
|
@@ -598,6 +649,26 @@ def test_convert_charrefs_dropped_text(self):
|
598 | 649 | ('endtag', 'a'), ('data', ' bar & baz')]
|
599 | 650 | )
|
600 | 651 |
|
| 652 | + @support.requires_resource('cpu') |
| 653 | + def test_eof_no_quadratic_complexity(self): |
| 654 | + # Each of these examples used to take about an hour. |
| 655 | + # Now they take a fraction of a second. |
| 656 | + def check(source): |
| 657 | + parser = html.parser.HTMLParser() |
| 658 | + parser.feed(source) |
| 659 | + parser.close() |
| 660 | + n = 120_000 |
| 661 | + check("<a " * n) |
| 662 | + check("<a a=" * n) |
| 663 | + check("</a " * 14 * n) |
| 664 | + check("</a a=" * 11 * n) |
| 665 | + check("<!--" * 4 * n) |
| 666 | + check("<!" * 60 * n) |
| 667 | + check("<?" * 19 * n) |
| 668 | + check("</$" * 15 * n) |
| 669 | + check("<![CDATA[" * 9 * n) |
| 670 | + check("<!doctype" * 35 * n) |
| 671 | + |
601 | 672 |
|
602 | 673 | class AttributesTestCase(TestCaseBase):
|
603 | 674 |
|
|
0 commit comments