|
6 | 6 |
|
7 | 7 | import automata.base.exceptions as exceptions
|
8 | 8 | import automata.regex.regex as re
|
9 |
| -from automata.fa.nfa import NFA, RESERVED_CHARACTERS |
| 9 | +from automata.fa.nfa import NFA |
10 | 10 | from automata.regex.parser import StringToken, WildcardToken
|
11 | 11 |
|
12 | 12 |
|
@@ -114,13 +114,13 @@ def test_intersection(self) -> None:
|
114 | 114 | # Test intersection subset
|
115 | 115 | regex_3 = "bcdaaa"
|
116 | 116 | nfa_5 = NFA.from_regex(regex_3)
|
117 |
| - nfa_6 = NFA.from_regex(f"({regex_3}) & (bcda*)") |
| 117 | + nfa_6 = NFA.from_regex(f"({regex_3})&(bcda*)") |
118 | 118 |
|
119 | 119 | self.assertEqual(nfa_5, nfa_6)
|
120 | 120 |
|
121 | 121 | # Test distributive law
|
122 |
| - regex_4 = f"{regex_1} & (({regex_2}) | ({regex_3}))" |
123 |
| - regex_5 = f"(({regex_1}) & ({regex_2})) | (({regex_1}) & ({regex_3}))" |
| 122 | + regex_4 = f"{regex_1}&(({regex_2})|({regex_3}))" |
| 123 | + regex_5 = f"(({regex_1})&({regex_2}))|(({regex_1})&({regex_3}))" |
124 | 124 | nfa_7 = NFA.from_regex(regex_4)
|
125 | 125 | nfa_8 = NFA.from_regex(regex_5)
|
126 | 126 |
|
@@ -159,18 +159,18 @@ def test_shuffle(self) -> None:
|
159 | 159 | self.assertTrue(
|
160 | 160 | re.isequal(
|
161 | 161 | "ab^cd",
|
162 |
| - "abcd | acbd | cabd | acdb | cadb | cdab", |
| 162 | + "abcd|acbd|cabd|acdb|cadb|cdab", |
163 | 163 | input_symbols=input_symbols,
|
164 | 164 | )
|
165 | 165 | )
|
166 | 166 | self.assertTrue(
|
167 | 167 | re.isequal("(a*)^(b*)^(c*)^(d*)", ".*", input_symbols=input_symbols)
|
168 | 168 | )
|
169 | 169 | self.assertTrue(
|
170 |
| - re.isequal("ca^db", "(c^db)a | (ca^d)b", input_symbols=input_symbols) |
| 170 | + re.isequal("ca^db", "(c^db)a|(ca^d)b", input_symbols=input_symbols) |
171 | 171 | )
|
172 | 172 | self.assertTrue(
|
173 |
| - re.isequal("a^(b|c)", "ab | ac | ba | ca", input_symbols=input_symbols) |
| 173 | + re.isequal("a^(b|c)", "ab|ac|ba|ca", input_symbols=input_symbols) |
174 | 174 | )
|
175 | 175 |
|
176 | 176 | reference_nfa = NFA.from_regex("a*^ba")
|
@@ -229,10 +229,14 @@ def test_blank(self) -> None:
|
229 | 229 | self.assertTrue(re.isequal("a()", "a"))
|
230 | 230 | self.assertTrue(re.isequal("a()b()()c()", "abc"))
|
231 | 231 |
|
232 |
| - def test_invalid_symbols(self) -> None: |
| 232 | + def test_reserved_characters_handled_correctly(self) -> None: |
233 | 233 | """Should throw exception if reserved character is in input symbols"""
|
234 |
| - with self.assertRaises(exceptions.InvalidSymbolError): |
235 |
| - NFA.from_regex("a+", input_symbols={"a", "+"}) |
| 234 | + nfa = NFA.from_regex("a+", input_symbols={"a", "+"}) |
| 235 | + self.assertTrue(nfa.accepts_input("a")) |
| 236 | + self.assertTrue(nfa.accepts_input("aa")) |
| 237 | + self.assertFalse(nfa.accepts_input("a+")) |
| 238 | + self.assertFalse(nfa.accepts_input("")) |
| 239 | + self.assertFalse(nfa.accepts_input("+")) |
236 | 240 |
|
237 | 241 | def test_character_class(self) -> None:
|
238 | 242 | """Should correctly handle character classes"""
|
@@ -344,7 +348,7 @@ def test_character_class(self) -> None:
|
344 | 348 | self.assertFalse(nfa1.accepts_input("b"))
|
345 | 349 |
|
346 | 350 | # One more more complex test with and without input symbols
|
347 |
| - input_symbols = set(string.printable) - RESERVED_CHARACTERS |
| 351 | + input_symbols = set(string.printable) |
348 | 352 | nfa1 = NFA.from_regex("[a-zA-Z0-9._%+-]+", input_symbols=input_symbols)
|
349 | 353 | self.assertTrue(nfa1.accepts_input("a"))
|
350 | 354 | self.assertTrue(nfa1.accepts_input("1"))
|
@@ -382,8 +386,6 @@ def create_range(start_char: str, end_char: str) -> set[str]:
|
382 | 386 | ascii_chars = set(string.printable)
|
383 | 387 | input_symbols.update(ascii_chars)
|
384 | 388 |
|
385 |
| - input_symbols = input_symbols - RESERVED_CHARACTERS |
386 |
| - |
387 | 389 | latin_nfa = NFA.from_regex("[¡-ƿ]+", input_symbols=input_symbols)
|
388 | 390 | greek_nfa = NFA.from_regex("[Ͱ-Ͽ]+", input_symbols=input_symbols)
|
389 | 391 | cyrillic_nfa = NFA.from_regex("[Ѐ-ӿ]+", input_symbols=input_symbols)
|
@@ -437,7 +439,7 @@ def create_range(start_char: str, end_char: str) -> set[str]:
|
437 | 439 | self.assertFalse(non_latin_nfa.accepts_input("a¡"))
|
438 | 440 |
|
439 | 441 | alphabet = set("abcdefghijklmnopqrstuvwxyz")
|
440 |
| - alphabet = alphabet - RESERVED_CHARACTERS |
| 442 | + alphabet = alphabet |
441 | 443 | safe_input_symbols = input_symbols.union(alphabet)
|
442 | 444 |
|
443 | 445 | ascii_range_nfa = NFA.from_regex("[i-p]+", input_symbols=safe_input_symbols)
|
@@ -625,3 +627,224 @@ def test_shorthand_character_classes(self) -> None:
|
625 | 627 | self.assertTrue(complex_nfa.accepts_input("_\t0\n"))
|
626 | 628 | self.assertFalse(complex_nfa.accepts_input("abc 123\n")) # space instead of tab
|
627 | 629 | self.assertFalse(complex_nfa.accepts_input("abc\t123")) # missing newline
|
| 630 | + |
| 631 | + def test_negated_class_with_period(self) -> None: |
| 632 | + """Test that negated character classes can match the period character""" |
| 633 | + |
| 634 | + # Create an NFA with a negated character class |
| 635 | + nfa = NFA.from_regex(r"[.]+.", input_symbols={"a"}) |
| 636 | + self.assertTrue(nfa.accepts_input(".a")) |
| 637 | + self.assertFalse(nfa.accepts_input("<a")) |
| 638 | + |
| 639 | + # Create an NFA with a negated character class |
| 640 | + nfa = NFA.from_regex(r"[^<>]+", input_symbols={"a", "."}) |
| 641 | + self.assertTrue(nfa.accepts_input(".")) |
| 642 | + self.assertTrue(nfa.accepts_input("...")) |
| 643 | + |
| 644 | + nfa = NFA.from_regex(r"[^<>]+", input_symbols=set(string.printable)) |
| 645 | + # This should match any character except < and > |
| 646 | + self.assertTrue(nfa.accepts_input("abc")) |
| 647 | + self.assertTrue(nfa.accepts_input("123")) |
| 648 | + self.assertTrue(nfa.accepts_input('!@#$%^&*()_+{}|:",./?`~')) |
| 649 | + |
| 650 | + # These should not match |
| 651 | + self.assertFalse(nfa.accepts_input("<")) |
| 652 | + self.assertFalse(nfa.accepts_input(">")) |
| 653 | + self.assertFalse(nfa.accepts_input("a<b")) # contains < |
| 654 | + self.assertFalse(nfa.accepts_input("a>b")) # contains > |
| 655 | + |
| 656 | + def test_slash_character(self) -> None: |
| 657 | + """Should correctly handle the slash character""" |
| 658 | + nfa = NFA.from_regex(r"/", input_symbols=set(string.printable)) |
| 659 | + self.assertTrue(nfa.accepts_input("/")) |
| 660 | + self.assertFalse(nfa.accepts_input("a/b")) |
| 661 | + |
| 662 | + def test_email_like_regexes(self) -> None: |
| 663 | + """Should correctly handle email-like regexes""" |
| 664 | + input_symbols = set(string.printable) |
| 665 | + |
| 666 | + # Pattern for bracketed email content: ">content<something" |
| 667 | + bracketed_nfa = NFA.from_regex(r">[^<>]+<.*", input_symbols=input_symbols) |
| 668 | + self. assertTrue( bracketed_nfa. accepts_input( ">[email protected]<")) |
| 669 | + self. assertTrue( bracketed_nfa. accepts_input( ">John Doe<[email protected]")) |
| 670 | + self. assertFalse( bracketed_nfa. accepts_input( "[email protected]")) # missing > |
| 671 | + self.assertFalse(bracketed_nfa.accepts_input("><")) # empty content |
| 672 | + |
| 673 | + # Pattern for "To:" header field |
| 674 | + to_header_nfa = NFA.from_regex(r"to:[^\r\n]+\r\n", input_symbols=input_symbols) |
| 675 | + self. assertTrue( to_header_nfa. accepts_input( "to:[email protected]\r\n")) |
| 676 | + self.assertTrue( |
| 677 | + to_header_nfa.accepts_input( |
| 678 | + "to:Multiple Recipients <[email protected]>\r\n" |
| 679 | + ) |
| 680 | + ) |
| 681 | + self.assertFalse( |
| 682 | + to_header_nfa. accepts_input( "to:[email protected]") |
| 683 | + ) # missing newline |
| 684 | + self.assertFalse( |
| 685 | + to_header_nfa. accepts_input( "from:[email protected]\r\n") |
| 686 | + ) # wrong header |
| 687 | + |
| 688 | + # Pattern for "Subject:" header field |
| 689 | + subject_nfa = NFA.from_regex( |
| 690 | + r"\)subject:[^\r\n]+\r\n", input_symbols=input_symbols |
| 691 | + ) |
| 692 | + self.assertTrue(subject_nfa.accepts_input(")subject:Hello World\r\n")) |
| 693 | + self.assertTrue( |
| 694 | + subject_nfa.accepts_input(")subject:Re: Meeting Tomorrow at 10AM\r\n") |
| 695 | + ) |
| 696 | + self.assertFalse( |
| 697 | + subject_nfa.accepts_input("subject:Hello World\r\n") |
| 698 | + ) # missing ) |
| 699 | + |
| 700 | + # Pattern for standard email address |
| 701 | + email_nfa = NFA.from_regex( |
| 702 | + r"[A-Za-z0-9!#$%&'*+=?\-\^_`{|}~.\/]+@[A-Za-z0-9.\-@]+", |
| 703 | + input_symbols=input_symbols, |
| 704 | + ) |
| 705 | + self. assertTrue( email_nfa. accepts_input( "[email protected]")) |
| 706 | + self. assertTrue( email_nfa. accepts_input( "[email protected]")) |
| 707 | + self. assertTrue( email_nfa. accepts_input( "unusual!#$%&'*[email protected]")) |
| 708 | + self.assertFalse(email_nfa.accepts_input("@example.com")) # missing local part |
| 709 | + self.assertFalse(email_nfa.accepts_input("user@")) # missing domain |
| 710 | + |
| 711 | + # Pattern for DKIM signature with Base64 hash |
| 712 | + dkim_bh_nfa = NFA.from_regex( |
| 713 | + r"dkim-signature:([a-z]+=[^;]+; )+bh=[a-zA-Z0-9+/=]+;", |
| 714 | + input_symbols=input_symbols, |
| 715 | + ) |
| 716 | + self.assertTrue( |
| 717 | + dkim_bh_nfa.accepts_input( |
| 718 | + "dkim-signature:v=1; a=rsa-sha256; bh=47DEQpj8HBSa+/TImW+5JCeuQeR;" |
| 719 | + ) |
| 720 | + ) |
| 721 | + self.assertTrue( |
| 722 | + dkim_bh_nfa.accepts_input( |
| 723 | + "dkim-signature:v=1; a=rsa-sha256; d=example.org; bh=base64+/hash=;" |
| 724 | + ) |
| 725 | + ) |
| 726 | + self.assertFalse( |
| 727 | + dkim_bh_nfa.accepts_input("dkim-signature:v=1; bh=;") |
| 728 | + ) # empty hash |
| 729 | + |
| 730 | + # Pattern for alternative email address format |
| 731 | + alt_email_nfa = NFA.from_regex( |
| 732 | + r"[A-Za-z0-9!#$%&'*+=?\-\^_`{|}~.\/@]+@[A-Za-z0-9.\-]+", |
| 733 | + input_symbols=input_symbols, |
| 734 | + ) |
| 735 | + self. assertTrue( alt_email_nfa. accepts_input( "[email protected]")) |
| 736 | + self.assertTrue( |
| 737 | + alt_email_nfa. accepts_input( "user/[email protected]") |
| 738 | + ) # with slash |
| 739 | + self.assertFalse(alt_email_nfa.accepts_input("user@")) # missing domain |
| 740 | + |
| 741 | + # Pattern for "From:" header field |
| 742 | + from_header_nfa = NFA.from_regex( |
| 743 | + r"from:[^\r\n]+\r\n", input_symbols=input_symbols |
| 744 | + ) |
| 745 | + self. assertTrue( from_header_nfa. accepts_input( "from:[email protected]\r\n")) |
| 746 | + self.assertTrue( |
| 747 | + from_header_nfa. accepts_input( "from:John Doe <[email protected]>\r\n") |
| 748 | + ) |
| 749 | + self.assertFalse( |
| 750 | + from_header_nfa. accepts_input( "from:[email protected]") |
| 751 | + ) # missing newline |
| 752 | + |
| 753 | + # Pattern for DKIM signature with timestamp |
| 754 | + dkim_time_nfa = NFA.from_regex( |
| 755 | + r"dkim-signature:([a-z]+=[^;]+; )+t=[0-9]+;", input_symbols=input_symbols |
| 756 | + ) |
| 757 | + self.assertTrue( |
| 758 | + dkim_time_nfa.accepts_input( |
| 759 | + "dkim-signature:v=1; a=rsa-sha256; t=1623456789;" |
| 760 | + ) |
| 761 | + ) |
| 762 | + self.assertTrue( |
| 763 | + dkim_time_nfa.accepts_input( |
| 764 | + "dkim-signature:v=1; a=rsa-sha256; s=selector; t=1623456789;" |
| 765 | + ) |
| 766 | + ) |
| 767 | + self.assertFalse( |
| 768 | + dkim_time_nfa.accepts_input("dkim-signature:v=1; t=;") |
| 769 | + ) # empty timestamp |
| 770 | + |
| 771 | + # Pattern for Message-ID header |
| 772 | + msgid_nfa = NFA.from_regex( |
| 773 | + r"message-id:<[A-Za-z0-9=@\.\+_-]+>\r\n", input_symbols=input_symbols |
| 774 | + ) |
| 775 | + self. assertTrue( msgid_nfa. accepts_input( "message-id:<[email protected]>\r\n")) |
| 776 | + self.assertTrue( |
| 777 | + msgid_nfa. accepts_input( "message-id:<[email protected]>\r\n") |
| 778 | + ) |
| 779 | + self.assertFalse( |
| 780 | + msgid_nfa.accepts_input("message-id:<invalid chars!>\r\n") |
| 781 | + ) # invalid chars |
| 782 | + self.assertFalse( |
| 783 | + msgid_nfa. accepts_input( "message-id:<[email protected]>") |
| 784 | + ) # missing newline |
| 785 | + |
| 786 | + def test_repeating_group_with_space(self) -> None: |
| 787 | + """Test a simpler version of the DKIM signature pattern to isolate the issue""" |
| 788 | + input_symbols = set(string.printable) |
| 789 | + |
| 790 | + # Try another variation without the space in the pattern |
| 791 | + no_space = NFA.from_regex(r"([a-z]+=[^;]+;)+", input_symbols=input_symbols) |
| 792 | + self.assertTrue(no_space.accepts_input("v=1;")) |
| 793 | + self.assertTrue(no_space.accepts_input("v=1;a=2;")) |
| 794 | + |
| 795 | + # Test with explicit space character instead of relying on character class |
| 796 | + explicit_space = NFA.from_regex( |
| 797 | + r"([a-z]+=[^;]+; )+", input_symbols=input_symbols |
| 798 | + ) |
| 799 | + self.assertTrue(explicit_space.accepts_input("v=1; ")) |
| 800 | + |
| 801 | + # Simplified version of the problematic pattern |
| 802 | + simple_repeat = NFA.from_regex( |
| 803 | + r"([a-z]+=[^;]+; )+", input_symbols=input_symbols |
| 804 | + ) |
| 805 | + self.assertTrue(simple_repeat.accepts_input("v=1; ")) |
| 806 | + self.assertTrue(simple_repeat.accepts_input("v=1; a=2; ")) |
| 807 | + |
| 808 | + # Test the full pattern but simplified |
| 809 | + full_simple = NFA.from_regex( |
| 810 | + r"header:([a-z]+=[^;]+; )+value;", input_symbols=input_symbols |
| 811 | + ) |
| 812 | + self.assertTrue(full_simple.accepts_input("header:v=1; value;")) |
| 813 | + self.assertTrue(full_simple.accepts_input("header:v=1; a=2; value;")) |
| 814 | + |
| 815 | + def test_space_in_patterns(self) -> None: |
| 816 | + """Test different patterns with spaces to isolate the issue""" |
| 817 | + input_symbols = set(string.printable) |
| 818 | + |
| 819 | + # Test 1: Basic pattern with space at the end |
| 820 | + basic = NFA.from_regex(r"a ", input_symbols=input_symbols) |
| 821 | + self.assertTrue(basic.accepts_input("a ")) |
| 822 | + |
| 823 | + # Test 2: Character class with space |
| 824 | + with_class = NFA.from_regex(r"a[b ]", input_symbols=input_symbols) |
| 825 | + self.assertTrue(with_class.accepts_input("a ")) |
| 826 | + self.assertTrue(with_class.accepts_input("ab")) |
| 827 | + |
| 828 | + # Test 3: Simple repetition with space |
| 829 | + simple_repeat = NFA.from_regex(r"(a )+", input_symbols=input_symbols) |
| 830 | + self.assertTrue(simple_repeat.accepts_input("a ")) |
| 831 | + self.assertTrue(simple_repeat.accepts_input("a a ")) |
| 832 | + |
| 833 | + # Test 4: Specific repeating pattern without the semicolon |
| 834 | + no_semicolon = NFA.from_regex(r"([a-z]+=. )+", input_symbols=input_symbols) |
| 835 | + self.assertTrue(no_semicolon.accepts_input("v=1 ")) |
| 836 | + self.assertTrue(no_semicolon.accepts_input("v=1 a=2 ")) |
| 837 | + |
| 838 | + # Test 5: With semicolon but space before |
| 839 | + space_before = NFA.from_regex(r"([a-z]+=[^;]+ ;)+", input_symbols=input_symbols) |
| 840 | + self.assertTrue(space_before.accepts_input("v=1 ;")) |
| 841 | + self.assertTrue(space_before.accepts_input("v=1 ;a=2 ;")) |
| 842 | + |
| 843 | + # Test 6: Space as part of negated class |
| 844 | + space_in_neg = NFA.from_regex(r"([a-z]+=[^; ]+;)+", input_symbols=input_symbols) |
| 845 | + self.assertTrue(space_in_neg.accepts_input("v=1;")) |
| 846 | + |
| 847 | + # Test 7: Bare minimum to reproduce |
| 848 | + minimal = NFA.from_regex(r"(a; )+", input_symbols=input_symbols) |
| 849 | + self.assertTrue(minimal.accepts_input("a; ")) |
| 850 | + self.assertTrue(minimal.accepts_input("a; a; ")) |
0 commit comments