Skip to content

Commit df53edc

Browse files
Allow reserved chars in input symbols and tokenize spaces
We also added more complex tests
1 parent f400c34 commit df53edc

File tree

4 files changed

+248
-31
lines changed

4 files changed

+248
-31
lines changed

automata/fa/nfa.py

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -217,18 +217,6 @@ def from_regex(
217217
Self
218218
The NFA accepting the language of the input regex.
219219
"""
220-
if input_symbols is not None:
221-
# Create a modified set of reserved characters that doesn't include
222-
# whitespace
223-
whitespace_chars = {" ", "\t", "\n", "\r", "\f", "\v"}
224-
non_whitespace_reserved = RESERVED_CHARACTERS - whitespace_chars
225-
226-
conflicting_symbols = non_whitespace_reserved & input_symbols
227-
if conflicting_symbols:
228-
raise exceptions.InvalidSymbolError(
229-
f"Invalid input symbols: {conflicting_symbols}"
230-
)
231-
232220
# Import the shorthand character classes
233221
from automata.regex.parser import (
234222
DIGIT_CHARS,

automata/regex/parser.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -648,7 +648,7 @@ def get_regex_lexer(
648648
input_symbols: AbstractSet[str], state_name_counter: count
649649
) -> Lexer[NFARegexBuilder]:
650650
"""Get lexer for parsing regular expressions."""
651-
lexer: Lexer[NFARegexBuilder] = Lexer()
651+
lexer: Lexer[NFARegexBuilder] = Lexer(blank_chars=set())
652652

653653
# Register all token types
654654
lexer.register_token(LeftParen.from_match, r"\(")
@@ -723,6 +723,12 @@ def character_class_factory(match: re.Match) -> CharacterClassToken:
723723
r"\\.", # Match any escaped character
724724
)
725725

726+
# Add specific token for space character - this is the key fix
727+
lexer.register_token(
728+
lambda match: StringToken(match.group(), state_name_counter),
729+
r" ", # Match a space character
730+
)
731+
726732
# Handle regular characters
727733
lexer.register_token(
728734
lambda match: StringToken(match.group(), state_name_counter), r"\S"

tests/test_nfa.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -785,7 +785,7 @@ def test_nfa_equality(self) -> None:
785785
self.assertEqual(
786786
nfa2,
787787
NFA.from_regex(
788-
"(((01) | 1)*)((0*1) | (1*0))(((10) | 0)*)", input_symbols=input_symbols
788+
"(((01)|1)*)((0*1)|(1*0))(((10)|0)*)", input_symbols=input_symbols
789789
),
790790
)
791791

@@ -807,7 +807,7 @@ def test_nfa_equality(self) -> None:
807807

808808
self.assertEqual(
809809
nfa3,
810-
NFA.from_regex("(0(0 | 1)*0) | (1(0 | 1)*1)", input_symbols=input_symbols),
810+
NFA.from_regex("(0(0|1)*0)|(1(0|1)*1)", input_symbols=input_symbols),
811811
)
812812

813813
nfa4 = NFA(
@@ -828,7 +828,7 @@ def test_nfa_equality(self) -> None:
828828

829829
self.assertEqual(
830830
nfa4,
831-
NFA.from_regex("((0 | 1)*00) | ((0 | 1)*11)", input_symbols=input_symbols),
831+
NFA.from_regex("((0|1)*00)|((0|1)*11)", input_symbols=input_symbols),
832832
)
833833

834834
input_symbols_2 = {"0", "1", "2"}
@@ -853,7 +853,7 @@ def test_nfa_equality(self) -> None:
853853
self.assertEqual(
854854
nfa5,
855855
NFA.from_regex(
856-
"((((01)*0) | 2)(100)*1)*(1* | (0*2*))", input_symbols=input_symbols_2
856+
"((((01)*0)|2)(100)*1)*(1*|(0*2*))", input_symbols=input_symbols_2
857857
),
858858
)
859859

tests/test_regex.py

Lines changed: 237 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
import automata.base.exceptions as exceptions
88
import automata.regex.regex as re
9-
from automata.fa.nfa import NFA, RESERVED_CHARACTERS
9+
from automata.fa.nfa import NFA
1010
from automata.regex.parser import StringToken, WildcardToken
1111

1212

@@ -114,13 +114,13 @@ def test_intersection(self) -> None:
114114
# Test intersection subset
115115
regex_3 = "bcdaaa"
116116
nfa_5 = NFA.from_regex(regex_3)
117-
nfa_6 = NFA.from_regex(f"({regex_3}) & (bcda*)")
117+
nfa_6 = NFA.from_regex(f"({regex_3})&(bcda*)")
118118

119119
self.assertEqual(nfa_5, nfa_6)
120120

121121
# Test distributive law
122-
regex_4 = f"{regex_1} & (({regex_2}) | ({regex_3}))"
123-
regex_5 = f"(({regex_1}) & ({regex_2})) | (({regex_1}) & ({regex_3}))"
122+
regex_4 = f"{regex_1}&(({regex_2})|({regex_3}))"
123+
regex_5 = f"(({regex_1})&({regex_2}))|(({regex_1})&({regex_3}))"
124124
nfa_7 = NFA.from_regex(regex_4)
125125
nfa_8 = NFA.from_regex(regex_5)
126126

@@ -159,18 +159,18 @@ def test_shuffle(self) -> None:
159159
self.assertTrue(
160160
re.isequal(
161161
"ab^cd",
162-
"abcd | acbd | cabd | acdb | cadb | cdab",
162+
"abcd|acbd|cabd|acdb|cadb|cdab",
163163
input_symbols=input_symbols,
164164
)
165165
)
166166
self.assertTrue(
167167
re.isequal("(a*)^(b*)^(c*)^(d*)", ".*", input_symbols=input_symbols)
168168
)
169169
self.assertTrue(
170-
re.isequal("ca^db", "(c^db)a | (ca^d)b", input_symbols=input_symbols)
170+
re.isequal("ca^db", "(c^db)a|(ca^d)b", input_symbols=input_symbols)
171171
)
172172
self.assertTrue(
173-
re.isequal("a^(b|c)", "ab | ac | ba | ca", input_symbols=input_symbols)
173+
re.isequal("a^(b|c)", "ab|ac|ba|ca", input_symbols=input_symbols)
174174
)
175175

176176
reference_nfa = NFA.from_regex("a*^ba")
@@ -229,10 +229,14 @@ def test_blank(self) -> None:
229229
self.assertTrue(re.isequal("a()", "a"))
230230
self.assertTrue(re.isequal("a()b()()c()", "abc"))
231231

232-
def test_invalid_symbols(self) -> None:
232+
def test_reserved_characters_handled_correctly(self) -> None:
233233
"""Should throw exception if reserved character is in input symbols"""
234-
with self.assertRaises(exceptions.InvalidSymbolError):
235-
NFA.from_regex("a+", input_symbols={"a", "+"})
234+
nfa = NFA.from_regex("a+", input_symbols={"a", "+"})
235+
self.assertTrue(nfa.accepts_input("a"))
236+
self.assertTrue(nfa.accepts_input("aa"))
237+
self.assertFalse(nfa.accepts_input("a+"))
238+
self.assertFalse(nfa.accepts_input(""))
239+
self.assertFalse(nfa.accepts_input("+"))
236240

237241
def test_character_class(self) -> None:
238242
"""Should correctly handle character classes"""
@@ -344,7 +348,7 @@ def test_character_class(self) -> None:
344348
self.assertFalse(nfa1.accepts_input("b"))
345349

346350
# One more more complex test with and without input symbols
347-
input_symbols = set(string.printable) - RESERVED_CHARACTERS
351+
input_symbols = set(string.printable)
348352
nfa1 = NFA.from_regex("[a-zA-Z0-9._%+-]+", input_symbols=input_symbols)
349353
self.assertTrue(nfa1.accepts_input("a"))
350354
self.assertTrue(nfa1.accepts_input("1"))
@@ -382,8 +386,6 @@ def create_range(start_char: str, end_char: str) -> set[str]:
382386
ascii_chars = set(string.printable)
383387
input_symbols.update(ascii_chars)
384388

385-
input_symbols = input_symbols - RESERVED_CHARACTERS
386-
387389
latin_nfa = NFA.from_regex("[¡-ƿ]+", input_symbols=input_symbols)
388390
greek_nfa = NFA.from_regex("[Ͱ-Ͽ]+", input_symbols=input_symbols)
389391
cyrillic_nfa = NFA.from_regex("[Ѐ-ӿ]+", input_symbols=input_symbols)
@@ -437,7 +439,7 @@ def create_range(start_char: str, end_char: str) -> set[str]:
437439
self.assertFalse(non_latin_nfa.accepts_input("a¡"))
438440

439441
alphabet = set("abcdefghijklmnopqrstuvwxyz")
440-
alphabet = alphabet - RESERVED_CHARACTERS
442+
alphabet = alphabet
441443
safe_input_symbols = input_symbols.union(alphabet)
442444

443445
ascii_range_nfa = NFA.from_regex("[i-p]+", input_symbols=safe_input_symbols)
@@ -625,3 +627,224 @@ def test_shorthand_character_classes(self) -> None:
625627
self.assertTrue(complex_nfa.accepts_input("_\t0\n"))
626628
self.assertFalse(complex_nfa.accepts_input("abc 123\n")) # space instead of tab
627629
self.assertFalse(complex_nfa.accepts_input("abc\t123")) # missing newline
630+
631+
def test_negated_class_with_period(self) -> None:
632+
"""Test that negated character classes can match the period character"""
633+
634+
# Create an NFA with a negated character class
635+
nfa = NFA.from_regex(r"[.]+.", input_symbols={"a"})
636+
self.assertTrue(nfa.accepts_input(".a"))
637+
self.assertFalse(nfa.accepts_input("<a"))
638+
639+
# Create an NFA with a negated character class
640+
nfa = NFA.from_regex(r"[^<>]+", input_symbols={"a", "."})
641+
self.assertTrue(nfa.accepts_input("."))
642+
self.assertTrue(nfa.accepts_input("..."))
643+
644+
nfa = NFA.from_regex(r"[^<>]+", input_symbols=set(string.printable))
645+
# This should match any character except < and >
646+
self.assertTrue(nfa.accepts_input("abc"))
647+
self.assertTrue(nfa.accepts_input("123"))
648+
self.assertTrue(nfa.accepts_input('!@#$%^&*()_+{}|:",./?`~'))
649+
650+
# These should not match
651+
self.assertFalse(nfa.accepts_input("<"))
652+
self.assertFalse(nfa.accepts_input(">"))
653+
self.assertFalse(nfa.accepts_input("a<b")) # contains <
654+
self.assertFalse(nfa.accepts_input("a>b")) # contains >
655+
656+
def test_slash_character(self) -> None:
657+
"""Should correctly handle the slash character"""
658+
nfa = NFA.from_regex(r"/", input_symbols=set(string.printable))
659+
self.assertTrue(nfa.accepts_input("/"))
660+
self.assertFalse(nfa.accepts_input("a/b"))
661+
662+
def test_email_like_regexes(self) -> None:
663+
"""Should correctly handle email-like regexes"""
664+
input_symbols = set(string.printable)
665+
666+
# Pattern for bracketed email content: ">content<something"
667+
bracketed_nfa = NFA.from_regex(r">[^<>]+<.*", input_symbols=input_symbols)
668+
self.assertTrue(bracketed_nfa.accepts_input(">[email protected]<"))
669+
self.assertTrue(bracketed_nfa.accepts_input(">John Doe<[email protected]"))
670+
self.assertFalse(bracketed_nfa.accepts_input("[email protected]")) # missing >
671+
self.assertFalse(bracketed_nfa.accepts_input("><")) # empty content
672+
673+
# Pattern for "To:" header field
674+
to_header_nfa = NFA.from_regex(r"to:[^\r\n]+\r\n", input_symbols=input_symbols)
675+
self.assertTrue(to_header_nfa.accepts_input("to:[email protected]\r\n"))
676+
self.assertTrue(
677+
to_header_nfa.accepts_input(
678+
"to:Multiple Recipients <[email protected]>\r\n"
679+
)
680+
)
681+
self.assertFalse(
682+
to_header_nfa.accepts_input("to:[email protected]")
683+
) # missing newline
684+
self.assertFalse(
685+
to_header_nfa.accepts_input("from:[email protected]\r\n")
686+
) # wrong header
687+
688+
# Pattern for "Subject:" header field
689+
subject_nfa = NFA.from_regex(
690+
r"\)subject:[^\r\n]+\r\n", input_symbols=input_symbols
691+
)
692+
self.assertTrue(subject_nfa.accepts_input(")subject:Hello World\r\n"))
693+
self.assertTrue(
694+
subject_nfa.accepts_input(")subject:Re: Meeting Tomorrow at 10AM\r\n")
695+
)
696+
self.assertFalse(
697+
subject_nfa.accepts_input("subject:Hello World\r\n")
698+
) # missing )
699+
700+
# Pattern for standard email address
701+
email_nfa = NFA.from_regex(
702+
r"[A-Za-z0-9!#$%&'*+=?\-\^_`{|}~.\/]+@[A-Za-z0-9.\-@]+",
703+
input_symbols=input_symbols,
704+
)
705+
self.assertTrue(email_nfa.accepts_input("[email protected]"))
706+
self.assertTrue(email_nfa.accepts_input("[email protected]"))
707+
self.assertTrue(email_nfa.accepts_input("unusual!#$%&'*[email protected]"))
708+
self.assertFalse(email_nfa.accepts_input("@example.com")) # missing local part
709+
self.assertFalse(email_nfa.accepts_input("user@")) # missing domain
710+
711+
# Pattern for DKIM signature with Base64 hash
712+
dkim_bh_nfa = NFA.from_regex(
713+
r"dkim-signature:([a-z]+=[^;]+; )+bh=[a-zA-Z0-9+/=]+;",
714+
input_symbols=input_symbols,
715+
)
716+
self.assertTrue(
717+
dkim_bh_nfa.accepts_input(
718+
"dkim-signature:v=1; a=rsa-sha256; bh=47DEQpj8HBSa+/TImW+5JCeuQeR;"
719+
)
720+
)
721+
self.assertTrue(
722+
dkim_bh_nfa.accepts_input(
723+
"dkim-signature:v=1; a=rsa-sha256; d=example.org; bh=base64+/hash=;"
724+
)
725+
)
726+
self.assertFalse(
727+
dkim_bh_nfa.accepts_input("dkim-signature:v=1; bh=;")
728+
) # empty hash
729+
730+
# Pattern for alternative email address format
731+
alt_email_nfa = NFA.from_regex(
732+
r"[A-Za-z0-9!#$%&'*+=?\-\^_`{|}~.\/@]+@[A-Za-z0-9.\-]+",
733+
input_symbols=input_symbols,
734+
)
735+
self.assertTrue(alt_email_nfa.accepts_input("[email protected]"))
736+
self.assertTrue(
737+
alt_email_nfa.accepts_input("user/[email protected]")
738+
) # with slash
739+
self.assertFalse(alt_email_nfa.accepts_input("user@")) # missing domain
740+
741+
# Pattern for "From:" header field
742+
from_header_nfa = NFA.from_regex(
743+
r"from:[^\r\n]+\r\n", input_symbols=input_symbols
744+
)
745+
self.assertTrue(from_header_nfa.accepts_input("from:[email protected]\r\n"))
746+
self.assertTrue(
747+
from_header_nfa.accepts_input("from:John Doe <[email protected]>\r\n")
748+
)
749+
self.assertFalse(
750+
from_header_nfa.accepts_input("from:[email protected]")
751+
) # missing newline
752+
753+
# Pattern for DKIM signature with timestamp
754+
dkim_time_nfa = NFA.from_regex(
755+
r"dkim-signature:([a-z]+=[^;]+; )+t=[0-9]+;", input_symbols=input_symbols
756+
)
757+
self.assertTrue(
758+
dkim_time_nfa.accepts_input(
759+
"dkim-signature:v=1; a=rsa-sha256; t=1623456789;"
760+
)
761+
)
762+
self.assertTrue(
763+
dkim_time_nfa.accepts_input(
764+
"dkim-signature:v=1; a=rsa-sha256; s=selector; t=1623456789;"
765+
)
766+
)
767+
self.assertFalse(
768+
dkim_time_nfa.accepts_input("dkim-signature:v=1; t=;")
769+
) # empty timestamp
770+
771+
# Pattern for Message-ID header
772+
msgid_nfa = NFA.from_regex(
773+
r"message-id:<[A-Za-z0-9=@\.\+_-]+>\r\n", input_symbols=input_symbols
774+
)
775+
self.assertTrue(msgid_nfa.accepts_input("message-id:<[email protected]>\r\n"))
776+
self.assertTrue(
777+
msgid_nfa.accepts_input("message-id:<[email protected]>\r\n")
778+
)
779+
self.assertFalse(
780+
msgid_nfa.accepts_input("message-id:<invalid chars!>\r\n")
781+
) # invalid chars
782+
self.assertFalse(
783+
msgid_nfa.accepts_input("message-id:<[email protected]>")
784+
) # missing newline
785+
786+
def test_repeating_group_with_space(self) -> None:
787+
"""Test a simpler version of the DKIM signature pattern to isolate the issue"""
788+
input_symbols = set(string.printable)
789+
790+
# Try another variation without the space in the pattern
791+
no_space = NFA.from_regex(r"([a-z]+=[^;]+;)+", input_symbols=input_symbols)
792+
self.assertTrue(no_space.accepts_input("v=1;"))
793+
self.assertTrue(no_space.accepts_input("v=1;a=2;"))
794+
795+
# Test with explicit space character instead of relying on character class
796+
explicit_space = NFA.from_regex(
797+
r"([a-z]+=[^;]+; )+", input_symbols=input_symbols
798+
)
799+
self.assertTrue(explicit_space.accepts_input("v=1; "))
800+
801+
# Simplified version of the problematic pattern
802+
simple_repeat = NFA.from_regex(
803+
r"([a-z]+=[^;]+; )+", input_symbols=input_symbols
804+
)
805+
self.assertTrue(simple_repeat.accepts_input("v=1; "))
806+
self.assertTrue(simple_repeat.accepts_input("v=1; a=2; "))
807+
808+
# Test the full pattern but simplified
809+
full_simple = NFA.from_regex(
810+
r"header:([a-z]+=[^;]+; )+value;", input_symbols=input_symbols
811+
)
812+
self.assertTrue(full_simple.accepts_input("header:v=1; value;"))
813+
self.assertTrue(full_simple.accepts_input("header:v=1; a=2; value;"))
814+
815+
def test_space_in_patterns(self) -> None:
816+
"""Test different patterns with spaces to isolate the issue"""
817+
input_symbols = set(string.printable)
818+
819+
# Test 1: Basic pattern with space at the end
820+
basic = NFA.from_regex(r"a ", input_symbols=input_symbols)
821+
self.assertTrue(basic.accepts_input("a "))
822+
823+
# Test 2: Character class with space
824+
with_class = NFA.from_regex(r"a[b ]", input_symbols=input_symbols)
825+
self.assertTrue(with_class.accepts_input("a "))
826+
self.assertTrue(with_class.accepts_input("ab"))
827+
828+
# Test 3: Simple repetition with space
829+
simple_repeat = NFA.from_regex(r"(a )+", input_symbols=input_symbols)
830+
self.assertTrue(simple_repeat.accepts_input("a "))
831+
self.assertTrue(simple_repeat.accepts_input("a a "))
832+
833+
# Test 4: Specific repeating pattern without the semicolon
834+
no_semicolon = NFA.from_regex(r"([a-z]+=. )+", input_symbols=input_symbols)
835+
self.assertTrue(no_semicolon.accepts_input("v=1 "))
836+
self.assertTrue(no_semicolon.accepts_input("v=1 a=2 "))
837+
838+
# Test 5: With semicolon but space before
839+
space_before = NFA.from_regex(r"([a-z]+=[^;]+ ;)+", input_symbols=input_symbols)
840+
self.assertTrue(space_before.accepts_input("v=1 ;"))
841+
self.assertTrue(space_before.accepts_input("v=1 ;a=2 ;"))
842+
843+
# Test 6: Space as part of negated class
844+
space_in_neg = NFA.from_regex(r"([a-z]+=[^; ]+;)+", input_symbols=input_symbols)
845+
self.assertTrue(space_in_neg.accepts_input("v=1;"))
846+
847+
# Test 7: Bare minimum to reproduce
848+
minimal = NFA.from_regex(r"(a; )+", input_symbols=input_symbols)
849+
self.assertTrue(minimal.accepts_input("a; "))
850+
self.assertTrue(minimal.accepts_input("a; a; "))

0 commit comments

Comments
 (0)