1
1
from test import support
2
2
from tokenize import (tokenize , _tokenize , untokenize , NUMBER , NAME , OP ,
3
3
STRING , ENDMARKER , ENCODING , tok_name , detect_encoding ,
4
- open as tokenize_open , Untokenizer )
4
+ open as tokenize_open , Untokenizer , generate_tokens ,
5
+ NEWLINE )
5
6
from io import BytesIO
6
7
import unittest
7
8
from unittest import TestCase , mock
11
12
import token
12
13
13
14
15
+ # Converts a source string into a list of textual representation
16
+ # of the tokens such as:
17
+ # ` NAME 'if' (1, 0) (1, 2)`
18
+ # to make writing tests easier.
19
+ def stringify_tokens_from_source (token_generator , source_string ):
20
+ result = []
21
+ num_lines = len (source_string .splitlines ())
22
+ missing_trailing_nl = source_string [- 1 ] not in '\r \n '
23
+
24
+ for type , token , start , end , line in token_generator :
25
+ if type == ENDMARKER :
26
+ break
27
+ # Ignore the new line on the last line if the input lacks one
28
+ if missing_trailing_nl and type == NEWLINE and end [0 ] == num_lines :
29
+ continue
30
+ type = tok_name [type ]
31
+ result .append (f" { type :10} { token !r:13} { start } { end } " )
32
+
33
+ return result
34
+
14
35
class TokenizeTest (TestCase ):
15
36
# Tests for the tokenize module.
16
37
17
38
# The tests can be really simple. Given a small fragment of source
18
- # code, print out a table with tokens. The ENDMARKER is omitted for
19
- # brevity.
39
+ # code, print out a table with tokens. The ENDMARKER, ENCODING and
40
+ # final NEWLINE are omitted for brevity.
20
41
21
42
def check_tokenize (self , s , expected ):
22
43
# Format the tokens in s in a table format.
23
- # The ENDMARKER is omitted.
24
- result = []
44
+ # The ENDMARKER and final NEWLINE are omitted.
25
45
f = BytesIO (s .encode ('utf-8' ))
26
- for type , token , start , end , line in tokenize (f .readline ):
27
- if type == ENDMARKER :
28
- break
29
- type = tok_name [type ]
30
- result .append (f" { type :10} { token !r:13} { start } { end } " )
46
+ result = stringify_tokens_from_source (tokenize (f .readline ), s )
47
+
31
48
self .assertEqual (result ,
32
49
[" ENCODING 'utf-8' (0, 0) (0, 0)" ] +
33
50
expected .rstrip ().splitlines ())
34
51
52
+ def test_implicit_newline (self ):
53
+ # Make sure that the tokenizer puts in an implicit NEWLINE
54
+ # when the input lacks a trailing new line.
55
+ f = BytesIO ("x" .encode ('utf-8' ))
56
+ tokens = list (tokenize (f .readline ))
57
+ self .assertEqual (tokens [- 2 ].type , NEWLINE )
58
+ self .assertEqual (tokens [- 1 ].type , ENDMARKER )
59
+
35
60
def test_basic (self ):
36
61
self .check_tokenize ("1 + 1" , """\
37
62
NUMBER '1' (1, 0) (1, 1)
@@ -1009,8 +1034,8 @@ def readline():
1009
1034
else :
1010
1035
return b''
1011
1036
1012
- # skip the initial encoding token and the end token
1013
- tokens = list (_tokenize (readline , encoding = 'utf-8' ))[1 :- 1 ]
1037
+ # skip the initial encoding token and the end tokens
1038
+ tokens = list (_tokenize (readline , encoding = 'utf-8' ))[1 :- 2 ]
1014
1039
expected_tokens = [(3 , '"ЉЊЈЁЂ"' , (1 , 0 ), (1 , 7 ), '"ЉЊЈЁЂ"' )]
1015
1040
self .assertEqual (tokens , expected_tokens ,
1016
1041
"bytes not decoded with encoding" )
@@ -1026,8 +1051,8 @@ def readline():
1026
1051
else :
1027
1052
return b''
1028
1053
1029
- # skip the end token
1030
- tokens = list (_tokenize (readline , encoding = None ))[:- 1 ]
1054
+ # skip the end tokens
1055
+ tokens = list (_tokenize (readline , encoding = None ))[:- 2 ]
1031
1056
expected_tokens = [(3 , '"ЉЊЈЁЂ"' , (1 , 0 ), (1 , 7 ), '"ЉЊЈЁЂ"' )]
1032
1057
self .assertEqual (tokens , expected_tokens ,
1033
1058
"string not tokenized when encoding is None" )
@@ -1338,18 +1363,21 @@ def test_oneline_defs(self):
1338
1363
1339
1364
# Test that 500 consequent, one-line defs is OK
1340
1365
toks = list (tokenize (BytesIO (buf .encode ('utf-8' )).readline ))
1341
- self .assertEqual (toks [- 2 ].string , 'OK' ) # [-1] is always ENDMARKER
1366
+ self .assertEqual (toks [- 3 ].string , 'OK' ) # [-1] is always ENDMARKER
1367
+ # [-2] is always NEWLINE
1342
1368
1343
1369
def assertExactTypeEqual (self , opstr , * optypes ):
1344
1370
tokens = list (tokenize (BytesIO (opstr .encode ('utf-8' )).readline ))
1345
1371
num_optypes = len (optypes )
1346
- self .assertEqual (len (tokens ), 2 + num_optypes )
1372
+ self .assertEqual (len (tokens ), 3 + num_optypes )
1347
1373
self .assertEqual (tok_name [tokens [0 ].exact_type ],
1348
1374
tok_name [ENCODING ])
1349
1375
for i in range (num_optypes ):
1350
1376
self .assertEqual (tok_name [tokens [i + 1 ].exact_type ],
1351
1377
tok_name [optypes [i ]])
1352
1378
self .assertEqual (tok_name [tokens [1 + num_optypes ].exact_type ],
1379
+ tok_name [token .NEWLINE ])
1380
+ self .assertEqual (tok_name [tokens [2 + num_optypes ].exact_type ],
1353
1381
tok_name [token .ENDMARKER ])
1354
1382
1355
1383
def test_exact_type (self ):
@@ -1502,7 +1530,7 @@ def test_roundtrip(self):
1502
1530
self .check_roundtrip ("if x == 1:\n "
1503
1531
" print(x)\n " )
1504
1532
self .check_roundtrip ("# This is a comment\n "
1505
- "# This also" )
1533
+ "# This also\n " )
1506
1534
1507
1535
# Some people use different formatting conventions, which makes
1508
1536
# untokenize a little trickier. Note that this test involves trailing
0 commit comments