1
1
from test import support
2
2
from tokenize import (tokenize , _tokenize , untokenize , NUMBER , NAME , OP ,
3
3
STRING , ENDMARKER , ENCODING , tok_name , detect_encoding ,
4
- open as tokenize_open , Untokenizer )
4
+ open as tokenize_open , Untokenizer , generate_tokens ,
5
+ NEWLINE )
5
6
from io import BytesIO
6
7
import unittest
7
8
from unittest import TestCase , mock
11
12
import token
12
13
13
14
15
+ # Converts a source string into a list of textual representation
16
+ # of the tokens such as:
17
+ # ` NAME 'if' (1, 0) (1, 2)`
18
+ # to make writing tests easier.
19
+ def stringify_tokens_from_source (token_generator , source_string ):
20
+ result = []
21
+ num_lines = len (source_string .splitlines ())
22
+ missing_trailing_nl = source_string [- 1 ] not in '\r \n '
23
+
24
+ for type , token , start , end , line in token_generator :
25
+ if type == ENDMARKER :
26
+ break
27
+ # Ignore the new line on the last line if the input lacks one
28
+ if missing_trailing_nl and type == NEWLINE and end [0 ] == num_lines :
29
+ continue
30
+ type = tok_name [type ]
31
+ result .append (f" { type :10} { token !r:13} { start } { end } " )
32
+
33
+ return result
34
+
14
35
class TokenizeTest (TestCase ):
15
36
# Tests for the tokenize module.
16
37
17
38
# The tests can be really simple. Given a small fragment of source
18
- # code, print out a table with tokens. The ENDMARKER is omitted for
19
- # brevity.
39
+ # code, print out a table with tokens. The ENDMARKER, ENCODING and
40
+ # final NEWLINE are omitted for brevity.
20
41
21
42
def check_tokenize (self , s , expected ):
22
43
# Format the tokens in s in a table format.
23
- # The ENDMARKER is omitted.
24
- result = []
44
+ # The ENDMARKER and final NEWLINE are omitted.
25
45
f = BytesIO (s .encode ('utf-8' ))
26
- for type , token , start , end , line in tokenize (f .readline ):
27
- if type == ENDMARKER :
28
- break
29
- type = tok_name [type ]
30
- result .append (f" { type :10} { token !r:13} { start } { end } " )
46
+ result = stringify_tokens_from_source (tokenize (f .readline ), s )
47
+
31
48
self .assertEqual (result ,
32
49
[" ENCODING 'utf-8' (0, 0) (0, 0)" ] +
33
50
expected .rstrip ().splitlines ())
34
51
52
+ def test_implicit_newline (self ):
53
+ # Make sure that the tokenizer puts in an implicit NEWLINE
54
+ # when the input lacks a trailing new line.
55
+ f = BytesIO ("x" .encode ('utf-8' ))
56
+ tokens = list (tokenize (f .readline ))
57
+ self .assertEqual (tokens [- 2 ].type , NEWLINE )
58
+ self .assertEqual (tokens [- 1 ].type , ENDMARKER )
59
+
35
60
def test_basic (self ):
36
61
self .check_tokenize ("1 + 1" , """\
37
62
NUMBER '1' (1, 0) (1, 1)
@@ -993,8 +1018,8 @@ def readline():
993
1018
else :
994
1019
return b''
995
1020
996
- # skip the initial encoding token and the end token
997
- tokens = list (_tokenize (readline , encoding = 'utf-8' ))[1 :- 1 ]
1021
+ # skip the initial encoding token and the end tokens
1022
+ tokens = list (_tokenize (readline , encoding = 'utf-8' ))[1 :- 2 ]
998
1023
expected_tokens = [(3 , '"ЉЊЈЁЂ"' , (1 , 0 ), (1 , 7 ), '"ЉЊЈЁЂ"' )]
999
1024
self .assertEqual (tokens , expected_tokens ,
1000
1025
"bytes not decoded with encoding" )
@@ -1010,8 +1035,8 @@ def readline():
1010
1035
else :
1011
1036
return b''
1012
1037
1013
- # skip the end token
1014
- tokens = list (_tokenize (readline , encoding = None ))[:- 1 ]
1038
+ # skip the end tokens
1039
+ tokens = list (_tokenize (readline , encoding = None ))[:- 2 ]
1015
1040
expected_tokens = [(3 , '"ЉЊЈЁЂ"' , (1 , 0 ), (1 , 7 ), '"ЉЊЈЁЂ"' )]
1016
1041
self .assertEqual (tokens , expected_tokens ,
1017
1042
"string not tokenized when encoding is None" )
@@ -1322,18 +1347,21 @@ def test_oneline_defs(self):
1322
1347
1323
1348
# Test that 500 consequent, one-line defs is OK
1324
1349
toks = list (tokenize (BytesIO (buf .encode ('utf-8' )).readline ))
1325
- self .assertEqual (toks [- 2 ].string , 'OK' ) # [-1] is always ENDMARKER
1350
+ self .assertEqual (toks [- 3 ].string , 'OK' ) # [-1] is always ENDMARKER
1351
+ # [-2] is always NEWLINE
1326
1352
1327
1353
def assertExactTypeEqual (self , opstr , * optypes ):
1328
1354
tokens = list (tokenize (BytesIO (opstr .encode ('utf-8' )).readline ))
1329
1355
num_optypes = len (optypes )
1330
- self .assertEqual (len (tokens ), 2 + num_optypes )
1356
+ self .assertEqual (len (tokens ), 3 + num_optypes )
1331
1357
self .assertEqual (token .tok_name [tokens [0 ].exact_type ],
1332
1358
token .tok_name [ENCODING ])
1333
1359
for i in range (num_optypes ):
1334
1360
self .assertEqual (token .tok_name [tokens [i + 1 ].exact_type ],
1335
1361
token .tok_name [optypes [i ]])
1336
1362
self .assertEqual (token .tok_name [tokens [1 + num_optypes ].exact_type ],
1363
+ token .tok_name [token .NEWLINE ])
1364
+ self .assertEqual (token .tok_name [tokens [2 + num_optypes ].exact_type ],
1337
1365
token .tok_name [token .ENDMARKER ])
1338
1366
1339
1367
def test_exact_type (self ):
@@ -1484,7 +1512,7 @@ def test_roundtrip(self):
1484
1512
self .check_roundtrip ("if x == 1:\n "
1485
1513
" print(x)\n " )
1486
1514
self .check_roundtrip ("# This is a comment\n "
1487
- "# This also" )
1515
+ "# This also\n " )
1488
1516
1489
1517
# Some people use different formatting conventions, which makes
1490
1518
# untokenize a little trickier. Note that this test involves trailing
0 commit comments