Skip to content

Commit 421c9b3

Browse files
peterkmurphyadamchainz
authored andcommitted
Support unicode literals over codepoint 0xffff
Fixes yaml#25. Rebase and tidy up of yaml#63.
1 parent c5b135f commit 421c9b3

File tree

8 files changed

+41
-48
lines changed

8 files changed

+41
-48
lines changed

lib/yaml/emitter.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,13 @@
88

99
__all__ = ['Emitter', 'EmitterError']
1010

11+
import sys
12+
1113
from error import YAMLError
1214
from events import *
1315

16+
has_ucs4 = sys.maxunicode > 0xffff
17+
1418
class EmitterError(YAMLError):
1519
pass
1620

@@ -674,7 +678,7 @@ def analyze_scalar(self, scalar):
674678
# Check for indicators.
675679
if index == 0:
676680
# Leading indicators are special characters.
677-
if ch in u'#,[]{}&*!|>\'\"%@`':
681+
if ch in u'#,[]{}&*!|>\'\"%@`':
678682
flow_indicators = True
679683
block_indicators = True
680684
if ch in u'?:':
@@ -701,7 +705,8 @@ def analyze_scalar(self, scalar):
701705
line_breaks = True
702706
if not (ch == u'\n' or u'\x20' <= ch <= u'\x7E'):
703707
if (ch == u'\x85' or u'\xA0' <= ch <= u'\uD7FF'
704-
or u'\uE000' <= ch <= u'\uFFFD') and ch != u'\uFEFF':
708+
or u'\uE000' <= ch <= u'\uFFFD'
709+
or ((not has_ucs4) or (u'\U00010000' <= ch < u'\U0010ffff'))) and ch != u'\uFEFF':
705710
unicode_characters = True
706711
if not self.allow_unicode:
707712
special_characters = True
@@ -1137,4 +1142,3 @@ def write_plain(self, text, split=True):
11371142
spaces = (ch == u' ')
11381143
breaks = (ch in u'\n\x85\u2028\u2029')
11391144
end += 1
1140-

lib/yaml/reader.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@
1919

2020
from error import YAMLError, Mark
2121

22-
import codecs, re
22+
import codecs, re, sys
23+
24+
has_ucs4 = sys.maxunicode > 0xffff
2325

2426
class ReaderError(YAMLError):
2527

@@ -134,7 +136,10 @@ def determine_encoding(self):
134136
self.encoding = 'utf-8'
135137
self.update(1)
136138

137-
NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]')
139+
if has_ucs4:
140+
NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD\U00010000-\U0010ffff]')
141+
else:
142+
NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]')
138143
def check_printable(self, data):
139144
match = self.NON_PRINTABLE.search(data)
140145
if match:
@@ -187,4 +192,3 @@ def update_raw(self, size=1024):
187192
# psyco.bind(Reader)
188193
#except ImportError:
189194
# pass
190-

lib3/yaml/emitter.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -671,7 +671,7 @@ def analyze_scalar(self, scalar):
671671
# Check for indicators.
672672
if index == 0:
673673
# Leading indicators are special characters.
674-
if ch in '#,[]{}&*!|>\'\"%@`':
674+
if ch in '#,[]{}&*!|>\'\"%@`':
675675
flow_indicators = True
676676
block_indicators = True
677677
if ch in '?:':
@@ -698,7 +698,8 @@ def analyze_scalar(self, scalar):
698698
line_breaks = True
699699
if not (ch == '\n' or '\x20' <= ch <= '\x7E'):
700700
if (ch == '\x85' or '\xA0' <= ch <= '\uD7FF'
701-
or '\uE000' <= ch <= '\uFFFD') and ch != '\uFEFF':
701+
or '\uE000' <= ch <= '\uFFFD'
702+
or '\U00010000' <= ch < '\U0010ffff') and ch != '\uFEFF':
702703
unicode_characters = True
703704
if not self.allow_unicode:
704705
special_characters = True
@@ -1134,4 +1135,3 @@ def write_plain(self, text, split=True):
11341135
spaces = (ch == ' ')
11351136
breaks = (ch in '\n\x85\u2028\u2029')
11361137
end += 1
1137-

lib3/yaml/reader.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ def determine_encoding(self):
134134
self.encoding = 'utf-8'
135135
self.update(1)
136136

137-
NON_PRINTABLE = re.compile('[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]')
137+
NON_PRINTABLE = re.compile('[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD\U00010000-\U0010ffff]')
138138
def check_printable(self, data):
139139
match = self.NON_PRINTABLE.search(data)
140140
if match:
@@ -189,4 +189,3 @@ def update_raw(self, size=4096):
189189
# psyco.bind(Reader)
190190
#except ImportError:
191191
# pass
192-

tests/data/emoticons.unicode

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
😀😁😂😃😄😅😆😇
2+
😈😉😊😋😌😍😎😏
3+
😐😑😒😓😔😕😖😗
4+
😘😙😚😛😜😝😞😟
5+
😠😡😢😣😤😥😦😧
6+
😨😩😪😫😬😭😮😯
7+
😰😱😲😳😴😵😶😷
8+
😸😹😺😻😼😽😾😿
9+
🙀🙁🙂🙃🙄🙅🙆🙇
10+
🙈🙉🙊🙋🙌🙍🙎🙏

tests/data/emoticons2.unicode

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
😀

tests/lib/test_input_output.py

Lines changed: 6 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,11 @@ def test_unicode_input(unicode_filename, verbose=False):
3434

3535
def test_unicode_input_errors(unicode_filename, verbose=False):
3636
data = open(unicode_filename, 'rb').read().decode('utf-8')
37-
for input in [data.encode('latin1', 'ignore'),
38-
data.encode('utf-16-be'), data.encode('utf-16-le'),
39-
codecs.BOM_UTF8+data.encode('utf-16-be'),
40-
codecs.BOM_UTF16_BE+data.encode('utf-16-le'),
41-
codecs.BOM_UTF16_LE+data.encode('utf-8')+'!']:
37+
for input in [data.encode('utf-16-be'),
38+
data.encode('utf-16-le'),
39+
codecs.BOM_UTF8+data.encode('utf-16-be'),
40+
codecs.BOM_UTF8+data.encode('utf-16-le')]:
41+
4242
try:
4343
yaml.load(input)
4444
except yaml.YAMLError, exc:
@@ -69,17 +69,7 @@ def test_unicode_output(unicode_filename, verbose=False):
6969
stream = StringIO.StringIO()
7070
yaml.dump(value, stream, encoding=encoding, allow_unicode=allow_unicode)
7171
data4 = stream.getvalue()
72-
for copy in [data1, data2, data3, data4]:
73-
if allow_unicode:
74-
try:
75-
copy[4:].encode('ascii')
76-
except (UnicodeDecodeError, UnicodeEncodeError), exc:
77-
if verbose:
78-
print exc
79-
else:
80-
raise AssertionError("expected an exception")
81-
else:
82-
copy[4:].encode('ascii')
72+
8373
assert isinstance(data1, str), (type(data1), encoding)
8474
data1.decode('utf-8')
8575
assert isinstance(data2, str), (type(data2), encoding)
@@ -148,4 +138,3 @@ def test_unicode_transfer(unicode_filename, verbose=False):
148138
if __name__ == '__main__':
149139
import test_appliance
150140
test_appliance.run(globals())
151-

tests/lib3/test_input_output.py

Lines changed: 6 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,11 @@ def test_unicode_input(unicode_filename, verbose=False):
2424

2525
def test_unicode_input_errors(unicode_filename, verbose=False):
2626
data = open(unicode_filename, 'rb').read().decode('utf-8')
27-
for input in [data.encode('latin1', 'ignore'),
28-
data.encode('utf-16-be'), data.encode('utf-16-le'),
29-
codecs.BOM_UTF8+data.encode('utf-16-be'),
30-
codecs.BOM_UTF16_BE+data.encode('utf-16-le'),
31-
codecs.BOM_UTF16_LE+data.encode('utf-8')+b'!']:
27+
for input in [data.encode('utf-16-be'),
28+
data.encode('utf-16-le'),
29+
codecs.BOM_UTF8+data.encode('utf-16-be'),
30+
codecs.BOM_UTF8+data.encode('utf-16-le')]:
31+
3232
try:
3333
yaml.load(input)
3434
except yaml.YAMLError as exc:
@@ -75,20 +75,7 @@ def test_unicode_output(unicode_filename, verbose=False):
7575
if verbose:
7676
print("BYTES:", data4[:50])
7777
data4 = data4.decode(encoding)
78-
for copy in [data1, data2, data3, data4]:
79-
if copy is None:
80-
continue
81-
assert isinstance(copy, str)
82-
if allow_unicode:
83-
try:
84-
copy[4:].encode('ascii')
85-
except UnicodeEncodeError as exc:
86-
if verbose:
87-
print(exc)
88-
else:
89-
raise AssertionError("expected an exception")
90-
else:
91-
copy[4:].encode('ascii')
78+
9279
assert isinstance(data1, str), (type(data1), encoding)
9380
assert isinstance(data2, str), (type(data2), encoding)
9481

@@ -147,4 +134,3 @@ def test_unicode_transfer(unicode_filename, verbose=False):
147134
if __name__ == '__main__':
148135
import test_appliance
149136
test_appliance.run(globals())
150-

0 commit comments

Comments
 (0)