Support unicode literals over codepoint 0xffff

peterkmurphy · adamchainz · commit 421c9b3771c4 · 2017-05-16T10:36:26.000+01:00
Fixes yaml#25. Rebase and tidy up of yaml#63.
diff --git a/lib/yaml/emitter.py b/lib/yaml/emitter.py
@@ -8,9 +8,13 @@
 
 __all__ = ['Emitter', 'EmitterError']
 
+import sys
+
 from error import YAMLError
 from events import *
 
+has_ucs4 = sys.maxunicode > 0xffff
+
 class EmitterError(YAMLError):
     pass
 
@@ -674,7 +678,7 @@ def analyze_scalar(self, scalar):
             # Check for indicators.
             if index == 0:
                 # Leading indicators are special characters.
-                if ch in u'#,[]{}&*!|>\'\"%@`': 
+                if ch in u'#,[]{}&*!|>\'\"%@`':
                     flow_indicators = True
                     block_indicators = True
                 if ch in u'?:':
@@ -701,7 +705,8 @@ def analyze_scalar(self, scalar):
                 line_breaks = True
             if not (ch == u'\n' or u'\x20' <= ch <= u'\x7E'):
                 if (ch == u'\x85' or u'\xA0' <= ch <= u'\uD7FF'
-                        or u'\uE000' <= ch <= u'\uFFFD') and ch != u'\uFEFF':
+                        or u'\uE000' <= ch <= u'\uFFFD'
+                        or ((not has_ucs4) or (u'\U00010000' <= ch < u'\U0010ffff'))) and ch != u'\uFEFF':
                     unicode_characters = True
                     if not self.allow_unicode:
                         special_characters = True
@@ -1137,4 +1142,3 @@ def write_plain(self, text, split=True):
                 spaces = (ch == u' ')
                 breaks = (ch in u'\n\x85\u2028\u2029')
             end += 1
-
diff --git a/lib/yaml/reader.py b/lib/yaml/reader.py
@@ -19,7 +19,9 @@
 
 from error import YAMLError, Mark
 
-import codecs, re
+import codecs, re, sys
+
+has_ucs4 = sys.maxunicode > 0xffff
 
 class ReaderError(YAMLError):
 
@@ -134,7 +136,10 @@ def determine_encoding(self):
                 self.encoding = 'utf-8'
         self.update(1)
 
-    NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]')
+    if has_ucs4:
+        NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD\U00010000-\U0010ffff]')
+    else:
+        NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]')
     def check_printable(self, data):
         match = self.NON_PRINTABLE.search(data)
         if match:
@@ -187,4 +192,3 @@ def update_raw(self, size=1024):
 #    psyco.bind(Reader)
 #except ImportError:
 #    pass
-
diff --git a/lib3/yaml/emitter.py b/lib3/yaml/emitter.py
@@ -671,7 +671,7 @@ def analyze_scalar(self, scalar):
             # Check for indicators.
             if index == 0:
                 # Leading indicators are special characters.
-                if ch in '#,[]{}&*!|>\'\"%@`': 
+                if ch in '#,[]{}&*!|>\'\"%@`':
                     flow_indicators = True
                     block_indicators = True
                 if ch in '?:':
@@ -698,7 +698,8 @@ def analyze_scalar(self, scalar):
                 line_breaks = True
             if not (ch == '\n' or '\x20' <= ch <= '\x7E'):
                 if (ch == '\x85' or '\xA0' <= ch <= '\uD7FF'
-                        or '\uE000' <= ch <= '\uFFFD') and ch != '\uFEFF':
+                        or '\uE000' <= ch <= '\uFFFD'
+                        or '\U00010000' <= ch < '\U0010ffff') and ch != '\uFEFF':
                     unicode_characters = True
                     if not self.allow_unicode:
                         special_characters = True
@@ -1134,4 +1135,3 @@ def write_plain(self, text, split=True):
                 spaces = (ch == ' ')
                 breaks = (ch in '\n\x85\u2028\u2029')
             end += 1
-
diff --git a/lib3/yaml/reader.py b/lib3/yaml/reader.py
@@ -134,7 +134,7 @@ def determine_encoding(self):
                 self.encoding = 'utf-8'
         self.update(1)
 
-    NON_PRINTABLE = re.compile('[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]')
+    NON_PRINTABLE = re.compile('[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD\U00010000-\U0010ffff]')
     def check_printable(self, data):
         match = self.NON_PRINTABLE.search(data)
         if match:
@@ -189,4 +189,3 @@ def update_raw(self, size=4096):
 #    psyco.bind(Reader)
 #except ImportError:
 #    pass
-
diff --git a/tests/data/emoticons.unicode b/tests/data/emoticons.unicode
@@ -0,0 +1,10 @@
+😀😁😂😃😄😅😆😇
+😈😉😊😋😌😍😎😏
+😐😑😒😓😔😕😖😗
+😘😙😚😛😜😝😞😟
+😠😡😢😣😤😥😦😧
+😨😩😪😫😬😭😮😯
+😰😱😲😳😴😵😶😷
+😸😹😺😻😼😽😾😿
+🙀🙁🙂🙃🙄🙅🙆🙇
+🙈🙉🙊🙋🙌🙍🙎🙏
diff --git a/tests/data/emoticons2.unicode b/tests/data/emoticons2.unicode
@@ -0,0 +1 @@
+😀
diff --git a/tests/lib/test_input_output.py b/tests/lib/test_input_output.py
@@ -34,11 +34,11 @@ def test_unicode_input(unicode_filename, verbose=False):
 
 def test_unicode_input_errors(unicode_filename, verbose=False):
     data = open(unicode_filename, 'rb').read().decode('utf-8')
-    for input in [data.encode('latin1', 'ignore'),
-                    data.encode('utf-16-be'), data.encode('utf-16-le'),
-                    codecs.BOM_UTF8+data.encode('utf-16-be'),
-                    codecs.BOM_UTF16_BE+data.encode('utf-16-le'),
-                    codecs.BOM_UTF16_LE+data.encode('utf-8')+'!']:
+    for input in [data.encode('utf-16-be'),
+                  data.encode('utf-16-le'),
+                  codecs.BOM_UTF8+data.encode('utf-16-be'),
+                  codecs.BOM_UTF8+data.encode('utf-16-le')]:
+
         try:
             yaml.load(input)
         except yaml.YAMLError, exc:
@@ -69,17 +69,7 @@ def test_unicode_output(unicode_filename, verbose=False):
             stream = StringIO.StringIO()
             yaml.dump(value, stream, encoding=encoding, allow_unicode=allow_unicode)
             data4 = stream.getvalue()
-            for copy in [data1, data2, data3, data4]:
-                if allow_unicode:
-                    try:
-                        copy[4:].encode('ascii')
-                    except (UnicodeDecodeError, UnicodeEncodeError), exc:
-                        if verbose:
-                            print exc
-                    else:
-                        raise AssertionError("expected an exception")
-                else:
-                    copy[4:].encode('ascii')
+
             assert isinstance(data1, str), (type(data1), encoding)
             data1.decode('utf-8')
             assert isinstance(data2, str), (type(data2), encoding)
@@ -148,4 +138,3 @@ def test_unicode_transfer(unicode_filename, verbose=False):
 if __name__ == '__main__':
     import test_appliance
     test_appliance.run(globals())
-
diff --git a/tests/lib3/test_input_output.py b/tests/lib3/test_input_output.py
@@ -24,11 +24,11 @@ def test_unicode_input(unicode_filename, verbose=False):
 
 def test_unicode_input_errors(unicode_filename, verbose=False):
     data = open(unicode_filename, 'rb').read().decode('utf-8')
-    for input in [data.encode('latin1', 'ignore'),
-                    data.encode('utf-16-be'), data.encode('utf-16-le'),
-                    codecs.BOM_UTF8+data.encode('utf-16-be'),
-                    codecs.BOM_UTF16_BE+data.encode('utf-16-le'),
-                    codecs.BOM_UTF16_LE+data.encode('utf-8')+b'!']:
+    for input in [data.encode('utf-16-be'),
+                  data.encode('utf-16-le'),
+                  codecs.BOM_UTF8+data.encode('utf-16-be'),
+                  codecs.BOM_UTF8+data.encode('utf-16-le')]:
+
         try:
             yaml.load(input)
         except yaml.YAMLError as exc:
@@ -75,20 +75,7 @@ def test_unicode_output(unicode_filename, verbose=False):
                 if verbose:
                     print("BYTES:", data4[:50])
                 data4 = data4.decode(encoding)
-            for copy in [data1, data2, data3, data4]:
-                if copy is None:
-                    continue
-                assert isinstance(copy, str)
-                if allow_unicode:
-                    try:
-                        copy[4:].encode('ascii')
-                    except UnicodeEncodeError as exc:
-                        if verbose:
-                            print(exc)
-                    else:
-                        raise AssertionError("expected an exception")
-                else:
-                    copy[4:].encode('ascii')
+
             assert isinstance(data1, str), (type(data1), encoding)
             assert isinstance(data2, str), (type(data2), encoding)
 
@@ -147,4 +134,3 @@ def test_unicode_transfer(unicode_filename, verbose=False):
 if __name__ == '__main__':
     import test_appliance
     test_appliance.run(globals())
-