upgrade unicode db to 6.3.0 (closes #19221)

bendmorris · Oct 10, 2013 · 7da8059 · 7da8059
1 parent f7102c1
commit 7da8059
Show file tree

Hide file tree

Showing 8 changed files with 17,195 additions and 17,153 deletions.
diff --git a/Doc/library/unicodedata.rst b/Doc/library/unicodedata.rst
@@ -15,8 +15,8 @@
 
 This module provides access to the Unicode Character Database (UCD) which
 defines character properties for all Unicode characters. The data contained in
-this database is compiled from the `UCD version 6.2.0
-<http://www.unicode.org/Public/6.2.0/ucd>`_.
+this database is compiled from the `UCD version 6.3.0
+<http://www.unicode.org/Public/6.3.0/ucd>`_.
 
 The module uses the same names and symbols as defined by Unicode
 Standard Annex #44, `"Unicode Character Database"
@@ -166,6 +166,6 @@ Examples:
 
 .. rubric:: Footnotes
 
-.. [#] http://www.unicode.org/Public/6.2.0/ucd/NameAliases.txt
+.. [#] http://www.unicode.org/Public/6.3.0/ucd/NameAliases.txt
 
-.. [#] http://www.unicode.org/Public/6.2.0/ucd/NamedSequences.txt
+.. [#] http://www.unicode.org/Public/6.3.0/ucd/NamedSequences.txt
diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py
@@ -21,7 +21,7 @@
 class UnicodeMethodsTest(unittest.TestCase):
 
     # update this, if the database changes
-    expectedchecksum = 'bf7a78f1a532421b5033600102e23a92044dbba9'
+    expectedchecksum = 'e74e878de71b6e780ffac271785c3cb58f6251f3'
 
     def test_method_checksum(self):
         h = hashlib.sha1()

diff --git a/Misc/NEWS b/Misc/NEWS
@@ -10,6 +10,8 @@ Projected release date: 2013-10-20
 Core and Builtins
 -----------------
 
+- Issue #19221: Upgrade Unicode database to version 6.3.0.
+
 - Issue #16742: The result of the C callback PyOS_ReadlineFunctionPointer must
   now be a string allocated by PyMem_RawMalloc() or PyMem_RawRealloc() (or NULL
   if an error occurred), instead of a string allocated by PyMem_Malloc() or

diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c
@@ -1322,10 +1322,10 @@ PyDoc_STRVAR(unicodedata_docstring,
 "This module provides access to the Unicode Character Database which\n\
 defines character properties for all Unicode characters. The data in\n\
 this database is based on the UnicodeData.txt file version\n\
-6.0.0 which is publically available from ftp://ftp.unicode.org/.\n\
+6.3.0 which is publically available from ftp://ftp.unicode.org/.\n\
 \n\
 The module uses the same names and symbols as defined by the\n\
-UnicodeData File Format 6.0.0 (see\n\
+UnicodeData File Format 6.3.0 (see\n\
 http://www.unicode.org/reports/tr44/tr44-6.html).");
 
 

diff --git a/Modules/unicodedata_db.h b/Modules/unicodedata_db.h
diff --git a/Modules/unicodename_db.h b/Modules/unicodename_db.h
diff --git a/Objects/unicodetype_db.h b/Objects/unicodetype_db.h
@@ -1589,7 +1589,7 @@ static unsigned short index2[] = {
     55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 
     0, 0, 0, 55, 55, 55, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 21, 21, 
     21, 21, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 25, 25, 25, 25, 25, 25, 25, 25, 
-    25, 25, 25, 5, 0, 0, 5, 5, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 
+    25, 25, 25, 5, 21, 0, 5, 5, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 
     55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 
     55, 55, 55, 96, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 25, 25, 25, 25, 
     25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 7, 8, 
@@ -1801,7 +1801,7 @@ static unsigned short index2[] = {
     25, 25, 25, 25, 25, 25, 25, 25, 5, 5, 5, 96, 5, 5, 5, 5, 55, 25, 0, 0, 7, 
     8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 0, 0, 0, 27, 27, 27, 27, 27, 
     27, 27, 27, 27, 27, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
-    25, 25, 25, 2, 0, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 0, 0, 0, 
+    25, 25, 25, 21, 0, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 0, 0, 0, 
     55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 
     55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 96, 
     55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 
@@ -1828,7 +1828,7 @@ static unsigned short index2[] = {
     7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 132, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 
     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
     5, 5, 5, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 
-    55, 55, 55, 55, 55, 55, 55, 25, 25, 18, 18, 18, 0, 0, 5, 5, 55, 55, 55, 
+    55, 55, 55, 55, 55, 55, 55, 25, 25, 18, 18, 25, 0, 0, 5, 5, 55, 55, 55, 
     55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 
     55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 
     55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 18, 25, 18, 25, 
@@ -1915,7 +1915,7 @@ static unsigned short index2[] = {
     5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 5, 5, 6, 3, 3, 21, 21, 21, 21, 21, 2, 5, 5, 
     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 18, 18, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 18, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 21, 
-    21, 21, 21, 21, 0, 0, 0, 0, 0, 21, 21, 21, 21, 21, 21, 245, 95, 0, 0, 
+    21, 21, 21, 21, 0, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 245, 95, 0, 0, 
     246, 247, 248, 249, 250, 251, 5, 5, 5, 5, 5, 95, 245, 26, 22, 23, 246, 
     247, 248, 249, 250, 251, 5, 5, 5, 5, 5, 0, 95, 95, 95, 95, 95, 95, 95, 
     95, 95, 95, 95, 95, 95, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
@@ -2925,9 +2925,6 @@ static unsigned short index2[] = {
 double _PyUnicode_ToNumeric(Py_UCS4 ch)
 {
     switch (ch) {
-    case 0x12456:
-    case 0x12457:
-        return (double) -1.0;
     case 0x0F33:
         return (double) -1.0/2.0;
     case 0x0030:
@@ -3383,6 +3380,7 @@ double _PyUnicode_ToNumeric(Py_UCS4 ch)
     case 0x12435:
     case 0x1244A:
     case 0x12450:
+    case 0x12456:
     case 0x12459:
     case 0x1D361:
     case 0x1D7D0:
@@ -3539,6 +3537,7 @@ double _PyUnicode_ToNumeric(Py_UCS4 ch)
     case 0x1243B:
     case 0x1244B:
     case 0x12451:
+    case 0x12457:
     case 0x1D362:
     case 0x1D7D1:
     case 0x1D7DB:
@@ -4294,7 +4293,6 @@ int _PyUnicode_IsWhitespace(const Py_UCS4 ch)
     case 0x0085:
     case 0x00A0:
     case 0x1680:
-    case 0x180E:
     case 0x2000:
     case 0x2001:
     case 0x2002:

diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
@@ -37,7 +37,7 @@
 VERSION = "3.2"
 
 # The Unicode Database
-UNIDATA_VERSION = "6.2.0"
+UNIDATA_VERSION = "6.3.0"
 UNICODE_DATA = "UnicodeData%s.txt"
 COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
 EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
@@ -68,7 +68,7 @@
 
 BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
     "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
-    "ON" ]
+    "ON", "LRI", "RLI", "FSI", "PDI" ]
 
 EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]