Skip to content

Commit

Permalink
unicode range stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
yosato committed Mar 18, 2013
1 parent 73fcd0b commit 8f38935
Show file tree
Hide file tree
Showing 5 changed files with 297 additions and 0 deletions.
7 changes: 7 additions & 0 deletions python/#clean_korean.py#
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/usr/env python3

import sys

FN=sys.argv[1]

F=open(FN).readlines()
9 changes: 9 additions & 0 deletions python/combined_names.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/usr/env python3

import sys

[SurnameFN,GivenNFN]=sys.argv[1:2]

for Surname in open(SurnameFN).split('\n'):
for GivenName in open(GivenNameFN).split('\n'):
print(Surname+' '+GivenName)
135 changes: 135 additions & 0 deletions python/unicodeRanges.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
UCRs={
('0020','007F'): 'Basic Latin',
('00A0','00FF'): 'Latin',
('0100','017F'): 'Latin Extended A',
('0180','024F'): 'Latin Extended B',
('0250','02AF'): 'IPA Extensions',
('02B0','02FF'): 'Spacing Modifier Letters',
('0300','036F'): 'Combining Diacritical Marks',
('0370','03FF'): 'Greek and Coptic',
('0400','04FF'): 'Cyrillic',
('0500','052F'): 'Cyrillic Supplementary',
('0530','058F'): 'Armenian',
('0590','05FF'): 'Hebrew',
('0600','06FF'): 'Arabic',
('0700','074F'): 'Syriac',
('0780','07BF'): 'Thaana',
('0900','097F'): 'Devanagari',
('0980','09FF'): 'Bengali',
('0A00','0A7F'): 'Gurmukhi',
('0A80','0AFF'): 'Gujarati',
('0B00','0B7F'): 'Oriya',
('0B80','0BFF'): 'Tamil',
('0C00','0C7F'): 'Telugu',
('0C80','0CFF'): 'Kannada',
('0D00','0D7F'): 'Malayalam',
('0D80','0DFF'): 'Sinhala',
('0E00','0E7F'): 'Thai',
('0E80','0EFF'): 'Lao',
('0F00','0FFF'): 'Tibetan',
('1000','109F'): 'Myanmar',
('10A0','10FF'): 'Georgian',
('1100','11FF'): 'Hangul Jamo',
('1200','137F'): 'Ethiopic',
('13A0','13FF'): 'Cherokee',
('1400','167F'): 'Unified Canadian Aboriginal Syllabics',
('1680','169F'): 'Ogham',
('16A0','16FF'): 'Runic',
('1700','171F'): 'Tagalog',
('1720','173F'): 'Hanunoo',
('1740','175F'): 'Buhid',
('1760','177F'): 'Tagbanwa',
('1780','17FF'): 'Khmer',
('1800','18AF'): 'Mongolian',
('1900','194F'): 'Limbu',
('1950','197F'): 'Tai Le',
('19E0','19FF'): 'Khmer Symbols',
('1D00','1D7F'): 'Phonetic Extensions',
('1E00','1EFF'): 'Latin Extended Additional',
('1F00','1FFF'): 'Greek Extended',
('2000','206F'): 'General Punctuation',
('2070','209F'): 'Superscripts and Subscripts',
('20A0','20CF'): 'Currency Symbols',
('20D0','20FF'): 'Combining Diacritical Marks for Symbols',
('2100','214F'): 'Letterlike Symbols',
('2150','218F'): 'Number Forms',
('2190','21FF'): 'Arrows',
('2200','22FF'): 'Mathematical Operators',
('2300','23FF'): 'Miscellaneous Technical',
('2400','243F'): 'Control Pictures',
('2440','245F'): 'Optical Character Recognition',
('2460','24FF'): 'Enclosed Alphanumerics',
('2500','257F'): 'Box Drawing',
('2580','259F'): 'Block Elements',
('25A0','25FF'): 'Geometric Shapes',
('2600','26FF'): 'Miscellaneous Symbols',
('2700','27BF'): 'Dingbats',
('27C0','27EF'): 'Miscellaneous Mathematical Symbols A',
('27F0','27FF'): 'Supplemental Arrows A',
('2800','28FF'): 'Braille Patterns',
('2900','297F'): 'Supplemental Arrows B',
('2980','29FF'): 'Miscellaneous Mathematical Symbols B',
('2A00','2AFF'): 'Supplemental Mathematical Operators',
('2B00','2BFF'): 'Miscellaneous Symbols and Arrows',
('2E80','2EFF'): 'CJK Radicals Supplement',
('2F00','2FDF'): 'Kangxi Radicals',
('2FF0','2FFF'): 'Ideographic Description Characters',
('3000','303F'): 'CJK Symbols and Punctuation',
('3040','309F'): 'Hiragana',
('30A0','30FF'): 'Katakana',
('3100','312F'): 'Bopomofo',
('3130','318F'): 'Hangul Compatibility Jamo',
('3190','319F'): 'Kanbun',
('31A0','31BF'): 'Bopomofo Extended',
('31F0','31FF'): 'Katakana Phonetic Extensions',
('3200','32FF'): 'Enclosed CJK Letters and Months',
('3300','33FF'): 'CJK Compatibility',
('3400','4DBF'): 'CJK Unified Ideographs Extension A',
('4DC0','4DFF'): 'Yijing Hexagram Symbols',
('4E00','9FFF'): 'CJK Unified Ideographs',
('A000','A48F'): 'Yi Syllables',
('A490','A4CF'): 'Yi Radicals',
('AC00','D7AF'): 'Hangul Syllables',
('D800','DB7F'): 'High Surrogates',
('DB80','DBFF'): 'High Private Use Surrogates',
('DC00','DFFF'): 'Low Surrogates',
('E000','F8FF'): 'Private Use Area',
('F900','FAFF'): 'CJK Compatibility Ideographs',
('FB00','FB4F'): 'Alphabetic Presentation Forms',
('FB50','FDFF'): 'Arabic Presentation Forms A',
('FE00','FE0F'): 'Variation Selectors',
('FE20','FE2F'): 'Combining Half Marks',
('FE30','FE4F'): 'CJK Compatibility Forms',
('FE50','FE6F'): 'Small Form Variants',
('FE70','FEFF'): 'Arabic Presentation Forms B',
('FF00','FFEF'): 'Halfwidth and Fullwidth Forms',
('FFF0','FFFF'): 'Specials',
('10000','1007F'): 'Linear B Syllabary',
('10080','100FF'): 'Linear B Ideograms',
('10100','1013F'): 'Aegean Numbers',
('10300','1032F'): 'Old Italic',
('10330','1034F'): 'Gothic',
('10380','1039F'): 'Ugaritic',
('10400','1044F'): 'Deseret',
('10450','1047F'): 'Shavian',
('10480','104AF'): 'Osmanya',
('10800','1083F'): 'Cypriot Syllabary',
('1D000','1D0FF'): 'Byzantine Musical Symbols',
('1D100','1D1FF'): 'Musical Symbols',
('1D300','1D35F'): 'Tai Xuan Jing Symbols',
('1D400','1D7FF'): 'Mathematical Alphanumeric Symbols',
('20000','2A6DF'): 'CJK Unified Ideographs Extension B',
('2F800','2FA1F'): 'CJK Compatibility Ideographs Supplement',
('E0000','E007F'): 'Tags',
}

CrossCats={
'num': [(48,57,),(65296,65305,)],
'roman': [('0041','005a',),('0061','0077',),('ff21','ff3a',),('ff41','ff5a',)],
'sym': [(33,47,),(58,64,),(91,96,),(123,126,),(8192,8303,),(8591,8597,),(9632, 9983,),(12288,12351,),(65280,65519,)],
'han': [(19968, 40959,),('f900','faff')],
'kana': [(12352,12543,)],
'hangul': [('AC00','D7AF',)],
'jamo': [('1100','11FF',),('3130','318f',)],
'ws': [('0009','0009',),('000A','000D',),('0020','0020',),('3000','3000',)],
}
122 changes: 122 additions & 0 deletions python/unicodeRanges.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
0020 - 007F Basic Latin
00A0 - 00FF Latin-1 Supplement
0100 - 017F Latin Extended-A
0180 - 024F Latin Extended-B
0250 - 02AF IPA Extensions
02B0 - 02FF Spacing Modifier Letters
0300 - 036F Combining Diacritical Marks
0370 - 03FF Greek and Coptic
0400 - 04FF Cyrillic
0500 - 052F Cyrillic Supplementary
0530 - 058F Armenian
0590 - 05FF Hebrew
0600 - 06FF Arabic
0700 - 074F Syriac
0780 - 07BF Thaana
0900 - 097F Devanagari
0980 - 09FF Bengali
0A00 - 0A7F Gurmukhi
0A80 - 0AFF Gujarati
0B00 - 0B7F Oriya
0B80 - 0BFF Tamil
0C00 - 0C7F Telugu
0C80 - 0CFF Kannada
0D00 - 0D7F Malayalam
0D80 - 0DFF Sinhala
0E00 - 0E7F Thai
0E80 - 0EFF Lao
0F00 - 0FFF Tibetan
1000 - 109F Myanmar
10A0 - 10FF Georgian
1100 - 11FF Hangul Jamo
1200 - 137F Ethiopic
13A0 - 13FF Cherokee
1400 - 167F Unified Canadian Aboriginal Syllabics
1680 - 169F Ogham
16A0 - 16FF Runic
1700 - 171F Tagalog
1720 - 173F Hanunoo
1740 - 175F Buhid
1760 - 177F Tagbanwa
1780 - 17FF Khmer
1800 - 18AF Mongolian
1900 - 194F Limbu
1950 - 197F Tai Le
19E0 - 19FF Khmer Symbols
1D00 - 1D7F Phonetic Extensions
1E00 - 1EFF Latin Extended Additional
1F00 - 1FFF Greek Extended
2000 - 206F General Punctuation
2070 - 209F Superscripts and Subscripts
20A0 - 20CF Currency Symbols
20D0 - 20FF Combining Diacritical Marks for Symbols
2100 - 214F Letterlike Symbols
2150 - 218F Number Forms
2190 - 21FF Arrows
2200 - 22FF Mathematical Operators
2300 - 23FF Miscellaneous Technical
2400 - 243F Control Pictures
2440 - 245F Optical Character Recognition
2460 - 24FF Enclosed Alphanumerics
2500 - 257F Box Drawing
2580 - 259F Block Elements
25A0 - 25FF Geometric Shapes
2600 - 26FF Miscellaneous Symbols
2700 - 27BF Dingbats
27C0 - 27EF Miscellaneous Mathematical Symbols-A
27F0 - 27FF Supplemental Arrows-A
2800 - 28FF Braille Patterns
2900 - 297F Supplemental Arrows-B
2980 - 29FF Miscellaneous Mathematical Symbols-B
2A00 - 2AFF Supplemental Mathematical Operators
2B00 - 2BFF Miscellaneous Symbols and Arrows
2E80 - 2EFF CJK Radicals Supplement
2F00 - 2FDF Kangxi Radicals
2FF0 - 2FFF Ideographic Description Characters
3000 - 303F CJK Symbols and Punctuation
3040 - 309F Hiragana
30A0 - 30FF Katakana
3100 - 312F Bopomofo
3130 - 318F Hangul Compatibility Jamo
3190 - 319F Kanbun
31A0 - 31BF Bopomofo Extended
31F0 - 31FF Katakana Phonetic Extensions
3200 - 32FF Enclosed CJK Letters and Months
3300 - 33FF CJK Compatibility
3400 - 4DBF CJK Unified Ideographs Extension A
4DC0 - 4DFF Yijing Hexagram Symbols
4E00 - 9FFF CJK Unified Ideographs
A000 - A48F Yi Syllables
A490 - A4CF Yi Radicals
AC00 - D7AF Hangul Syllables
D800 - DB7F High Surrogates
DB80 - DBFF High Private Use Surrogates
DC00 - DFFF Low Surrogates
E000 - F8FF Private Use Area
F900 - FAFF CJK Compatibility Ideographs
FB00 - FB4F Alphabetic Presentation Forms
FB50 - FDFF Arabic Presentation Forms-A
FE00 - FE0F Variation Selectors
FE20 - FE2F Combining Half Marks
FE30 - FE4F CJK Compatibility Forms
FE50 - FE6F Small Form Variants
FE70 - FEFF Arabic Presentation Forms-B
FF00 - FFEF Halfwidth and Fullwidth Forms
FFF0 - FFFF Specials
10000 - 1007F Linear B Syllabary
10080 - 100FF Linear B Ideograms
10100 - 1013F Aegean Numbers
10300 - 1032F Old Italic
10330 - 1034F Gothic
10380 - 1039F Ugaritic
10400 - 1044F Deseret
10450 - 1047F Shavian
10480 - 104AF Osmanya
10800 - 1083F Cypriot Syllabary
1D000 - 1D0FF Byzantine Musical Symbols
1D100 - 1D1FF Musical Symbols
1D300 - 1D35F Tai Xuan Jing Symbols
1D400 - 1D7FF Mathematical Alphanumeric Symbols
20000 - 2A6DF CJK Unified Ideographs Extension B
2F800 - 2FA1F CJK Compatibility Ideographs Supplement
E0000 - E007F Tags
24 changes: 24 additions & 0 deletions python/uniq2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# just the python rendering of 'uniq', order preserving de-duplication

def uniq(List):
Uniqs=[]
Prev=''
for El in List:
if El!=Prev:
Uniqs.append(El)
Prev=El

return Uniqs

import sys

if __name__=='__main__':
if len(sys.argv[1])==2:
print('Give an argument, the filename!'); exit()
else:
FN=sys.argv[1]
List=open(FN).read().split('\n')
Uniqs=[ El for (Cntr,El) in enumerate(List) if El!=List[Cntr-2] ]
for Line in Uniqs:
sys.stdout.write(Line+'\n')

0 comments on commit 8f38935

Please sign in to comment.