-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathutf8.h
154 lines (146 loc) · 4.71 KB
/
utf8.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
/* functions for wide char (UCS2), multibyte char (UTF-8) conversion */
/* ref:
* - http://ja.m.wikipedia.org/wiki/UTF-8
* - http://en.m.wikipedia.org/wiki/UTF-8
* - http://www.azillionmonkeys.com/qed/unicode.html
*/
enum {
//UTF8_LEN_MAX = 4,
MALFORMED_CHARACTER = 0xFFFD,
CURSIVE_SQUARE_OFFSET = 0x60,
};
/*
* from UCS2 to UTF-8
* return length of UTF-8 sequence
*/
int utf8_encode(uint32_t ucs, char utf8_buf[UTF8_LEN_MAX + 1])
{
if ((0xD800 <= ucs && ucs <= 0xDFFF) /* used as surrogate pair in UTF-16 */
|| (0xFDD0 <= ucs && ucs <= 0xFDEF) /* Non-character */
|| ucs == 0xFFFE /* conflict byte order mark (U+FEFF) */
|| ucs == 0xFFFF /* Non-character U+FFFF ("useful for internal purposes as sentinels") */
|| ucs > 0xFFFF) { /* UCS2 (Unicode BMP): 0x0000 - 0xFFFF */
/* invalid codepoint */
return -1;
}
if (ucs <= 0x7F) {
/* ASCII Character */
utf8_buf[0] = (ucs & 0x7F);
utf8_buf[1] = '\0';
return 1;
} else if (0x80 <= ucs && ucs <= 0x7FF) {
/* 2 byte sequence */
utf8_buf[0] = 0xC0 | ((ucs >> 6) & 0x1F);
utf8_buf[1] = 0x80 | (ucs & 0x3F);
utf8_buf[2] = '\0';
return 2;
} else if (0x800 <= ucs && ucs <= 0xFFFF) {
/* 3 byte sequence */
utf8_buf[0] = 0xE0 | ((ucs >> 12) & 0x0F);
utf8_buf[1] = 0x80 | ((ucs >> 6) & 0x3F);
utf8_buf[2] = 0x80 | (ucs & 0x3F);
utf8_buf[3] = '\0';
return 3;
}
/* illegal codepoint */
return -1;
}
/*
* from UTF-8 to UCS2
* return length of read sequence
*/
int utf8_decode(const char *utf8_str, uint32_t *ucs)
{
int following_byte = 0, count = 0;
uint32_t code = 0;
bool is_valid = true;
uint8_t ch;
/* utf8 string must end by NUL (0x00) */
for (int i = 0; utf8_str[i] != '\0'; i++) {
ch = (uint8_t) utf8_str[i];
if (ch <= 0x7F) {
/* ASCII Character */
*ucs = (uint32_t) ch;
return 1;
} else if (0x80 <= ch && ch <= 0xBF) {
/* check illegal UTF-8 sequence
* ? byte sequence: first byte must be between 0xC2 ~ 0xFD
* 2 byte sequence: first byte must be between 0xC2 ~ 0xDF
* 3 byte sequence: second byte following 0xE0 must be between 0xA0 ~ 0xBF
* 4 byte sequence: second byte following 0xF0 must be between 0x90 ~ 0xBF
* 5 byte sequence: second byte following 0xF8 must be between 0x88 ~ 0xBF
* 6 byte sequence: second byte following 0xFC must be between 0x84 ~ 0xBF
*/
if ((following_byte == 0)
|| (following_byte == 1 && count == 0 && code <= 1)
|| (following_byte == 2 && count == 0 && code == 0 && ch < 0xA0)
|| (following_byte == 3 && count == 0 && code == 0 && ch < 0x90)
|| (following_byte == 4 && count == 0 && code == 0 && ch < 0x88)
|| (following_byte == 5 && count == 0 && code == 0 && ch < 0x84))
is_valid = false;
code <<= 6;
code += ch & 0x3F;
count++;
} else if (0xC0 <= ch && ch <= 0xDF) {
code = ch & 0x1F;
following_byte = 1;
count = 0;
} else if (0xE0 <= ch && ch <= 0xEF) {
code = ch & 0x0F;
following_byte = 2;
count = 0;
} else if (0xF0 <= ch && ch <= 0xF7) {
code = ch & 0x07;
following_byte = 3;
count = 0;
} else if (0xF8 <= ch && ch <= 0xFB) {
code = ch & 0x03;
following_byte = 4;
count = 0;
} else if (0xFC <= ch && ch <= 0xFD) {
code = ch & 0x01;
following_byte = 5;
count = 0;
} else { /* 0xFE - 0xFF: not used in UTF-8 */
*ucs = MALFORMED_CHARACTER;
return count + 1;
}
if (count >= following_byte) {
/* illegal code point (ref: http://www.unicode.org/reports/tr27/tr27-4.html)
0xD800 ~ 0xDFFF : surrogate pair
0xFDD0 ~ 0xFDEF : noncharacter
0xnFFFE ~ 0xnFFFF: noncharacter (n: 0x00 ~ 0x10)
0x110000 ~ : invalid (unicode U+0000 ~ U+10FFFF)
*/
if (!is_valid
|| (0xD800 <= code && code <= 0xDFFF)
|| (0xFDD0 <= code && code <= 0xFDEF)
|| ((code & 0xFFFF) == 0xFFFE || (code & 0xFFFF) == 0xFFFF)
|| (code > 0x10FFFF)) {
*ucs = MALFORMED_CHARACTER;
return count + 1;
} else {
*ucs = code;
return count + 1;
}
}
}
return MALFORMED_CHARACTER;
}
void toggle_cursive_square(struct line_t *line)
{
/*
* ref: http://ja.m.wikipedia.org/wiki/Unicode%E4%B8%80%E8%A6%A7_3000-3FFF
* ref: http://www.mwsoft.jp/programming/other/convert_hiragana_katakana.html
*/
logging(DEBUG, "toggle cursive <---> square\n");
/* line->cells[0] is mark_cook, so skip it */
for (int i = 1; i < line->cursor.insert; i++) {
if ((0x3041 <= line->cells[i] && line->cells[i] <= 0x3096)
|| (0x309D <= line->cells[i] && line->cells[i] <= 0x309F))
line->cells[i] = line->cells[i] + CURSIVE_SQUARE_OFFSET;
else if ((0x30A1 <= line->cells[i] && line->cells[i] <= 0x30F6)
|| (0x30FD <= line->cells[i] && line->cells[i] <= 0x30FF))
line->cells[i] = line->cells[i] - CURSIVE_SQUARE_OFFSET;
}
}