Skip to content

Commit

Permalink
NormalizerNFKC: add unify_katakana_z_sounds option (groonga#1502)
Browse files Browse the repository at this point in the history
When `unify_katakana_z_sounds` is specified, `NormalizerNFKC*` normalize
characters as below.

ズァ -> ザ
ズィ -> ジ
ズェ -> ゼ
ズォ -> ゾ


Usage: 

```
normalize \
  'NormalizerNFKC130("unify_katakana_z_sounds", true, \
                     "report_source_offset", true)' \
  "ズァズィズェズォ" \
  WITH_CHECKS|WITH_TYPES
```
  • Loading branch information
HashidaTKS authored Feb 3, 2023
1 parent f84cf01 commit 441d800
Show file tree
Hide file tree
Showing 11 changed files with 291 additions and 0 deletions.
1 change: 1 addition & 0 deletions lib/grn_nfkc.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ typedef struct {
grn_bool unify_middle_dot;
grn_bool unify_katakana_v_sounds;
grn_bool unify_katakana_bu_sound;
grn_bool unify_katakana_z_sounds;
grn_bool unify_katakana_wo_sound;
grn_bool unify_katakana_di_sound;
grn_bool unify_katakana_g_sounds;
Expand Down
7 changes: 7 additions & 0 deletions lib/nfkc.c
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ grn_nfkc_normalize_options_init(grn_ctx *ctx,
options->unify_middle_dot = GRN_FALSE;
options->unify_katakana_v_sounds = GRN_FALSE;
options->unify_katakana_bu_sound = GRN_FALSE;
options->unify_katakana_z_sounds = GRN_FALSE;
options->unify_katakana_wo_sound = GRN_FALSE;
options->unify_katakana_di_sound = GRN_FALSE;
options->unify_katakana_g_sounds = GRN_FALSE;
Expand Down Expand Up @@ -195,6 +196,12 @@ grn_nfkc_normalize_options_apply(grn_ctx *ctx,
raw_options,
i,
options->unify_katakana_bu_sound);
} else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "unify_katakana_z_sounds")) {
options->unify_katakana_z_sounds =
grn_vector_get_element_bool(ctx,
raw_options,
i,
options->unify_katakana_z_sounds);
} else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "unify_katakana_wo_sound")) {
options->unify_katakana_wo_sound =
grn_vector_get_element_bool(ctx,
Expand Down
95 changes: 95 additions & 0 deletions lib/normalizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -1476,6 +1476,83 @@ grn_nfkc_normalize_unify_katakana_bu_sound(grn_ctx *ctx,
return current;
}

static const unsigned char *
grn_nfkc_normalize_unify_katakana_z_sounds(grn_ctx *ctx,
const unsigned char *start,
const unsigned char *current,
const unsigned char *end,
size_t *n_used_bytes,
size_t *n_used_characters,
unsigned char *unified_buffer,
size_t *n_unified_bytes,
size_t *n_unified_characters,
void *user_data)
{
size_t char_length;

char_length = (size_t)grn_charlen_(ctx, current, end, GRN_ENC_UTF8);

*n_used_bytes = char_length;
*n_used_characters = 1;

if (char_length == 3 &&
/* U+30BA KATAKANA LETTER ZU */
current[0] == 0xe3 &&
current[1] == 0x82 &&
current[2] == 0xba) {
const unsigned char *next = current + char_length;
size_t next_char_length;

next_char_length = (size_t)grn_charlen_(ctx, next, end, GRN_ENC_UTF8);
if (next_char_length == 3 &&
next[0] == 0xe3 &&
next[1] == 0x82) {
if (next[2] == 0xa1) { /* U+30A1 KATAKANA LETTER SMALL A */
/* U+30B6 KATAKANA LETTER ZA */
unified_buffer[(*n_unified_bytes)++] = current[0];
unified_buffer[(*n_unified_bytes)++] = 0x82;
unified_buffer[(*n_unified_bytes)++] = 0xb6;
(*n_unified_characters)++;
(*n_used_bytes) += next_char_length;
(*n_used_characters)++;
return unified_buffer;
} else if (next[2] == 0xa3) { /* U+30A3 KATAKANA LETTER SMALL I */
/* U+30B8 KATAKANA LETTER ZI */
unified_buffer[(*n_unified_bytes)++] = current[0];
unified_buffer[(*n_unified_bytes)++] = 0x82;
unified_buffer[(*n_unified_bytes)++] = 0xb8;
(*n_unified_characters)++;
(*n_used_bytes) += next_char_length;
(*n_used_characters)++;
return unified_buffer;
} else if (next[2] == 0xa7) { /* U+30A7 KATAKANA LETTER SMALL E */
/* U+30BC KATAKANA LETTER ZE */
unified_buffer[(*n_unified_bytes)++] = current[0];
unified_buffer[(*n_unified_bytes)++] = 0x82;
unified_buffer[(*n_unified_bytes)++] = 0xbC;
(*n_unified_characters)++;
(*n_used_bytes) += next_char_length;
(*n_used_characters)++;
return unified_buffer;
} else if (next[2] == 0xa9) { /* U+30A8 KATAKANA LETTER SMALL O */
/* U+30BE KATAKANA LETTER ZO */
unified_buffer[(*n_unified_bytes)++] = current[0];
unified_buffer[(*n_unified_bytes)++] = 0x82;
unified_buffer[(*n_unified_bytes)++] = 0xbe;
(*n_unified_characters)++;
(*n_used_bytes) += next_char_length;
(*n_used_characters)++;
return unified_buffer;
}
}
}

*n_unified_bytes = *n_used_bytes;
*n_unified_characters = *n_used_characters;

return current;
}

static const unsigned char *
grn_nfkc_normalize_unify_katakana_wo_sound(grn_ctx *ctx,
const unsigned char *start,
Expand Down Expand Up @@ -1821,6 +1898,7 @@ grn_nfkc_normalize_unify(grn_ctx *ctx,
data->options->unify_middle_dot ||
data->options->unify_katakana_v_sounds ||
data->options->unify_katakana_bu_sound ||
data->options->unify_katakana_z_sounds ||
data->options->unify_katakana_wo_sound ||
data->options->unify_katakana_di_sound ||
data->options->unify_katakana_g_sounds ||
Expand Down Expand Up @@ -1891,6 +1969,23 @@ grn_nfkc_normalize_unify(grn_ctx *ctx,
need_swap = GRN_TRUE;
}

if (data->options->unify_katakana_z_sounds) {
if (need_swap) {
grn_nfkc_normalize_context_swap(ctx, &(data->context), &unify);
grn_nfkc_normalize_context_rewind(ctx, &unify);
}
grn_nfkc_normalize_unify_stateful(ctx,
data,
&unify,
grn_nfkc_normalize_unify_katakana_z_sounds,
NULL,
"[unify][katakana-z-sounds]");
if (ctx->rc != GRN_SUCCESS) {
goto exit;
}
need_swap = GRN_TRUE;
}

if (data->options->unify_katakana_wo_sound) {
if (need_swap) {
grn_nfkc_normalize_context_swap(ctx, &(data->context), &unify);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
normalize 'NormalizerNFKC100("unify_katakana_z_sounds", true, "report_source_offset", true)' "ズァズィズズェズォ" WITH_CHECKS|WITH_TYPES
[
[
0,
0.0,
0.0
],
{
"normalized": "ザジズゼゾ",
"types": [
"katakana",
"katakana",
"katakana",
"katakana",
"katakana"
],
"checks": [
6,
0,
0,
6,
0,
0,
3,
0,
0,
6,
0,
0,
6,
0,
0
],
"offsets": [
0,
6,
12,
15,
21
]
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
normalize \
'NormalizerNFKC100("unify_katakana_z_sounds", true, \
"report_source_offset", true)' \
"ズァズィズズェズォ" \
WITH_CHECKS|WITH_TYPES
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
normalize 'NormalizerNFKC121("unify_katakana_z_sounds", true, "report_source_offset", true)' "ズァズィズズェズォ" WITH_CHECKS|WITH_TYPES
[
[
0,
0.0,
0.0
],
{
"normalized": "ザジズゼゾ",
"types": [
"katakana",
"katakana",
"katakana",
"katakana",
"katakana"
],
"checks": [
6,
0,
0,
6,
0,
0,
3,
0,
0,
6,
0,
0,
6,
0,
0
],
"offsets": [
0,
6,
12,
15,
21
]
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
normalize \
'NormalizerNFKC121("unify_katakana_z_sounds", true, \
"report_source_offset", true)' \
"ズァズィズズェズォ" \
WITH_CHECKS|WITH_TYPES
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
normalize 'NormalizerNFKC130("unify_katakana_z_sounds", true, "report_source_offset", true)' "ズァズィズズェズォ" WITH_CHECKS|WITH_TYPES
[
[
0,
0.0,
0.0
],
{
"normalized": "ザジズゼゾ",
"types": [
"katakana",
"katakana",
"katakana",
"katakana",
"katakana"
],
"checks": [
6,
0,
0,
6,
0,
0,
3,
0,
0,
6,
0,
0,
6,
0,
0
],
"offsets": [
0,
6,
12,
15,
21
]
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
normalize \
'NormalizerNFKC130("unify_katakana_z_sounds", true, \
"report_source_offset", true)' \
"ズァズィズズェズォ" \
WITH_CHECKS|WITH_TYPES
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
normalize 'NormalizerNFKC150("unify_katakana_z_sounds", true, "report_source_offset", true)' "ズァズィズズェズォ" WITH_CHECKS|WITH_TYPES
[
[
0,
0.0,
0.0
],
{
"normalized": "ザジズゼゾ",
"types": [
"katakana",
"katakana",
"katakana",
"katakana",
"katakana"
],
"checks": [
6,
0,
0,
6,
0,
0,
3,
0,
0,
6,
0,
0,
6,
0,
0
],
"offsets": [
0,
6,
12,
15,
21
]
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
normalize \
'NormalizerNFKC150("unify_katakana_z_sounds", true, \
"report_source_offset", true)' \
"ズァズィズズェズォ" \
WITH_CHECKS|WITH_TYPES

0 comments on commit 441d800

Please sign in to comment.