From 441d8002fb3734365e81284c59e6fa677a231d17 Mon Sep 17 00:00:00 2001 From: Takashi Hashida Date: Fri, 3 Feb 2023 14:00:36 +0900 Subject: [PATCH] NormalizerNFKC: add `unify_katakana_z_sounds` option (#1502) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When `unify_katakana_z_sounds` is specified, `NormalizerNFKC*` normalize characters as below. ズァ -> ザ ズィ -> ジ ズェ -> ゼ ズォ -> ゾ Usage: ``` normalize \ 'NormalizerNFKC130("unify_katakana_z_sounds", true, \ "report_source_offset", true)' \ "ズァズィズェズォ" \ WITH_CHECKS|WITH_TYPES ``` --- lib/grn_nfkc.h | 1 + lib/nfkc.c | 7 ++ lib/normalizer.c | 95 +++++++++++++++++++ .../nfkc100/unify_katakana_z_sounds.expected | 42 ++++++++ .../nfkc100/unify_katakana_z_sounds.test | 5 + .../nfkc121/unify_katakana_z_sounds.expected | 42 ++++++++ .../nfkc121/unify_katakana_z_sounds.test | 5 + .../nfkc130/unify_katakana_z_sounds.expected | 42 ++++++++ .../nfkc130/unify_katakana_z_sounds.test | 5 + .../nfkc150/unify_katakana_z_sounds.expected | 42 ++++++++ .../nfkc150/unify_katakana_z_sounds.test | 5 + 11 files changed, 291 insertions(+) create mode 100644 test/command/suite/normalizers/nfkc100/unify_katakana_z_sounds.expected create mode 100644 test/command/suite/normalizers/nfkc100/unify_katakana_z_sounds.test create mode 100644 test/command/suite/normalizers/nfkc121/unify_katakana_z_sounds.expected create mode 100644 test/command/suite/normalizers/nfkc121/unify_katakana_z_sounds.test create mode 100644 test/command/suite/normalizers/nfkc130/unify_katakana_z_sounds.expected create mode 100644 test/command/suite/normalizers/nfkc130/unify_katakana_z_sounds.test create mode 100644 test/command/suite/normalizers/nfkc150/unify_katakana_z_sounds.expected create mode 100644 test/command/suite/normalizers/nfkc150/unify_katakana_z_sounds.test diff --git a/lib/grn_nfkc.h b/lib/grn_nfkc.h index a3f577032d..95acefd2fb 100644 --- a/lib/grn_nfkc.h +++ b/lib/grn_nfkc.h @@ -44,6 +44,7 @@ typedef struct { grn_bool unify_middle_dot; grn_bool unify_katakana_v_sounds; grn_bool unify_katakana_bu_sound; + grn_bool unify_katakana_z_sounds; grn_bool unify_katakana_wo_sound; grn_bool unify_katakana_di_sound; grn_bool unify_katakana_g_sounds; diff --git a/lib/nfkc.c b/lib/nfkc.c index 2b1e255e9e..aac03ca3b5 100644 --- a/lib/nfkc.c +++ b/lib/nfkc.c @@ -62,6 +62,7 @@ grn_nfkc_normalize_options_init(grn_ctx *ctx, options->unify_middle_dot = GRN_FALSE; options->unify_katakana_v_sounds = GRN_FALSE; options->unify_katakana_bu_sound = GRN_FALSE; + options->unify_katakana_z_sounds = GRN_FALSE; options->unify_katakana_wo_sound = GRN_FALSE; options->unify_katakana_di_sound = GRN_FALSE; options->unify_katakana_g_sounds = GRN_FALSE; @@ -195,6 +196,12 @@ grn_nfkc_normalize_options_apply(grn_ctx *ctx, raw_options, i, options->unify_katakana_bu_sound); + } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "unify_katakana_z_sounds")) { + options->unify_katakana_z_sounds = + grn_vector_get_element_bool(ctx, + raw_options, + i, + options->unify_katakana_z_sounds); } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "unify_katakana_wo_sound")) { options->unify_katakana_wo_sound = grn_vector_get_element_bool(ctx, diff --git a/lib/normalizer.c b/lib/normalizer.c index 7ba9c29275..38ed4d8165 100644 --- a/lib/normalizer.c +++ b/lib/normalizer.c @@ -1476,6 +1476,83 @@ grn_nfkc_normalize_unify_katakana_bu_sound(grn_ctx *ctx, return current; } +static const unsigned char * +grn_nfkc_normalize_unify_katakana_z_sounds(grn_ctx *ctx, + const unsigned char *start, + const unsigned char *current, + const unsigned char *end, + size_t *n_used_bytes, + size_t *n_used_characters, + unsigned char *unified_buffer, + size_t *n_unified_bytes, + size_t *n_unified_characters, + void *user_data) +{ + size_t char_length; + + char_length = (size_t)grn_charlen_(ctx, current, end, GRN_ENC_UTF8); + + *n_used_bytes = char_length; + *n_used_characters = 1; + + if (char_length == 3 && + /* U+30BA KATAKANA LETTER ZU */ + current[0] == 0xe3 && + current[1] == 0x82 && + current[2] == 0xba) { + const unsigned char *next = current + char_length; + size_t next_char_length; + + next_char_length = (size_t)grn_charlen_(ctx, next, end, GRN_ENC_UTF8); + if (next_char_length == 3 && + next[0] == 0xe3 && + next[1] == 0x82) { + if (next[2] == 0xa1) { /* U+30A1 KATAKANA LETTER SMALL A */ + /* U+30B6 KATAKANA LETTER ZA */ + unified_buffer[(*n_unified_bytes)++] = current[0]; + unified_buffer[(*n_unified_bytes)++] = 0x82; + unified_buffer[(*n_unified_bytes)++] = 0xb6; + (*n_unified_characters)++; + (*n_used_bytes) += next_char_length; + (*n_used_characters)++; + return unified_buffer; + } else if (next[2] == 0xa3) { /* U+30A3 KATAKANA LETTER SMALL I */ + /* U+30B8 KATAKANA LETTER ZI */ + unified_buffer[(*n_unified_bytes)++] = current[0]; + unified_buffer[(*n_unified_bytes)++] = 0x82; + unified_buffer[(*n_unified_bytes)++] = 0xb8; + (*n_unified_characters)++; + (*n_used_bytes) += next_char_length; + (*n_used_characters)++; + return unified_buffer; + } else if (next[2] == 0xa7) { /* U+30A7 KATAKANA LETTER SMALL E */ + /* U+30BC KATAKANA LETTER ZE */ + unified_buffer[(*n_unified_bytes)++] = current[0]; + unified_buffer[(*n_unified_bytes)++] = 0x82; + unified_buffer[(*n_unified_bytes)++] = 0xbC; + (*n_unified_characters)++; + (*n_used_bytes) += next_char_length; + (*n_used_characters)++; + return unified_buffer; + } else if (next[2] == 0xa9) { /* U+30A8 KATAKANA LETTER SMALL O */ + /* U+30BE KATAKANA LETTER ZO */ + unified_buffer[(*n_unified_bytes)++] = current[0]; + unified_buffer[(*n_unified_bytes)++] = 0x82; + unified_buffer[(*n_unified_bytes)++] = 0xbe; + (*n_unified_characters)++; + (*n_used_bytes) += next_char_length; + (*n_used_characters)++; + return unified_buffer; + } + } + } + + *n_unified_bytes = *n_used_bytes; + *n_unified_characters = *n_used_characters; + + return current; +} + static const unsigned char * grn_nfkc_normalize_unify_katakana_wo_sound(grn_ctx *ctx, const unsigned char *start, @@ -1821,6 +1898,7 @@ grn_nfkc_normalize_unify(grn_ctx *ctx, data->options->unify_middle_dot || data->options->unify_katakana_v_sounds || data->options->unify_katakana_bu_sound || + data->options->unify_katakana_z_sounds || data->options->unify_katakana_wo_sound || data->options->unify_katakana_di_sound || data->options->unify_katakana_g_sounds || @@ -1891,6 +1969,23 @@ grn_nfkc_normalize_unify(grn_ctx *ctx, need_swap = GRN_TRUE; } + if (data->options->unify_katakana_z_sounds) { + if (need_swap) { + grn_nfkc_normalize_context_swap(ctx, &(data->context), &unify); + grn_nfkc_normalize_context_rewind(ctx, &unify); + } + grn_nfkc_normalize_unify_stateful(ctx, + data, + &unify, + grn_nfkc_normalize_unify_katakana_z_sounds, + NULL, + "[unify][katakana-z-sounds]"); + if (ctx->rc != GRN_SUCCESS) { + goto exit; + } + need_swap = GRN_TRUE; + } + if (data->options->unify_katakana_wo_sound) { if (need_swap) { grn_nfkc_normalize_context_swap(ctx, &(data->context), &unify); diff --git a/test/command/suite/normalizers/nfkc100/unify_katakana_z_sounds.expected b/test/command/suite/normalizers/nfkc100/unify_katakana_z_sounds.expected new file mode 100644 index 0000000000..65da75646d --- /dev/null +++ b/test/command/suite/normalizers/nfkc100/unify_katakana_z_sounds.expected @@ -0,0 +1,42 @@ +normalize 'NormalizerNFKC100("unify_katakana_z_sounds", true, "report_source_offset", true)' "ズァズィズズェズォ" WITH_CHECKS|WITH_TYPES +[ + [ + 0, + 0.0, + 0.0 + ], + { + "normalized": "ザジズゼゾ", + "types": [ + "katakana", + "katakana", + "katakana", + "katakana", + "katakana" + ], + "checks": [ + 6, + 0, + 0, + 6, + 0, + 0, + 3, + 0, + 0, + 6, + 0, + 0, + 6, + 0, + 0 + ], + "offsets": [ + 0, + 6, + 12, + 15, + 21 + ] + } +] diff --git a/test/command/suite/normalizers/nfkc100/unify_katakana_z_sounds.test b/test/command/suite/normalizers/nfkc100/unify_katakana_z_sounds.test new file mode 100644 index 0000000000..03e7dfd276 --- /dev/null +++ b/test/command/suite/normalizers/nfkc100/unify_katakana_z_sounds.test @@ -0,0 +1,5 @@ +normalize \ + 'NormalizerNFKC100("unify_katakana_z_sounds", true, \ + "report_source_offset", true)' \ + "ズァズィズズェズォ" \ + WITH_CHECKS|WITH_TYPES diff --git a/test/command/suite/normalizers/nfkc121/unify_katakana_z_sounds.expected b/test/command/suite/normalizers/nfkc121/unify_katakana_z_sounds.expected new file mode 100644 index 0000000000..cbe55af979 --- /dev/null +++ b/test/command/suite/normalizers/nfkc121/unify_katakana_z_sounds.expected @@ -0,0 +1,42 @@ +normalize 'NormalizerNFKC121("unify_katakana_z_sounds", true, "report_source_offset", true)' "ズァズィズズェズォ" WITH_CHECKS|WITH_TYPES +[ + [ + 0, + 0.0, + 0.0 + ], + { + "normalized": "ザジズゼゾ", + "types": [ + "katakana", + "katakana", + "katakana", + "katakana", + "katakana" + ], + "checks": [ + 6, + 0, + 0, + 6, + 0, + 0, + 3, + 0, + 0, + 6, + 0, + 0, + 6, + 0, + 0 + ], + "offsets": [ + 0, + 6, + 12, + 15, + 21 + ] + } +] diff --git a/test/command/suite/normalizers/nfkc121/unify_katakana_z_sounds.test b/test/command/suite/normalizers/nfkc121/unify_katakana_z_sounds.test new file mode 100644 index 0000000000..114fb58c40 --- /dev/null +++ b/test/command/suite/normalizers/nfkc121/unify_katakana_z_sounds.test @@ -0,0 +1,5 @@ +normalize \ + 'NormalizerNFKC121("unify_katakana_z_sounds", true, \ + "report_source_offset", true)' \ + "ズァズィズズェズォ" \ + WITH_CHECKS|WITH_TYPES diff --git a/test/command/suite/normalizers/nfkc130/unify_katakana_z_sounds.expected b/test/command/suite/normalizers/nfkc130/unify_katakana_z_sounds.expected new file mode 100644 index 0000000000..d572c43c0c --- /dev/null +++ b/test/command/suite/normalizers/nfkc130/unify_katakana_z_sounds.expected @@ -0,0 +1,42 @@ +normalize 'NormalizerNFKC130("unify_katakana_z_sounds", true, "report_source_offset", true)' "ズァズィズズェズォ" WITH_CHECKS|WITH_TYPES +[ + [ + 0, + 0.0, + 0.0 + ], + { + "normalized": "ザジズゼゾ", + "types": [ + "katakana", + "katakana", + "katakana", + "katakana", + "katakana" + ], + "checks": [ + 6, + 0, + 0, + 6, + 0, + 0, + 3, + 0, + 0, + 6, + 0, + 0, + 6, + 0, + 0 + ], + "offsets": [ + 0, + 6, + 12, + 15, + 21 + ] + } +] diff --git a/test/command/suite/normalizers/nfkc130/unify_katakana_z_sounds.test b/test/command/suite/normalizers/nfkc130/unify_katakana_z_sounds.test new file mode 100644 index 0000000000..4fd458e01d --- /dev/null +++ b/test/command/suite/normalizers/nfkc130/unify_katakana_z_sounds.test @@ -0,0 +1,5 @@ +normalize \ + 'NormalizerNFKC130("unify_katakana_z_sounds", true, \ + "report_source_offset", true)' \ + "ズァズィズズェズォ" \ + WITH_CHECKS|WITH_TYPES diff --git a/test/command/suite/normalizers/nfkc150/unify_katakana_z_sounds.expected b/test/command/suite/normalizers/nfkc150/unify_katakana_z_sounds.expected new file mode 100644 index 0000000000..a233f39fb0 --- /dev/null +++ b/test/command/suite/normalizers/nfkc150/unify_katakana_z_sounds.expected @@ -0,0 +1,42 @@ +normalize 'NormalizerNFKC150("unify_katakana_z_sounds", true, "report_source_offset", true)' "ズァズィズズェズォ" WITH_CHECKS|WITH_TYPES +[ + [ + 0, + 0.0, + 0.0 + ], + { + "normalized": "ザジズゼゾ", + "types": [ + "katakana", + "katakana", + "katakana", + "katakana", + "katakana" + ], + "checks": [ + 6, + 0, + 0, + 6, + 0, + 0, + 3, + 0, + 0, + 6, + 0, + 0, + 6, + 0, + 0 + ], + "offsets": [ + 0, + 6, + 12, + 15, + 21 + ] + } +] diff --git a/test/command/suite/normalizers/nfkc150/unify_katakana_z_sounds.test b/test/command/suite/normalizers/nfkc150/unify_katakana_z_sounds.test new file mode 100644 index 0000000000..eae358f8e4 --- /dev/null +++ b/test/command/suite/normalizers/nfkc150/unify_katakana_z_sounds.test @@ -0,0 +1,5 @@ +normalize \ + 'NormalizerNFKC150("unify_katakana_z_sounds", true, \ + "report_source_offset", true)' \ + "ズァズィズズェズォ" \ + WITH_CHECKS|WITH_TYPES