Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: custom 和 match api 对于双 unicode 编码字符的适配 #242

Merged
merged 1 commit into from
Jun 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 20 additions & 3 deletions lib/common/utils.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,24 @@
import { DoubleUnicodeReg } from './constant';
import { DoubleUnicodePrefixReg, DoubleUnicodeSuffixReg, DoubleUnicodeReg } from './constant';

export function getStringLength(string: string) {
return string.replace(DoubleUnicodeReg, '_').length;
export function stringLength(text: string) {
return text.replace(DoubleUnicodeReg, '_').length;
}

// 双音节字符处理
export function splitString(text: string): string[] {
const result = [];
let i = 0;
while (i < text.length) {
const char = text.charAt(i);
if (DoubleUnicodePrefixReg.test(char) && DoubleUnicodeSuffixReg.test(text.charAt(i + 1))) {
result.push(text.substring(i, i + 2));
i += 2;
} else {
result.push(char);
i += 1;
}
}
return result;
}

export function isZhChar(char: string) {
Expand Down
10 changes: 5 additions & 5 deletions lib/core/custom/index.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { acTree } from '@/common/segmentit';
import { Probability, Priority } from '@/common/constant';
import { getStringLength } from '@/common/utils';
import { splitString, stringLength } from '@/common/utils';
import DICT1 from '@/data/dict1';
let customDict: { [key: string]: string } = {};
let customMultipleDict: string[] = [];
Expand Down Expand Up @@ -33,15 +33,15 @@ export function customPinyin(
options?: CustomPinyinOptions
) {
const keys = Object.keys(config).sort(
(key1, key2) => getStringLength(key2) - getStringLength(key1)
(key1, key2) => stringLength(key2) - stringLength(key1)
);
keys.forEach((key) => {
customDict[key] = config[key];
});
const customPatterns = Object.keys(customDict).map((key) => ({
zh: key,
pinyin: customDict[key],
probability: Probability.Custom + getStringLength(key),
probability: Probability.Custom + stringLength(key),
length: key.length,
priority: Priority.Custom,
dict: CustomDictName,
Expand All @@ -63,10 +63,10 @@ function addCustomConfigToDict(
) {
for (let key in config) {
const pinyins = config[key];
key.split('').forEach((word, index) => {
splitString(key).forEach((word, index) => {
const pinyin = pinyins.split(' ')[index] || '';
const wordCode = word.charCodeAt(0);
if (handleType === 'replace') {
if (handleType === 'replace' || (handleType === 'add' && !dict[wordCode] && !DICT1[wordCode])) {
// 直接覆盖原词典
dict[wordCode] = pinyin;
} else {
Expand Down
4 changes: 2 additions & 2 deletions lib/core/dict/index.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { Priority, Probability } from "@/common/constant";
import { Pattern, acTree } from "@/common/segmentit";
import { getStringLength } from "@/common/utils";
import { stringLength } from "@/common/utils";
import DICT1 from "@/data/dict1";

const DefaultName = Symbol("default");
Expand Down Expand Up @@ -29,7 +29,7 @@ export function addDict(dict: DICT | {}, options?: string | DictOptions) {
for (let key in dict as DICT) {
const value = (dict as DICT)[key];
const pinyin = Array.isArray(value) ? value[0] : value;
if (getStringLength(key) === 1) {
if (stringLength(key) === 1) {
addToOriginDict(
dictName,
key,
Expand Down
98 changes: 65 additions & 33 deletions lib/core/match/index.ts
Original file line number Diff line number Diff line change
@@ -1,33 +1,34 @@
import { pinyin as _pinyin } from '@/core/pinyin';
import { splitString } from "@/common/utils";
import { pinyin as _pinyin } from "@/core/pinyin";

interface MatchOptions {
/**
* @description 每个汉字和拼音需要遵从的匹配精度
*/
precision?: 'first' | 'start' | 'every' | 'any';
precision?: "first" | "start" | "every" | "any";
/**
* @description 匹配的汉字下标是否为连续的才算匹配成功
*/
continuous?: boolean;
/**
* @description 匹配时对于空格的处理
*/
space?: 'ignore' | 'preserve';
space?: "ignore" | "preserve";
/**
* @description 最后一个字的匹配精度
*/
lastPrecision?: 'first' | 'start' | 'every' | 'any';
lastPrecision?: "first" | "start" | "every" | "any";
/**
* @description 是否大小写不敏感
*/
insensitive?: boolean;
}

const DefaultMatchOptions: MatchOptions = {
precision: 'first',
precision: "first",
continuous: false,
space: 'ignore',
lastPrecision: 'start',
space: "ignore",
lastPrecision: "start",
insensitive: true,
};

Expand All @@ -41,8 +42,8 @@ const MAX_PINYIN_LENGTH = 6;
* @return {Array | null} 若匹配成功,返回 text 中匹配成功的下标数组;若匹配失败,返回 null
*/
export const match = (text: string, pinyin: string, options?: MatchOptions) => {
if (options?.precision === 'any') {
options.lastPrecision = 'any';
if (options?.precision === "any") {
options.lastPrecision = "any";
}
const completeOptions = {
...DefaultMatchOptions,
Expand All @@ -54,14 +55,14 @@ export const match = (text: string, pinyin: string, options?: MatchOptions) => {
pinyin = pinyin.toLowerCase();
}
// 移除空格
if (completeOptions.space === 'ignore') {
pinyin = pinyin.replace(/\s/g, '');
if (completeOptions.space === "ignore") {
pinyin = pinyin.replace(/\s/g, "");
}
const result =
options?.precision === 'any'
options?.precision === "any"
? matchAny(text, pinyin, completeOptions)
: matchAboveStart(text, pinyin, completeOptions);
return result;
return processDoubleUnicodeIndex(text, result);
};

// 检测两个拼音最大的匹配长度
Expand All @@ -81,23 +82,24 @@ const matchAny = (
options: Required<MatchOptions>
) => {
let result = [];
for (let i = 0; i < text.length; i++) {
const words = splitString(text);
for (let i = 0; i < words.length; i++) {
// 空格字符
if (options.space === 'ignore' && text[i] === ' ') {
if (options.space === "ignore" && words[i] === " ") {
result.push(i);
continue;
}
// 是否为中文匹配
if (text[i] === pinyin[0]) {
if (words[i] === pinyin[0]) {
pinyin = pinyin.slice(1);
result.push(i);
continue;
}
// 当前字的多音字拼音
const ps = _pinyin(text[i], {
toneType: 'none',
const ps = _pinyin(words[i], {
toneType: "none",
multiple: true,
type: 'array',
type: "array",
});
let currentLength = 0;
ps.forEach((p) => {
Expand Down Expand Up @@ -128,8 +130,8 @@ const matchAny = (
return null;
}
}
if (options.space === 'ignore') {
result = result.filter((i) => text[i] !== ' ');
if (options.space === "ignore") {
result = result.filter((i) => words[i] !== " ");
}
return result.length ? result : null;
};
Expand All @@ -139,7 +141,7 @@ const matchAboveStart = (
pinyin: string,
options: Required<MatchOptions>
) => {
const words = text.split('');
const words = splitString(text);

// 二维数组 dp[i][j],i 表示遍历到的 text 索引+1, j 表示遍历到的 pinyin 的索引+1
const dp = Array(words.length + 1);
Expand All @@ -157,7 +159,7 @@ const matchAboveStart = (
// options.continuous 为 false 或 options.space 为 ignore 且当前为空格时,第 i 个字可以不参与匹配
if (
!options.continuous ||
(options.space == 'ignore' && text[i - 1] === ' ')
(options.space == "ignore" && words[i - 1] === " ")
) {
for (let j = 1; j <= pinyin.length; j++) {
dp[i][j - 1] = dp[i - 1][j - 1];
Expand All @@ -172,14 +174,14 @@ const matchAboveStart = (
// 非开头且前面的字符未匹配完成,停止向后匹配
continue;
} else {
const muls = _pinyin(text[i - 1], {
type: 'array',
toneType: 'none',
const muls = _pinyin(words[i - 1], {
type: "array",
toneType: "none",
multiple: true,
});

// 非中文匹配
if (text[i - 1] === pinyin[j - 1]) {
if (words[i - 1] === pinyin[j - 1]) {
const matches = [...dp[i - 1][j - 1], i - 1];
// 记录最长的可匹配下标数组
if (!dp[i][j] || matches.length > dp[i][j].length) {
Expand All @@ -195,16 +197,16 @@ const matchAboveStart = (
if (pinyin.length - j <= MAX_PINYIN_LENGTH) {
// lastPrecision 参数处理
const last = muls.some((py) => {
if (options.lastPrecision === 'any') {
if (options.lastPrecision === "any") {
return py.includes(pinyin.slice(j - 1, pinyin.length));
}
if (options.lastPrecision === 'start') {
if (options.lastPrecision === "start") {
return py.startsWith(pinyin.slice(j - 1, pinyin.length));
}
if (options.lastPrecision === 'first') {
if (options.lastPrecision === "first") {
return py[0] === pinyin.slice(j - 1, pinyin.length);
}
if (options.lastPrecision === 'every') {
if (options.lastPrecision === "every") {
return py === pinyin.slice(j - 1, pinyin.length);
}
return false;
Expand All @@ -217,7 +219,7 @@ const matchAboveStart = (
const precision = options.precision;

// precision 为 start 时,匹配开头
if (precision === 'start') {
if (precision === "start") {
muls.forEach((py) => {
let end = j;
const matches = [...dp[i - 1][j - 1], i - 1];
Expand All @@ -234,7 +236,7 @@ const matchAboveStart = (
}

// precision 为 first 时,匹配首字母
if (precision === 'first') {
if (precision === "first") {
if (muls.some((py) => py[0] === pinyin[j - 1])) {
const matches = [...dp[i - 1][j - 1], i - 1];
// 记录最长的可匹配下标数组
Expand All @@ -261,3 +263,33 @@ const matchAboveStart = (
}
return null;
};

// 对于双字节的字符,需要将 index 顺延 +1
function processDoubleUnicodeIndex(
text: string,
indexArray: number[] | null
): number[] | null {
if (!indexArray) {
return null;
}
const result = [];
let doubleUnicodeCount = 0;
const words = splitString(text);
let i = 0;
for (let j = 0; j < indexArray.length; j++) {
const curIndex = indexArray[j];
while (i <= curIndex) {
if (words[i].length === 2) {
doubleUnicodeCount++;
}
i++;
}
const realIndex = curIndex + doubleUnicodeCount;
if (words[curIndex].length === 2) {
result.push(realIndex - 1, realIndex);
} else {
result.push(realIndex);
}
}
return result;
}
6 changes: 3 additions & 3 deletions lib/core/pinyin/middlewares.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { getStringLength, isZhChar } from "@/common/utils";
import { stringLength, isZhChar } from "@/common/utils";
import type { SingleWordResult } from "../../common/type";
import {
DoubleUnicodePrefixReg,
Expand Down Expand Up @@ -59,7 +59,7 @@ export const middlewareMultiple = (
word: string,
options: CompleteOptions
): SingleWordResult[] | false => {
if (getStringLength(word) === 1 && options.multiple) {
if (stringLength(word) === 1 && options.multiple) {
return getMultiplePinyin(word, options.surname);
} else {
return false;
Expand Down Expand Up @@ -166,7 +166,7 @@ export const middlewareType = (
options: CompleteOptions,
word: string
) => {
if (options.multiple && getStringLength(word) === 1) {
if (options.multiple && stringLength(word) === 1) {
let last = "";
list = list.filter((item) => {
const res = item.result !== last;
Expand Down
4 changes: 2 additions & 2 deletions lib/core/polyphonic/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ import {
getFinalParts,
} from '@/core/pinyin/handle';
import { getCustomPolyphonicDict } from '../custom';
import { isZhChar } from '@/common/utils';
import { isZhChar, splitString } from '@/common/utils';

interface BasicOptions {
/**
Expand Down Expand Up @@ -206,7 +206,7 @@ function polyphonic(

// 获取每个字多音字的数组
const getPolyphonicList = (text: string): SingleWordResult[] => {
return text.split('').map((word) => {
return splitString(text).map((word) => {
const wordCode = word.charCodeAt(0);
const customPolyphonicDict = getCustomPolyphonicDict();
const pinyin = customPolyphonicDict[wordCode] || DICT1[wordCode] || word;
Expand Down
13 changes: 12 additions & 1 deletion test/match.test.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { match } from '../lib/index';
import { match, customPinyin, clearCustomDict } from '../lib/index';
import { expect, describe, it } from 'vitest';

describe('match', () => {
Expand Down Expand Up @@ -90,6 +90,17 @@ describe('match', () => {
expect(result).to.deep.equal([2, 4]);
});

it('[match]first&space', () => {
customPinyin({
𧒽: 'lei'
}, {
multiple: 'replace'
})
const result = match('𧒽测 试', 'l c s');
expect(result).to.deep.equal([0, 1, 2, 4]);
clearCustomDict(['pinyin', 'multiple', 'polyphonic']);
});

it('[match]nonZh match', () => {
const result = match('测uuuuuuuuuu试', 'cuuuuuu');
expect(result).to.deep.equal([0, 1, 2, 3, 4, 5, 6]);
Expand Down
3 changes: 2 additions & 1 deletion types/common/utils.d.ts
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
export declare function getStringLength(string: string): number;
export declare function stringLength(text: string): number;
export declare function splitString(text: string): string[];
export declare function isZhChar(char: string): boolean;
Loading
Loading