Skip to content

Commit

Permalink
Merge pull request #243 from zh-lx/feature-2-unicode
Browse files Browse the repository at this point in the history
Feature 2 unicode
  • Loading branch information
zh-lx authored Jun 8, 2024
2 parents 38306a5 + e2d2011 commit a064200
Show file tree
Hide file tree
Showing 19 changed files with 18,121 additions and 5,524 deletions.
1 change: 1 addition & 0 deletions lib/common/constant.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
export const DoubleUnicodePrefixReg = /^[\uD800-\uDBFF]$/;
export const DoubleUnicodeSuffixReg = /^[\uDC00-\uDFFF]$/;
export const DoubleUnicodeReg = /[\uD800-\uDBFF][\uDC00-\uDFFF]/g;
export const DoubleUnicodeCharReg = /^[\uD800-\uDBFF][\uDC00-\uDFFF]$/g;
export const enum Probability {
Unknown = 1e-13,
Rule = 1e-12,
Expand Down
18 changes: 10 additions & 8 deletions lib/common/segmentit/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import { minTokenization } from "./min-tokenization";
import { reverseMaxMatch } from "./reverse-max-match";
import { Priority } from "@/common/constant";
import type { SurnameMode } from "../type";
import { splitString, stringLength } from "../utils";

export const enum TokenizationAlgorithm {
ReverseMaxMatch = 1,
Expand Down Expand Up @@ -69,12 +70,12 @@ export class AC {
// 构建 trie 树
buildTrie(patternList: Pattern[]) {
for (let pattern of patternList) {
const { zh } = pattern;
const zhChars = splitString(pattern.zh);
let cur = this.root;
for (let i = 0; i < zh.length; i++) {
let c = zh.charAt(i);
for (let i = 0; i < zhChars.length; i++) {
let c = zhChars[i];
if (!cur.children.has(c)) {
const trieNode = new TrieNode(cur, zh.slice(0, i), c);
const trieNode = new TrieNode(cur, zhChars.slice(0, i).join(''), c);
cur.children.set(c, trieNode);
this.addNodeToQueues(trieNode);
}
Expand Down Expand Up @@ -161,8 +162,9 @@ export class AC {
match(text: string, surname: SurnameMode) {
let cur = this.root;
let result: MatchPattern[] = [];
for (let i = 0; i < text.length; i++) {
let c = text.charAt(i);
const zhChars = splitString(text);
for (let i = 0; i < zhChars.length; i++) {
let c = zhChars[i];

while (cur !== null && !cur.children.has(c)) {
cur = cur.fail as TrieNode;
Expand Down Expand Up @@ -220,9 +222,9 @@ export class AC {
if (algorithm === TokenizationAlgorithm.ReverseMaxMatch) {
return reverseMaxMatch(patterns);
} else if (algorithm === TokenizationAlgorithm.MinTokenization) {
return minTokenization(patterns, text.length);
return minTokenization(patterns, stringLength(text));
}
return maxProbability(patterns, text.length);
return maxProbability(patterns, stringLength(text));
}
}

Expand Down
50 changes: 41 additions & 9 deletions lib/common/utils.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import { DoubleUnicodePrefixReg, DoubleUnicodeSuffixReg, DoubleUnicodeReg } from './constant';
import {
DoubleUnicodePrefixReg,
DoubleUnicodeSuffixReg,
DoubleUnicodeReg,
} from "./constant";

export function stringLength(text: string) {
return text.replace(DoubleUnicodeReg, '_').length;
return text.replace(DoubleUnicodeReg, "_").length;
}

// 双音节字符处理
Expand All @@ -10,7 +14,10 @@ export function splitString(text: string): string[] {
let i = 0;
while (i < text.length) {
const char = text.charAt(i);
if (DoubleUnicodePrefixReg.test(char) && DoubleUnicodeSuffixReg.test(text.charAt(i + 1))) {
if (
DoubleUnicodePrefixReg.test(char) &&
DoubleUnicodeSuffixReg.test(text.charAt(i + 1))
) {
result.push(text.substring(i, i + 2));
i += 2;
} else {
Expand All @@ -21,10 +28,35 @@ export function splitString(text: string): string[] {
return result;
}

export function isZhChar(char: string) {
if (typeof char !== 'string') {
return false;
export class FastDictFactory {
NumberDICT: string[];
StringDICT: Map<string, string>;

constructor() {
this.NumberDICT = [];
this.StringDICT = new Map();
}
let code = char.charCodeAt(0);
return code >= 19968 && code <= 40869;
}

get(word: string): string {
if (word.length > 1) {
return this.StringDICT.get(word) as string;
} else {
const code = word.charCodeAt(0);
return this.NumberDICT[code];
}
}

set(word: string, pinyin: string) {
if (word.length > 1) {
this.StringDICT.set(word, pinyin);
} else {
const code = word.charCodeAt(0);
this.NumberDICT[code] = pinyin;
}
}

clear() {
this.NumberDICT = [];
this.StringDICT.clear();
}
}
54 changes: 26 additions & 28 deletions lib/core/custom/index.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import { acTree } from '@/common/segmentit';
import { Probability, Priority } from '@/common/constant';
import { splitString, stringLength } from '@/common/utils';
import { FastDictFactory, splitString, stringLength } from '@/common/utils';
import DICT1 from '@/data/dict1';
let customDict: { [key: string]: string } = {};
let customMultipleDict: string[] = [];
let customPolyphonicDict: string[] = [];
const customMultipleDict = new FastDictFactory();
const customPolyphonicDict = new FastDictFactory();

type CustomHandleType = 'add' | 'replace';

Expand All @@ -29,20 +29,20 @@ const CustomDictName = Symbol('custom');
* @param {CustomPinyinOptions} options multiple/polyphonic 对于 customPinyin 补充词汇的处理
*/
export function customPinyin(
config: { [key: string]: string } = {},
config: { [word: string]: string } = {},
options?: CustomPinyinOptions
) {
const keys = Object.keys(config).sort(
(key1, key2) => stringLength(key2) - stringLength(key1)
const words = Object.keys(config).sort(
(word1, word2) => stringLength(word2) - stringLength(word1)
);
keys.forEach((key) => {
customDict[key] = config[key];
words.forEach((word) => {
customDict[word] = config[word];
});
const customPatterns = Object.keys(customDict).map((key) => ({
zh: key,
pinyin: customDict[key],
probability: Probability.Custom + stringLength(key),
length: key.length,
const customPatterns = Object.keys(customDict).map((word) => ({
zh: word,
pinyin: customDict[word],
probability: Probability.Custom + stringLength(word),
length: stringLength(word),
priority: Priority.Custom,
dict: CustomDictName,
}));
Expand All @@ -58,23 +58,21 @@ export function customPinyin(

function addCustomConfigToDict(
config: { [key: string]: string },
dict: string[],
dict: FastDictFactory,
handleType: CustomHandleType
) {
for (let key in config) {
const pinyins = config[key];
splitString(key).forEach((word, index) => {
for (let word in config) {
const pinyins = config[word];
splitString(word).forEach((char, index) => {
const pinyin = pinyins.split(' ')[index] || '';
const wordCode = word.charCodeAt(0);
if (handleType === 'replace' || (handleType === 'add' && !dict[wordCode] && !DICT1[wordCode])) {
if (handleType === 'replace' || (handleType === 'add' && !dict.get(char) && !DICT1.get(char))) {
// 直接覆盖原词典
dict[wordCode] = pinyin;
dict.set(char, pinyin);
} else {
// 补充至原词典
dict[wordCode] = dict[wordCode] || DICT1[wordCode];
if (!dict[wordCode].split(' ').includes(pinyin)) {
dict[wordCode] += ` ${pinyin}`;
dict[wordCode] = dict[wordCode].trim();
dict.set(char, dict.get(char) || DICT1.get(char));
if (!dict.get(char).split(' ').includes(pinyin)) {
dict.set(char, `${dict.get(char)} ${pinyin}`.trim());
}
}
});
Expand All @@ -91,15 +89,15 @@ export const getCustomPolyphonicDict = () => {

export function clearCustomDict(dict: CustomDictType | CustomDictType[]) {
if (dict === 'pinyin' || dict.indexOf('pinyin') !== -1) {
Object.keys(customDict).forEach(function (key) {
delete customDict[key];
Object.keys(customDict).forEach(function (word) {
delete customDict[word];
});
acTree.removeDict(CustomDictName);
}
if (dict === 'multiple' || dict.indexOf('multiple') !== -1) {
customMultipleDict.length = 0;
customMultipleDict.clear();
}
if (dict === 'polyphonic' || dict.indexOf('polyphonic') !== -1) {
customPolyphonicDict.length = 0;
customPolyphonicDict.clear();
}
}
45 changes: 22 additions & 23 deletions lib/core/dict/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,36 +26,36 @@ export function addDict(dict: DICT | {}, options?: string | DictOptions) {
const name = typeof options === "object" ? options.name : options;
const dictName = name || DefaultName;
const dict1Handle = (options as DictOptions)?.dict1 || "add";
for (let key in dict as DICT) {
const value = (dict as DICT)[key];
for (let word in dict as DICT) {
const value = (dict as DICT)[word];
const pinyin = Array.isArray(value) ? value[0] : value;
if (stringLength(key) === 1) {
if (stringLength(word) === 1) {
addToOriginDict(
dictName,
key,
word,
pinyin,
dict1Handle
);
}
if (Array.isArray(value)) {
patterns.push({
zh: key,
zh: word,
pinyin,
probability:
typeof value[1] === "number"
? value[1]
: Probability.DICT * key.length * key.length,
length: key.length,
: Probability.DICT * stringLength(word) * stringLength(word),
length: stringLength(word),
priority: Priority.Normal,
dict: dictName,
pos: typeof value[2] === "string" ? value[2] : "",
});
} else {
patterns.push({
zh: key,
zh: word,
pinyin,
probability: Probability.DICT * key.length * key.length,
length: key.length,
probability: Probability.DICT * stringLength(word) * stringLength(word),
length: stringLength(word),
priority: Priority.Normal,
dict: dictName,
});
Expand All @@ -71,34 +71,33 @@ export function removeDict(dictName?: string) {

function addToOriginDict(
dict: string | Symbol,
key: string,
char: string,
pinyin: string,
handle: "add" | "replace" | "ignore" = "add"
) {
if (!originDictMap.get(dict)) {
originDictMap.set(dict, {})
}
const originDict = originDictMap.get(dict)!;
const code = key.charCodeAt(0);
if (!originDict[key]) {
originDict[key] = DICT1[code] as string;
if (!originDict[char]) {
originDict[char] = DICT1.get(char) as string;
}
if (handle === "add") {
if (DICT1[code] && !DICT1[code].split(' ').includes(pinyin)) {
DICT1[code] += ` ${pinyin}`;
} else if (!DICT1[code]) {
DICT1[code] = pinyin;
const existedPinyin = DICT1.get(char);
if (existedPinyin && !existedPinyin.split(' ').includes(pinyin)) {
DICT1.set(char, `${existedPinyin} ${pinyin}`);
} else if (!DICT1.get(char)) {
DICT1.set(char, pinyin);
}
} else if (handle === "replace") {
DICT1[code] = pinyin;
DICT1.set(char, pinyin);
}
}

function removeOriginDict(dict: string | Symbol) {
const originDict = originDictMap.get(dict) || {};
for (let key in originDict) {
const code = key.charCodeAt(0);
DICT1[code] = originDict[key];
delete originDict[key];
for (let char in originDict) {
DICT1.set(char, originDict[char]);
delete originDict[char];
}
}
Loading

0 comments on commit a064200

Please sign in to comment.