From 912f5ad1c866acfa40816c278e0f1b9d247850e5 Mon Sep 17 00:00:00 2001 From: Cryolitia PukNgae Date: Sat, 28 Dec 2024 17:12:53 +0800 Subject: [PATCH] rfct(crates/lang_unicodes): migrate to opencc_rs --- .github/workflows/cross-build.yml | 19 +- Cargo.lock | 120 +++++++++++ Cargo.toml | 9 +- Cross.toml | 12 +- crates/lang_unicodes/Cargo.toml | 8 + crates/lang_unicodes/build.rs | 186 +++++++++++++++--- crates/lang_unicodes/package.json | 16 -- crates/lang_unicodes/scripts/cn_char_rank.mjs | 52 ----- crates/lang_unicodes/scripts/hangul.mjs | 8 - crates/lang_unicodes/src/cjk_unicodes.rs | 6 +- crates/proto/Cargo.toml | 6 +- crates/proto/build.rs | 7 +- packages/ffi/Cargo.toml | 8 +- packages/grpc/Cargo.toml | 8 +- packages/server/Cargo.toml | 8 +- packages/wasm-edge/Cargo.toml | 8 +- pnpm-lock.yaml | 9 - 17 files changed, 338 insertions(+), 152 deletions(-) delete mode 100644 crates/lang_unicodes/package.json delete mode 100644 crates/lang_unicodes/scripts/cn_char_rank.mjs delete mode 100644 crates/lang_unicodes/scripts/hangul.mjs diff --git a/.github/workflows/cross-build.yml b/.github/workflows/cross-build.yml index d6a3c01e..3351e18c 100644 --- a/.github/workflows/cross-build.yml +++ b/.github/workflows/cross-build.yml @@ -91,13 +91,6 @@ jobs: sudo apt update sudo apt install -y nodejs llvm clang pkg-config libssl-dev ${{ matrix.platform.setup }} - - - name: Build Node - working-directory: ./crates/lang_unicodes - run: | - npm i - npm run build - - name: Set up Homebrew id: set-up-homebrew if: ${{ runner.os == 'macOS' }} @@ -120,8 +113,9 @@ jobs: with: toolchain: stable - - name: BUILD PROTO + - name: BUILD PROTO & lang-unicodes run: | + cargo build -p lang-unicodes cargo build -p cn-font-proto - name: Build binary @@ -129,10 +123,9 @@ jobs: with: command: build target: ${{ matrix.platform.target }} - args: '--locked --release -p ffi' + args: '--locked --release -p ffi --no-default-features' strip: false env: - CARGO_WITH_NO_EXTRA: 'True' HARFBUZZ_SYS_NO_PKG_CONFIG: 'True' - name: Rename Package On Mac @@ -186,12 +179,6 @@ jobs: - name: Update Rust run: /home/runner/.cargo/bin/rustup target add wasm32-wasip1 - - name: Build Node - working-directory: ./crates/lang_unicodes - run: | - npm i - npm run build - - name: Build working-directory: ./packages/wasm-edge run: | diff --git a/Cargo.lock b/Cargo.lock index ea160734..901867a5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -294,6 +294,26 @@ version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "bindgen" +version = "0.71.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f58bf3d7db68cfbac37cfc485a8d711e87e064c3d0fe0435b92f7a407f9d6b3" +dependencies = [ + "bitflags 2.6.0", + "cexpr", + "clang-sys", + "itertools", + "log", + "prettyplease", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "syn 2.0.90", +] + [[package]] name = "bitflags" version = "1.3.2" @@ -365,6 +385,15 @@ dependencies = [ "shlex", ] +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", +] + [[package]] name = "cfg-if" version = "1.0.0" @@ -401,6 +430,17 @@ dependencies = [ "inout", ] +[[package]] +name = "clang-sys" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" +dependencies = [ + "glob", + "libc", + "libloading", +] + [[package]] name = "clap" version = "3.2.25" @@ -480,6 +520,15 @@ version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" +[[package]] +name = "cmake" +version = "0.1.52" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c682c223677e0e5b6b7f63a64b9351844c3f1b1678a68b7ee617e30fb082620e" +dependencies = [ + "cc", +] + [[package]] name = "cn-font-proto" version = "0.1.1" @@ -1022,6 +1071,12 @@ version = "0.28.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" +[[package]] +name = "glob" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" + [[package]] name = "grpc" version = "0.1.0" @@ -1542,6 +1597,7 @@ name = "lang-unicodes" version = "0.1.0" dependencies = [ "lazy_static", + "opencc-rs", ] [[package]] @@ -1556,6 +1612,16 @@ version = "0.2.168" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5aaeb2981e0606ca11d79718f8bb01164f1d6ed75080182d3abf017e6d244b6d" +[[package]] +name = "libloading" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc2f4eb4bc735547cfed7c0a4922cbd04a4655978c09b54f1f7b228750664c34" +dependencies = [ + "cfg-if", + "windows-targets", +] + [[package]] name = "libredox" version = "0.1.3" @@ -1566,6 +1632,15 @@ dependencies = [ "libc", ] +[[package]] +name = "link-cplusplus" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d240c6f7e1ba3a28b0249f774e6a9dd0175054b52dfbb61b16eb8505c3785c9" +dependencies = [ + "cc", +] + [[package]] name = "linux-raw-sys" version = "0.4.14" @@ -1654,6 +1729,12 @@ dependencies = [ "rxml", ] +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + [[package]] name = "miniz_oxide" version = "0.7.4" @@ -1729,6 +1810,16 @@ dependencies = [ "libc", ] +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + [[package]] name = "nu-ansi-term" version = "0.46.0" @@ -1769,6 +1860,29 @@ version = "1.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" +[[package]] +name = "opencc-rs" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a45d82f519a62e7439dbccd6d1dcfc349588dca7099732ec031f2a97ee79e44a" +dependencies = [ + "libc", + "opencc-sys", + "tempfile", + "thiserror 2.0.8", +] + +[[package]] +name = "opencc-sys" +version = "0.3.4+1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e173b43a777be3d35c2c8734f23303308b2d1f5c5ab22e4b6ccd9387b3dfbe34" +dependencies = [ + "bindgen", + "cmake", + "link-cplusplus", +] + [[package]] name = "openssl" version = "0.10.68" @@ -2233,6 +2347,12 @@ version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" +[[package]] +name = "rustc-hash" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7fb8039b3032c191086b10f11f319a6e99e1e82889c5cc6046f515c9db1d497" + [[package]] name = "rustc_version" version = "0.4.1" diff --git a/Cargo.toml b/Cargo.toml index 8e04ef13..569e1087 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,12 +11,12 @@ license = "Apache-2.0" [dependencies] cn-font-utils = { version = "0.1.0", path = "crates/cn_font_utils" } harfbuzz_rs_now = "2.2.6" -lang-unicodes = { version = "0.1.0", path = "crates/lang_unicodes" } +lang-unicodes = { version = "0.1.0", path = "crates/lang_unicodes", default-features = false } log = "0.4.22" md5 = "0.7.0" opentype = "0.38.1" prost = "0.13.3" -cn-font-proto = { version = "0.1.1", path = "crates/proto" } +cn-font-proto = { version = "0.1.1", path = "crates/proto", default-features = false } rayon = "1.10.0" unicode-range = { version = "0.1.0", path = "crates/unicode_range" } woff = "0.3.3" @@ -27,11 +27,14 @@ chrono = "0.4.38" [dev-dependencies] env_logger = "0.11.5" - [build-dependencies] cross = "0.2.5" tonic-build = "0.12.3" +[features] +default = ["with_extra"] +with_extra = ["lang-unicodes/with_extra", "cn-font-proto/with_extra"] + [workspace] members = [ "crates/cn_font_utils", diff --git a/Cross.toml b/Cross.toml index 8f2092b0..3d2b2278 100644 --- a/Cross.toml +++ b/Cross.toml @@ -5,31 +5,31 @@ zig = false # do not use zig cc for the builds context = "." # the context folder to build the script in. defaults to `.` [build.env] -passthrough = ["CARGO_WITH_NO_EXTRA", "HARFBUZZ_SYS_NO_PKG_CONFIG"] +passthrough = ["HARFBUZZ_SYS_NO_PKG_CONFIG"] [target.aarch64-unknown-linux-gnu] pre-build = [ - "apt update && apt install -y nodejs llvm clang pkg-config libssl-dev", + "apt update && apt install -y llvm clang pkg-config libssl-dev", ] [target.x86_64-unknown-linux-gnu] pre-build = [ - "apt update && apt install -y nodejs llvm clang pkg-config libssl-dev", + "apt update && apt install -y llvm clang pkg-config libssl-dev", ] [target.x86_64-unknown-linux-musl] pre-build = [ - "apt update && apt install -y nodejs llvm clang pkg-config libssl-dev", + "apt update && apt install -y llvm clang pkg-config libssl-dev", ] [target.aarch64-unknown-linux-musl] pre-build = [ - "apt update && apt install -y nodejs llvm clang pkg-config libssl-dev", + "apt update && apt install -y llvm clang pkg-config libssl-dev", ] [target.riscv64gc-unknown-linux-gnu] pre-build = [ - "apt update && apt install -y nodejs llvm clang pkg-config libssl-dev", + "apt update && apt install -y llvm clang pkg-config libssl-dev", ] [target.x86_64-pc-windows-msvc] diff --git a/crates/lang_unicodes/Cargo.toml b/crates/lang_unicodes/Cargo.toml index 0c229e1f..743bb42e 100644 --- a/crates/lang_unicodes/Cargo.toml +++ b/crates/lang_unicodes/Cargo.toml @@ -14,3 +14,11 @@ lazy_static = "1.5.0" [lib] path = "src/lib.rs" + +[build-dependencies] +lazy_static = { version = "1.5.0", optional = true } +opencc-rs = { version = "0.4.7", optional = true } + +[features] +default = [ "with_extra" ] +with_extra = ["lazy_static", "opencc-rs"] diff --git a/crates/lang_unicodes/build.rs b/crates/lang_unicodes/build.rs index 36c12881..6c3773c5 100644 --- a/crates/lang_unicodes/build.rs +++ b/crates/lang_unicodes/build.rs @@ -1,32 +1,166 @@ -use std::env; -use std::process::Command; +#[cfg(feature = "with_extra")] +use lazy_static::lazy_static; +#[cfg(feature = "with_extra")] +use opencc_rs::{Config, OpenCC}; +use std::collections::HashSet; +use std::fs; +use std::fs::{create_dir, exists, read_to_string}; +#[cfg(feature = "with_extra")] +const CN_SYMBOL: &str = "⸺、。〈〉《》「」『』【】〔〕〖〗︐︑︒︓︔︕︖︐︑︒︓︔︕︖︗︘︙︰︱︳︴︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄"; +#[cfg(feature = "with_extra")] +const HAN_ZI_PIN_LV: &str = "的一国在人了有中是年和大业不为发会工经上地市要个产这出行作生家以成到日民来我部对进多全建他公开们场展时理新方主企资实学报制政济用同于法高长现本月定化加动合品重关机分力自外者区能设后就等体下万元社过前面农也得与说之员而务利电文事可种总改三各好金第司其从平代当天水市提商十管内小技位目起海所立已通入量子问度北保心还科委都术使明着次将增基名向门应里美由规今题记点计去强两些表系办教正条最达特革收二期并程厂如道际及西口京华任调性导组东路活广意比投决交统党南安此领结营项情解议义山先车然价放世间因共院步物界集把持无但城相书村求治取原处府研质信四运县军件育局干队团又造形级标联专少费效据手施权江近深更认果格几看没职服台式益想数单样只被亿老受优常销志战流很接乡头给至难观指创证织论别五协变风批见究支那查张精林每转划准做需传争税构具百或才积势举必型易视快李参回引镇首推思完消值该走装众责备州供包副极整确知贸己环话反身选亚么带采王策真女谈严斯况色打德告仅它气料神率识劳境源青护列兴许户马港则节款拉直案股光较河花根布线土克再群医清速律她族历非感占续师何影功负验望财类货约艺售连纪按讯史示象养获石食抓富模始住赛客越闻央席坚份士热限米银息校均房周游千失八检足配存九命尔即防钱评复考依断范础油照段落访未额双让切须儿便空往你层低奖注黄英承远版维算破铁乐边初满病响药助致善突爱容香称购届余素请白宣健牌促培竞巴稳继紧字困刘旅声超随例担友号显却监材且春居适除红半买充陈火搞图阳六察试太什执片古七球修尽控讲排粮武预亲挥卖审措荣洲卫希店良属险曾围域令站苏龙念罗吨器汇康减习演普田班待星飞写矿轻扩言章汽靠毛终仍景置底福止离泽波兰核降训逐票菜座献钢眼损宁像苦印融独湖早予夫编换欧努著顾征升态套介送某斗状画留航派室临兵补宝略黑综云差纳密贫剧犯阿击遇岁阶烈督吃丰馆招害官树听庭另沙私针胜贷网愿托缺园假酒音巨既判输讨测读洋括筑欢刚庆久陆找楼激晚绝压故互签汉草木亩短绍迎吸警藏疗贵纷授登探索湾宏录申诉秀序顺死卡歌午孩桥喜川邓扬津温船库订练候退违否彩棉帮拿罪币角召灾妇杨奋绩虽煤免笔够永圳停奥鲜朝吴岛觉移尼急博贯拥束左细舞幅语俄奇般简拍脑债固威券追筹刻映繁伟甚饭右彻烟沿街血冲洪植誉刊玉厅救潮迅伍怎付倍顿述播励斤乎纸振旧障鼓艰呼吉男绿尚夏亏季松哈祖典韩遍夜轮板抗摄杂皮贡借幕罚伤岸扶乱曲脱践危澳童散味叶累谢孙邮雄兼微呢谁惠偿署择染答块徐鱼赞课盛延瑞怀堂驻零辆齐胡途封似润守毕坦母雨败朱污趋械纺租灵拓残含握跨衣储瓦蒙鉴析竟骨档秘禁赵宾异伊智钟键辉跃冷倒庄毒仪哪涉泛宗鹏归岗雷礼尤休泰疾肥珠叫牛宜抵挂寻父攻佳塞架符裁虑肉启丽露鲁秋昌估射册若宽厚盾硬末轨饮勤茶诗郑冠涨篇泥唱纯坡熟浙晓抢丝锦载笑勇杰患乌坐雪戏背塔翻沈遗聚渠哥享迹森辽衡掌牧附操赶览野盟殊仁错萨夺梅误词董潜卷矛腐亮冒盖旗井凡震峰坏倾距壮惊盘梁摆径忠冰峡丹避珍乘刑扎透迫箱莫跑穿祝乏厦渐软询折浪朋敢诚弱疑邀沉端床络疆缩脚甘贴勒荒唐静缓侵句尊塑肃怕耕痛援劣伙挑洗暴冬龄乔餐肯廉跟阵伐悉忘闭奔恢宋泉杯渡吗奉婚赴恩盐掉洁亡洛聘蔬混摩抽鸡剂胆麦谋雅废贺羊阔唯捐返隆穷辛猪帐饰郭颁灯绕诸伴顶祥谓恶番敏旦劲缴麻屋跳码鞋扣迈忙趣盈棋勃敬辑摊旺纠炼梦偏渔牙侨黎赔裕宫谷概稿柱弹殖秩凭拨幸洞伪沟姓遭涌陶迁诺拔畅忧胞丁蓄贝舍腾杀煌圆伦横薄畜毫豪弟呈佛邦您墨徽惯循蓝烧触陕拖伯盲宪净卢炭籍秦粉妻爆欣释玩俊欠蛋猛迪苗暂貌遵锡楚桂昆杜皇醒燃凤截铺液撤胶慢杭虚辞曼毅咨俗糖忽芳姐耗妈谊浦频阻允宅窗默胀弃倡灭甲症埃滨赏莱拒淡坛陵绘虎竹赢锋篮迷纽轿贩递娘圈挖炉替幼乃郊颇戴滑徒崇涛焦凝墙吧炎刀玻寿履圣昨酸朗媒桑铜仲亦诞揭纵漫愈辟赠旱奶泳枪骗虫池镜浓拆艾扫娱钻碍寒迟邻曹盗穆豆赚晨浩彭耳瓜扭脸燕摇寄仿炮晋泪欲饱壁锁刷柬诊磨捕寨滚膨孔添帝辖炸旨吁址驶抱嘉拜扰袋佩阴辈锅赖剩押怪浮枚栏毁柳恐敦孟旁仓岩伸岭耐懂捷璃溪暖纤汗疫巧旋侧冶陪鸣瓶纲挤旬舆喝陷缘稻饲滩隔慰朴隐灌拟偷闲赫恰慧蒋闹邹牵柴刺滞彰俱勘填琛尝贾搬淮奏荷滋覆役秒踏巩摸荡辅惜柜肖颗搏氏姑弄姜君舒兑宇割哲摘钦逃漠忆敌宿啊凌耀闯阅贪赤汪悲抑瓷冯厉粗菲琴堡斌掘稀衰驾雕牢氛驱妥悄郎巡臣羽灰癌颖姆漏袭贤鸟暗茂孤惩榜袁桌卓傅剑堆兆狠轰拳妹绒裂潘兄洽叹涵贿侯岚熊绪阁尾碑尖腿涂栽坝犹铸肩闪诱辩芬睡奠伏妙乙绸廷夕恒梯赁霞攀枝译描湘磁吕硕爸肝峻葡衷搭唤薪挺逝狗蔡宴蓬撞铝牲舰胁崛桃斜丧烂屏砖墓详逾函跌抚插戈凉啤脉滥赋柏堤腰泊寺尘蒂削仙踪冻汤睛艳荐劫框廊惑页拼堪携丈乳挪谱舶埔遥菌塘氧晶洒株颜虹岳胸忍甜匹瞩懈爷丛莲叙鸿逢抬嘴弘炒喷吊窝衔吹霸仔垦胎慎脏歧疏悠慕漂杆萍舟吐玲凯戒盼偶盆慨弊箭茅衫罐串辐腹钩碰昂酬晰姿彼锻飘嫁竣缝蹈悬紫浅缆喊昔驰湿剪侦坑姚魏扑挣焕皆狂泡骤堵膜禽锐芝帽擅沪晤婆埋劝碗玛顷鸭娃豫匆魂哭庞亭屡逼尺撒鹿讼弥坊碎缔霍壤萄铃稍丘肿烦苹庙雇汛孝辰吞汰怨酿耶咱欺丢琼棚披渴屈弗疲帕昭盒仰萧牺撑抛鼠纱翼兹骑糊契铭淘顽撰乒淑妆窑柔姻苍谨卿灿栋敲窃菊郁催眉邱揽鼎韦肤娜俏呀拚寸爬悟尿罢圭葬聪沃肠厕慈恋绵橡圾垃翁粤脂歹憾阐甸巷蜂轴艘垄衬阜惨冀幽厘崭筋寓迄渗碘碧赌袖奈崔悦捞剥孕逆婴脆缅艇谭笼儒粒诈遣垂磋卸帜枣幢淀帆蛇宰殿猎叔夹帅沧魅俩牟钓葛罕渤汕溢擦袱嫩桶殷酷呆卧暑骄幻囊掀醉牡饼扇蒸赣俭椅枢彦樊吾仗彬砂绳巾喀勋愁碱谦壳轧潭浆挽邢啥焊钞烤廖猫狱腔喻御蕴坎魔刮瘤茫竭莉链淫愤纹咸睐睹裤夸滴雾搜拘龚凶茨傲鞍鹤蚀颈翠卉汁冈狮隧弯胃沛募琳疼蚕泼磷捧炳绣朵涯掏奎聂孜韵浑翔魄掩斥敞腊愧粘丑溉斑啦柯谐烯禄浴涝鬼薛瘦挡昏鹅湛逻虾沂辱叉鼻厨鲍鞭辣潇乓肺尹颂邵澜桐鹰妨闽屠畏翰塌亟寂赂犬聊暨垫泄漆旭蕾坪涤挫佐瞄拦硫棒杏爽碳畔熙襄祸乾淹臂莎辜阎庸砍捉勾垒衍坤噪毯倪扮铅遏哀愉瑶咬嫌闸恳齿杠怒兽浇肇鄂溶哄棵盯梨灶屯狭陋啡浸淋濒脊戚勉膏氨墅沸挨蔓抄芒秉刹饶厢咖魁骚缚遂恨跻螺辨菇帷凰椒汝瞬淄舱馈桩炬誓卜麟岂兔眠泵拐肚匪芦匈霉蜜荆雁窄秧枯仆嘱壶谅哨肌贬叠稽岐沫肆醇菱彪躺摔膀甫逊凑渊喂藤砸悔杉霜厄忌桔筒丙臭拾芜禹丸蟹嘛俞翅尸澄骂睦馨郝贮陌钧轩赃笋歉逸歪巍萃崖窟踢锣萎庐剖籽甩饥苑恼渣痕莞硅晴巢瘫缠隶筛穴昼埠宠肢饿仑逮兢趟糕妮邪抹俑萌匠扔酱葱礁掺雀髓悼挚蔚枫庚伞侃僵捆蒜溜傻蔗谜斋蝶沾闷驳耿槽黔吓肾芽栗朽荫榆皖曰徊奴迭僻蓉靖氟滔羡愚尧俺徘罩磊镑舌曙纶粪匙钉佼扯踊躲猴纬咽酝挠宛瑰歇抒茧穗祭鑫趁痴裙猜耘碌锈晒潍弦稼狼拢梧芯眷哑宙厌逛谴邯呵蜡寥钥耸媳熏蚁惕颠娟亨吟蒲梭瞻渝喉遮慌夷韶焰尉珊胖蕉粹裹琦秽侠奸挝绑曝棍婉镶熬傍燥氯骆晃鸽疯琢聋瑟暇绥禅溃腺垮阀撼煮佣滕淤蹲栖硝睁荟荧抖坟芭臻锭晖倦倘喘邑锤惧荔毗觅矮恭钙氮缸瞧颤萝佑怡瘾寡烹摧棠缪雏韧喇兜坯坷贞仇缉帘竖糟猖懒凿洼喧谣驼烫锌椰崩沥汾磅霖棘扛彗矩瞒陇绎诫斐卵铮钾宵簿秤畴斧擂剔躁冤讳寅焚漳鳖哺耻僧琅粟怖咏蜀淳柑缕烁氢蔽琪泣阮镀殴虞虐炊搁诀掠坠屿髦酋躯吵遐寞仕稚僚楠矶筝彝叮熔槐潢芹郸匾咋玄裔陡哗怜襟刃脾嵌拱慷痪跋孚峪钊滇苟晕墩膝羞乍腻詹讶敷肴莹衢柿朔袜枕烘匀歼泻樱吻翟堰苯隙娇獗汲蛙斩靡沁乞姨翩沼嘎畸矫骏薯绚窜藻矗皂楷腕篷徇耽娼犁榻茄棕汹峨蹄昧奢涩灼踩粥拣旷簇溯攒沓呕梳搅砌纫渭澡撕漓葆辍肪祁鞠蛮捏诵娣岱瀑啸裸鸦瑛躬舜忱豹纂恤惟赐俯犀媚嫂嗓蚊茬驭缀皱凳钮蚂姬扒嫖跪凹揣尬沦尴豁玫殡淌叭唇啃裘卑琐矢拯忡勿盎茵椎脖拂骅葫迢薇龟绞眶沐傣浊舅叛浚窘栓酶笛泌榄惹铲碟捡恪酯滤匿酵砚贼匮熠鳞麓镁氓苇廓巫踵竿蘑翘梓贻鳗帼冉泓狐涟崎窍瑜讽逗铎掷璀泗浏陲醋苛攘璧瀚哩暮矣蚌悖扼漯烛蝴屑墟俘侣庇陀煎秸弓捣譬炜炯拌扁彤锚禾侮秆绮嚣樟咐枉窦桦寇哉狸耍馒驹隋冕疮咄妄峙娄溥腑钠栩糙滦呐鲻娶祺刨褒橙茹谎抉慑媛橄戎迩雯璨雍惶扳桢霓账梗炕裴韬杖痹缤沽燎煞删辙爵缭劈烨槌媲凛莆颅锯膳澎坞瓣婷絮酌涡唁秃禺膊棣芸忻炽榨篆憨戍圩爹蹊饪胺贱睫蝇惫拇赈泾盏弧剿硒毓皓菏灸湄炙祠荻捍嚼朦屹紊藜驴寝兮隘祈榕臧蝉绢瞎闵鳌娥藉娅烽楂摒凄凸熄孵叩渎胳匡袍卒怠桓莽倩泸藕陨辗骋峭冥饺亢圃颐擒铵鳄簧愣璜钰拙瘠靳隽罹岑镭榴恕毋囤汀绽窖筷擎猿诲碾夭筐邃藩诬芙胚哇垣胧帖殉毙壑绰憋亥涅屁璞缮侍倚稠棺棱葵诣笨橱寰郡垢徕眺胰谆窥霄栉舸蹦坂瞪珲釉跤挟侄肘嘲刁缎嚷痒敛祛绅孰痫闺椿噶恍伶峦酥萦苎癫涪锲蜚拎嵩昊娴涣烙璋笃囚祯篱讴舷纭锄巅卦摹眸柄踞焉辄褚褐湃夙堕岔惦疚谍奕羚帧澈濮捎漾吼锰趴菩簸仃渲札谙咕桨咀郴咳呜蛟拧莘驯庵弼逞蹬姥撂镍晏疡爪骥楞钳懋寐淇琉杞菠铨翌靶侗瑙馅丐痊娓侈苓聆睿偌釜噬曦燮哟瑾瞿璇拮憬鹊勺憧嗜啼檐柚呱渍镌妃溺鸥粕沱榭隅毡禧瞅鲸淆阪茁渺瞥茜瘟礴伺谛锹蔼虔莺迸磕赡泱栈甄镐抠嬉诿甬绊饵谬梢颍揪琶褥佟腥辊溅琵鄯拴喃笙酰粱卤芮膛斓潼鸵侥讷婿吆羁嗣蜒栅疙拷戳镛芷钛蜿铀夯摞雌酣荼蝎锥姊瓢祀玺弛犷哦茸鱿绷茎惋亘珑莓掂迥鲤殃瘩叨螃奄腈疟沭钨昕膺涿糠氰揉狩檀悍缫哮衙瑚潞谤搀洱涓袤痰乖冗芋甭骸幌涮俨敖槛狄牒恺雹赎庶熨蛛佰蓦鄱煽腌黯疤倔剌斡诽锵筱妍掖铿脐捅弈邸湟眯赦拄啪玮轶蛾麋炫赊靴箔菁撬裳戌缨蝗撇奚瀛噩怯蓓匕咚瞰佬泞扉皋晾麒姗跚瘀鄙猕拭鲟祷脯砺驿陛瘁搓舵汞哼胫珀邬磺馏馍铢诧涧吏苔潺邳烷囿斟滁殆酚狡孺恬沅铬湍啧囱蒿鹃柠漱胥妖洙珂茉蹒圻鬓搂葩佘渥诙袒捂瞠妓铐澧袂馁汐匣逍谚窒蔑糯汶壹岖盔嘘迂嘀锢讥吭抨屎獭褪咫稷迦檬塬蠢蓟咎皿驮俐坍惭垛鹭鸾蹴撩诠恙臃遨睬踌浒搪郧竺翡宦冽憩萱拽卞槟躇蘸肋呛濡酮眨撮矸垸蛀黛涸脓徙撷曳峥渚镖钴骊袅磐掣沌埂嘿琏楣豚诡悸麝煦矾羲唉溧呻覃兖吱惰羹钝枸姣颓铣梆骇淅孢叱谧泯谟恃薹筵鏖栾鹜哽掬辘茗瓯绛筠铤袄殚梵挎遴榈蜕癣垠厮幄偕焱攥裨炖旮旯蔺骡娩伫猝窿虏屉缜咒筏骼璐剃涕猗淼侬阙嗅鸳嘈霏珩沮捺硼荃驷漩嘻眩掰伽脍婪煜鹄壕崂翎痞兀婺鸯楹咤徜嫉篓烃铂咪掐匝杼蕃箍荤砾嘶皑宕荪哎汴貂邡淦蕙弩堑惬偃徉箴赘啻凋穹酗憎芥唾闫晔苞昶甙笺吝蕊鳝衅猩薰昱趾淞坳怅翱汩琥岌阑粼羌霆篡塾酉裱韭唠廿闰攸黝蛤厥荞瑕柘祚疵愕蕨牦飨疹嗷癖芪漕隍徨逵泠嵘嗡岫岷擞陂颊咔卯婶椭惘歙幺臆叽缰睽勐暄弋痔秭煲琮嘟犊玖怦丕溴罂瓮丞惮癜晦攫镰镯柞舫铆蹼妩熹铱褂丫笆妒噢噙琬冼荀蟾捶嗒町嫣肮皎旖恣钚砥吩茯馥钎甥嗦蜗浔谒辫亳彷珏咯淖妊佤玷嘹崴於辕贲扈伎旎孽耙娠戊冢跷砷焘羔圪耄钼悻荥唑稞邝莅杷醛嗽唆拗碴馋胱琨茏糜懦骞蜘嚓怵抡唢腆涎灏臼墒暹椽牍钒猾榔懵枇樵锶籼箫漪帚钵赓捻郅儋烬锂剽锑鄢鄞臾喳胄耋阱笠瓴啬杳萤莠嶂浜傩遒轼睢倜矽仉唬旌酪腼罄嬗畲祟桅悴讹憔龋嵊绶邕忖箩咆晌愫猷帛麾莒觑吮蟋庥懊阂蒯阡腮潸晟蟀臀罔骁崽绉粽忿肛蠡遛蜓煊蚜坻滹銮悯鼐撵噼忐湮侏粳矍铄坨铉盂锗阖溟俟忑赝鬃敝宸哆靓揩瘸鲅篝氦嚎浃缙飚锷癸柩蛎濂榷鲨钡盹鲫诘诩迤桎遁尕梏楫赳飒锃雉怆痼劾痢喽霹昙畹胭佚狈瘪姹吠铧谏雳咙畦荠娑褶忏惚痉橘漉诏呗晁惆砀馄戟峁昵拈蠕虱洵鹦蛹铛挛倏澍濉钅噜咛俳磬蜷霎肽砼聿怔砭谌箕蹶孪蔷糅挞饨惴禀淙哒枷楝闾蜻嗖淬垩矜郗蚤嫦喋镉饯髋潦镂簌偎鹉岙踱诃籁宓膘飙涞耆荏渑豌琰俎绌埭幡赅锆崮碣珞腋滢蓖伉馗聩幔锨蓥鹑砝酩枰鞘苋粑蹭倌犟俪嶙砻嵋滂葺苒枭翊婀飓阚喟傈藐蜃怂稣亵诒蜇岜霁瞌沏卅舀鹌俸嵇蟒汨砰鞣唏陉佯恿竽瘴祉焙诋濠螂叻垅谩朐稔芍瞳惺萸盅啄眈偻爿蟠炔垭噎蛰擘锏茭悌喔谑峋妪恽韫褓镳饽杈戛鸠萋襁榫霭苄跺杲嗨珉哌娆孀恸缄夔佗饷苷郜鼾颌訇谲溘咧褛逄颦洮逶嫡蠹碓烩醴栎鎏瓤伢蔫怿甾摈畈镣螨秣搔盱痍搐蹉佃绂疽骝霾悚缃懿咂奘轱邗蚝瘘醚湎瞑掮羟仨砣郢砧鳟跛踝轲窠郦踉躏戮篾骐鳍蹂郯跎倭诅鄄褴阆缈嗯妞沤跄箐苕窕楔饴峄腴圄谕揍踹罡佝颔觊篑鲢綦妾镗啕蚬窈揖眙蟑诛钗绯讣睾媾嗬祜镢囹苜坭蛐髯搡叟蹋觎捱碉呋罘荚鹫岿寮扪焖狞鳅嗄嗤擀痂嗟颉蚧儆锴龛嗑锟俚枥懑讫橇嗪虬跆骧陟灞恻涔酐鸪牯钜萘鲶缥曜蚓诤埕墀麸蝠蛊遑厩趄沔耦疱匍揿蚯讪唰舔呷蓿鹧膑刍耷鞑裆趸孑鲲绫埝嘭舢鸢螯吡蝙疸匐桁铠羸鲈囵唛仫庖劭郓骜粲峒腓鹳鳜蚶囫茴峤蟆蘖癯纾僳皙隰缬馐谪捭汊碜塍艮睑狍苫篦蜍锉沣诰晗喙麂謇蹇觐啾踽邈壬燧娲猥歆镒茔昝赭狰孳哧舛噔鹗蚣逅洹腱锒纰蛆蕤姝邰纣嘣钹衩婵孱蹿鲷萼椁浣镓遽赉趔蕲剜邂仡氤獐幛俾铋嗔茌氡诂豢桧畿倥捋仞忒疃浯蜈榛偬稗菖鲳厝踮叼痱貉玑婕琚疴掳钤垧氵黠跹怏揄氲铡濯芾笈崆钕菽隼傥仝囗芗埙簪暧桉镝蚪蜉藁笳菅龃喹橹抿啮蹑逖唔樨巽揶黟訾钣嵯凼恫掇剁珙沆噱揆耒铌泅疝葳隗滟龉钺殒蒡觇黜澹酊垡奂珈濑馕馊嚏痿岘氩茱滓焯抻豉敕掸碲靛摁淝鳏盥皈鲑颢犄翦铰椐胯屺邛庹猬蓊骛浠桠胤鸩痣蛭噌杵啜靼啶煅枋觥毂刽蝈蘅芨戬醮疖忾骷洌呤荦觞谡瀣蝣糌倬碚蹙痘砘绀虢蕻肓蛔唧桀蝌侩棂樯挈轫巳崧蓑藓鳕瑗帙馔豺痤郇殓髅轳逯嗫戕嚅蛳琊嘤疣蚱钯钿碇咣毽迳喱逦廪邙囡匏扦亻咝凇纨涠庠溆醺炀烊肄龈谀锱瘢枞皴贰晷闳斛屐讦婧苣蔻绺渌瑁螟叵颀穑膻羧螳绦誊蜥楦恂靥咿翳瓒枳啭樽嫒婊搽铒跗凫菡篁髻裾栲癞蓼氖孬喏砒姘衽缛嵬挹缢慵呦箸蹩槎榇舂嗲胴谔岢圹娌潋蛉酃鲵鲇娉亓碛芊忪谇笤韪勰呓俣圜愠仄炷毖筚伧棰磴滏篙肱笕堇馑荩榘哐傀崃罱痨儡鹂檩垴仵檄芎阉刈壅馀庾妯躅獒阊笞饬钏硐椴泔硌鹘鳇豇狙戡莨啉辇臬殇舐黍薮眭佻嗵煨莴蚴妤瘐擢蛏蹰龊辏绐氘骶莪珐缟聒讧岬胛桷谰戾撸鸬雒嘧囔铍骈掊茕噻铯柁艉龌硖罅魇酽咦嶷羿轸趵荸薜踟玳啖蔸槁鲛疥砬唳弭曩黏镊泮霈淠柒颧瘙痧辋郄燹泫郾鹞钇殪痈甑踯翥婢檗柽啐菪嶝腭嗝剐笏蟥戢阄噘撅尻贶辚蜢颞忸胼阕竦焐揠邺鳙啁稹徵诌隹舨哔卟伥苌鹚箪缍锇蝮诟洄浍诨犍硷噤垲郐椤嫫伲脲殍噗溱箬厍钽钍恹鬻爰砦蓁胝颛褙鳊邴铖镫腚钭颚鲂悱狒佶偈堀绔醪坜疠椋犸暝佞哝瞟荨芩逡溽裟挲抟暾崦芫荑薏莸欤栀斫镞嗳鸨跸骠俦谠簟棼驸掼倨橛犒邋耧蝼虻铙郫汔诮楸阒绻叁臊钐腧闩菘阗忝橐翕阋踅窨鹬鼋樾錾吒旃弁侪坼蚩嘬糍骢氐呃榧玢绋蚨钆岣菰罟嘏埚绗嚯藿笄袈羯肼暌啷蒗蜊獠鬣熳黾乜镆怩驽旆髂仟芡谯恁鳃艄莳艏趿遢鲐醍僮氽刎芴喑墉昀箦鄣摺钲贽缵鏊锛瓿廛瘳亍遄褡垌椟酆砩桴赙坩臌曷跽湫榉黧猁钌镏缦殁赧埤悭缱衾鲭铩猞眚铈谥耜飕饕餮骰乇绾鹇鲞爻蜴镱铟莜祗濞镔逋谄谶酲茺樗憷莼撺柢阏砜垓旰妫衮嗥郏鞯徼孓钪侉夼跬铼嫘蟊茆睨怄蹁谝嘌綮嫱筇犰穰铷筲哂炻豕秫笥涑铊帏闱鋈舾屣狎哓噫璎铕宥阈豸辎趑龇捌秕荜愎窆镲谗踔苁酢呔聃镦屙鲱鬲膈铪醐獾鲩虺葭牮礓苴讵颏裉诳栌氇镙哞柰袢帔睥苤嫔笸氆佥箧跫蚋鲥扌狲桫溏铽殄脘洧肟绡咻洫癔洇嵛磔胗肫赀眦吖瑷埯畚妣飑豳髌砗铳楮蔟毳锝堞疔葑缶菔疳彀胍磙顸薅翮猢怙蒺廑妗髁醌粝魉旒蝥缗衲呸醅芘蚍圮榀萁苘逑诎劬蕖朊剡蟮椹饣酞帑葶菟魍庑葸氙谖鞅狺夤嬴瘿饔雩鹆橼赜潴骓缁诹怍杓艹檫媸氚呲殂矬笪迨纛簦玎苊轭匚鼢呒缑诖炅鲧唿戽鬟恚袷瘕枧洚桕雎蠲剀诓瘌镧铑鳓蓠呖跞裢裣埒捩鲮熘嵝瘰镘脒腩筢耪辔牝嘁蛴戗蛩巯悫葚熵绱蛸螫毹妁纟嗾鳎绨粜菀沩鼯牾螅顼泶蕈鼹繇苡悒廴吲喁卣牖笮舴罾棹鸷碡锕嗌媪龅甏箅傧啵鹁晡氅魑篪怛籴礅蒽珥钫绠觚鸹涫颃篌锪蠖乩咭赍嵴铗湔槿赆僦皲佧箜蒉缧酹嘞疬臁膦泷蒌泺荬颟旄泖镅蠓冖幂耱襻鼙攵炝愀蘧氍犭禳桡糁馓酾槊狻锬羰鼗鹈畋髫萜堍璺怃崤囟睚痖菸餍徭瘗唷圉蜮砟谵澶朕摭轵诼笫廒聱庳髀笾龀裎雠蝽腠妲刂铥黩怼沲蘩趺苻拊阝鲋戆纥哏鲠笱瞽庋簋刿掴猓蚵槲觳萑癀蟪钬虮掎鲣囝裥踺茳糨鹪狷麇芤刳愦髡悃缡鲡鳢奁墚尥柃胪镥脔杩劢墁玟蝻呶搦湓罴蜱俜鲆皤镨槭镪黢洳枘芟埏渖筮殳飧溻饧樘醣酡圬粞觋莶霰榍薤髹曛疋迓衤欹佾埸霪茚鼬伛瘵骣畛卮轾彘觯锺邾槠谘嵫髭蕞犴鞴畀滗煸褊冫孛羼耖褫彳艟辶茈璁爨榱萏坫鸫篼簖裰哚蹯瀵怫陔筻廾蛄绲崞蜾盍荭黉糇骺後鲎煳鹕冱瓠逭漶耠镬齑殛鲚跏蛱搛缣鹣僬噍衿缂喾狯纩栝蛞稂塄嫠詈蠊鹩躐鹨簏膂脶嬷昴瞀浼艨祢縻蘼芈糸宀眄鹋杪咩愍麽瘼鍪硇猱茑脬蟛貔仳犏钋芑葜愆锓蠼筌鬈蚺荛埽潲诜埘弑嗍蒴鸶缌澌姒蔌睃缇梃彖鼍芄隈鲔硪忤痦欷僖醯鼷跣枵忄擤勖痃碹谳轺铘圯纡窬窳饫蓣瀹趱驵缯揸笊絷跖舯螽籀舳粢驺陬阼揞菝魃癍鹎坌狴萆嬖襞碥髟鳔醭螬馇虿瘥惝怊鸱螭瘛帱徂汆脞瘅忉羝睇瓞鲽岽胨芏佴燔偾稃郛莩幞澉槔袼搿茛鞲觏酤牿鲴宄匦呙馘焓瘊虍岵鹱咴隳缋溷夥剞洎恝蒹谫僭艽挢敫卺冂扃锔窭锩觖劂氪骒哙悝蝰诔苈篥娈瞵锍栊癃舻辂稆猡蛑甍艋敉眇蠛侔镎肭艿蛲疒陧衄锘堋庀擗甓螵钷攴桤褰凵肷锖鞒吣黥俅蝤璩悛辁肜颡谂礻摅汜溲嗉荽闼骀炱螗耥裼铫莛亠箨蕹迕杌寤穸饩舄禊猃绁渫廨獬硎荇鸺貅糈揎镟獯讠厣罨蛘鳐崾舣媵尢蚰侑狳螈龠昃痄搌浈埴夂黹酎橥丶缒窀菹锿砹茇勹邶鐾舭忭缏灬瘭踣钸礤骖黪艚锸猹镡躔蒇冁鬯屮枨眵傺搋巛舡楱镩鹾戥觌阽铞垤揲蹀耵髑憝鸸鲕篚镄鲼唪祓艴黻黼鳆尜戤塥哿虼遘桄丨胲醢撖嚆薨堠烀轷锾缳擐哕阍劐攉丌墼蕺彐芰哜戋趼楗耩喈卩鲒骱刭弪獍鬏鞫犋屦醵桊爝捃胩锎蒈莰闶钶锞眍筘阃漤铹栳耢仂泐檑轹蔹懔垆锊倮蠃鞔硭漭猸鹛钔瞢礞喵苠鳘貊貘毪坶攮猊嬲肀聍甯狃耨孥胬恧蒎锫陴氕丿裒镤蜞岍搴箝慊椠蜣硗劁缲檎螓圊檠謦銎赇鼽糗麴鸲磲畎狨蝾薷襦颥蓐脎毵磉鳋唼歃彡骟滠矧胂蓍鲺贳搠厶兕锼螋瞍觫赕铴瑭慝掭祧龆蜩鲦茼酴煺柝腽軎阌阢诶菥蓰柙祆筅葙蟓魈躞砉醑儇岈砑珧酏劓堙撄潆舁蝓燠眢箢掾刖狁拶唣迮帻谮哳齄膪嫜忮骘膣踬荮瘃麈疰丬浞禚觜耔腙鄹鲰躜撙胙"; + +const CN_CHAR_RANK_FILE: &str = "./data/cn_char_rank.dat"; +const HANGUL_SYL_FILE: &str = "./data/hangul-syl.dat"; + +#[cfg(feature = "with_extra")] +fn encode_utf16(s: &char) -> u16 { + let mut buf = [0; 1]; + *s.encode_utf16(&mut buf).iter().next().unwrap() +} + +#[cfg(feature = "with_extra")] +lazy_static! { + static ref OPENCC: OpenCC = OpenCC::new([Config::S2T]).unwrap(); +} + +#[cfg(feature = "with_extra")] +fn opencc_convert(s: String) -> String { + OPENCC.convert(s).unwrap() +} + +/* +@author modified by konghayao +@link https://github.com/sxei/pinyinjs/blob/master/other/%E5%B8%B8%E7%94%A86763%E4%B8%AA%E6%B1%89%E5%AD%97%E4%BD%BF%E7%94%A8%E9%A2%91%E7%8E%87%E8%A1%A8.txt,modify by konghayao + +常用6763个汉字使用频率表 + + +原文地址:http://blog.sina.com.cn/s/blog_5e2ffb490100dnfg.html + + +汉字频度表统计资料来源于清华大学,现公布如下,仅供参考。 + 使用字数 6763 字(国标字符集),范文合计总字数 86405823 个。 + 说明如下: + + 假若认识 500 字,则覆盖面为 78.53 % 。其余类推, + +列表如下: +字数 覆盖面( % ) + 500 78.53202 + 1000 91.91527 + 1500 96.47563 + 2000 98.38765 + 2500 99.24388 + 3000 99.63322 + 3500 99.82015 + 4000 99.91645 + 4500 99.96471 + 5000 99.98633 + 5500 99.99553 + 6000 99.99901 + 6479 100.00000 + 6500 100.00000 + 6763 100.00000 + +*/ + +#[cfg(feature = "with_extra")] +fn process_chinese_chars() { + let sc: Vec = + CN_SYMBOL.chars().chain(HAN_ZI_PIN_LV.chars()).clone().collect(); + let tc: Vec = sc + .iter() + .map(|i| opencc_convert(i.to_string()).chars().next().unwrap()) + .map(|i| encode_utf16(&i)) + .collect(); + let sc: Vec = sc.iter().map(encode_utf16).collect(); + let hashset_tc: HashSet<&u16> = HashSet::from_iter(tc.iter()); + let common: Vec = + sc.iter().filter(|i| hashset_tc.contains(i)).copied().collect(); + let hashset_common: HashSet<&u16> = HashSet::from_iter(common.iter()); + let sc_set: Vec = + sc.iter().filter(|i| !hashset_common.contains(i)).copied().collect(); + let tc_set: Vec = + tc.iter().filter(|i| !hashset_common.contains(i)).copied().collect(); + + println!( + "common: {}\tsc_set: {}\ttc_set: {}\t使用 uint16存储", + common.len(), + sc_set.len(), + tc_set.len() + ); + let data: Vec = common + .iter() + .chain([0].iter()) + .chain(sc_set.iter()) + .chain([0].iter()) + .chain(tc_set.iter()) + .copied() + .collect(); + fs::write( + CN_CHAR_RANK_FILE, + data.iter().flat_map(|&x| x.to_le_bytes()).collect::>(), + ) + .unwrap(); +} + +// 处理韩文字符频率分布 +// 来源 http://nlp.kookmin.ac.kr/data/syl-2.txt +#[cfg(feature = "with_extra")] +fn process_korean_syllables() { + let data = read_to_string("./scripts/syl-2.txt").unwrap(); + let binding = + data.split("=====================================").collect::>(); + let content = binding.get(1).unwrap(); + let chars: Vec = content + .split("\n") + .map(|i| { + i.split(" ") + .map(|i| i.to_string()) + .collect::>() + .first() + .cloned() + .unwrap() + }) + .filter(|i| !i.is_empty()) + .map(|i| i.chars().next().unwrap()) + .collect(); + fs::write( + HANGUL_SYL_FILE, + chars + .iter() + .map(encode_utf16) + .flat_map(|x| x.to_le_bytes()) + .collect::>(), + ) + .unwrap() +} + +#[cfg(feature = "with_extra")] fn main() { - if env::var("CARGO_WITH_NO_EXTRA").is_ok() { - return; - } - // 检查 npm 是否存在 - if Command::new("npm").arg("--version").status().is_err() { - panic!("npm is not installed or not in PATH"); + println!("cargo::rerun-if-changed=scripts/syl-2.txt"); + println!("cargo::rerun-if-changed={}", CN_CHAR_RANK_FILE); + println!("cargo::rerun-if-changed={}", HANGUL_SYL_FILE); + + if !exists("./data").unwrap() { + create_dir("./data").unwrap(); } - // 执行 npm run build 并捕获输出 - let output = Command::new("npm") - .arg("run") - .arg("build") - .current_dir( - env::var("CARGO_MANIFEST_DIR").unwrap_or_else(|_| ".".to_string()), - ) - .output() - .expect("Failed to execute 'npm run build'"); - - // 打印命令的输出和错误 - println!("status: {}", output.status); - println!("stdout: {}", String::from_utf8_lossy(&output.stdout)); - println!("stderr: {}", String::from_utf8_lossy(&output.stderr)); - - // 检查命令是否成功完成 - if !output.status.success() { - panic!("'npm run build' failed with status: {}", output.status); + process_chinese_chars(); + process_korean_syllables(); +} + +#[cfg(not(feature = "with_extra"))] +fn main() { + if exists(CN_CHAR_RANK_FILE).unwrap_or(false) + && exists(HANGUL_SYL_FILE).unwrap_or(false) + { + println!( + "feature with_extra is set, data process will be skipped...exit" + ); + } else { + panic!("feature with_extra is set but data file not found"); } } diff --git a/crates/lang_unicodes/package.json b/crates/lang_unicodes/package.json deleted file mode 100644 index 7879c8e8..00000000 --- a/crates/lang_unicodes/package.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "name": "lang_unicodes", - "version": "1.0.0", - "description": "", - "main": "index.js", - "scripts": { - "build": "node ./scripts/cn_char_rank.mjs && node ./scripts/hangul.mjs" - }, - "keywords": [], - "author": "", - "license": "ISC", - "devDependencies": { - "chinese-simple2traditional": "^2.2.2", - "fs-extra": "^11.2.0" - } -} diff --git a/crates/lang_unicodes/scripts/cn_char_rank.mjs b/crates/lang_unicodes/scripts/cn_char_rank.mjs deleted file mode 100644 index 0613fafb..00000000 --- a/crates/lang_unicodes/scripts/cn_char_rank.mjs +++ /dev/null @@ -1,52 +0,0 @@ -/** - * @author modified by konghayao -@link https://github.com/sxei/pinyinjs/blob/master/other/%E5%B8%B8%E7%94%A86763%E4%B8%AA%E6%B1%89%E5%AD%97%E4%BD%BF%E7%94%A8%E9%A2%91%E7%8E%87%E8%A1%A8.txt,modify by konghayao - -常用6763个汉字使用频率表 - - -原文地址:http://blog.sina.com.cn/s/blog_5e2ffb490100dnfg.html - - -汉字频度表统计资料来源于清华大学,现公布如下,仅供参考。 - 使用字数 6763 字(国标字符集),范文合计总字数 86405823 个。 - 说明如下: - - 假若认识 500 字,则覆盖面为 78.53 % 。其余类推, - -列表如下: -字数 覆盖面( % ) - 500 78.53202 - 1000 91.91527 - 1500 96.47563 - 2000 98.38765 - 2500 99.24388 - 3000 99.63322 - 3500 99.82015 - 4000 99.91645 - 4500 99.96471 - 5000 99.98633 - 5500 99.99553 - 6000 99.99901 - 6479 100.00000 - 6500 100.00000 - 6763 100.00000 - -**/ -import { toSimplified, toTraditional } from 'chinese-simple2traditional' -const cn_symbol = '⸺、。〈〉《》「」『』【】〔〕〖〗︐︑︒︓︔︕︖︐︑︒︓︔︕︖︗︘︙︰︱︳︴︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄' - -const hanzipinlv = "的一国在人了有中是年和大业不为发会工经上地市要个产这出行作生家以成到日民来我部对进多全建他公开们场展时理新方主企资实学报制政济用同于法高长现本月定化加动合品重关机分力自外者区能设后就等体下万元社过前面农也得与说之员而务利电文事可种总改三各好金第司其从平代当天水市提商十管内小技位目起海所立已通入量子问度北保心还科委都术使明着次将增基名向门应里美由规今题记点计去强两些表系办教正条最达特革收二期并程厂如道际及西口京华任调性导组东路活广意比投决交统党南安此领结营项情解议义山先车然价放世间因共院步物界集把持无但城相书村求治取原处府研质信四运县军件育局干队团又造形级标联专少费效据手施权江近深更认果格几看没职服台式益想数单样只被亿老受优常销志战流很接乡头给至难观指创证织论别五协变风批见究支那查张精林每转划准做需传争税构具百或才积势举必型易视快李参回引镇首推思完消值该走装众责备州供包副极整确知贸己环话反身选亚么带采王策真女谈严斯况色打德告仅它气料神率识劳境源青护列兴许户马港则节款拉直案股光较河花根布线土克再群医清速律她族历非感占续师何影功负验望财类货约艺售连纪按讯史示象养获石食抓富模始住赛客越闻央席坚份士热限米银息校均房周游千失八检足配存九命尔即防钱评复考依断范础油照段落访未额双让切须儿便空往你层低奖注黄英承远版维算破铁乐边初满病响药助致善突爱容香称购届余素请白宣健牌促培竞巴稳继紧字困刘旅声超随例担友号显却监材且春居适除红半买充陈火搞图阳六察试太什执片古七球修尽控讲排粮武预亲挥卖审措荣洲卫希店良属险曾围域令站苏龙念罗吨器汇康减习演普田班待星飞写矿轻扩言章汽靠毛终仍景置底福止离泽波兰核降训逐票菜座献钢眼损宁像苦印融独湖早予夫编换欧努著顾征升态套介送某斗状画留航派室临兵补宝略黑综云差纳密贫剧犯阿击遇岁阶烈督吃丰馆招害官树听庭另沙私针胜贷网愿托缺园假酒音巨既判输讨测读洋括筑欢刚庆久陆找楼激晚绝压故互签汉草木亩短绍迎吸警藏疗贵纷授登探索湾宏录申诉秀序顺死卡歌午孩桥喜川邓扬津温船库订练候退违否彩棉帮拿罪币角召灾妇杨奋绩虽煤免笔够永圳停奥鲜朝吴岛觉移尼急博贯拥束左细舞幅语俄奇般简拍脑债固威券追筹刻映繁伟甚饭右彻烟沿街血冲洪植誉刊玉厅救潮迅伍怎付倍顿述播励斤乎纸振旧障鼓艰呼吉男绿尚夏亏季松哈祖典韩遍夜轮板抗摄杂皮贡借幕罚伤岸扶乱曲脱践危澳童散味叶累谢孙邮雄兼微呢谁惠偿署择染答块徐鱼赞课盛延瑞怀堂驻零辆齐胡途封似润守毕坦母雨败朱污趋械纺租灵拓残含握跨衣储瓦蒙鉴析竟骨档秘禁赵宾异伊智钟键辉跃冷倒庄毒仪哪涉泛宗鹏归岗雷礼尤休泰疾肥珠叫牛宜抵挂寻父攻佳塞架符裁虑肉启丽露鲁秋昌估射册若宽厚盾硬末轨饮勤茶诗郑冠涨篇泥唱纯坡熟浙晓抢丝锦载笑勇杰患乌坐雪戏背塔翻沈遗聚渠哥享迹森辽衡掌牧附操赶览野盟殊仁错萨夺梅误词董潜卷矛腐亮冒盖旗井凡震峰坏倾距壮惊盘梁摆径忠冰峡丹避珍乘刑扎透迫箱莫跑穿祝乏厦渐软询折浪朋敢诚弱疑邀沉端床络疆缩脚甘贴勒荒唐静缓侵句尊塑肃怕耕痛援劣伙挑洗暴冬龄乔餐肯廉跟阵伐悉忘闭奔恢宋泉杯渡吗奉婚赴恩盐掉洁亡洛聘蔬混摩抽鸡剂胆麦谋雅废贺羊阔唯捐返隆穷辛猪帐饰郭颁灯绕诸伴顶祥谓恶番敏旦劲缴麻屋跳码鞋扣迈忙趣盈棋勃敬辑摊旺纠炼梦偏渔牙侨黎赔裕宫谷概稿柱弹殖秩凭拨幸洞伪沟姓遭涌陶迁诺拔畅忧胞丁蓄贝舍腾杀煌圆伦横薄畜毫豪弟呈佛邦您墨徽惯循蓝烧触陕拖伯盲宪净卢炭籍秦粉妻爆欣释玩俊欠蛋猛迪苗暂貌遵锡楚桂昆杜皇醒燃凤截铺液撤胶慢杭虚辞曼毅咨俗糖忽芳姐耗妈谊浦频阻允宅窗默胀弃倡灭甲症埃滨赏莱拒淡坛陵绘虎竹赢锋篮迷纽轿贩递娘圈挖炉替幼乃郊颇戴滑徒崇涛焦凝墙吧炎刀玻寿履圣昨酸朗媒桑铜仲亦诞揭纵漫愈辟赠旱奶泳枪骗虫池镜浓拆艾扫娱钻碍寒迟邻曹盗穆豆赚晨浩彭耳瓜扭脸燕摇寄仿炮晋泪欲饱壁锁刷柬诊磨捕寨滚膨孔添帝辖炸旨吁址驶抱嘉拜扰袋佩阴辈锅赖剩押怪浮枚栏毁柳恐敦孟旁仓岩伸岭耐懂捷璃溪暖纤汗疫巧旋侧冶陪鸣瓶纲挤旬舆喝陷缘稻饲滩隔慰朴隐灌拟偷闲赫恰慧蒋闹邹牵柴刺滞彰俱勘填琛尝贾搬淮奏荷滋覆役秒踏巩摸荡辅惜柜肖颗搏氏姑弄姜君舒兑宇割哲摘钦逃漠忆敌宿啊凌耀闯阅贪赤汪悲抑瓷冯厉粗菲琴堡斌掘稀衰驾雕牢氛驱妥悄郎巡臣羽灰癌颖姆漏袭贤鸟暗茂孤惩榜袁桌卓傅剑堆兆狠轰拳妹绒裂潘兄洽叹涵贿侯岚熊绪阁尾碑尖腿涂栽坝犹铸肩闪诱辩芬睡奠伏妙乙绸廷夕恒梯赁霞攀枝译描湘磁吕硕爸肝峻葡衷搭唤薪挺逝狗蔡宴蓬撞铝牲舰胁崛桃斜丧烂屏砖墓详逾函跌抚插戈凉啤脉滥赋柏堤腰泊寺尘蒂削仙踪冻汤睛艳荐劫框廊惑页拼堪携丈乳挪谱舶埔遥菌塘氧晶洒株颜虹岳胸忍甜匹瞩懈爷丛莲叙鸿逢抬嘴弘炒喷吊窝衔吹霸仔垦胎慎脏歧疏悠慕漂杆萍舟吐玲凯戒盼偶盆慨弊箭茅衫罐串辐腹钩碰昂酬晰姿彼锻飘嫁竣缝蹈悬紫浅缆喊昔驰湿剪侦坑姚魏扑挣焕皆狂泡骤堵膜禽锐芝帽擅沪晤婆埋劝碗玛顷鸭娃豫匆魂哭庞亭屡逼尺撒鹿讼弥坊碎缔霍壤萄铃稍丘肿烦苹庙雇汛孝辰吞汰怨酿耶咱欺丢琼棚披渴屈弗疲帕昭盒仰萧牺撑抛鼠纱翼兹骑糊契铭淘顽撰乒淑妆窑柔姻苍谨卿灿栋敲窃菊郁催眉邱揽鼎韦肤娜俏呀拚寸爬悟尿罢圭葬聪沃肠厕慈恋绵橡圾垃翁粤脂歹憾阐甸巷蜂轴艘垄衬阜惨冀幽厘崭筋寓迄渗碘碧赌袖奈崔悦捞剥孕逆婴脆缅艇谭笼儒粒诈遣垂磋卸帜枣幢淀帆蛇宰殿猎叔夹帅沧魅俩牟钓葛罕渤汕溢擦袱嫩桶殷酷呆卧暑骄幻囊掀醉牡饼扇蒸赣俭椅枢彦樊吾仗彬砂绳巾喀勋愁碱谦壳轧潭浆挽邢啥焊钞烤廖猫狱腔喻御蕴坎魔刮瘤茫竭莉链淫愤纹咸睐睹裤夸滴雾搜拘龚凶茨傲鞍鹤蚀颈翠卉汁冈狮隧弯胃沛募琳疼蚕泼磷捧炳绣朵涯掏奎聂孜韵浑翔魄掩斥敞腊愧粘丑溉斑啦柯谐烯禄浴涝鬼薛瘦挡昏鹅湛逻虾沂辱叉鼻厨鲍鞭辣潇乓肺尹颂邵澜桐鹰妨闽屠畏翰塌亟寂赂犬聊暨垫泄漆旭蕾坪涤挫佐瞄拦硫棒杏爽碳畔熙襄祸乾淹臂莎辜阎庸砍捉勾垒衍坤噪毯倪扮铅遏哀愉瑶咬嫌闸恳齿杠怒兽浇肇鄂溶哄棵盯梨灶屯狭陋啡浸淋濒脊戚勉膏氨墅沸挨蔓抄芒秉刹饶厢咖魁骚缚遂恨跻螺辨菇帷凰椒汝瞬淄舱馈桩炬誓卜麟岂兔眠泵拐肚匪芦匈霉蜜荆雁窄秧枯仆嘱壶谅哨肌贬叠稽岐沫肆醇菱彪躺摔膀甫逊凑渊喂藤砸悔杉霜厄忌桔筒丙臭拾芜禹丸蟹嘛俞翅尸澄骂睦馨郝贮陌钧轩赃笋歉逸歪巍萃崖窟踢锣萎庐剖籽甩饥苑恼渣痕莞硅晴巢瘫缠隶筛穴昼埠宠肢饿仑逮兢趟糕妮邪抹俑萌匠扔酱葱礁掺雀髓悼挚蔚枫庚伞侃僵捆蒜溜傻蔗谜斋蝶沾闷驳耿槽黔吓肾芽栗朽荫榆皖曰徊奴迭僻蓉靖氟滔羡愚尧俺徘罩磊镑舌曙纶粪匙钉佼扯踊躲猴纬咽酝挠宛瑰歇抒茧穗祭鑫趁痴裙猜耘碌锈晒潍弦稼狼拢梧芯眷哑宙厌逛谴邯呵蜡寥钥耸媳熏蚁惕颠娟亨吟蒲梭瞻渝喉遮慌夷韶焰尉珊胖蕉粹裹琦秽侠奸挝绑曝棍婉镶熬傍燥氯骆晃鸽疯琢聋瑟暇绥禅溃腺垮阀撼煮佣滕淤蹲栖硝睁荟荧抖坟芭臻锭晖倦倘喘邑锤惧荔毗觅矮恭钙氮缸瞧颤萝佑怡瘾寡烹摧棠缪雏韧喇兜坯坷贞仇缉帘竖糟猖懒凿洼喧谣驼烫锌椰崩沥汾磅霖棘扛彗矩瞒陇绎诫斐卵铮钾宵簿秤畴斧擂剔躁冤讳寅焚漳鳖哺耻僧琅粟怖咏蜀淳柑缕烁氢蔽琪泣阮镀殴虞虐炊搁诀掠坠屿髦酋躯吵遐寞仕稚僚楠矶筝彝叮熔槐潢芹郸匾咋玄裔陡哗怜襟刃脾嵌拱慷痪跋孚峪钊滇苟晕墩膝羞乍腻詹讶敷肴莹衢柿朔袜枕烘匀歼泻樱吻翟堰苯隙娇獗汲蛙斩靡沁乞姨翩沼嘎畸矫骏薯绚窜藻矗皂楷腕篷徇耽娼犁榻茄棕汹峨蹄昧奢涩灼踩粥拣旷簇溯攒沓呕梳搅砌纫渭澡撕漓葆辍肪祁鞠蛮捏诵娣岱瀑啸裸鸦瑛躬舜忱豹纂恤惟赐俯犀媚嫂嗓蚊茬驭缀皱凳钮蚂姬扒嫖跪凹揣尬沦尴豁玫殡淌叭唇啃裘卑琐矢拯忡勿盎茵椎脖拂骅葫迢薇龟绞眶沐傣浊舅叛浚窘栓酶笛泌榄惹铲碟捡恪酯滤匿酵砚贼匮熠鳞麓镁氓苇廓巫踵竿蘑翘梓贻鳗帼冉泓狐涟崎窍瑜讽逗铎掷璀泗浏陲醋苛攘璧瀚哩暮矣蚌悖扼漯烛蝴屑墟俘侣庇陀煎秸弓捣譬炜炯拌扁彤锚禾侮秆绮嚣樟咐枉窦桦寇哉狸耍馒驹隋冕疮咄妄峙娄溥腑钠栩糙滦呐鲻娶祺刨褒橙茹谎抉慑媛橄戎迩雯璨雍惶扳桢霓账梗炕裴韬杖痹缤沽燎煞删辙爵缭劈烨槌媲凛莆颅锯膳澎坞瓣婷絮酌涡唁秃禺膊棣芸忻炽榨篆憨戍圩爹蹊饪胺贱睫蝇惫拇赈泾盏弧剿硒毓皓菏灸湄炙祠荻捍嚼朦屹紊藜驴寝兮隘祈榕臧蝉绢瞎闵鳌娥藉娅烽楂摒凄凸熄孵叩渎胳匡袍卒怠桓莽倩泸藕陨辗骋峭冥饺亢圃颐擒铵鳄簧愣璜钰拙瘠靳隽罹岑镭榴恕毋囤汀绽窖筷擎猿诲碾夭筐邃藩诬芙胚哇垣胧帖殉毙壑绰憋亥涅屁璞缮侍倚稠棺棱葵诣笨橱寰郡垢徕眺胰谆窥霄栉舸蹦坂瞪珲釉跤挟侄肘嘲刁缎嚷痒敛祛绅孰痫闺椿噶恍伶峦酥萦苎癫涪锲蜚拎嵩昊娴涣烙璋笃囚祯篱讴舷纭锄巅卦摹眸柄踞焉辄褚褐湃夙堕岔惦疚谍奕羚帧澈濮捎漾吼锰趴菩簸仃渲札谙咕桨咀郴咳呜蛟拧莘驯庵弼逞蹬姥撂镍晏疡爪骥楞钳懋寐淇琉杞菠铨翌靶侗瑙馅丐痊娓侈苓聆睿偌釜噬曦燮哟瑾瞿璇拮憬鹊勺憧嗜啼檐柚呱渍镌妃溺鸥粕沱榭隅毡禧瞅鲸淆阪茁渺瞥茜瘟礴伺谛锹蔼虔莺迸磕赡泱栈甄镐抠嬉诿甬绊饵谬梢颍揪琶褥佟腥辊溅琵鄯拴喃笙酰粱卤芮膛斓潼鸵侥讷婿吆羁嗣蜒栅疙拷戳镛芷钛蜿铀夯摞雌酣荼蝎锥姊瓢祀玺弛犷哦茸鱿绷茎惋亘珑莓掂迥鲤殃瘩叨螃奄腈疟沭钨昕膺涿糠氰揉狩檀悍缫哮衙瑚潞谤搀洱涓袤痰乖冗芋甭骸幌涮俨敖槛狄牒恺雹赎庶熨蛛佰蓦鄱煽腌黯疤倔剌斡诽锵筱妍掖铿脐捅弈邸湟眯赦拄啪玮轶蛾麋炫赊靴箔菁撬裳戌缨蝗撇奚瀛噩怯蓓匕咚瞰佬泞扉皋晾麒姗跚瘀鄙猕拭鲟祷脯砺驿陛瘁搓舵汞哼胫珀邬磺馏馍铢诧涧吏苔潺邳烷囿斟滁殆酚狡孺恬沅铬湍啧囱蒿鹃柠漱胥妖洙珂茉蹒圻鬓搂葩佘渥诙袒捂瞠妓铐澧袂馁汐匣逍谚窒蔑糯汶壹岖盔嘘迂嘀锢讥吭抨屎獭褪咫稷迦檬塬蠢蓟咎皿驮俐坍惭垛鹭鸾蹴撩诠恙臃遨睬踌浒搪郧竺翡宦冽憩萱拽卞槟躇蘸肋呛濡酮眨撮矸垸蛀黛涸脓徙撷曳峥渚镖钴骊袅磐掣沌埂嘿琏楣豚诡悸麝煦矾羲唉溧呻覃兖吱惰羹钝枸姣颓铣梆骇淅孢叱谧泯谟恃薹筵鏖栾鹜哽掬辘茗瓯绛筠铤袄殚梵挎遴榈蜕癣垠厮幄偕焱攥裨炖旮旯蔺骡娩伫猝窿虏屉缜咒筏骼璐剃涕猗淼侬阙嗅鸳嘈霏珩沮捺硼荃驷漩嘻眩掰伽脍婪煜鹄壕崂翎痞兀婺鸯楹咤徜嫉篓烃铂咪掐匝杼蕃箍荤砾嘶皑宕荪哎汴貂邡淦蕙弩堑惬偃徉箴赘啻凋穹酗憎芥唾闫晔苞昶甙笺吝蕊鳝衅猩薰昱趾淞坳怅翱汩琥岌阑粼羌霆篡塾酉裱韭唠廿闰攸黝蛤厥荞瑕柘祚疵愕蕨牦飨疹嗷癖芪漕隍徨逵泠嵘嗡岫岷擞陂颊咔卯婶椭惘歙幺臆叽缰睽勐暄弋痔秭煲琮嘟犊玖怦丕溴罂瓮丞惮癜晦攫镰镯柞舫铆蹼妩熹铱褂丫笆妒噢噙琬冼荀蟾捶嗒町嫣肮皎旖恣钚砥吩茯馥钎甥嗦蜗浔谒辫亳彷珏咯淖妊佤玷嘹崴於辕贲扈伎旎孽耙娠戊冢跷砷焘羔圪耄钼悻荥唑稞邝莅杷醛嗽唆拗碴馋胱琨茏糜懦骞蜘嚓怵抡唢腆涎灏臼墒暹椽牍钒猾榔懵枇樵锶籼箫漪帚钵赓捻郅儋烬锂剽锑鄢鄞臾喳胄耋阱笠瓴啬杳萤莠嶂浜傩遒轼睢倜矽仉唬旌酪腼罄嬗畲祟桅悴讹憔龋嵊绶邕忖箩咆晌愫猷帛麾莒觑吮蟋庥懊阂蒯阡腮潸晟蟀臀罔骁崽绉粽忿肛蠡遛蜓煊蚜坻滹銮悯鼐撵噼忐湮侏粳矍铄坨铉盂锗阖溟俟忑赝鬃敝宸哆靓揩瘸鲅篝氦嚎浃缙飚锷癸柩蛎濂榷鲨钡盹鲫诘诩迤桎遁尕梏楫赳飒锃雉怆痼劾痢喽霹昙畹胭佚狈瘪姹吠铧谏雳咙畦荠娑褶忏惚痉橘漉诏呗晁惆砀馄戟峁昵拈蠕虱洵鹦蛹铛挛倏澍濉钅噜咛俳磬蜷霎肽砼聿怔砭谌箕蹶孪蔷糅挞饨惴禀淙哒枷楝闾蜻嗖淬垩矜郗蚤嫦喋镉饯髋潦镂簌偎鹉岙踱诃籁宓膘飙涞耆荏渑豌琰俎绌埭幡赅锆崮碣珞腋滢蓖伉馗聩幔锨蓥鹑砝酩枰鞘苋粑蹭倌犟俪嶙砻嵋滂葺苒枭翊婀飓阚喟傈藐蜃怂稣亵诒蜇岜霁瞌沏卅舀鹌俸嵇蟒汨砰鞣唏陉佯恿竽瘴祉焙诋濠螂叻垅谩朐稔芍瞳惺萸盅啄眈偻爿蟠炔垭噎蛰擘锏茭悌喔谑峋妪恽韫褓镳饽杈戛鸠萋襁榫霭苄跺杲嗨珉哌娆孀恸缄夔佗饷苷郜鼾颌訇谲溘咧褛逄颦洮逶嫡蠹碓烩醴栎鎏瓤伢蔫怿甾摈畈镣螨秣搔盱痍搐蹉佃绂疽骝霾悚缃懿咂奘轱邗蚝瘘醚湎瞑掮羟仨砣郢砧鳟跛踝轲窠郦踉躏戮篾骐鳍蹂郯跎倭诅鄄褴阆缈嗯妞沤跄箐苕窕楔饴峄腴圄谕揍踹罡佝颔觊篑鲢綦妾镗啕蚬窈揖眙蟑诛钗绯讣睾媾嗬祜镢囹苜坭蛐髯搡叟蹋觎捱碉呋罘荚鹫岿寮扪焖狞鳅嗄嗤擀痂嗟颉蚧儆锴龛嗑锟俚枥懑讫橇嗪虬跆骧陟灞恻涔酐鸪牯钜萘鲶缥曜蚓诤埕墀麸蝠蛊遑厩趄沔耦疱匍揿蚯讪唰舔呷蓿鹧膑刍耷鞑裆趸孑鲲绫埝嘭舢鸢螯吡蝙疸匐桁铠羸鲈囵唛仫庖劭郓骜粲峒腓鹳鳜蚶囫茴峤蟆蘖癯纾僳皙隰缬馐谪捭汊碜塍艮睑狍苫篦蜍锉沣诰晗喙麂謇蹇觐啾踽邈壬燧娲猥歆镒茔昝赭狰孳哧舛噔鹗蚣逅洹腱锒纰蛆蕤姝邰纣嘣钹衩婵孱蹿鲷萼椁浣镓遽赉趔蕲剜邂仡氤獐幛俾铋嗔茌氡诂豢桧畿倥捋仞忒疃浯蜈榛偬稗菖鲳厝踮叼痱貉玑婕琚疴掳钤垧氵黠跹怏揄氲铡濯芾笈崆钕菽隼傥仝囗芗埙簪暧桉镝蚪蜉藁笳菅龃喹橹抿啮蹑逖唔樨巽揶黟訾钣嵯凼恫掇剁珙沆噱揆耒铌泅疝葳隗滟龉钺殒蒡觇黜澹酊垡奂珈濑馕馊嚏痿岘氩茱滓焯抻豉敕掸碲靛摁淝鳏盥皈鲑颢犄翦铰椐胯屺邛庹猬蓊骛浠桠胤鸩痣蛭噌杵啜靼啶煅枋觥毂刽蝈蘅芨戬醮疖忾骷洌呤荦觞谡瀣蝣糌倬碚蹙痘砘绀虢蕻肓蛔唧桀蝌侩棂樯挈轫巳崧蓑藓鳕瑗帙馔豺痤郇殓髅轳逯嗫戕嚅蛳琊嘤疣蚱钯钿碇咣毽迳喱逦廪邙囡匏扦亻咝凇纨涠庠溆醺炀烊肄龈谀锱瘢枞皴贰晷闳斛屐讦婧苣蔻绺渌瑁螟叵颀穑膻羧螳绦誊蜥楦恂靥咿翳瓒枳啭樽嫒婊搽铒跗凫菡篁髻裾栲癞蓼氖孬喏砒姘衽缛嵬挹缢慵呦箸蹩槎榇舂嗲胴谔岢圹娌潋蛉酃鲵鲇娉亓碛芊忪谇笤韪勰呓俣圜愠仄炷毖筚伧棰磴滏篙肱笕堇馑荩榘哐傀崃罱痨儡鹂檩垴仵檄芎阉刈壅馀庾妯躅獒阊笞饬钏硐椴泔硌鹘鳇豇狙戡莨啉辇臬殇舐黍薮眭佻嗵煨莴蚴妤瘐擢蛏蹰龊辏绐氘骶莪珐缟聒讧岬胛桷谰戾撸鸬雒嘧囔铍骈掊茕噻铯柁艉龌硖罅魇酽咦嶷羿轸趵荸薜踟玳啖蔸槁鲛疥砬唳弭曩黏镊泮霈淠柒颧瘙痧辋郄燹泫郾鹞钇殪痈甑踯翥婢檗柽啐菪嶝腭嗝剐笏蟥戢阄噘撅尻贶辚蜢颞忸胼阕竦焐揠邺鳙啁稹徵诌隹舨哔卟伥苌鹚箪缍锇蝮诟洄浍诨犍硷噤垲郐椤嫫伲脲殍噗溱箬厍钽钍恹鬻爰砦蓁胝颛褙鳊邴铖镫腚钭颚鲂悱狒佶偈堀绔醪坜疠椋犸暝佞哝瞟荨芩逡溽裟挲抟暾崦芫荑薏莸欤栀斫镞嗳鸨跸骠俦谠簟棼驸掼倨橛犒邋耧蝼虻铙郫汔诮楸阒绻叁臊钐腧闩菘阗忝橐翕阋踅窨鹬鼋樾錾吒旃弁侪坼蚩嘬糍骢氐呃榧玢绋蚨钆岣菰罟嘏埚绗嚯藿笄袈羯肼暌啷蒗蜊獠鬣熳黾乜镆怩驽旆髂仟芡谯恁鳃艄莳艏趿遢鲐醍僮氽刎芴喑墉昀箦鄣摺钲贽缵鏊锛瓿廛瘳亍遄褡垌椟酆砩桴赙坩臌曷跽湫榉黧猁钌镏缦殁赧埤悭缱衾鲭铩猞眚铈谥耜飕饕餮骰乇绾鹇鲞爻蜴镱铟莜祗濞镔逋谄谶酲茺樗憷莼撺柢阏砜垓旰妫衮嗥郏鞯徼孓钪侉夼跬铼嫘蟊茆睨怄蹁谝嘌綮嫱筇犰穰铷筲哂炻豕秫笥涑铊帏闱鋈舾屣狎哓噫璎铕宥阈豸辎趑龇捌秕荜愎窆镲谗踔苁酢呔聃镦屙鲱鬲膈铪醐獾鲩虺葭牮礓苴讵颏裉诳栌氇镙哞柰袢帔睥苤嫔笸氆佥箧跫蚋鲥扌狲桫溏铽殄脘洧肟绡咻洫癔洇嵛磔胗肫赀眦吖瑷埯畚妣飑豳髌砗铳楮蔟毳锝堞疔葑缶菔疳彀胍磙顸薅翮猢怙蒺廑妗髁醌粝魉旒蝥缗衲呸醅芘蚍圮榀萁苘逑诎劬蕖朊剡蟮椹饣酞帑葶菟魍庑葸氙谖鞅狺夤嬴瘿饔雩鹆橼赜潴骓缁诹怍杓艹檫媸氚呲殂矬笪迨纛簦玎苊轭匚鼢呒缑诖炅鲧唿戽鬟恚袷瘕枧洚桕雎蠲剀诓瘌镧铑鳓蓠呖跞裢裣埒捩鲮熘嵝瘰镘脒腩筢耪辔牝嘁蛴戗蛩巯悫葚熵绱蛸螫毹妁纟嗾鳎绨粜菀沩鼯牾螅顼泶蕈鼹繇苡悒廴吲喁卣牖笮舴罾棹鸷碡锕嗌媪龅甏箅傧啵鹁晡氅魑篪怛籴礅蒽珥钫绠觚鸹涫颃篌锪蠖乩咭赍嵴铗湔槿赆僦皲佧箜蒉缧酹嘞疬臁膦泷蒌泺荬颟旄泖镅蠓冖幂耱襻鼙攵炝愀蘧氍犭禳桡糁馓酾槊狻锬羰鼗鹈畋髫萜堍璺怃崤囟睚痖菸餍徭瘗唷圉蜮砟谵澶朕摭轵诼笫廒聱庳髀笾龀裎雠蝽腠妲刂铥黩怼沲蘩趺苻拊阝鲋戆纥哏鲠笱瞽庋簋刿掴猓蚵槲觳萑癀蟪钬虮掎鲣囝裥踺茳糨鹪狷麇芤刳愦髡悃缡鲡鳢奁墚尥柃胪镥脔杩劢墁玟蝻呶搦湓罴蜱俜鲆皤镨槭镪黢洳枘芟埏渖筮殳飧溻饧樘醣酡圬粞觋莶霰榍薤髹曛疋迓衤欹佾埸霪茚鼬伛瘵骣畛卮轾彘觯锺邾槠谘嵫髭蕞犴鞴畀滗煸褊冫孛羼耖褫彳艟辶茈璁爨榱萏坫鸫篼簖裰哚蹯瀵怫陔筻廾蛄绲崞蜾盍荭黉糇骺後鲎煳鹕冱瓠逭漶耠镬齑殛鲚跏蛱搛缣鹣僬噍衿缂喾狯纩栝蛞稂塄嫠詈蠊鹩躐鹨簏膂脶嬷昴瞀浼艨祢縻蘼芈糸宀眄鹋杪咩愍麽瘼鍪硇猱茑脬蟛貔仳犏钋芑葜愆锓蠼筌鬈蚺荛埽潲诜埘弑嗍蒴鸶缌澌姒蔌睃缇梃彖鼍芄隈鲔硪忤痦欷僖醯鼷跣枵忄擤勖痃碹谳轺铘圯纡窬窳饫蓣瀹趱驵缯揸笊絷跖舯螽籀舳粢驺陬阼揞菝魃癍鹎坌狴萆嬖襞碥髟鳔醭螬馇虿瘥惝怊鸱螭瘛帱徂汆脞瘅忉羝睇瓞鲽岽胨芏佴燔偾稃郛莩幞澉槔袼搿茛鞲觏酤牿鲴宄匦呙馘焓瘊虍岵鹱咴隳缋溷夥剞洎恝蒹谫僭艽挢敫卺冂扃锔窭锩觖劂氪骒哙悝蝰诔苈篥娈瞵锍栊癃舻辂稆猡蛑甍艋敉眇蠛侔镎肭艿蛲疒陧衄锘堋庀擗甓螵钷攴桤褰凵肷锖鞒吣黥俅蝤璩悛辁肜颡谂礻摅汜溲嗉荽闼骀炱螗耥裼铫莛亠箨蕹迕杌寤穸饩舄禊猃绁渫廨獬硎荇鸺貅糈揎镟獯讠厣罨蛘鳐崾舣媵尢蚰侑狳螈龠昃痄搌浈埴夂黹酎橥丶缒窀菹锿砹茇勹邶鐾舭忭缏灬瘭踣钸礤骖黪艚锸猹镡躔蒇冁鬯屮枨眵傺搋巛舡楱镩鹾戥觌阽铞垤揲蹀耵髑憝鸸鲕篚镄鲼唪祓艴黻黼鳆尜戤塥哿虼遘桄丨胲醢撖嚆薨堠烀轷锾缳擐哕阍劐攉丌墼蕺彐芰哜戋趼楗耩喈卩鲒骱刭弪獍鬏鞫犋屦醵桊爝捃胩锎蒈莰闶钶锞眍筘阃漤铹栳耢仂泐檑轹蔹懔垆锊倮蠃鞔硭漭猸鹛钔瞢礞喵苠鳘貊貘毪坶攮猊嬲肀聍甯狃耨孥胬恧蒎锫陴氕丿裒镤蜞岍搴箝慊椠蜣硗劁缲檎螓圊檠謦銎赇鼽糗麴鸲磲畎狨蝾薷襦颥蓐脎毵磉鳋唼歃彡骟滠矧胂蓍鲺贳搠厶兕锼螋瞍觫赕铴瑭慝掭祧龆蜩鲦茼酴煺柝腽軎阌阢诶菥蓰柙祆筅葙蟓魈躞砉醑儇岈砑珧酏劓堙撄潆舁蝓燠眢箢掾刖狁拶唣迮帻谮哳齄膪嫜忮骘膣踬荮瘃麈疰丬浞禚觜耔腙鄹鲰躜撙胙"; - -let sc = [...cn_symbol, ...hanzipinlv].filter(Boolean); -const tc = sc.map(i => toTraditional(i)).map((i) => i.codePointAt(0)) -sc = sc.map((i) => i.codePointAt(0)) -const common = sc.filter(i => tc.includes(i)); -const sc_set = sc.filter(i => !common.includes(i)); -const tc_set = tc.filter(i => !common.includes(i)); - -console.log(common.length, sc_set.length, tc_set.length, '使用 uint16存储'); -const data = new Uint16Array([...common, 0, ...sc_set, 0, ...tc_set]); - -import fs from 'fs-extra'; -await fs.outputFile('./data/cn_char_rank.dat', data); diff --git a/crates/lang_unicodes/scripts/hangul.mjs b/crates/lang_unicodes/scripts/hangul.mjs deleted file mode 100644 index 6f636258..00000000 --- a/crates/lang_unicodes/scripts/hangul.mjs +++ /dev/null @@ -1,8 +0,0 @@ -// 处理韩文字符频率分布 -// 来源 http://nlp.kookmin.ac.kr/data/syl-2.txt -import fs from 'fs-extra' -const data = fs.readFileSync('./scripts/syl-2.txt', 'utf-8') -const content = data.split('=====================================')[1] -const chars = content.split('\n').map(i => i.split(' ')[0]).filter(Boolean) -console.log(chars.length); -fs.writeFileSync('./data/hangul-syl.dat', new Uint16Array(chars.map(i => i.codePointAt(0)))) \ No newline at end of file diff --git a/crates/lang_unicodes/src/cjk_unicodes.rs b/crates/lang_unicodes/src/cjk_unicodes.rs index 62d06ce0..636d2cf8 100644 --- a/crates/lang_unicodes/src/cjk_unicodes.rs +++ b/crates/lang_unicodes/src/cjk_unicodes.rs @@ -72,9 +72,9 @@ mod tests { #[test] fn test() { assert_eq!(HIRAGANA_AND_KATAKANA.len(), 192); - assert_eq!(ZH_COMMON.len(), 4329); - assert_eq!(ZH_SC.len(), 2508); - assert_eq!(ZH_TC.len(), 2481); + assert_eq!(ZH_COMMON.len(), 4524); + assert_eq!(ZH_SC.len(), 2313); + assert_eq!(ZH_TC.len(), 2308); assert_eq!(HANGUL_SYL.len(), 2026); } } diff --git a/crates/proto/Cargo.toml b/crates/proto/Cargo.toml index 817f7c53..99deb3d9 100644 --- a/crates/proto/Cargo.toml +++ b/crates/proto/Cargo.toml @@ -15,10 +15,12 @@ prost = "0.13.3" prost-types = { version = "0.13.3", optional = true } [features] +default = ["with_extra"] server = ["tonic"] +with_extra = ["tonic-build"] [build-dependencies] -tonic-build = "0.12.3" +tonic-build = { version = "0.12.3", optional = true } [lib] -path = "src/lib/mod.rs" +path = "src/lib/mod.rs" \ No newline at end of file diff --git a/crates/proto/build.rs b/crates/proto/build.rs index 0ca8d190..5095ec94 100644 --- a/crates/proto/build.rs +++ b/crates/proto/build.rs @@ -1,7 +1,5 @@ +#[cfg(feature = "with_extra")] fn main() { - if std::env::var("CARGO_WITH_NO_EXTRA").is_ok() { - return; - } let out_dir = std::path::PathBuf::from("./src/lib"); let _ = std::fs::create_dir_all("./src/lib"); @@ -39,3 +37,6 @@ fn main() { let mod_code = mod_code+"\npub const INDEX_PROTO: &[u8] = include_bytes!(\"../index.proto\");"; let _ = std::fs::write("src/lib/mod.rs", mod_code); } + +#[cfg(not(feature = "with_extra"))] +fn main() {} diff --git a/packages/ffi/Cargo.toml b/packages/ffi/Cargo.toml index 2d4cd6c2..ea63bf08 100644 --- a/packages/ffi/Cargo.toml +++ b/packages/ffi/Cargo.toml @@ -4,10 +4,14 @@ version = "0.1.0" edition = "2021" [dependencies] -cn-font-proto = { version = "0.1.0", path = "../../crates/proto" } -cn-font-split = { version = "7.0.0", path = "../.." } +cn-font-proto = { version = "0.1.0", path = "../../crates/proto", default-features = false } +cn-font-split = { version = "7.0.0", path = "../..", default-features = false} libc = "0.2.165" prost = "0.13.3" +[features] +default = ["with_extra"] +with_extra = ["cn-font-proto/with_extra", "cn-font-split/with_extra"] + [lib] crate-type = ["cdylib"] diff --git a/packages/grpc/Cargo.toml b/packages/grpc/Cargo.toml index fc4cf16d..b1719705 100644 --- a/packages/grpc/Cargo.toml +++ b/packages/grpc/Cargo.toml @@ -4,10 +4,10 @@ version = "0.1.0" edition = "2021" [dependencies] -cn-font-proto = { version = "0.1.0", path = "../../crates/proto", features = [ +cn-font-proto = { version = "0.1.0", path = "../../crates/proto", default-features = false, features = [ "server", ] } -cn-font-split = { version = "7.0.0", path = "../.." } +cn-font-split = { version = "7.0.0", path = "../..", default-features = false} cn-font-utils = { version = "0.1.0", path = "../../crates/cn_font_utils" } prost = "0.13.3" tokio = { version = "1.19.2", features = ["macros", "rt-multi-thread"] } @@ -18,3 +18,7 @@ tracing-subscriber = "0.3.19" [build-dependencies] tonic-build = "0.12.3" + +[features] +default = ["with_extra"] +with_extra = ["cn-font-split/with_extra", "cn-font-proto/with_extra"] \ No newline at end of file diff --git a/packages/server/Cargo.toml b/packages/server/Cargo.toml index 8e062ace..546168bb 100644 --- a/packages/server/Cargo.toml +++ b/packages/server/Cargo.toml @@ -4,7 +4,7 @@ version = "0.1.0" edition = "2021" [dependencies] -cn-font-split = { version = "7.0.0", path = "../.." } +cn-font-split = { version = "7.0.0", path = "../..", default-features = false} axum = {version = "0.7.9", features = []} serde = { version = "1.0", features = ["derive"] } serde_json = "1.0.68" @@ -14,6 +14,10 @@ tokio-stream = "0.1.16" async-stream = "0.3.6" futures-util = "0.3.31" rust-s3 = "0.35.1" -cn-font-proto = { version = "0.1.0", path = "../../crates/proto" } +cn-font-proto = { version = "0.1.0", path = "../../crates/proto", default-features = false } zip = "2.2.1" md5 = "0.7.0" + +[features] +default = ["with_extra"] +with_extra = ["cn-font-proto/with_extra", "cn-font-split/with_extra"] diff --git a/packages/wasm-edge/Cargo.toml b/packages/wasm-edge/Cargo.toml index b70eb06b..294c266a 100644 --- a/packages/wasm-edge/Cargo.toml +++ b/packages/wasm-edge/Cargo.toml @@ -7,8 +7,12 @@ edition = "2021" crate-type = ["cdylib"] [dependencies] -cn-font-proto = { version = "0.1.0", path = "../../crates/proto" } -cn-font-split = { version = "7.0.0", path = "../.." } +cn-font-proto = { version = "0.1.0", path = "../../crates/proto", default-features = false } +cn-font-split = { version = "7.0.0", path = "../..", default-features = false } cn-font-utils = { version = "0.1.0", path = "../../crates/cn_font_utils" } env_logger = "0.11.5" prost = "0.13.4" + +[features] +default = ["with_extra"] +with_extra = ["cn-font-split/with_extra", "cn-font-proto/with_extra"] \ No newline at end of file diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index ee84a8d0..7087f2df 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -55,15 +55,6 @@ importers: specifier: ^2.1.4 version: 2.1.4(@types/node@22.8.2)(terser@5.31.0) - crates/lang_unicodes: - devDependencies: - chinese-simple2traditional: - specifier: ^2.2.2 - version: 2.2.2 - fs-extra: - specifier: ^11.2.0 - version: 11.2.0 - packages/edge-provider: dependencies: '@aws-sdk/client-s3':