From 490e2309dea117a3fd7173622077f9fe6cf96534 Mon Sep 17 00:00:00 2001 From: kakkokari-gtyih <67428053+kakkokari-gtyih@users.noreply.github.com> Date: Tue, 19 Nov 2024 18:40:09 +0900 Subject: [PATCH 1/2] =?UTF-8?q?fix(encoding):=20ISO-2022-JP=E3=81=AE?= =?UTF-8?q?=E3=83=9A=E3=83=BC=E3=82=B8=E3=81=A7=E6=96=87=E5=AD=97=E5=8C=96?= =?UTF-8?q?=E3=81=91=E3=81=8C=E7=99=BA=E7=94=9F=E3=81=99=E3=82=8B=E5=95=8F?= =?UTF-8?q?=E9=A1=8C=E3=82=92=E4=BF=AE=E6=AD=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- package.json | 2 ++ pnpm-lock.yaml | 17 +++++++++++++++ src/utils/encoding.ts | 50 ++++++++++++++++++++++++++++++++++++++++++- src/utils/got.ts | 2 +- 4 files changed, 69 insertions(+), 2 deletions(-) diff --git a/package.json b/package.json index e58eeb45..3433e51d 100644 --- a/package.json +++ b/package.json @@ -27,6 +27,7 @@ "@swc/jest": "^0.2.37", "@types/cheerio": "0.22.35", "@types/debug": "4.1.12", + "@types/encoding-japanese": "^2.2.1", "@types/escape-regexp": "^0.0.3", "@types/node": "22.9.0", "@typescript-eslint/eslint-plugin": "^7.17.0", @@ -42,6 +43,7 @@ }, "dependencies": { "cheerio": "1.0.0", + "encoding-japanese": "^2.2.0", "escape-regexp": "0.0.1", "got": "^14.4.4", "html-entities": "2.5.2", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index ac7c54c3..a2ae7a86 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -11,6 +11,9 @@ importers: cheerio: specifier: 1.0.0 version: 1.0.0 + encoding-japanese: + specifier: ^2.2.0 + version: 2.2.0 escape-regexp: specifier: 0.0.1 version: 0.0.1 @@ -48,6 +51,9 @@ importers: '@types/debug': specifier: 4.1.12 version: 4.1.12 + '@types/encoding-japanese': + specifier: ^2.2.1 + version: 2.2.1 '@types/escape-regexp': specifier: ^0.0.3 version: 0.0.3 @@ -573,6 +579,9 @@ packages: '@types/debug@4.1.12': resolution: {integrity: sha512-vIChWdVG3LG1SMxEvI/AK+FWJthlrqlTu7fbrlywTkkaONwk/UAGaULXRlf8vkzFBLVm0zkMdCquhL5aOjhXPQ==} + '@types/encoding-japanese@2.2.1': + resolution: {integrity: sha512-6jjepuTusvySxMLP7W6usamlbgf0F4sIDvm7EzYePjLHY7zWUv4yz2PLUnu0vuNVtXOTLu2cRdFcDg40J5Owsw==} + '@types/escape-regexp@0.0.3': resolution: {integrity: sha512-FQMYUxaf1dVeWLUzJFSvfdDugfOpDyM13p67QfyMdagxSkBa689opkr/q9uR/VWyrWrl0jAyQaSPKxX9MpAXFw==} @@ -1051,6 +1060,10 @@ packages: emoji-regex@8.0.0: resolution: {integrity: sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==} + encoding-japanese@2.2.0: + resolution: {integrity: sha512-EuJWwlHPZ1LbADuKTClvHtwbaFn4rOD+dRAbWysqEOXRc2Uui0hJInNJrsdH0c+OhJA4nrCBdSkW4DD5YxAo6A==} + engines: {node: '>=8.10.0'} + encoding-sniffer@0.2.0: resolution: {integrity: sha512-ju7Wq1kg04I3HtiYIOrUrdfdDvkyO9s5XM8QAj/bN61Yo/Vb4vgJxy5vi4Yxk01gWHbrofpPtpxM8bKger9jhg==} @@ -3100,6 +3113,8 @@ snapshots: dependencies: '@types/ms': 0.7.34 + '@types/encoding-japanese@2.2.1': {} + '@types/escape-regexp@0.0.3': {} '@types/estree@1.0.6': {} @@ -3656,6 +3671,8 @@ snapshots: emoji-regex@8.0.0: {} + encoding-japanese@2.2.0: {} + encoding-sniffer@0.2.0: dependencies: iconv-lite: 0.6.3 diff --git a/src/utils/encoding.ts b/src/utils/encoding.ts index 7a168802..4c401997 100644 --- a/src/utils/encoding.ts +++ b/src/utils/encoding.ts @@ -1,14 +1,47 @@ import iconv from 'iconv-lite'; +import Encoding from 'encoding-japanese'; import jschardet from 'jschardet'; +import type { Response } from 'got'; + const regCharset = new RegExp(/charset\s*=\s*["']?([\w-]+)/, 'i'); +export const ENCODING_JAPANESE_ENCODING_PREFIX = '__EJ__'; + +const ENCODING_JAPANESE_SUPPORTED_ENCODING: string[] = [ + 'UTF32', + 'UTF16', + 'UTF16BE', + 'UTF16LE', + 'BINARY', + 'ASCII', + 'JIS', + 'UTF8', + 'EUCJP', + 'SJIS', + 'UNICODE', + 'AUTO', +] satisfies Encoding.Encoding[]; + /** * Detect HTML encoding * @param body Body in Buffer * @returns encoding */ -export function detectEncoding(body: Buffer): string { +export function detectEncoding(res: Response): string { + // From header + const contentType = res.headers['content-type']; + if (contentType) { + const matchHeader = contentType.match(regCharset); + if (matchHeader) { + const candicate = matchHeader[1]; + const encoding = toEncoding(candicate); + if (encoding != null) return encoding; + } + } + + const body = res.rawBody; + // By detection const detected = jschardet.detect(body, { minimumThreshold: 0.99 }); // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition @@ -30,10 +63,25 @@ export function detectEncoding(body: Buffer): string { } export function toUtf8(body: Buffer, encoding: string): string { + if (encoding.startsWith(ENCODING_JAPANESE_ENCODING_PREFIX)) { + const _encoding = encoding.slice(ENCODING_JAPANESE_ENCODING_PREFIX.length); + + function assertEncoding(enc: string): enc is Encoding.Encoding { + return ENCODING_JAPANESE_SUPPORTED_ENCODING.includes(enc); + } + + if (assertEncoding(_encoding)) { + return Encoding.codeToString(Encoding.convert(body, 'UNICODE', _encoding)); + } + } return iconv.decode(body, encoding); } function toEncoding(candicate: string): string | null { + // iconvで処理できない + // https://github.com/ashtuchkin/iconv-lite/issues/60 + if (candicate.toLowerCase() === 'iso-2022-jp') return '__EJ__JIS'; + if (iconv.encodingExists(candicate)) { if (['shift_jis', 'shift-jis', 'windows-31j', 'x-sjis'].includes(candicate.toLowerCase())) return 'cp932'; return candicate; diff --git a/src/utils/got.ts b/src/utils/got.ts index 6a2b6d9d..bb976eae 100644 --- a/src/utils/got.ts +++ b/src/utils/got.ts @@ -77,7 +77,7 @@ export async function scpaping( method: 'GET', }); - const encoding = detectEncoding(response.rawBody); + const encoding = detectEncoding(response); const body = toUtf8(response.rawBody, encoding); const $ = cheerio.load(body); From d3f49f9590e9023bacd73520a9aba580d1742b80 Mon Sep 17 00:00:00 2001 From: kakkokari-gtyih <67428053+kakkokari-gtyih@users.noreply.github.com> Date: Tue, 19 Nov 2024 18:40:21 +0900 Subject: [PATCH 2/2] Update Changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ee9c5934..79e44467 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ (unreleased) ------------------ * センシティブフラグの判定を `` および `rating` ヘッダでも行うように +* 文字コードがISO-2022-JPのウェブサイトの処理に失敗して文字化けする問題を修正 * 依存関係の更新 * eslintの設定を更新