From 27f29c5aa3fe14bb60b0a148b4bc36081af34772 Mon Sep 17 00:00:00 2001 From: SoraSuegami Date: Tue, 17 Dec 2024 00:01:14 +0900 Subject: [PATCH] Revert "Revert "fix: error message and UTF-8 handling in selector search"" This reverts commit 8c40a80ae00342c12ae9cdca1ce4cc6e78cca8a0. --- packages/helpers/src/input-generators.ts | 189 +++++++++++++++--- packages/helpers/src/sha-utils.ts | 59 +++++- .../helpers/tests/input-generators.test.ts | 12 ++ 3 files changed, 232 insertions(+), 28 deletions(-) diff --git a/packages/helpers/src/input-generators.ts b/packages/helpers/src/input-generators.ts index 776f7cc3..cb16a723 100644 --- a/packages/helpers/src/input-generators.ts +++ b/packages/helpers/src/input-generators.ts @@ -55,14 +55,53 @@ function findSelectorInCleanContent( selector: string, positionMap: Map, ): { selector: string; originalIndex: number } { + // First build a clean string without soft line breaks const cleanString = new TextDecoder().decode(cleanContent); - const selectorIndex = cleanString.indexOf(selector); + let decodedString = ''; + let cleanIndex = 0; + const indexMap = new Map(); // decodedPos -> cleanPos + + while (cleanIndex < cleanString.length) { + // Handle multi-byte UTF-8 sequences in QP format (e.g., =E2=80=94 for em dash) + const qpMatch = cleanString.slice(cleanIndex).match(/^=([0-9A-F]{2})=([0-9A-F]{2})=([0-9A-F]{2})/); + if (qpMatch) { + const byte1 = parseInt(qpMatch[1], 16); + const byte2 = parseInt(qpMatch[2], 16); + const byte3 = parseInt(qpMatch[3], 16); + const bytes = new Uint8Array([byte1, byte2, byte3]); + decodedString += new TextDecoder().decode(bytes); + indexMap.set(decodedString.length - 1, cleanIndex); + cleanIndex += 9; // Skip over the entire QP sequence + continue; + } + + // Handle single-byte QP sequences + if (cleanString[cleanIndex] === '=' && + /[0-9A-F]{2}/.test(cleanString.slice(cleanIndex + 1, cleanIndex + 3))) { + const byte = parseInt(cleanString.slice(cleanIndex + 1, cleanIndex + 3), 16); + decodedString += String.fromCharCode(byte); + indexMap.set(decodedString.length - 1, cleanIndex); + cleanIndex += 3; + } else { + decodedString += cleanString[cleanIndex]; + indexMap.set(decodedString.length - 1, cleanIndex); + cleanIndex++; + } + } + + const selectorIndex = decodedString.indexOf(selector); if (selectorIndex === -1) { - throw new Error(`SHA precompute selector "${selector}" not found in cleaned body`); + throw new Error(`SHA precompute selector "${selector}" not found in the body`); } - const originalIndex = positionMap.get(selectorIndex); + // Map back to original position using our index maps + const cleanPos = indexMap.get(selectorIndex); + if (cleanPos === undefined) { + throw new Error(`Failed to map selector position in decoded content`); + } + + const originalIndex = positionMap.get(cleanPos); if (originalIndex === undefined) { throw new Error(`Failed to map selector position to original body`); } @@ -92,17 +131,39 @@ function getAdjustedSelector( cleanContent: Uint8Array, positionMap: Map, ): string { - // First try finding selector in original body - if (new TextDecoder().decode(originalBody).includes(selector)) { - return selector; - } + const decoder = new TextDecoder(); + const originalString = decoder.decode(originalBody); - // If not found, look in cleaned content and map back to original + // Look in cleaned and decoded content and map back to original const { originalIndex } = findSelectorInCleanContent(cleanContent, selector, positionMap); - const bodyString = new TextDecoder().decode(originalBody); - // Add 3 to length to account for potential soft line break - return bodyString.slice(originalIndex, originalIndex + selector.length + 3); + // Find the end of the QP sequence by looking for multi-byte UTF-8 characters + let encodedLength = 0; + let currentIndex = originalIndex; + + for (let i = 0; i < selector.length; i++) { + const char = selector[i]; + if (char.charCodeAt(0) > 127) { + // Look for QP-encoded multi-byte sequence + const qpMatch = originalString.slice(currentIndex).match(/^=([0-9A-F]{2})=([0-9A-F]{2})=([0-9A-F]{2})/); + if (qpMatch) { + encodedLength += 9; // Length of =XX=XX=XX + currentIndex += 9; + continue; + } + } + // Look for single-byte QP sequence or regular character + if (originalString[currentIndex] === '=' && + /[0-9A-F]{2}/.test(originalString.slice(currentIndex + 1, currentIndex + 3))) { + encodedLength += 3; + currentIndex += 3; + } else { + encodedLength++; + currentIndex++; + } + } + + return originalString.slice(originalIndex, originalIndex + encodedLength); } /** @@ -131,19 +192,55 @@ function removeSoftLineBreaks(body: Uint8Array): { cleanContent: Uint8Array; pos let cleanPos = 0; while (i < body.length) { - if ( - i + 2 < body.length && - body[i] === 61 && // '=' character - body[i + 1] === 13 && // '\r' character - body[i + 2] === 10 // '\n' character - ) { - i += 3; // Move past the soft line break - } else { - positionMap.set(cleanPos, i); - result.push(body[i]); - cleanPos++; - i++; + // Handle multi-byte UTF-8 sequences in QP format (e.g., =E2=80=94 for em dash) + if (i < body.length - 8 && body[i] === 61) { // '=' character + const slice = body.slice(i, i + 9); + const str = new TextDecoder().decode(slice); + const qpMatch = str.match(/^=([0-9A-F]{2})=([0-9A-F]{2})=([0-9A-F]{2})/); + if (qpMatch) { + const byte1 = parseInt(qpMatch[1], 16); + const byte2 = parseInt(qpMatch[2], 16); + const byte3 = parseInt(qpMatch[3], 16); + result.push(byte1, byte2, byte3); + positionMap.set(cleanPos, i); + positionMap.set(cleanPos + 1, i + 3); + positionMap.set(cleanPos + 2, i + 6); + cleanPos += 3; + i += 9; + continue; + } + } + + // Handle single-byte QP sequences + if (i < body.length - 2 && body[i] === 61) { // '=' character + const nextTwo = new TextDecoder().decode(body.slice(i + 1, i + 3)); + if (/[0-9A-F]{2}/.test(nextTwo)) { + const byte = parseInt(nextTwo, 16); + result.push(byte); + positionMap.set(cleanPos, i); + cleanPos++; + i += 3; + continue; + } } + + // Handle soft line breaks with optional whitespace + if (i < body.length - 1 && body[i] === 61) { // '=' character + let j = i + 1; + // Skip whitespace and newlines after the '=' + while (j < body.length && (body[j] === 13 || body[j] === 10 || body[j] === 32 || body[j] === 9)) { + j++; + } + if (j > i + 1) { + i = j; + continue; + } + } + + positionMap.set(cleanPos, i); + result.push(body[i]); + cleanPos++; + i++; } // Pad the result with zeros to make it the same length as body @@ -236,11 +333,51 @@ export function generateEmailVerifierInputsFromDKIMResult( circuitInputs.emailBodyLength = bodyRemainingLength.toString(); circuitInputs.precomputedSHA = Uint8ArrayToCharArray(precomputedSha); circuitInputs.bodyHashIndex = bodyHashIndex.toString(); - circuitInputs.emailBody = Uint8ArrayToCharArray(bodyRemaining); + + // First remove soft line breaks to ensure QP sequences aren't broken + const { cleanContent: contentWithoutBreaks } = removeSoftLineBreaks(bodyRemaining); + + // Then decode QP-encoded content + const decodedContent = new Uint8Array(contentWithoutBreaks.length); + let writePos = 0; + let readPos = 0; + + while (readPos < contentWithoutBreaks.length) { + // Handle multi-byte UTF-8 sequences + if (readPos < contentWithoutBreaks.length - 8 && contentWithoutBreaks[readPos] === 61) { // '=' character + const slice = contentWithoutBreaks.slice(readPos, readPos + 9); + const str = new TextDecoder().decode(slice); + const qpMatch = str.match(/^=([0-9A-F]{2})=([0-9A-F]{2})=([0-9A-F]{2})/); + if (qpMatch) { + const byte1 = parseInt(qpMatch[1], 16); + const byte2 = parseInt(qpMatch[2], 16); + const byte3 = parseInt(qpMatch[3], 16); + decodedContent[writePos++] = byte1; + decodedContent[writePos++] = byte2; + decodedContent[writePos++] = byte3; + readPos += 9; + continue; + } + } + + // Handle single-byte QP sequences + if (readPos < contentWithoutBreaks.length - 2 && contentWithoutBreaks[readPos] === 61) { + const nextTwo = new TextDecoder().decode(contentWithoutBreaks.slice(readPos + 1, readPos + 3)); + if (/[0-9A-F]{2}/.test(nextTwo)) { + decodedContent[writePos++] = parseInt(nextTwo, 16); + readPos += 3; + continue; + } + } + + decodedContent[writePos++] = contentWithoutBreaks[readPos++]; + } + + const finalDecodedContent = decodedContent.slice(0, writePos); + circuitInputs.emailBody = Uint8ArrayToCharArray(finalDecodedContent); if (params.removeSoftLineBreaks) { - const { cleanContent } = removeSoftLineBreaks(bodyRemaining); - circuitInputs.decodedEmailBodyIn = Uint8ArrayToCharArray(cleanContent); + circuitInputs.decodedEmailBodyIn = circuitInputs.emailBody; } if (params.enableBodyMasking) { diff --git a/packages/helpers/src/sha-utils.ts b/packages/helpers/src/sha-utils.ts index 2352b184..c74bb65f 100644 --- a/packages/helpers/src/sha-utils.ts +++ b/packages/helpers/src/sha-utils.ts @@ -41,12 +41,67 @@ export function generatePartialSHA({ let selectorIndex = 0; if (selectorString) { - const selector = new TextEncoder().encode(selectorString); - selectorIndex = findIndexInUint8Array(body, selector); + // First remove soft line breaks and get position mapping + const cleanContent = new Uint8Array(body); + const positionMap = new Map(); + let cleanPos = 0; + let i = 0; + + // Build clean content and position map + while (i < body.length) { + if (i < body.length - 1 && body[i] === 61) { // '=' character + // Check for multi-byte UTF-8 sequence in QP format + const qpMatch = body.slice(i, i + 9).toString().match(/^=([0-9A-F]{2})=([0-9A-F]{2})=([0-9A-F]{2})/); + if (qpMatch) { + // Handle 3-byte UTF-8 sequence + const byte1 = parseInt(qpMatch[1], 16); + const byte2 = parseInt(qpMatch[2], 16); + const byte3 = parseInt(qpMatch[3], 16); + cleanContent[cleanPos] = byte1; + cleanContent[cleanPos + 1] = byte2; + cleanContent[cleanPos + 2] = byte3; + positionMap.set(cleanPos, i); + positionMap.set(cleanPos + 1, i + 3); + positionMap.set(cleanPos + 2, i + 6); + cleanPos += 3; + i += 9; + continue; + } + + // Check for line continuation + let j = i + 1; + while (j < body.length && (body[j] === 13 || body[j] === 10 || body[j] === 32 || body[j] === 9)) { + j++; + } + if (j > i + 1) { + i = j; + continue; + } + } + positionMap.set(cleanPos, i); + cleanContent[cleanPos] = body[i]; + cleanPos++; + i++; + } + + // Create a view of only the valid content + const validContent = cleanContent.slice(0, cleanPos); + + // Find selector in decoded content + const cleanString = new TextDecoder().decode(validContent); + selectorIndex = cleanString.indexOf(selectorString); if (selectorIndex === -1) { throw new Error(`SHA precompute selector "${selectorString}" not found in the body`); } + + // Map back to original position + const originalIndex = positionMap.get(selectorIndex); + if (originalIndex === undefined) { + throw new Error(`Failed to map selector position to original body`); + } + + selectorIndex = originalIndex; } const shaCutoffIndex = Math.floor(selectorIndex / 64) * 64; diff --git a/packages/helpers/tests/input-generators.test.ts b/packages/helpers/tests/input-generators.test.ts index 0830ba94..96eade08 100644 --- a/packages/helpers/tests/input-generators.test.ts +++ b/packages/helpers/tests/input-generators.test.ts @@ -61,4 +61,16 @@ describe('Input generators', () => { }), ).rejects.toThrow('SHA precompute selector "Bla Bla" not found in the body'); }); + + it('should handle UTF-8 characters in selector and body', async () => { + const email = fs.readFileSync(path.join(__dirname, 'test-data/email-good-large.eml')); + + const inputs = await generateEmailVerifierInputs(email, { + shaPrecomputeSelector: 'Genesis Block —', + }); + + expect(inputs.emailBody).toBeDefined(); + const strBody = bytesToString(Uint8Array.from(inputs.emailBody!.map((b) => Number(b)))); + expect(strBody).toContain('Genesis Block —'); + }); });