Skip to content

Commit

Permalink
Revert "Revert "fix: error message and UTF-8 handling in selector sea…
Browse files Browse the repository at this point in the history
…rch""

This reverts commit 8c40a80.
  • Loading branch information
SoraSuegami committed Dec 16, 2024
1 parent 8c40a80 commit 27f29c5
Show file tree
Hide file tree
Showing 3 changed files with 232 additions and 28 deletions.
189 changes: 163 additions & 26 deletions packages/helpers/src/input-generators.ts
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,53 @@ function findSelectorInCleanContent(
selector: string,
positionMap: Map<number, number>,
): { selector: string; originalIndex: number } {
// First build a clean string without soft line breaks
const cleanString = new TextDecoder().decode(cleanContent);
const selectorIndex = cleanString.indexOf(selector);
let decodedString = '';
let cleanIndex = 0;
const indexMap = new Map<number, number>(); // decodedPos -> cleanPos

while (cleanIndex < cleanString.length) {
// Handle multi-byte UTF-8 sequences in QP format (e.g., =E2=80=94 for em dash)
const qpMatch = cleanString.slice(cleanIndex).match(/^=([0-9A-F]{2})=([0-9A-F]{2})=([0-9A-F]{2})/);
if (qpMatch) {
const byte1 = parseInt(qpMatch[1], 16);
const byte2 = parseInt(qpMatch[2], 16);
const byte3 = parseInt(qpMatch[3], 16);
const bytes = new Uint8Array([byte1, byte2, byte3]);
decodedString += new TextDecoder().decode(bytes);
indexMap.set(decodedString.length - 1, cleanIndex);
cleanIndex += 9; // Skip over the entire QP sequence
continue;
}

// Handle single-byte QP sequences
if (cleanString[cleanIndex] === '=' &&
/[0-9A-F]{2}/.test(cleanString.slice(cleanIndex + 1, cleanIndex + 3))) {
const byte = parseInt(cleanString.slice(cleanIndex + 1, cleanIndex + 3), 16);
decodedString += String.fromCharCode(byte);
indexMap.set(decodedString.length - 1, cleanIndex);
cleanIndex += 3;
} else {
decodedString += cleanString[cleanIndex];
indexMap.set(decodedString.length - 1, cleanIndex);
cleanIndex++;
}
}

const selectorIndex = decodedString.indexOf(selector);

if (selectorIndex === -1) {
throw new Error(`SHA precompute selector "${selector}" not found in cleaned body`);
throw new Error(`SHA precompute selector "${selector}" not found in the body`);
}

const originalIndex = positionMap.get(selectorIndex);
// Map back to original position using our index maps
const cleanPos = indexMap.get(selectorIndex);
if (cleanPos === undefined) {
throw new Error(`Failed to map selector position in decoded content`);
}

const originalIndex = positionMap.get(cleanPos);
if (originalIndex === undefined) {
throw new Error(`Failed to map selector position to original body`);
}
Expand Down Expand Up @@ -92,17 +131,39 @@ function getAdjustedSelector(
cleanContent: Uint8Array,
positionMap: Map<number, number>,
): string {
// First try finding selector in original body
if (new TextDecoder().decode(originalBody).includes(selector)) {
return selector;
}
const decoder = new TextDecoder();
const originalString = decoder.decode(originalBody);

// If not found, look in cleaned content and map back to original
// Look in cleaned and decoded content and map back to original
const { originalIndex } = findSelectorInCleanContent(cleanContent, selector, positionMap);
const bodyString = new TextDecoder().decode(originalBody);

// Add 3 to length to account for potential soft line break
return bodyString.slice(originalIndex, originalIndex + selector.length + 3);
// Find the end of the QP sequence by looking for multi-byte UTF-8 characters
let encodedLength = 0;
let currentIndex = originalIndex;

for (let i = 0; i < selector.length; i++) {
const char = selector[i];
if (char.charCodeAt(0) > 127) {
// Look for QP-encoded multi-byte sequence
const qpMatch = originalString.slice(currentIndex).match(/^=([0-9A-F]{2})=([0-9A-F]{2})=([0-9A-F]{2})/);
if (qpMatch) {
encodedLength += 9; // Length of =XX=XX=XX
currentIndex += 9;
continue;
}
}
// Look for single-byte QP sequence or regular character
if (originalString[currentIndex] === '=' &&
/[0-9A-F]{2}/.test(originalString.slice(currentIndex + 1, currentIndex + 3))) {
encodedLength += 3;
currentIndex += 3;
} else {
encodedLength++;
currentIndex++;
}
}

return originalString.slice(originalIndex, originalIndex + encodedLength);
}

/**
Expand Down Expand Up @@ -131,19 +192,55 @@ function removeSoftLineBreaks(body: Uint8Array): { cleanContent: Uint8Array; pos
let cleanPos = 0;

while (i < body.length) {
if (
i + 2 < body.length &&
body[i] === 61 && // '=' character
body[i + 1] === 13 && // '\r' character
body[i + 2] === 10 // '\n' character
) {
i += 3; // Move past the soft line break
} else {
positionMap.set(cleanPos, i);
result.push(body[i]);
cleanPos++;
i++;
// Handle multi-byte UTF-8 sequences in QP format (e.g., =E2=80=94 for em dash)
if (i < body.length - 8 && body[i] === 61) { // '=' character
const slice = body.slice(i, i + 9);
const str = new TextDecoder().decode(slice);
const qpMatch = str.match(/^=([0-9A-F]{2})=([0-9A-F]{2})=([0-9A-F]{2})/);
if (qpMatch) {
const byte1 = parseInt(qpMatch[1], 16);
const byte2 = parseInt(qpMatch[2], 16);
const byte3 = parseInt(qpMatch[3], 16);
result.push(byte1, byte2, byte3);
positionMap.set(cleanPos, i);
positionMap.set(cleanPos + 1, i + 3);
positionMap.set(cleanPos + 2, i + 6);
cleanPos += 3;
i += 9;
continue;
}
}

// Handle single-byte QP sequences
if (i < body.length - 2 && body[i] === 61) { // '=' character
const nextTwo = new TextDecoder().decode(body.slice(i + 1, i + 3));
if (/[0-9A-F]{2}/.test(nextTwo)) {
const byte = parseInt(nextTwo, 16);
result.push(byte);
positionMap.set(cleanPos, i);
cleanPos++;
i += 3;
continue;
}
}

// Handle soft line breaks with optional whitespace
if (i < body.length - 1 && body[i] === 61) { // '=' character
let j = i + 1;
// Skip whitespace and newlines after the '='
while (j < body.length && (body[j] === 13 || body[j] === 10 || body[j] === 32 || body[j] === 9)) {
j++;
}
if (j > i + 1) {
i = j;
continue;
}
}

positionMap.set(cleanPos, i);
result.push(body[i]);
cleanPos++;
i++;
}

// Pad the result with zeros to make it the same length as body
Expand Down Expand Up @@ -236,11 +333,51 @@ export function generateEmailVerifierInputsFromDKIMResult(
circuitInputs.emailBodyLength = bodyRemainingLength.toString();
circuitInputs.precomputedSHA = Uint8ArrayToCharArray(precomputedSha);
circuitInputs.bodyHashIndex = bodyHashIndex.toString();
circuitInputs.emailBody = Uint8ArrayToCharArray(bodyRemaining);

// First remove soft line breaks to ensure QP sequences aren't broken
const { cleanContent: contentWithoutBreaks } = removeSoftLineBreaks(bodyRemaining);

// Then decode QP-encoded content
const decodedContent = new Uint8Array(contentWithoutBreaks.length);
let writePos = 0;
let readPos = 0;

while (readPos < contentWithoutBreaks.length) {
// Handle multi-byte UTF-8 sequences
if (readPos < contentWithoutBreaks.length - 8 && contentWithoutBreaks[readPos] === 61) { // '=' character
const slice = contentWithoutBreaks.slice(readPos, readPos + 9);
const str = new TextDecoder().decode(slice);
const qpMatch = str.match(/^=([0-9A-F]{2})=([0-9A-F]{2})=([0-9A-F]{2})/);
if (qpMatch) {
const byte1 = parseInt(qpMatch[1], 16);
const byte2 = parseInt(qpMatch[2], 16);
const byte3 = parseInt(qpMatch[3], 16);
decodedContent[writePos++] = byte1;
decodedContent[writePos++] = byte2;
decodedContent[writePos++] = byte3;
readPos += 9;
continue;
}
}

// Handle single-byte QP sequences
if (readPos < contentWithoutBreaks.length - 2 && contentWithoutBreaks[readPos] === 61) {
const nextTwo = new TextDecoder().decode(contentWithoutBreaks.slice(readPos + 1, readPos + 3));
if (/[0-9A-F]{2}/.test(nextTwo)) {
decodedContent[writePos++] = parseInt(nextTwo, 16);
readPos += 3;
continue;
}
}

decodedContent[writePos++] = contentWithoutBreaks[readPos++];
}

const finalDecodedContent = decodedContent.slice(0, writePos);
circuitInputs.emailBody = Uint8ArrayToCharArray(finalDecodedContent);

if (params.removeSoftLineBreaks) {
const { cleanContent } = removeSoftLineBreaks(bodyRemaining);
circuitInputs.decodedEmailBodyIn = Uint8ArrayToCharArray(cleanContent);
circuitInputs.decodedEmailBodyIn = circuitInputs.emailBody;
}

if (params.enableBodyMasking) {
Expand Down
59 changes: 57 additions & 2 deletions packages/helpers/src/sha-utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,67 @@ export function generatePartialSHA({
let selectorIndex = 0;

if (selectorString) {
const selector = new TextEncoder().encode(selectorString);
selectorIndex = findIndexInUint8Array(body, selector);
// First remove soft line breaks and get position mapping
const cleanContent = new Uint8Array(body);
const positionMap = new Map<number, number>();
let cleanPos = 0;
let i = 0;

// Build clean content and position map
while (i < body.length) {
if (i < body.length - 1 && body[i] === 61) { // '=' character
// Check for multi-byte UTF-8 sequence in QP format
const qpMatch = body.slice(i, i + 9).toString().match(/^=([0-9A-F]{2})=([0-9A-F]{2})=([0-9A-F]{2})/);
if (qpMatch) {
// Handle 3-byte UTF-8 sequence
const byte1 = parseInt(qpMatch[1], 16);
const byte2 = parseInt(qpMatch[2], 16);
const byte3 = parseInt(qpMatch[3], 16);
cleanContent[cleanPos] = byte1;
cleanContent[cleanPos + 1] = byte2;
cleanContent[cleanPos + 2] = byte3;
positionMap.set(cleanPos, i);
positionMap.set(cleanPos + 1, i + 3);
positionMap.set(cleanPos + 2, i + 6);
cleanPos += 3;
i += 9;
continue;
}

// Check for line continuation
let j = i + 1;
while (j < body.length && (body[j] === 13 || body[j] === 10 || body[j] === 32 || body[j] === 9)) {
j++;
}
if (j > i + 1) {
i = j;
continue;
}
}
positionMap.set(cleanPos, i);
cleanContent[cleanPos] = body[i];
cleanPos++;
i++;
}

// Create a view of only the valid content
const validContent = cleanContent.slice(0, cleanPos);

// Find selector in decoded content
const cleanString = new TextDecoder().decode(validContent);
selectorIndex = cleanString.indexOf(selectorString);

if (selectorIndex === -1) {
throw new Error(`SHA precompute selector "${selectorString}" not found in the body`);
}

// Map back to original position
const originalIndex = positionMap.get(selectorIndex);
if (originalIndex === undefined) {
throw new Error(`Failed to map selector position to original body`);
}

selectorIndex = originalIndex;
}

const shaCutoffIndex = Math.floor(selectorIndex / 64) * 64;
Expand Down
12 changes: 12 additions & 0 deletions packages/helpers/tests/input-generators.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -61,4 +61,16 @@ describe('Input generators', () => {
}),
).rejects.toThrow('SHA precompute selector "Bla Bla" not found in the body');
});

it('should handle UTF-8 characters in selector and body', async () => {
const email = fs.readFileSync(path.join(__dirname, 'test-data/email-good-large.eml'));

const inputs = await generateEmailVerifierInputs(email, {
shaPrecomputeSelector: 'Genesis Block —',
});

expect(inputs.emailBody).toBeDefined();
const strBody = bytesToString(Uint8Array.from(inputs.emailBody!.map((b) => Number(b))));
expect(strBody).toContain('Genesis Block —');
});
});

0 comments on commit 27f29c5

Please sign in to comment.