Skip to content

Commit

Permalink
fix: do encoding transform in node for speed and size
Browse files Browse the repository at this point in the history
  • Loading branch information
connor4312 committed Jan 31, 2020
1 parent adc1fdd commit 01a756f
Show file tree
Hide file tree
Showing 9 changed files with 98 additions and 135 deletions.
24 changes: 24 additions & 0 deletions .devcontainer/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#-------------------------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See https://go.microsoft.com/fwlink/?linkid=2090316 for license information.
#-------------------------------------------------------------------------------------------------------------

# To fully customize the contents of this image, use the following Dockerfile instead:
# https://github.com/microsoft/vscode-dev-containers/tree/v0.100.0/containers/ubuntu-18.04-git/.devcontainer/Dockerfile
FROM mcr.microsoft.com/vscode/devcontainers/javascript-node:12

ENV PATH /root/.cargo/bin:/root/emsdk/upstream/bin:$PATH

# Install Rust
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
RUN cargo install wasm-pack

# Install emsdk
RUN cd /root \
&& git clone https://github.com/emscripten-core/emsdk.git \
&& cd emsdk \
&& ./emsdk install latest


# Install benchmark util
RUN npm i -g @c4312/matcha
29 changes: 29 additions & 0 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
// For format details, see https://aka.ms/vscode-remote/devcontainer.json or this file's README at:
// https://github.com/microsoft/vscode-dev-containers/tree/v0.100.0/containers/ubuntu-18.04-git
{
"name": "Ubuntu 18.04 & Git",
"dockerFile": "Dockerfile",

// Set *default* container specific settings.json values on container create.
"settings": {
"terminal.integrated.shell.linux": "/bin/bash"
},

// Add the IDs of extensions you want installed when the container is created.
"extensions": []

// Use 'forwardPorts' to make a list of ports inside the container available locally.
// "forwardPorts": [],

// Use 'postCreateCommand' to run commands after the container is created.
// "postCreateCommand": "uname -a",

// Uncomment to use the Docker CLI from inside the container. See https://aka.ms/vscode-remote/samples/docker-in-docker.
// "mounts": [ "source=/var/run/docker.sock,target=/var/run/docker.sock,type=bind" ],

// Uncomment when using a ptrace-based debugger like C++, Go, and Rust
// "runArgs": [ "--cap-add=SYS_PTRACE", "--security-opt", "seccomp=unconfined" ],

// Uncomment to connect as a non-root user. See https://aka.ms/vscode-remote/containers/non-root.
// "remoteUser": "vscode"
}
4 changes: 4 additions & 0 deletions .npmignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
/*
!/pkg
!/index.js
!/index.d.ts
1 change: 0 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,4 @@ crate-type = ["cdylib"]

[dependencies]
wasm-bindgen = "0.2"
encoding = "0.2"

1 change: 1 addition & 0 deletions index.d.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
export function hash(input: Buffer): string;
32 changes: 32 additions & 0 deletions index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
const { hash } = require('./pkg/chromehash');

const output = Buffer.alloc(4 * 5);

exports.hash = input => {
hash(normalize(input), output);
return output.toString('hex');
};

const hasUTF8BOM = buffer =>
buffer.length >= 3 && buffer[0] === 0xef && buffer[1] === 0xbb && buffer[2] === 0xbf;
const hasUtf16LEBOM = buffer => buffer.length >= 2 && buffer[0] === 0xff && buffer[1] === 0xfe;
const hasUtf16BEBOM = buffer => buffer.length >= 2 && buffer[0] === 0xfe && buffer[1] === 0xff;

const normalize = buffer => {
if (hasUTF8BOM(buffer)) {
return utf8ToUtf16(buffer.slice(3));
}

if (hasUtf16LEBOM(buffer)) {
return buffer.slice(2);
}

if (hasUtf16BEBOM(buffer)) {
return buffer.slice(2).swap16();
}

return utf8ToUtf16(buffer);
}

const utf8ToUtf16 = buffer => Buffer.from(buffer.toString('utf8'), 'utf16le');

99 changes: 4 additions & 95 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
"name": "@c4312/chromehash",
"version": "0.1.0",
"description": "A Rust/WebAssembly implementation of the Chrome content hashing algorithm",
"main": "pkg/chromehash",
"main": "index.js",
"scripts": {
"prepublishOnly": "rimraf pkg && wasm-pack build --target nodejs --release"
"prepublishOnly": "rm -rf pkg && wasm-pack build --target nodejs --release && wasm-opt -O4 -o pkg/min.wasm pkg/chromehash_bg.wasm && mv pkg/min.wasm pkg/chromehash_bg.wasm"
},
"repository": {
"type": "git",
Expand All @@ -17,6 +17,6 @@
},
"homepage": "https://github.com/connor4312/chromehash#readme",
"devDependencies": {
"rimraf": "^3.0.1"
"prettier": "^1.19.1"
}
}
37 changes: 1 addition & 36 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,54 +1,19 @@
extern crate wasm_bindgen;

use wasm_bindgen::prelude::*;
use std::borrow::Cow;
use encoding::{Encoding, EncoderTrap, DecoderTrap};
use encoding::all::{UTF_8, UTF_16LE, UTF_16BE};

// These constants are constant in Chromium code. Not configurable, just used for seeing hashes.
const HASHES: usize = 5;
const PRIMES: [u64; HASHES] = [0x3FB75161, 0xAB1F4E4F, 0x82675BC5, 0xCD924D35, 0x81ABE279];
const RANDOM_EVN: [u64; HASHES] = [0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0];
const RANDOM_ODD: [u64; HASHES] = [0xB4663807, 0xCC322BF5, 0xD4F91BBD, 0xA7BEA11D, 0x8F462907];

// decodes a string using the `encoding` library. While they do have a generic
// decoding function, we explicitly include the couple we need here to avoid
// packaging the entire thing into the wasm.
fn decode_str(input: &[u8]) -> Result<String, Cow<'static, str>> {
if input.len() >= 3 && input[0] == 0xef && input[1] == 0xbb && input[2] == 0xbf {
return UTF_8.decode(&input[3..], DecoderTrap::Ignore); // utf-8 BOM
}

if input.len() >= 2 && input[0] == 0xff && input[1] == 0xfe {
return UTF_16LE.decode(&input[2..], DecoderTrap::Ignore); // utf-16 LE
}

if input.len() >= 2 && input[0] == 0xfe && input[1] == 0xff {
return UTF_16BE.decode(&input[2..], DecoderTrap::Ignore); // utf-16 BE
}

UTF_8.decode(&input, DecoderTrap::Ignore)
}

// normalizes the input into a utf-16 string.
fn normalize<'a>(input: &'a [u8]) -> Vec<u8> {
match decode_str(&input) {
Ok(encoded) => match UTF_16LE.encode(&encoded, EncoderTrap::Ignore) {
Ok(decoded) => decoded,
Err(_) => input.to_vec(),
},
Err(_) => input.to_vec(),
}
}

#[wasm_bindgen]
pub fn hash(raw_input: &mut [u8], output: &mut [u8]) {
pub fn hash(input: &[u8], output: &mut [u8]) {
let mut hashes: [u64; HASHES] = [0, 0, 0, 0, 0];
let mut zi: [u64; HASHES] = [1, 1, 1, 1, 1];

let input = normalize(raw_input);
let full_bytes = input.len() - (input.len() % 4);

let mut current = 0;
for i in (0..full_bytes).step_by(4) {
let v: u32 = input[i] as u32 | (input[i + 1] as u32) << 8 | ((input[i + 2] as u32) << 16) | ((input[i + 3] as u32) << 24);
Expand Down

0 comments on commit 01a756f

Please sign in to comment.