From e5b43e52c8f50b2871c73774f9a4d269f3d80ccb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCrg=C3=BCn=20Day=C4=B1o=C4=9Flu?= Date: Sat, 9 Mar 2024 21:55:57 +0100 Subject: [PATCH 01/17] move to class --- deps/streamsearch/sbmh.js | 357 ++++++++++++++++++-------------------- 1 file changed, 166 insertions(+), 191 deletions(-) diff --git a/deps/streamsearch/sbmh.js b/deps/streamsearch/sbmh.js index b90c0e8..ee5b90d 100644 --- a/deps/streamsearch/sbmh.js +++ b/deps/streamsearch/sbmh.js @@ -1,228 +1,203 @@ 'use strict' -/** - * Copyright Brian White. All rights reserved. - * - * @see https://github.com/mscdex/streamsearch - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * Based heavily on the Streaming Boyer-Moore-Horspool C++ implementation - * by Hongli Lai at: https://github.com/FooBarWidget/boyer-moore-horspool - */ -const EventEmitter = require('node:events').EventEmitter -const inherits = require('node:util').inherits - -function SBMH (needle) { - if (typeof needle === 'string') { - needle = Buffer.from(needle) - } - - if (!Buffer.isBuffer(needle)) { - throw new TypeError('The needle has to be a String or a Buffer.') - } +const { EventEmitter } = require('node:events') - const needleLength = needle.length +class SBMH extends EventEmitter { + constructor (needle) { + super() + if (typeof needle === 'string') { + needle = Buffer.from(needle) + } - if (needleLength === 0) { - throw new Error('The needle cannot be an empty String/Buffer.') - } + if (!Buffer.isBuffer(needle)) { + throw new TypeError('The needle has to be a String or a Buffer.') + } - if (needleLength > 256) { - throw new Error('The needle cannot have a length bigger than 256.') - } + const needleLength = needle.length - this.maxMatches = Infinity - this.matches = 0 + if (needleLength === 0) { + throw new Error('The needle cannot be an empty String/Buffer.') + } - this._occ = new Array(256) - .fill(needleLength) // Initialize occurrence table. - this._lookbehind_size = 0 - this._needle = needle - this._bufpos = 0 + if (needleLength > 256) { + throw new Error('The needle cannot have a length bigger than 256.') + } - this._lookbehind = Buffer.alloc(needleLength) + this.maxMatches = Infinity + this.matches = 0 - // Populate occurrence table with analysis of the needle, - // ignoring last letter. - for (var i = 0; i < needleLength - 1; ++i) { // eslint-disable-line no-var - this._occ[needle[i]] = needleLength - 1 - i - } -} -inherits(SBMH, EventEmitter) + this._occ = new Array(256) + .fill(needleLength) // Initialize occurrence table. + this._lookbehind_size = 0 + this._needle = needle + this._bufpos = 0 -SBMH.prototype.reset = function () { - this._lookbehind_size = 0 - this.matches = 0 - this._bufpos = 0 -} + this._lookbehind = Buffer.alloc(needleLength) -SBMH.prototype.push = function (chunk, pos) { - if (!Buffer.isBuffer(chunk)) { - chunk = Buffer.from(chunk, 'binary') + // Populate occurrence table with analysis of the needle, + // ignoring last letter. + for (var i = 0; i < needleLength - 1; ++i) { // eslint-disable-line no-var + this._occ[needle[i]] = needleLength - 1 - i + } } - const chlen = chunk.length - this._bufpos = pos || 0 - let r - while (r !== chlen && this.matches < this.maxMatches) { r = this._sbmh_feed(chunk) } - return r -} -SBMH.prototype._sbmh_feed = function (data) { - const len = data.length - const needle = this._needle - const needleLength = needle.length - const lastNeedleChar = needle[needleLength - 1] - - // Positive: points to a position in `data` - // pos == 3 points to data[3] - // Negative: points to a position in the lookbehind buffer - // pos == -2 points to lookbehind[lookbehind_size - 2] - let pos = -this._lookbehind_size - let ch - - if (pos < 0) { - // Lookbehind buffer is not empty. Perform Boyer-Moore-Horspool - // search with character lookup code that considers both the - // lookbehind buffer and the current round's haystack data. - // - // Loop until - // there is a match. - // or until - // we've moved past the position that requires the - // lookbehind buffer. In this case we switch to the - // optimized loop. - // or until - // the character to look at lies outside the haystack. - while (pos < 0 && pos <= len - needleLength) { - ch = this._sbmh_lookup_char(data, pos + needleLength - 1) - - if ( - ch === lastNeedleChar && - this._sbmh_memcmp(data, pos, needleLength - 1) - ) { - this._lookbehind_size = 0 - ++this.matches - this.emit('info', true) + reset () { + this._lookbehind_size = 0 + this.matches = 0 + this._bufpos = 0 + } - return (this._bufpos = pos + needleLength) - } - pos += this._occ[ch] + push (chunk, pos) { + if (!Buffer.isBuffer(chunk)) { + chunk = Buffer.from(chunk, 'binary') } + const chlen = chunk.length + this._bufpos = pos || 0 + let r + while (r !== chlen && this.matches < this.maxMatches) { r = this._sbmh_feed(chunk) } + return r + } - // No match. + _sbmh_feed (data) { + const len = data.length + const needle = this._needle + const needleLength = needle.length + const lastNeedleChar = needle[needleLength - 1] + + // Positive: points to a position in `data` + // pos == 3 points to data[3] + // Negative: points to a position in the lookbehind buffer + // pos == -2 points to lookbehind[lookbehind_size - 2] + let pos = -this._lookbehind_size + let ch if (pos < 0) { - // There's too few data for Boyer-Moore-Horspool to run, - // so let's use a different algorithm to skip as much as - // we can. - // Forward pos until - // the trailing part of lookbehind + data - // looks like the beginning of the needle + // Lookbehind buffer is not empty. Perform Boyer-Moore-Horspool + // search with character lookup code that considers both the + // lookbehind buffer and the current round's haystack data. + // + // Loop until + // there is a match. // or until - // pos == 0 - while (pos < 0 && !this._sbmh_memcmp(data, pos, len - pos)) { ++pos } - } - - if (pos >= 0) { - // Discard lookbehind buffer. - this.emit('info', false, this._lookbehind, 0, this._lookbehind_size) - this._lookbehind_size = 0 - } else { - // Cut off part of the lookbehind buffer that has - // been processed and append the entire haystack - // into it. - const bytesToCutOff = this._lookbehind_size + pos - if (bytesToCutOff > 0) { - // The cut off data is guaranteed not to contain the needle. - this.emit('info', false, this._lookbehind, 0, bytesToCutOff) + // we've moved past the position that requires the + // lookbehind buffer. In this case we switch to the + // optimized loop. + // or until + // the character to look at lies outside the haystack. + while (pos < 0 && pos <= len - needleLength) { + ch = this._sbmh_lookup_char(data, pos + needleLength - 1) + + if ( + ch === lastNeedleChar && + this._sbmh_memcmp(data, pos, needleLength - 1) + ) { + this._lookbehind_size = 0 + ++this.matches + this.emit('info', true) + + return (this._bufpos = pos + needleLength) + } + pos += this._occ[ch] } - this._lookbehind.copy(this._lookbehind, 0, bytesToCutOff, - this._lookbehind_size - bytesToCutOff) - this._lookbehind_size -= bytesToCutOff - - data.copy(this._lookbehind, this._lookbehind_size) - this._lookbehind_size += len + // No match. + + if (pos < 0) { + // There's too few data for Boyer-Moore-Horspool to run, + // so let's use a different algorithm to skip as much as + // we can. + // Forward pos until + // the trailing part of lookbehind + data + // looks like the beginning of the needle + // or until + // pos == 0 + while (pos < 0 && !this._sbmh_memcmp(data, pos, len - pos)) { ++pos } + } - this._bufpos = len - return len + if (pos >= 0) { + // Discard lookbehind buffer. + this.emit('info', false, this._lookbehind, 0, this._lookbehind_size) + this._lookbehind_size = 0 + } else { + // Cut off part of the lookbehind buffer that has + // been processed and append the entire haystack + // into it. + const bytesToCutOff = this._lookbehind_size + pos + if (bytesToCutOff > 0) { + // The cut off data is guaranteed not to contain the needle. + this.emit('info', false, this._lookbehind, 0, bytesToCutOff) + } + + this._lookbehind.copy(this._lookbehind, 0, bytesToCutOff, + this._lookbehind_size - bytesToCutOff) + this._lookbehind_size -= bytesToCutOff + + data.copy(this._lookbehind, this._lookbehind_size) + this._lookbehind_size += len + + this._bufpos = len + return len + } } - } - pos += (pos >= 0) * this._bufpos + pos += (pos >= 0) * this._bufpos - // Lookbehind buffer is now empty. We only need to check if the - // needle is in the haystack. - if (data.indexOf(needle, pos) !== -1) { - pos = data.indexOf(needle, pos) - ++this.matches - if (pos > 0) { this.emit('info', true, data, this._bufpos, pos) } else { this.emit('info', true) } + // Lookbehind buffer is now empty. We only need to check if the + // needle is in the haystack. + if (data.indexOf(needle, pos) !== -1) { + pos = data.indexOf(needle, pos) + ++this.matches + if (pos > 0) { this.emit('info', true, data, this._bufpos, pos) } else { this.emit('info', true) } - return (this._bufpos = pos + needleLength) - } else { - pos = len - needleLength - } + return (this._bufpos = pos + needleLength) + } else { + pos = len - needleLength + } - // There was no match. If there's trailing haystack data that we cannot - // match yet using the Boyer-Moore-Horspool algorithm (because the trailing - // data is less than the needle size) then match using a modified - // algorithm that starts matching from the beginning instead of the end. - // Whatever trailing data is left after running this algorithm is added to - // the lookbehind buffer. - while ( - pos < len && - ( - data[pos] !== needle[0] || + // There was no match. If there's trailing haystack data that we cannot + // match yet using the Boyer-Moore-Horspool algorithm (because the trailing + // data is less than the needle size) then match using a modified + // algorithm that starts matching from the beginning instead of the end. + // Whatever trailing data is left after running this algorithm is added to + // the lookbehind buffer. + while ( + pos < len && ( - (Buffer.compare( - data.subarray(pos, pos + len - pos), - needle.subarray(0, len - pos) - ) !== 0) + data[pos] !== needle[0] || + ( + (Buffer.compare( + data.subarray(pos, pos + len - pos), + needle.subarray(0, len - pos) + ) !== 0) + ) ) - ) - ) { - ++pos - } - if (pos < len) { - data.copy(this._lookbehind, 0, pos, pos + (len - pos)) - this._lookbehind_size = len - pos - } + ) { + ++pos + } + if (pos < len) { + data.copy(this._lookbehind, 0, pos, pos + (len - pos)) + this._lookbehind_size = len - pos + } - // Everything until pos is guaranteed not to contain needle data. - if (pos > 0) { this.emit('info', false, data, this._bufpos, pos < len ? pos : len) } + // Everything until pos is guaranteed not to contain needle data. + if (pos > 0) { this.emit('info', false, data, this._bufpos, pos < len ? pos : len) } - this._bufpos = len - return len -} + this._bufpos = len + return len + } -SBMH.prototype._sbmh_lookup_char = function (data, pos) { - return (pos < 0) - ? this._lookbehind[this._lookbehind_size + pos] - : data[pos] -} + _sbmh_lookup_char (data, pos) { + return (pos < 0) + ? this._lookbehind[this._lookbehind_size + pos] + : data[pos] + } -SBMH.prototype._sbmh_memcmp = function (data, pos, len) { - for (var i = 0; i < len; ++i) { // eslint-disable-line no-var - if (this._sbmh_lookup_char(data, pos + i) !== this._needle[i]) { return false } + _sbmh_memcmp (data, pos, len) { + for (var i = 0; i < len; ++i) { // eslint-disable-line no-var + if (this._sbmh_lookup_char(data, pos + i) !== this._needle[i]) { return false } + } + return true } - return true } module.exports = SBMH From 74eee01c4c31e5fbd4aa1cca0c78767968d003ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCrg=C3=BCn=20Day=C4=B1o=C4=9Flu?= Date: Sat, 9 Mar 2024 21:56:06 +0100 Subject: [PATCH 02/17] format --- deps/streamsearch/sbmh.js | 1 + 1 file changed, 1 insertion(+) diff --git a/deps/streamsearch/sbmh.js b/deps/streamsearch/sbmh.js index ee5b90d..f6c3cf2 100644 --- a/deps/streamsearch/sbmh.js +++ b/deps/streamsearch/sbmh.js @@ -5,6 +5,7 @@ const { EventEmitter } = require('node:events') class SBMH extends EventEmitter { constructor (needle) { super() + if (typeof needle === 'string') { needle = Buffer.from(needle) } From 965f5bca755dcd6ac69ba34980774b66bfc6af64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCrg=C3=BCn=20Day=C4=B1o=C4=9Flu?= Date: Sat, 9 Mar 2024 22:21:50 +0100 Subject: [PATCH 03/17] small refactor --- deps/streamsearch/sbmh.js | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/deps/streamsearch/sbmh.js b/deps/streamsearch/sbmh.js index f6c3cf2..40640ce 100644 --- a/deps/streamsearch/sbmh.js +++ b/deps/streamsearch/sbmh.js @@ -27,35 +27,37 @@ class SBMH extends EventEmitter { this.maxMatches = Infinity this.matches = 0 - this._occ = new Array(256) - .fill(needleLength) // Initialize occurrence table. - this._lookbehind_size = 0 this._needle = needle this._bufpos = 0 - + this._lookbehind_size = 0 this._lookbehind = Buffer.alloc(needleLength) + this._occ = new Array(256).fill(needleLength) // Initialize occurrence table. - // Populate occurrence table with analysis of the needle, - // ignoring last letter. + // Populate occurrence table with analysis of the needle, ignoring last letter. for (var i = 0; i < needleLength - 1; ++i) { // eslint-disable-line no-var this._occ[needle[i]] = needleLength - 1 - i } } reset () { - this._lookbehind_size = 0 this.matches = 0 + this._lookbehind_size = 0 this._bufpos = 0 } - push (chunk, pos) { + push (chunk, pos = 0) { if (!Buffer.isBuffer(chunk)) { chunk = Buffer.from(chunk, 'binary') } + + this._bufpos = pos const chlen = chunk.length - this._bufpos = pos || 0 let r - while (r !== chlen && this.matches < this.maxMatches) { r = this._sbmh_feed(chunk) } + + while (r !== chlen && this.matches < this.maxMatches) { + r = this._sbmh_feed(chunk) + } + return r } @@ -98,6 +100,7 @@ class SBMH extends EventEmitter { return (this._bufpos = pos + needleLength) } + pos += this._occ[ch] } @@ -175,6 +178,7 @@ class SBMH extends EventEmitter { ) { ++pos } + if (pos < len) { data.copy(this._lookbehind, 0, pos, pos + (len - pos)) this._lookbehind_size = len - pos @@ -184,6 +188,7 @@ class SBMH extends EventEmitter { if (pos > 0) { this.emit('info', false, data, this._bufpos, pos < len ? pos : len) } this._bufpos = len + return len } @@ -197,6 +202,7 @@ class SBMH extends EventEmitter { for (var i = 0; i < len; ++i) { // eslint-disable-line no-var if (this._sbmh_lookup_char(data, pos + i) !== this._needle[i]) { return false } } + return true } } From a4e91c508f61dc45f8507e689280e51380a67317 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCrg=C3=BCn=20Day=C4=B1o=C4=9Flu?= Date: Sat, 9 Mar 2024 22:33:17 +0100 Subject: [PATCH 04/17] simplify check --- deps/streamsearch/sbmh.js | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/deps/streamsearch/sbmh.js b/deps/streamsearch/sbmh.js index 40640ce..3acb28f 100644 --- a/deps/streamsearch/sbmh.js +++ b/deps/streamsearch/sbmh.js @@ -148,16 +148,17 @@ class SBMH extends EventEmitter { // Lookbehind buffer is now empty. We only need to check if the // needle is in the haystack. - if (data.indexOf(needle, pos) !== -1) { - pos = data.indexOf(needle, pos) + pos = data.indexOf(needle, pos) + + if (pos !== -1) { ++this.matches if (pos > 0) { this.emit('info', true, data, this._bufpos, pos) } else { this.emit('info', true) } return (this._bufpos = pos + needleLength) - } else { - pos = len - needleLength } + pos = len - needleLength + // There was no match. If there's trailing haystack data that we cannot // match yet using the Boyer-Moore-Horspool algorithm (because the trailing // data is less than the needle size) then match using a modified From 5dac93474d7c2ca72cec146d479931326a2e9860 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCrg=C3=BCn=20Day=C4=B1o=C4=9Flu?= Date: Sat, 9 Mar 2024 22:33:42 +0100 Subject: [PATCH 05/17] space --- deps/streamsearch/sbmh.js | 1 + 1 file changed, 1 insertion(+) diff --git a/deps/streamsearch/sbmh.js b/deps/streamsearch/sbmh.js index 3acb28f..1db10e2 100644 --- a/deps/streamsearch/sbmh.js +++ b/deps/streamsearch/sbmh.js @@ -152,6 +152,7 @@ class SBMH extends EventEmitter { if (pos !== -1) { ++this.matches + if (pos > 0) { this.emit('info', true, data, this._bufpos, pos) } else { this.emit('info', true) } return (this._bufpos = pos + needleLength) From 887954535fd45a275adc64b0a1b0d7b5120fc101 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCrg=C3=BCn=20Day=C4=B1o=C4=9Flu?= Date: Sat, 9 Mar 2024 22:37:47 +0100 Subject: [PATCH 06/17] simplify check --- deps/streamsearch/sbmh.js | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/deps/streamsearch/sbmh.js b/deps/streamsearch/sbmh.js index 1db10e2..38064a4 100644 --- a/deps/streamsearch/sbmh.js +++ b/deps/streamsearch/sbmh.js @@ -170,12 +170,10 @@ class SBMH extends EventEmitter { pos < len && ( data[pos] !== needle[0] || - ( - (Buffer.compare( - data.subarray(pos, pos + len - pos), - needle.subarray(0, len - pos) - ) !== 0) - ) + Buffer.compare( + data.subarray(pos, pos + len - pos), + needle.subarray(0, len - pos) + ) !== 0 ) ) { ++pos From d0f1985288d3f57d17c5382d42d2f82de62b7174 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCrg=C3=BCn=20Day=C4=B1o=C4=9Flu?= Date: Sat, 9 Mar 2024 22:38:09 +0100 Subject: [PATCH 07/17] simplify check --- deps/streamsearch/sbmh.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/streamsearch/sbmh.js b/deps/streamsearch/sbmh.js index 38064a4..41fc1dd 100644 --- a/deps/streamsearch/sbmh.js +++ b/deps/streamsearch/sbmh.js @@ -193,7 +193,7 @@ class SBMH extends EventEmitter { } _sbmh_lookup_char (data, pos) { - return (pos < 0) + return pos < 0 ? this._lookbehind[this._lookbehind_size + pos] : data[pos] } From e254a8285f09c56ac0efc28795ac73f81e0dfa70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCrg=C3=BCn=20Day=C4=B1o=C4=9Flu?= Date: Sat, 9 Mar 2024 22:42:14 +0100 Subject: [PATCH 08/17] refactor --- deps/streamsearch/sbmh.js | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/deps/streamsearch/sbmh.js b/deps/streamsearch/sbmh.js index 41fc1dd..bb38f32 100644 --- a/deps/streamsearch/sbmh.js +++ b/deps/streamsearch/sbmh.js @@ -50,14 +50,12 @@ class SBMH extends EventEmitter { chunk = Buffer.from(chunk, 'binary') } - this._bufpos = pos const chlen = chunk.length - let r - while (r !== chlen && this.matches < this.maxMatches) { - r = this._sbmh_feed(chunk) - } + this._bufpos = pos + let r + while (r !== chlen && this.matches < this.maxMatches) { r = this._sbmh_feed(chunk) } return r } From c281addf88a4f701af71f28d309fe802a8b23a70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCrg=C3=BCn=20Day=C4=B1o=C4=9Flu?= Date: Sun, 10 Mar 2024 13:08:38 +0100 Subject: [PATCH 09/17] simplify --- deps/streamsearch/sbmh.js | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/deps/streamsearch/sbmh.js b/deps/streamsearch/sbmh.js index bb38f32..b52aeb1 100644 --- a/deps/streamsearch/sbmh.js +++ b/deps/streamsearch/sbmh.js @@ -142,11 +142,9 @@ class SBMH extends EventEmitter { } } - pos += (pos >= 0) * this._bufpos - // Lookbehind buffer is now empty. We only need to check if the // needle is in the haystack. - pos = data.indexOf(needle, pos) + pos = data.indexOf(needle, pos + ((pos >= 0) * this._bufpos)) if (pos !== -1) { ++this.matches From 7748c790a26852df356fa1f2d6602c803496c5e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCrg=C3=BCn=20Day=C4=B1o=C4=9Flu?= Date: Sun, 10 Mar 2024 13:19:20 +0100 Subject: [PATCH 10/17] put back license --- deps/streamsearch/sbmh.js | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/deps/streamsearch/sbmh.js b/deps/streamsearch/sbmh.js index b52aeb1..3d3bf04 100644 --- a/deps/streamsearch/sbmh.js +++ b/deps/streamsearch/sbmh.js @@ -1,5 +1,32 @@ 'use strict' +/** + * Copyright Brian White. All rights reserved. + * + * @see https://github.com/mscdex/streamsearch + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Based heavily on the Streaming Boyer-Moore-Horspool C++ implementation + * by Hongli Lai at: https://github.com/FooBarWidget/boyer-moore-horspool + */ + const { EventEmitter } = require('node:events') class SBMH extends EventEmitter { From ca4602dc1f93ebb130b24ccdc92590b13eaffc01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCrg=C3=BCn=20Day=C4=B1o=C4=9Flu?= Date: Sun, 10 Mar 2024 13:22:47 +0100 Subject: [PATCH 11/17] remove duplicate subtraction --- deps/streamsearch/sbmh.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deps/streamsearch/sbmh.js b/deps/streamsearch/sbmh.js index 3d3bf04..10d59f7 100644 --- a/deps/streamsearch/sbmh.js +++ b/deps/streamsearch/sbmh.js @@ -152,14 +152,14 @@ class SBMH extends EventEmitter { // been processed and append the entire haystack // into it. const bytesToCutOff = this._lookbehind_size + pos + if (bytesToCutOff > 0) { // The cut off data is guaranteed not to contain the needle. this.emit('info', false, this._lookbehind, 0, bytesToCutOff) } - this._lookbehind.copy(this._lookbehind, 0, bytesToCutOff, - this._lookbehind_size - bytesToCutOff) this._lookbehind_size -= bytesToCutOff + this._lookbehind.copy(this._lookbehind, 0, bytesToCutOff, this._lookbehind_size) data.copy(this._lookbehind, this._lookbehind_size) this._lookbehind_size += len From bf65c01861468b3f3a41ca41d98ae3de4f221161 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCrg=C3=BCn=20Day=C4=B1o=C4=9Flu?= Date: Tue, 12 Mar 2024 18:53:37 +0100 Subject: [PATCH 12/17] refactor --- deps/streamsearch/sbmh.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/streamsearch/sbmh.js b/deps/streamsearch/sbmh.js index 10d59f7..5d20bf1 100644 --- a/deps/streamsearch/sbmh.js +++ b/deps/streamsearch/sbmh.js @@ -68,8 +68,8 @@ class SBMH extends EventEmitter { reset () { this.matches = 0 - this._lookbehind_size = 0 this._bufpos = 0 + this._lookbehind_size = 0 } push (chunk, pos = 0) { From 9e986cf0786a6ff07491c062b8adb559b1ce900d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCrg=C3=BCn=20Day=C4=B1o=C4=9Flu?= Date: Fri, 15 Mar 2024 16:05:05 +0100 Subject: [PATCH 13/17] readability --- deps/streamsearch/sbmh.js | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/deps/streamsearch/sbmh.js b/deps/streamsearch/sbmh.js index 5d20bf1..70fd02b 100644 --- a/deps/streamsearch/sbmh.js +++ b/deps/streamsearch/sbmh.js @@ -77,10 +77,9 @@ class SBMH extends EventEmitter { chunk = Buffer.from(chunk, 'binary') } - const chlen = chunk.length - this._bufpos = pos + const chlen = chunk.length let r while (r !== chlen && this.matches < this.maxMatches) { r = this._sbmh_feed(chunk) } return r From 8b31e4d749269fc22f79c62f7bdff94dbf7ca873 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCrg=C3=BCn=20Day=C4=B1o=C4=9Flu?= Date: Sat, 16 Mar 2024 21:39:35 +0100 Subject: [PATCH 14/17] revert class refactor --- deps/streamsearch/sbmh.js | 323 +++++++++++++++++++------------------- 1 file changed, 159 insertions(+), 164 deletions(-) diff --git a/deps/streamsearch/sbmh.js b/deps/streamsearch/sbmh.js index 70fd02b..79a3ff9 100644 --- a/deps/streamsearch/sbmh.js +++ b/deps/streamsearch/sbmh.js @@ -27,206 +27,201 @@ * by Hongli Lai at: https://github.com/FooBarWidget/boyer-moore-horspool */ -const { EventEmitter } = require('node:events') +const EventEmitter = require('node:events').EventEmitter +const inherits = require('node:util').inherits -class SBMH extends EventEmitter { - constructor (needle) { - super() - - if (typeof needle === 'string') { - needle = Buffer.from(needle) - } +function SBMH (needle) { + if (typeof needle === 'string') { + needle = Buffer.from(needle) + } - if (!Buffer.isBuffer(needle)) { - throw new TypeError('The needle has to be a String or a Buffer.') - } + if (!Buffer.isBuffer(needle)) { + throw new TypeError('The needle has to be a String or a Buffer.') + } - const needleLength = needle.length + const needleLength = needle.length - if (needleLength === 0) { - throw new Error('The needle cannot be an empty String/Buffer.') - } + if (needleLength === 0) { + throw new Error('The needle cannot be an empty String/Buffer.') + } - if (needleLength > 256) { - throw new Error('The needle cannot have a length bigger than 256.') - } + if (needleLength > 256) { + throw new Error('The needle cannot have a length bigger than 256.') + } - this.maxMatches = Infinity - this.matches = 0 + this.maxMatches = Infinity + this.matches = 0 - this._needle = needle - this._bufpos = 0 - this._lookbehind_size = 0 - this._lookbehind = Buffer.alloc(needleLength) - this._occ = new Array(256).fill(needleLength) // Initialize occurrence table. + this._occ = new Array(256) + .fill(needleLength) // Initialize occurrence table. + this._lookbehind_size = 0 + this._needle = needle + this._bufpos = 0 - // Populate occurrence table with analysis of the needle, ignoring last letter. - for (var i = 0; i < needleLength - 1; ++i) { // eslint-disable-line no-var - this._occ[needle[i]] = needleLength - 1 - i - } - } + this._lookbehind = Buffer.alloc(needleLength) - reset () { - this.matches = 0 - this._bufpos = 0 - this._lookbehind_size = 0 + // Populate occurrence table with analysis of the needle, + // ignoring last letter. + for (var i = 0; i < needleLength - 1; ++i) { // eslint-disable-line no-var + this._occ[needle[i]] = needleLength - 1 - i } +} +inherits(SBMH, EventEmitter) - push (chunk, pos = 0) { - if (!Buffer.isBuffer(chunk)) { - chunk = Buffer.from(chunk, 'binary') - } - - this._bufpos = pos +SBMH.prototype.reset = function () { + this._lookbehind_size = 0 + this.matches = 0 + this._bufpos = 0 +} - const chlen = chunk.length - let r - while (r !== chlen && this.matches < this.maxMatches) { r = this._sbmh_feed(chunk) } - return r +SBMH.prototype.push = function (chunk, pos) { + if (!Buffer.isBuffer(chunk)) { + chunk = Buffer.from(chunk, 'binary') } + const chlen = chunk.length + this._bufpos = pos || 0 + let r + while (r !== chlen && this.matches < this.maxMatches) { r = this._sbmh_feed(chunk) } + return r +} - _sbmh_feed (data) { - const len = data.length - const needle = this._needle - const needleLength = needle.length - const lastNeedleChar = needle[needleLength - 1] +SBMH.prototype._sbmh_feed = function (data) { + const len = data.length + const needle = this._needle + const needleLength = needle.length + const lastNeedleChar = needle[needleLength - 1] + + // Positive: points to a position in `data` + // pos == 3 points to data[3] + // Negative: points to a position in the lookbehind buffer + // pos == -2 points to lookbehind[lookbehind_size - 2] + let pos = -this._lookbehind_size + let ch + + if (pos < 0) { + // Lookbehind buffer is not empty. Perform Boyer-Moore-Horspool + // search with character lookup code that considers both the + // lookbehind buffer and the current round's haystack data. + // + // Loop until + // there is a match. + // or until + // we've moved past the position that requires the + // lookbehind buffer. In this case we switch to the + // optimized loop. + // or until + // the character to look at lies outside the haystack. + while (pos < 0 && pos <= len - needleLength) { + ch = this._sbmh_lookup_char(data, pos + needleLength - 1) + + if ( + ch === lastNeedleChar && + this._sbmh_memcmp(data, pos, needleLength - 1) + ) { + this._lookbehind_size = 0 + ++this.matches + this.emit('info', true) - // Positive: points to a position in `data` - // pos == 3 points to data[3] - // Negative: points to a position in the lookbehind buffer - // pos == -2 points to lookbehind[lookbehind_size - 2] - let pos = -this._lookbehind_size - let ch + return (this._bufpos = pos + needleLength) + } + pos += this._occ[ch] + } + + // No match. if (pos < 0) { - // Lookbehind buffer is not empty. Perform Boyer-Moore-Horspool - // search with character lookup code that considers both the - // lookbehind buffer and the current round's haystack data. - // - // Loop until - // there is a match. - // or until - // we've moved past the position that requires the - // lookbehind buffer. In this case we switch to the - // optimized loop. + // There's too few data for Boyer-Moore-Horspool to run, + // so let's use a different algorithm to skip as much as + // we can. + // Forward pos until + // the trailing part of lookbehind + data + // looks like the beginning of the needle // or until - // the character to look at lies outside the haystack. - while (pos < 0 && pos <= len - needleLength) { - ch = this._sbmh_lookup_char(data, pos + needleLength - 1) - - if ( - ch === lastNeedleChar && - this._sbmh_memcmp(data, pos, needleLength - 1) - ) { - this._lookbehind_size = 0 - ++this.matches - this.emit('info', true) - - return (this._bufpos = pos + needleLength) - } - - pos += this._occ[ch] - } + // pos == 0 + while (pos < 0 && !this._sbmh_memcmp(data, pos, len - pos)) { ++pos } + } - // No match. - - if (pos < 0) { - // There's too few data for Boyer-Moore-Horspool to run, - // so let's use a different algorithm to skip as much as - // we can. - // Forward pos until - // the trailing part of lookbehind + data - // looks like the beginning of the needle - // or until - // pos == 0 - while (pos < 0 && !this._sbmh_memcmp(data, pos, len - pos)) { ++pos } + if (pos >= 0) { + // Discard lookbehind buffer. + this.emit('info', false, this._lookbehind, 0, this._lookbehind_size) + this._lookbehind_size = 0 + } else { + // Cut off part of the lookbehind buffer that has + // been processed and append the entire haystack + // into it. + const bytesToCutOff = this._lookbehind_size + pos + if (bytesToCutOff > 0) { + // The cut off data is guaranteed not to contain the needle. + this.emit('info', false, this._lookbehind, 0, bytesToCutOff) } - if (pos >= 0) { - // Discard lookbehind buffer. - this.emit('info', false, this._lookbehind, 0, this._lookbehind_size) - this._lookbehind_size = 0 - } else { - // Cut off part of the lookbehind buffer that has - // been processed and append the entire haystack - // into it. - const bytesToCutOff = this._lookbehind_size + pos - - if (bytesToCutOff > 0) { - // The cut off data is guaranteed not to contain the needle. - this.emit('info', false, this._lookbehind, 0, bytesToCutOff) - } - - this._lookbehind_size -= bytesToCutOff - this._lookbehind.copy(this._lookbehind, 0, bytesToCutOff, this._lookbehind_size) + this._lookbehind.copy(this._lookbehind, 0, bytesToCutOff, + this._lookbehind_size - bytesToCutOff) + this._lookbehind_size -= bytesToCutOff - data.copy(this._lookbehind, this._lookbehind_size) - this._lookbehind_size += len + data.copy(this._lookbehind, this._lookbehind_size) + this._lookbehind_size += len - this._bufpos = len - return len - } + this._bufpos = len + return len } + } - // Lookbehind buffer is now empty. We only need to check if the - // needle is in the haystack. - pos = data.indexOf(needle, pos + ((pos >= 0) * this._bufpos)) - - if (pos !== -1) { - ++this.matches - - if (pos > 0) { this.emit('info', true, data, this._bufpos, pos) } else { this.emit('info', true) } - - return (this._bufpos = pos + needleLength) - } + // Lookbehind buffer is now empty. We only need to check if the + // needle is in the haystack. + pos = data.indexOf(needle, pos + ((pos >= 0) * this._bufpos)) - pos = len - needleLength + if (pos !== -1) { + ++this.matches + if (pos > 0) { this.emit('info', true, data, this._bufpos, pos) } else { this.emit('info', true) } + return (this._bufpos = pos + needleLength) + } - // There was no match. If there's trailing haystack data that we cannot - // match yet using the Boyer-Moore-Horspool algorithm (because the trailing - // data is less than the needle size) then match using a modified - // algorithm that starts matching from the beginning instead of the end. - // Whatever trailing data is left after running this algorithm is added to - // the lookbehind buffer. - while ( - pos < len && + pos = len - needleLength + + // There was no match. If there's trailing haystack data that we cannot + // match yet using the Boyer-Moore-Horspool algorithm (because the trailing + // data is less than the needle size) then match using a modified + // algorithm that starts matching from the beginning instead of the end. + // Whatever trailing data is left after running this algorithm is added to + // the lookbehind buffer. + while ( + pos < len && + ( + data[pos] !== needle[0] || ( - data[pos] !== needle[0] || - Buffer.compare( + (Buffer.compare( data.subarray(pos, pos + len - pos), needle.subarray(0, len - pos) - ) !== 0 + ) !== 0) ) - ) { - ++pos - } - - if (pos < len) { - data.copy(this._lookbehind, 0, pos, pos + (len - pos)) - this._lookbehind_size = len - pos - } - - // Everything until pos is guaranteed not to contain needle data. - if (pos > 0) { this.emit('info', false, data, this._bufpos, pos < len ? pos : len) } - - this._bufpos = len - - return len + ) + ) { + ++pos } - - _sbmh_lookup_char (data, pos) { - return pos < 0 - ? this._lookbehind[this._lookbehind_size + pos] - : data[pos] + if (pos < len) { + data.copy(this._lookbehind, 0, pos, pos + (len - pos)) + this._lookbehind_size = len - pos } - _sbmh_memcmp (data, pos, len) { - for (var i = 0; i < len; ++i) { // eslint-disable-line no-var - if (this._sbmh_lookup_char(data, pos + i) !== this._needle[i]) { return false } - } + // Everything until pos is guaranteed not to contain needle data. + if (pos > 0) { this.emit('info', false, data, this._bufpos, pos < len ? pos : len) } + + this._bufpos = len + return len +} + +SBMH.prototype._sbmh_lookup_char = function (data, pos) { + return (pos < 0) + ? this._lookbehind[this._lookbehind_size + pos] + : data[pos] +} - return true +SBMH.prototype._sbmh_memcmp = function (data, pos, len) { + for (var i = 0; i < len; ++i) { // eslint-disable-line no-var + if (this._sbmh_lookup_char(data, pos + i) !== this._needle[i]) { return false } } + return true } module.exports = SBMH From 16ffb5041f33e7e9141a76319a4feb33eebfe313 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCrg=C3=BCn=20Day=C4=B1o=C4=9Flu?= Date: Sat, 16 Mar 2024 21:42:17 +0100 Subject: [PATCH 15/17] remove unnecessary subtraction --- deps/streamsearch/sbmh.js | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/deps/streamsearch/sbmh.js b/deps/streamsearch/sbmh.js index 79a3ff9..e96036b 100644 --- a/deps/streamsearch/sbmh.js +++ b/deps/streamsearch/sbmh.js @@ -155,9 +155,8 @@ SBMH.prototype._sbmh_feed = function (data) { this.emit('info', false, this._lookbehind, 0, bytesToCutOff) } - this._lookbehind.copy(this._lookbehind, 0, bytesToCutOff, - this._lookbehind_size - bytesToCutOff) this._lookbehind_size -= bytesToCutOff + this._lookbehind.copy(this._lookbehind, 0, bytesToCutOff, this._lookbehind_size) data.copy(this._lookbehind, this._lookbehind_size) this._lookbehind_size += len @@ -212,7 +211,7 @@ SBMH.prototype._sbmh_feed = function (data) { } SBMH.prototype._sbmh_lookup_char = function (data, pos) { - return (pos < 0) + return pos < 0 ? this._lookbehind[this._lookbehind_size + pos] : data[pos] } From dd789acc007b3e64351a0c08c1e59f5d069d648c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCrg=C3=BCn=20Day=C4=B1o=C4=9Flu?= Date: Sat, 16 Mar 2024 21:42:52 +0100 Subject: [PATCH 16/17] simplify --- deps/streamsearch/sbmh.js | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/deps/streamsearch/sbmh.js b/deps/streamsearch/sbmh.js index e96036b..d95546d 100644 --- a/deps/streamsearch/sbmh.js +++ b/deps/streamsearch/sbmh.js @@ -188,12 +188,10 @@ SBMH.prototype._sbmh_feed = function (data) { pos < len && ( data[pos] !== needle[0] || - ( - (Buffer.compare( - data.subarray(pos, pos + len - pos), - needle.subarray(0, len - pos) - ) !== 0) - ) + Buffer.compare( + data.subarray(pos, pos + len - pos), + needle.subarray(0, len - pos) + ) !== 0 ) ) { ++pos From 4628515901d813df9e8703b135b85f5c542970c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCrg=C3=BCn=20Day=C4=B1o=C4=9Flu?= Date: Sat, 16 Mar 2024 21:45:50 +0100 Subject: [PATCH 17/17] imports --- deps/streamsearch/sbmh.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deps/streamsearch/sbmh.js b/deps/streamsearch/sbmh.js index d95546d..b967365 100644 --- a/deps/streamsearch/sbmh.js +++ b/deps/streamsearch/sbmh.js @@ -27,8 +27,8 @@ * by Hongli Lai at: https://github.com/FooBarWidget/boyer-moore-horspool */ -const EventEmitter = require('node:events').EventEmitter -const inherits = require('node:util').inherits +const { EventEmitter } = require('node:events') +const { inherits } = require('node:util') function SBMH (needle) { if (typeof needle === 'string') {