From 50c6a6b540a4cafee17ee2f8bf0f2fbcc30f375a Mon Sep 17 00:00:00 2001 From: Gregor MacLennan Date: Wed, 23 Oct 2024 22:14:06 +0100 Subject: [PATCH 01/25] WIP: blobStore.entriesStream() --- package-lock.json | 12 +++ package.json | 1 + src/blob-store/entries-stream.js | 122 +++++++++++++++++++++++++++++++ 3 files changed, 135 insertions(+) create mode 100644 src/blob-store/entries-stream.js diff --git a/package-lock.json b/package-lock.json index 45fe44e9..468002f8 100644 --- a/package-lock.json +++ b/package-lock.json @@ -18,6 +18,7 @@ "@mapeo/crypto": "1.0.0-alpha.10", "@mapeo/sqlite-indexer": "1.0.0-alpha.9", "@sinclair/typebox": "^0.29.6", + "@sindresorhus/merge-streams": "^4.0.0", "b4a": "^1.6.3", "bcp-47": "^2.1.0", "better-sqlite3": "^8.7.0", @@ -1288,6 +1289,17 @@ "version": "0.29.6", "license": "MIT" }, + "node_modules/@sindresorhus/merge-streams": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/@sindresorhus/merge-streams/-/merge-streams-4.0.0.tgz", + "integrity": "sha512-tlqY9xq5ukxTUZBmoOp+m61cqwQD5pHJtFY3Mn8CA8ps6yghLH/Hw8UPdqg4OLmFW3IFlcXnQNmo/dh8HzXYIQ==", + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/@sinonjs/commons": { "version": "2.0.0", "dev": true, diff --git a/package.json b/package.json index 741d7b3d..38a3759f 100644 --- a/package.json +++ b/package.json @@ -162,6 +162,7 @@ "@mapeo/crypto": "1.0.0-alpha.10", "@mapeo/sqlite-indexer": "1.0.0-alpha.9", "@sinclair/typebox": "^0.29.6", + "@sindresorhus/merge-streams": "^4.0.0", "b4a": "^1.6.3", "bcp-47": "^2.1.0", "better-sqlite3": "^8.7.0", diff --git a/src/blob-store/entries-stream.js b/src/blob-store/entries-stream.js new file mode 100644 index 00000000..dab3408a --- /dev/null +++ b/src/blob-store/entries-stream.js @@ -0,0 +1,122 @@ +import SubEncoder from 'sub-encoder' +import mergeStreams from '@sindresorhus/merge-streams' +import { Transform } from 'node:stream' + +/** @import Hyperdrive from 'hyperdrive' */ + +/** + * We treat the return type of `createEntriesStream` as a Readable, because the + * `add` and `remove` methods should not be used outside this module. + * @typedef {import('type-fest').Tagged} EntriesStream + */ + +const keyEncoding = new SubEncoder('files', 'utf-8') +const kAddDrive = Symbol('add-drive') + +/** + * @param {EntriesStream} entriesStream + * @param {Hyperdrive} drive + */ +export function addDrive(entriesStream, drive) { + // @ts-expect-error + entriesStream[kAddDrive](drive) +} + +/** + * + * @param {Array} drives + * @param {object} opts + * @param {boolean} [opts.live=false] + * @param {[string, ...string[]]} [opts.folders] + * @returns {EntriesStream} + */ +export function createEntriesStream( + drives, + { live = false, folders = ['/'] } = {} +) { + folders = normalizeFolders(folders) + const mergedEntriesStreams = mergeStreams([]) + for (const drive of drives) { + addDrive(drive) + } + Object.defineProperty(mergedEntriesStreams, kAddDrive, { + get() { + return addDrive + }, + writable: false, + enumerable: false, + configurable: false, + }) + // @ts-expect-error + return mergedEntriesStreams + + /** @param {Hyperdrive} drive */ + function addDrive(drive) { + const bee = drive.db + // This will also include old versions of files, but it is the only way to + // get a live stream from a Hyperbee, however we currently do not support + // edits of blobs, so this should not be an issue, and the consequence is + // that old versions are downloaded too, which is acceptable. + const historyStream = bee.createHistoryStream({ + live, + // `keyEncoding` is necessary because hyperdrive stores file index data + // under the `files` sub-encoding key + keyEncoding, + }) + const filteredHistoryStream = historyStream.pipe( + new Transform({ + transform(entry, _, callback) { + if (matchesFolder(entry.key, folders)) { + callback(null, entry) + } else { + callback() + } + }, + }) + ) + mergedEntriesStreams.add(filteredHistoryStream) + } +} + +/** + * Take an array of folders, remove any folders that are subfolders of another, + * remove duplicates, and add trailing slashes + * @param {string[]} folders + * @returns {[string, ...string[]]} + */ +function normalizeFolders(folders) { + folders = folders.map(addTrailingSlash) + /** @type {Set} */ + const normalized = new Set() + for (let i = 0; i < folders.length; i++) { + const isSubfolderOfAnotherFolder = !!folders.find((folder, index) => { + if (index === i) return false + // Deduping is done by the Set, if we do it here we don't get either + if (folder === folders[i]) return true + return folders[i].startsWith(folder) + }) + if (!isSubfolderOfAnotherFolder) normalized.add(folders[i]) + } + const normalizedArray = Array.from(normalized) + // @ts-expect-error - TS should know this, but doesn't + return normalizedArray.length === 0 ? ['/'] : normalizedArray +} + +/** @param {string} path */ +function addTrailingSlash(path) { + return path.endsWith('/') ? path : `${path}/` +} + +/** + * Returns true if the path is within one of the given folders + * + * @param {string} path + * @param {string[]} folders + * @returns {boolean} + */ +function matchesFolder(path, folders) { + for (const folder of folders) { + if (path.startsWith(folder)) return true + } + return false +} From 30a552e121d523319e154ff626e6d8a6a0726b38 Mon Sep 17 00:00:00 2001 From: Gregor MacLennan Date: Thu, 24 Oct 2024 18:04:31 +0100 Subject: [PATCH 02/25] WIP: not quite working yet --- package-lock.json | 4 +- package.json | 1 + src/blob-store/downloader.js | 230 ++++++++++++++++++ src/blob-store/entries-stream.js | 117 +++++----- src/blob-store/index.js | 42 +++- src/blob-store/live-download.js | 373 ------------------------------ src/types.ts | 8 + test/blob-store/blob-store.js | 221 +++++++++++++++++- test/blob-store/combine-states.js | 149 ------------ types/hyperbee.d.ts | 165 +++++++++++++ types/hyperdrive.d.ts | 19 +- types/unix-path-resolve.d.ts | 4 + 12 files changed, 723 insertions(+), 610 deletions(-) create mode 100644 src/blob-store/downloader.js delete mode 100644 src/blob-store/live-download.js delete mode 100644 test/blob-store/combine-states.js create mode 100644 types/hyperbee.d.ts create mode 100644 types/unix-path-resolve.d.ts diff --git a/package-lock.json b/package-lock.json index 468002f8..31b9fdac 100644 --- a/package-lock.json +++ b/package-lock.json @@ -55,6 +55,7 @@ "tiny-typed-emitter": "^2.1.0", "type-fest": "^4.5.0", "undici": "^6.13.0", + "unix-path-resolve": "^1.0.2", "varint": "^6.0.0", "yauzl-promise": "^4.0.0" }, @@ -9594,7 +9595,8 @@ }, "node_modules/unix-path-resolve": { "version": "1.0.2", - "license": "MIT" + "resolved": "https://registry.npmjs.org/unix-path-resolve/-/unix-path-resolve-1.0.2.tgz", + "integrity": "sha512-kG4g5nobBBaMnH2XbrS4sLUXEpx4nY2J3C6KAlAUcnahG2HChxSPVKWYrqEq76iTo+cyMkLUjqxGaQR2tz097Q==" }, "node_modules/uri-js": { "version": "4.4.1", diff --git a/package.json b/package.json index 38a3759f..eb7d6d0f 100644 --- a/package.json +++ b/package.json @@ -199,6 +199,7 @@ "tiny-typed-emitter": "^2.1.0", "type-fest": "^4.5.0", "undici": "^6.13.0", + "unix-path-resolve": "^1.0.2", "varint": "^6.0.0", "yauzl-promise": "^4.0.0" } diff --git a/src/blob-store/downloader.js b/src/blob-store/downloader.js new file mode 100644 index 00000000..1b01ad25 --- /dev/null +++ b/src/blob-store/downloader.js @@ -0,0 +1,230 @@ +import { TypedEmitter } from 'tiny-typed-emitter' +import { once } from 'node:events' +import { createEntriesStream } from './entries-stream.js' +import { noop } from '../utils.js' +/** @import Hyperdrive from 'hyperdrive' */ + +/** + * @typedef {object} BlobDownloadState + * @property {number} haveCount The number of files already downloaded + * @property {number} haveBytes The bytes already downloaded + * @property {number} wantCount The number of files pending download + * @property {number} wantBytes The bytes pending download + * @property {null} error If status = 'error' then this will be an Error object + * @property {'pending' | 'downloading' | 'downloaded'} status + */ + +/** @typedef {Omit & { status: 'error', error: Error }} BlobDownloadStateError */ + +/** + * @typedef {object} BlobDownloadEvents + * @property {(state: BlobDownloadState | BlobDownloadStateError ) => void} state Emitted with the current download state whenever it changes (not emitted during initial 'checking' status) + */ + +class State { + haveCount = 0 + haveBytes = 0 + /** @type {Set<{ done(): Promise, destroy(): void }>} */ + downloads = new Set() + wantBytes = 0 + error = null + + constructor({ live = false } = {}) { + /** @type {'pending' | 'downloading' | 'downloaded'} */ + this.status = live ? 'pending' : 'downloading' + } + + /** @type {BlobDownloadState | BlobDownloadStateError} */ + get value() { + if (this.error) { + return { + haveCount: this.haveCount, + haveBytes: this.haveBytes, + wantCount: this.downloads.size, + wantBytes: this.wantBytes, + error: this.error, + status: 'error', + } + } + return { + haveCount: this.haveCount, + haveBytes: this.haveBytes, + wantCount: this.downloads.size, + wantBytes: this.wantBytes, + error: null, + status: this.status, + } + } +} + +/** + * Hyperdrive Downloader class, like drive.download() for multiple drives, but + * will download all previous versions that match the filter, and is optionally + * "live", which will download any new files from replicating peers. + * + * @extends {TypedEmitter} + */ +export class Downloader extends TypedEmitter { + /** @type {Map} */ + #drivesById = new Map() + #entriesStream + #donePromise + #ac = new AbortController() + #state + + /** @param {import('hyperdrive')} drive */ + #addDrive = (drive) => { + if (drive.key) { + this.#drivesById.set(drive.key.toString('hex'), drive) + return + } + drive + .ready() + .then(() => { + if (!drive.key) return // should never happen + this.#drivesById.set(drive.key.toString('hex'), drive) + }) + .catch(noop) + } + + /** + * Like drive.download() but 'live', and for multiple drives + * @param {Array} drives + * @param {import('./index.js').InternalDriveEmitter} driveEmitter + * @param {object} [options] + * @param {import('../types.js').BlobFilter} [options.filter] Filter blobs of specific types and/or sizes to download + * @param {boolean} [options.live=false] + */ + constructor(drives, driveEmitter, { filter, live = false } = {}) { + super() + this.#state = new State({ live }) + + this.#entriesStream = createEntriesStream(drives, driveEmitter, { + live, + folders: filterToFolders(filter), + }) + + this.#donePromise = this.#start() + this.#donePromise.catch(noop) + + if (!live) return + + driveEmitter.on('add-drive', this.#addDrive) + this.#ac.signal.addEventListener( + 'abort', + () => { + driveEmitter.off('add-drive', this.#addDrive) + }, + { once: true } + ) + } + + async #start() { + for await (const entry of this.#entriesStream) { + this.#ac.signal.throwIfAborted() + const { + driveId, + value: { blob }, + } = entry + const drive = this.#drivesById.get(driveId) + if (!drive) throw new Error('Drive not found: ' + driveId) + const core = await getBlobsCore(drive, { signal: this.#ac.signal }) + await this.#processEntry(core, blob) + } + } + + /** + * Update state and queue missing entries for download + * + * @param {import('hypercore')} core + * @param {{ blockOffset: number, blockLength: number, byteLength: number }} blob + */ + async #processEntry( + core, + { blockOffset: start, blockLength: length, byteLength } + ) { + const end = start + length + const have = await core.has(start, end) + this.#ac.signal.throwIfAborted() + if (have) { + this.#state.haveCount++ + this.#state.haveBytes += byteLength + } else { + this.#state.wantBytes += byteLength + const download = core.download({ start, end }) + this.#state.downloads.add(download) + download + .done() + .then(() => { + this.#state.haveCount++ + this.#state.haveBytes += byteLength + this.#state.wantBytes -= byteLength + }) + .catch((e) => { + this.#state.error = e + this.#ac.abort(e) + }) + .finally(() => { + this.#state.downloads.delete(download) + this.emit('state', this.#state.value) + }) + } + } + + done() { + return this.#donePromise + } + + /** + * @param {Error} [reason] + */ + destroy(reason) { + this.#ac.abort(reason) + } + + /** + * @returns {BlobDownloadState | BlobDownloadStateError} + */ + get state() { + return this.#state.value + } +} + +/** + * Convert a filter to an array of folders that need to be downloaded + * + * @param {import('../types.js').BlobFilter} [filter] + * @returns {string[]} array of folders that match the filter + */ +function filterToFolders(filter) { + if (!filter) return ['/'] + const folders = [] + for (const [ + type, + variants, + ] of /** @type {import('type-fest').Entries} */ ( + Object.entries(filter) + )) { + // De-dupe variants array + for (const variant of new Set(variants)) { + folders.push(makePath({ type, variant })) + } + } + return folders +} + +/** @param {Pick} opts */ +function makePath({ type, variant }) { + return `/${type}/${variant}` +} + +/** + * @param {Hyperdrive} drive + * @param {{signal?: AbortSignal}} [opts] + * @returns {Promise} + */ +async function getBlobsCore(drive, { signal } = {}) { + if (drive.blobs) return drive.blobs.core + const [blobs] = await once(drive, 'blobs', { signal }) + return blobs.core +} diff --git a/src/blob-store/entries-stream.js b/src/blob-store/entries-stream.js index dab3408a..eccceaa1 100644 --- a/src/blob-store/entries-stream.js +++ b/src/blob-store/entries-stream.js @@ -1,91 +1,98 @@ import SubEncoder from 'sub-encoder' import mergeStreams from '@sindresorhus/merge-streams' import { Transform } from 'node:stream' +import unixPathResolve from 'unix-path-resolve' /** @import Hyperdrive from 'hyperdrive' */ - -/** - * We treat the return type of `createEntriesStream` as a Readable, because the - * `add` and `remove` methods should not be used outside this module. - * @typedef {import('type-fest').Tagged} EntriesStream - */ +/** @import { BlobStoreEntriesStream } from '../types.js' */ const keyEncoding = new SubEncoder('files', 'utf-8') -const kAddDrive = Symbol('add-drive') - -/** - * @param {EntriesStream} entriesStream - * @param {Hyperdrive} drive - */ -export function addDrive(entriesStream, drive) { - // @ts-expect-error - entriesStream[kAddDrive](drive) -} /** * * @param {Array} drives + * @param {import('./index.js').InternalDriveEmitter} driveEmitter * @param {object} opts * @param {boolean} [opts.live=false] - * @param {[string, ...string[]]} [opts.folders] - * @returns {EntriesStream} + * @param {readonly string[]} [opts.folders] + * @returns {BlobStoreEntriesStream} */ export function createEntriesStream( drives, + driveEmitter, { live = false, folders = ['/'] } = {} ) { folders = normalizeFolders(folders) - const mergedEntriesStreams = mergeStreams([]) - for (const drive of drives) { - addDrive(drive) + const mergedEntriesStreams = mergeStreams( + drives.map((drive) => getFilteredHistoryStream(drive.db, { folders, live })) + ) + if (live) { + driveEmitter.on('add-drive', addDrive) + mergedEntriesStreams.on('close', () => { + driveEmitter.off('add-drive', addDrive) + }) } - Object.defineProperty(mergedEntriesStreams, kAddDrive, { - get() { - return addDrive - }, - writable: false, - enumerable: false, - configurable: false, - }) // @ts-expect-error return mergedEntriesStreams /** @param {Hyperdrive} drive */ function addDrive(drive) { - const bee = drive.db - // This will also include old versions of files, but it is the only way to - // get a live stream from a Hyperbee, however we currently do not support - // edits of blobs, so this should not be an issue, and the consequence is - // that old versions are downloaded too, which is acceptable. - const historyStream = bee.createHistoryStream({ - live, - // `keyEncoding` is necessary because hyperdrive stores file index data - // under the `files` sub-encoding key - keyEncoding, - }) - const filteredHistoryStream = historyStream.pipe( - new Transform({ - transform(entry, _, callback) { - if (matchesFolder(entry.key, folders)) { - callback(null, entry) - } else { - callback() - } - }, - }) + mergedEntriesStreams.add( + getFilteredHistoryStream(drive.db, { folders, live }) ) - mergedEntriesStreams.add(filteredHistoryStream) } } +/** + * + * @param {import('hyperbee')} bee + * @param {object} opts + * @param {boolean} opts.live + * @param {readonly string[]} opts.folders + */ +function getFilteredHistoryStream(bee, { folders, live }) { + let driveId = bee.core.discoveryKey?.toString('hex') + // This will also include old versions of files, but it is the only way to + // get a live stream from a Hyperbee, however we currently do not support + // edits of blobs, so this should not be an issue, and the consequence is + // that old versions are downloaded too, which is acceptable. + const historyStream = bee.createHistoryStream({ + live, + // `keyEncoding` is necessary because hyperdrive stores file index data + // under the `files` sub-encoding key + keyEncoding, + }) + return historyStream.pipe( + new Transform({ + objectMode: true, + /** @param {import('hyperdrive').HyperdriveEntry} entry */ + transform(entry, _, callback) { + if (matchesFolder(entry.key, folders)) { + // Unnecessary performance optimization to only call toString() once + // bee.discoveryKey will always be defined by the time it starts + // streaming, but could be null when the instance is first created. + driveId = driveId || bee.core.discoveryKey?.toString('hex') + callback(null, { ...entry, driveId }) + } else { + callback() + } + }, + }) + ) +} + /** * Take an array of folders, remove any folders that are subfolders of another, * remove duplicates, and add trailing slashes - * @param {string[]} folders - * @returns {[string, ...string[]]} + * @param {readonly string[]} folders + * @returns {readonly [string, ...string[]]} */ function normalizeFolders(folders) { - folders = folders.map(addTrailingSlash) + // 1. Add trailing slashes so that path.startsWith(folder) does not match a folder whose name starts with this folder. + // 2. Standardize path names as done internally in Hyperdrive: https://github.com/holepunchto/hyperdrive/blob/5ee0164fb39eadc0a073f7926800f81117a4c52e/index.js#L685 + folders = folders.map((folder) => + addTrailingSlash(unixPathResolve('/', folder)) + ) /** @type {Set} */ const normalized = new Set() for (let i = 0; i < folders.length; i++) { @@ -111,7 +118,7 @@ function addTrailingSlash(path) { * Returns true if the path is within one of the given folders * * @param {string} path - * @param {string[]} folders + * @param {readonly string[]} folders * @returns {boolean} */ function matchesFolder(path, folders) { diff --git a/src/blob-store/index.js b/src/blob-store/index.js index c1a1761f..cb6d83d5 100644 --- a/src/blob-store/index.js +++ b/src/blob-store/index.js @@ -3,12 +3,13 @@ import b4a from 'b4a' import util from 'node:util' import { discoveryKey } from 'hypercore-crypto' import { TypedEmitter } from 'tiny-typed-emitter' -import { LiveDownload } from './live-download.js' +import { Downloader } from './downloader.js' +import { createEntriesStream } from './entries-stream.js' /** @import { JsonObject } from 'type-fest' */ /** @import { Readable as NodeReadable } from 'node:stream' */ /** @import { Readable as StreamxReadable, Writable } from 'streamx' */ -/** @import { BlobId } from '../types.js' */ -/** @import { BlobDownloadEvents } from './live-download.js' */ +/** @import { BlobFilter, BlobId } from '../types.js' */ +/** @import { BlobDownloadEvents } from './downloader.js' */ /** * @internal @@ -123,15 +124,16 @@ export class BlobStore { * If no filter is specified, all blobs will be downloaded. If a filter is * specified, then _only_ blobs that match the filter will be downloaded. * - * @param {import('../types.js').BlobFilter} [filter] Filter blob types and/or variants to download. Filter is { [BlobType]: BlobVariants[] }. At least one blob variant must be specified for each blob type. - * @param {object} options - * @param {AbortSignal} [options.signal] Optional AbortSignal to cancel in-progress download - * @returns {TypedEmitter} + * @param {object} [options] + * @param {import('../types.js').BlobFilter} [options.filter] Filter blob types and/or variants to download. Filter is { [BlobType]: BlobVariants[] }. At least one blob variant must be specified for each blob type. + * @param {boolean} [options.live=false] Set to `true` for a downloader that never ends, and will continue downloading any new data that becomes available. + * @returns {Downloader} */ - download(filter, { signal } = {}) { - return new LiveDownload(this.#hyperdrives.values(), this.#driveEmitter, { + download({ filter, live = false } = {}) { + const drives = Array.from(this.#hyperdrives.values()) + return new Downloader(drives, this.#driveEmitter, { filter, - signal, + live, }) } @@ -154,6 +156,24 @@ export class BlobStore { return drive.createReadStream(path, options) } + /** + * This is a low-level method to create a stream of entries from all drives. + * It includes entries for unknown blob types and variants. + * + * @param {object} opts + * @param {boolean} [opts.live=false] Set to `true` to get a live stream of entries + * @param {readonly string[]} [opts.folders] Filter entries to only those in these folders + * @returns + */ + createEntriesReadStream({ live = false, folders } = {}) { + const drives = Array.from(this.#hyperdrives.values()) + const entriesStream = createEntriesStream(drives, this.#driveEmitter, { + live, + folders, + }) + return entriesStream + } + /** * Optimization for creating the blobs read stream when you have * previously read the entry from Hyperdrive using `drive.entry` @@ -163,7 +183,7 @@ export class BlobStore { * @param {boolean} [options.wait=false] Set to `true` to wait for a blob to download, otherwise will throw if blob is not available locally * @returns {Promise} */ - async createEntryReadStream(driveId, entry, options = { wait: false }) { + async createReadStreamFromEntry(driveId, entry, options = { wait: false }) { const drive = this.#getDrive(driveId) const blobs = await drive.getBlobs() diff --git a/src/blob-store/live-download.js b/src/blob-store/live-download.js deleted file mode 100644 index 0b8ac51c..00000000 --- a/src/blob-store/live-download.js +++ /dev/null @@ -1,373 +0,0 @@ -import { TypedEmitter } from 'tiny-typed-emitter' -import { once } from 'node:events' -import SubEncoder from 'sub-encoder' - -const keyEncoding = new SubEncoder('files', 'utf-8') - -/** - * @typedef {object} BlobDownloadState - * @property {number} haveCount The number of files already downloaded - * @property {number} haveBytes The bytes already downloaded - * @property {number} wantCount The number of files pending download - * @property {number} wantBytes The bytes pending download - * @property {null} error If status = 'error' then this will be an Error object - * @property {'checking' | 'downloading' | 'downloaded' | 'aborted'} status - */ - -/** @typedef {Omit & { status: 'error', error: Error }} BlobDownloadStateError */ - -/** - * @typedef {object} BlobDownloadEvents - * @property {(state: BlobDownloadState | BlobDownloadStateError ) => void} state Emitted with the current download state whenever it changes (not emitted during initial 'checking' status) - */ - -/** - * LiveDownload class - * @extends {TypedEmitter} - */ -export class LiveDownload extends TypedEmitter { - /** @type {Set} */ - #driveLiveDownloads = new Set() - #signal - - /** - * Like drive.download() but 'live', and for multiple drives - * @param {Iterable} drives - * @param {import('./index.js').InternalDriveEmitter} emitter - * @param {object} options - * @param {import('../types.js').BlobFilter} [options.filter] Filter blobs of specific types and/or sizes to download - * @param {AbortSignal} [options.signal] - */ - constructor(drives, emitter, { filter, signal }) { - super() - this.#signal = signal - - const emitState = () => { - this.emit('state', this.state) - } - - /** @param {import('hyperdrive')} drive */ - const addDrive = (drive) => { - const download = new DriveLiveDownload(drive, { - filter, - signal, - }) - this.#driveLiveDownloads.add(download) - download.on('state', emitState) - } - - for (const drive of drives) addDrive(drive) - emitter.on('add-drive', addDrive) - - signal?.addEventListener( - 'abort', - () => { - emitter.off('add-drive', addDrive) - for (const download of this.#driveLiveDownloads) { - download.off('state', emitState) - } - }, - { once: true } - ) - } - - /** - * @returns {BlobDownloadState | BlobDownloadStateError} - */ - get state() { - return combineStates(this.#driveLiveDownloads, { signal: this.#signal }) - } -} - -/** - * LiveDownload class - * @extends {TypedEmitter} - */ -export class DriveLiveDownload extends TypedEmitter { - #haveCount = 0 - #haveBytes = 0 - #wantBytes = 0 - #initialCheck = true - #drive - #folders - /** @type {Set<{ done(): Promise, destroy(): void }>} */ - #downloads = new Set() - /** @type {Error | null} */ - #error = null - #signal - - /** - * Like drive.download() but 'live', - * @param {import('hyperdrive')} drive - * @param {object} options - * @param {import('../types.js').BlobFilter} [options.filter] Filter blobs of specific types and/or sizes to download - * @param {AbortSignal} [options.signal] - */ - constructor(drive, { filter, signal } = {}) { - super() - this.#drive = drive - this.#folders = filterToFolders(filter) - this.#signal = signal - if (signal && !signal.aborted) { - signal.addEventListener( - 'abort', - () => { - for (const download of this.#downloads) download.destroy() - this.#downloads.clear() - this.emit('state', this.state) - }, - { once: true } - ) - } - this.#start().catch(this.#handleError.bind(this)) - } - - /** - * @returns {BlobDownloadState | BlobDownloadStateError} - */ - get state() { - if (this.#error) { - return { - haveCount: this.#haveCount, - haveBytes: this.#haveBytes, - wantCount: this.#downloads.size, - wantBytes: this.#wantBytes, - error: this.#error, - status: 'error', - } - } - return { - haveCount: this.#haveCount, - haveBytes: this.#haveBytes, - wantCount: this.#downloads.size, - wantBytes: this.#wantBytes, - error: null, - status: this.#signal?.aborted - ? 'aborted' - : this.#initialCheck - ? 'checking' - : this.#downloads.size > 0 - ? 'downloading' - : 'downloaded', - } - } - - async #start() { - const blobsCore = await this.#getBlobsCore() - /* c8 ignore next */ - if (this.#signal?.aborted || !blobsCore) return // Can't get here in tests - let seq = 0 - - for (const folder of this.#folders) { - // Don't emit state during initial iteration of existing data, since this is - // likely fast and not useful UX feedback - const entryStream = this.#drive.list(folder, { recursive: true }) - if (this.#signal) { - this.#signal.addEventListener('abort', () => entryStream.destroy(), { - once: true, - }) - } - for await (const entry of entryStream) { - if (this.#signal?.aborted) return - seq = Math.max(seq, entry.seq) - const { blob } = entry.value - if (!blob) continue - await this.#processEntry(blobsCore, blob) - } - if (this.#signal?.aborted) return - } - - this.#initialCheck = false - this.emit('state', this.state) - - const bee = this.#drive.db - // This will also download old versions of files, but it is the only way to - // get a live stream from a Hyperbee, however we currently do not support - // edits of blobs, so this should not be an issue. `keyEncoding` is - // necessary because hyperdrive stores file index data under the `files` - // sub-encoding key - const historyStream = bee.createHistoryStream({ - live: true, - gt: seq, - keyEncoding, - }) - if (this.#signal) { - this.#signal.addEventListener('abort', () => historyStream.destroy(), { - once: true, - }) - } - for await (const entry of historyStream) { - if (this.#signal?.aborted) return - const { blob } = entry.value - if (!blob) continue - if (!matchesFolder(entry.key, this.#folders)) continue - // TODO: consider cancelling downloads when a delete entry is found? - // Probably not worth the extra work. - if (entry.type !== 'put') continue - const wasDownloaded = this.state.status === 'downloaded' - await this.#processEntry(blobsCore, blob) - if (wasDownloaded && this.state.status === 'downloading') { - // State has changed, so emit - this.emit('state', this.state) - } - } - /* c8 ignore next 2 */ - // Could possibly reach here if aborted after check in loop, hard to test - this.emit('state', this.state) - } - - /** - * If a Hyperdrive has been added by its key and has never replicated, then - * drive.getBlobs() will not resolve until replication starts. Since we do not - * want the downloader to remain in the "checking" state forever, we catch - * this case and update the state before waiting for the hyperdrive hyperblobs - * instance. This also makes waiting for the blobs instance cancellable. - * - * @returns {Promise} - */ - async #getBlobsCore() { - if (this.#drive.blobs) return this.#drive.blobs.core - await this.#drive.ready() - await this.#drive.core.update({ wait: true }) - - // If no peers at this stage, we are not going to be able to get the blobs - // until a peer appears, so consider this state "downloaded", because - // otherwise this will just hang as "checking" - if (!this.#drive.core.peers.length) { - this.#initialCheck = false - this.emit('state', this.state) - } - try { - const [blobs] = await once(this.#drive, 'blobs', { signal: this.#signal }) - return blobs.core - } catch (e) { - if (e instanceof Error && e.name === 'AbortError') return - throw e - } - } - - /** @param {Error} e */ - #handleError(e) { - this.#error = e - this.emit('state', this.state) - } - - /** - * Update state and queue missing entries for download - * - * @param {import('hypercore')} core - * @param {{ blockOffset: number, blockLength: number, byteLength: number }} blob - */ - async #processEntry( - core, - { blockOffset: start, blockLength: length, byteLength } - ) { - const end = start + length - const have = await core.has(start, end) - if (have) { - this.#haveCount++ - this.#haveBytes += byteLength - } else { - this.#wantBytes += byteLength - const download = core.download({ start, end }) - this.#downloads.add(download) - download - .done() - .then(() => { - this.#downloads.delete(download) - this.#haveCount++ - this.#haveBytes += byteLength - this.#wantBytes -= byteLength - this.emit('state', this.state) - }) - .catch(this.#handleError.bind(this)) - } - } -} - -/** - * Reduce multiple states into one. Factored out for unit testing because I - * don't trust my coding. Probably a smarter way to do this, but this works. - * - * @param {Iterable<{ state: BlobDownloadState | BlobDownloadStateError }>} liveDownloads - * @param {{ signal?: AbortSignal }} options - * @returns {BlobDownloadState | BlobDownloadStateError} - */ -export function combineStates(liveDownloads, { signal } = {}) { - /** @type {BlobDownloadState | BlobDownloadStateError} */ - let combinedState = { - haveCount: 0, - haveBytes: 0, - wantCount: 0, - wantBytes: 0, - error: null, - status: 'downloaded', - } - for (const { state } of liveDownloads) { - combinedState.haveCount += state.haveCount - combinedState.haveBytes += state.haveBytes - combinedState.wantCount += state.wantCount - combinedState.wantBytes += state.wantBytes - if (state.status === combinedState.status) continue - if (state.status === 'error') { - combinedState = { ...combinedState, error: state.error, status: 'error' } - } else if ( - state.status === 'downloading' && - combinedState.status === 'downloaded' - ) { - combinedState = { ...combinedState, status: 'downloading' } - } else if ( - state.status === 'checking' && - (combinedState.status === 'downloaded' || - combinedState.status === 'downloading') - ) { - combinedState = { ...combinedState, status: 'checking' } - } - } - if (signal?.aborted) { - combinedState.status = 'aborted' - } - return combinedState -} - -/** - * Convert a filter to an array of folders that need to be downloaded - * - * @param {import('../types.js').BlobFilter} [filter] - * @returns {string[]} array of folders that match the filter - */ -function filterToFolders(filter) { - if (!filter) return ['/'] - const folders = [] - for (const [ - type, - variants, - ] of /** @type {import('type-fest').Entries} */ ( - Object.entries(filter) - )) { - // De-dupe variants array - for (const variant of new Set(variants)) { - folders.push(makePath({ type, variant })) - } - } - return folders -} - -/** - * Returns true if the path is within one of the given folders - * - * @param {string} path - * @param {string[]} folders - * @returns {boolean} - */ -function matchesFolder(path, folders) { - for (const folder of folders) { - if (path.startsWith(folder)) return true - } - return false -} - -/** @param {Pick} opts */ -function makePath({ type, variant }) { - return `/${type}/${variant}` -} diff --git a/src/types.ts b/src/types.ts index 41bc0640..17db3d4a 100644 --- a/src/types.ts +++ b/src/types.ts @@ -14,6 +14,8 @@ import { Duplex } from 'streamx' import RandomAccessStorage from 'random-access-storage' import { DefaultListener, ListenerSignature } from 'tiny-typed-emitter' import type { NAMESPACES } from './constants.js' +import type { Readable } from 'stream' +import type { HyperdriveEntry } from 'hyperdrive' export type Namespace = (typeof NAMESPACES)[number] @@ -146,3 +148,9 @@ export type DefaultEmitterEvents< newListener: (event: keyof L, listener: L[keyof L]) => void removeListener: (event: keyof L, listener: L[keyof L]) => void } + +export type BlobStoreEntriesStream = Readable & { + [Symbol.asyncIterator](): AsyncIterableIterator< + HyperdriveEntry & { driveId: string } + > +} diff --git a/test/blob-store/blob-store.js b/test/blob-store/blob-store.js index c6199271..73bcef86 100644 --- a/test/blob-store/blob-store.js +++ b/test/blob-store/blob-store.js @@ -10,10 +10,14 @@ import { createCoreManager, waitForCores, } from '../helpers/core-manager.js' -import { BlobStore } from '../../src/blob-store/index.js' +import { + BlobStore, + SUPPORTED_BLOB_VARIANTS, +} from '../../src/blob-store/index.js' import { setTimeout } from 'node:timers/promises' import { concat } from '../helpers/blob-store.js' import { discoveryKey } from 'hypercore-crypto' +import { setTimeout as delay } from 'node:timers/promises' // Test with buffers that are 3 times the default blockSize for hyperblobs const TEST_BUF_SIZE = 3 * 64 * 1024 @@ -288,7 +292,7 @@ test('blobStore.writerDriveId', async () => { // Tests: // A) Downloads from peers connected when download() is first called // B) Downloads from peers connected after download() is first called -test('live download', async function () { +test.skip('live download', async function () { const projectKey = randomBytes(32) const { blobStore: bs1, coreManager: cm1 } = testenv({ projectKey }) const { blobStore: bs2, coreManager: cm2 } = testenv({ projectKey }) @@ -337,7 +341,7 @@ test('live download', async function () { ) }) -test('sparse live download', async function () { +test.skip('sparse live download', async function () { const projectKey = randomBytes(32) const { blobStore: bs1, coreManager: cm1 } = testenv({ projectKey }) const { blobStore: bs2, coreManager: cm2 } = testenv({ projectKey }) @@ -367,7 +371,9 @@ test('sparse live download', async function () { const { destroy } = replicate(cm1, cm2) - const liveDownload = bs2.download({ photo: ['original', 'preview'] }) + const liveDownload = bs2.download({ + filter: { photo: ['original', 'preview'] }, + }) await downloaded(liveDownload) await destroy() @@ -388,7 +394,7 @@ test('sparse live download', async function () { ) }) -test('cancelled live download', async function () { +test.skip('cancelled live download', async function () { const projectKey = randomBytes(32) const { blobStore: bs1, coreManager: cm1 } = testenv({ projectKey }) const { blobStore: bs2, coreManager: cm2 } = testenv({ projectKey }) @@ -412,12 +418,11 @@ test('cancelled live download', async function () { // STEP 2: Replicate CM1 with CM3 const { destroy: destroy1 } = replicate(cm1, cm3) // STEP 3: Start live download to CM3 - const ac = new AbortController() - const liveDownload = bs3.download(undefined, { signal: ac.signal }) + const liveDownload = bs3.download() // STEP 4: Wait for blobs to be downloaded await downloaded(liveDownload) // STEP 5: Cancel download - ac.abort() + liveDownload.destroy() // STEP 6: Replicate CM2 with CM3 const { destroy: destroy2 } = replicate(cm2, cm3) // STEP 7: Write a blob to CM2 @@ -469,7 +474,7 @@ test('blobStore.getEntryReadStream(driveId, entry)', async () => { assert(entry) const buf = await concat( - await blobStore.createEntryReadStream(driveId, entry) + await blobStore.createReadStreamFromEntry(driveId, entry) ) assert.deepEqual(buf, diskbuf, 'should be equal') @@ -493,13 +498,207 @@ test('blobStore.getEntryReadStream(driveId, entry) should not wait', async () => await assert.rejects( async () => { - const stream = await blobStore.createEntryReadStream(driveId, entry) + const stream = await blobStore.createReadStreamFromEntry(driveId, entry) await concat(stream) }, { message: 'Block not available' } ) }) +test('blobStore.createEntriesReadStream({ live: false })', async (t) => { + const { blobStore } = testenv() + const blobIds = Array.from({ length: 50 }, randomBlobId) + + // Add some blobs with unknown variants and types + blobIds.push( + { + // @ts-expect-error + type: 'unknownType', + variant: 'original', + name: randomBytes(8).toString('hex'), + }, + { + type: 'photo', + variant: 'unknownVariant', + name: randomBytes(8).toString('hex'), + }, + { + type: 'photoExtra', + variant: 'original', + name: randomBytes(8).toString('hex'), + } + ) + for (const blobId of blobIds) { + await blobStore.put(blobId, Buffer.from([0])) + } + const inputKeys = blobIds.map(blobIdToKey) + + /** @param {import('../../src/types.js').BlobStoreEntriesStream} entriesStream */ + async function getKeys(entriesStream) { + const keys = new Set() + for await (const entry of entriesStream) { + keys.add(entry.key) + } + return keys + } + + await t.test('no folders filter, returns everything', async () => { + const expectedKeys = new Set(inputKeys) + const entriesStream = blobStore.createEntriesReadStream() + const keys = await getKeys(entriesStream) + assert.deepEqual(keys, expectedKeys, 'returns all keys') + }) + + await t.test('[] folders filter, returns everything', async () => { + const expectedKeys = new Set(inputKeys) + const entriesStream = blobStore.createEntriesReadStream({ folders: [] }) + const keys = await getKeys(entriesStream) + assert.deepEqual(keys, expectedKeys, 'returns all keys') + }) + + await t.test('single folders filter', async () => { + const folders = ['/photo'] + const unexpectedKeys = new Set( + inputKeys.filter((key) => key.startsWith(folders[0])) + ) + const expectedKeys = new Set( + inputKeys.filter((key) => key.startsWith(addTrailingSlash(folders[0]))) + ) + const entriesStream = blobStore.createEntriesReadStream({ folders }) + const keys = await getKeys(entriesStream) + assert.notDeepEqual( + keys, + unexpectedKeys, + 'does not return keys matched without trailing slash' + ) + assert.deepEqual(keys, expectedKeys, 'returns expected keys') + }) + + await t.test('multiple folders filter, no subfolder', async () => { + const folders = ['/video/original', '/photo/preview'] + const expectedKeys = new Set( + inputKeys.filter((key) => + folders.find((folder) => key.startsWith(addTrailingSlash(folder))) + ) + ) + const entriesStream = blobStore.createEntriesReadStream({ folders }) + const keys = await getKeys(entriesStream) + assert.deepEqual(keys, expectedKeys, 'returns expected keys') + }) + + await t.test('multiple folders filter, subfolder', async () => { + const folders = ['/photo/original', '/photo'] + const expectedKeys = new Set( + inputKeys.filter((key) => key.startsWith(addTrailingSlash(folders[1]))) + ) + const entriesStream = blobStore.createEntriesReadStream({ folders }) + const keys = await getKeys(entriesStream) + assert.deepEqual(keys, expectedKeys, 'returns expected keys') + }) + + await t.test('folders filter with trailing slashes', async () => { + const folders = ['/photo/original/'] + const expectedKeys = new Set( + inputKeys.filter((key) => key.startsWith(addTrailingSlash(folders[0]))) + ) + const entriesStream = blobStore.createEntriesReadStream({ folders }) + const keys = await getKeys(entriesStream) + assert.deepEqual(keys, expectedKeys, 'returns expected keys') + }) + + await t.test('folders filter without leading slash', async () => { + const folders = ['photo/original'] + const expectedKeys = new Set( + inputKeys.filter((key) => key.startsWith('/photo/original/')) + ) + const entriesStream = blobStore.createEntriesReadStream({ folders }) + const keys = await getKeys(entriesStream) + assert.deepEqual(keys, expectedKeys, 'returns expected keys') + }) + + await t.test('folders filter windows separator', async () => { + const folders = ['C:\\photo\\original'] + const expectedKeys = new Set( + inputKeys.filter((key) => key.startsWith('/photo/original/')) + ) + const entriesStream = blobStore.createEntriesReadStream({ folders }) + const keys = await getKeys(entriesStream) + assert.deepEqual(keys, expectedKeys, 'returns expected keys') + }) + + await t.test('folders filter unknown blob type & variant', async () => { + const folders = ['/unknownType', '/photo/unknownVariant'] + const entriesStream = blobStore.createEntriesReadStream({ folders }) + const keys = await getKeys(entriesStream) + assert.deepEqual(keys.size, 2) + }) +}) + +test('blobStore.createEntriesReadStream({ live: true })', async () => { + const projectKey = randomBytes(32) + const { blobStore: bs1, coreManager: cm1 } = testenv({ projectKey }) + const { blobStore: bs2, coreManager: cm2 } = testenv({ projectKey }) + const { blobStore: bs3, coreManager: cm3 } = testenv({ projectKey }) + + const blob1 = randomBytes(TEST_BUF_SIZE) + const blob1Id = /** @type {const} */ ({ + type: 'photo', + variant: 'original', + name: 'blob1', + }) + const blob2 = randomBytes(TEST_BUF_SIZE) + const blob2Id = /** @type {const} */ ({ + type: 'photo', + variant: 'original', + name: 'blob2', + }) + const entries = [] + + // STEP 1: Write a blob to CM1 + await bs1.put(blob1Id, blob1) + // STEP 2: Replicate CM1 with CM3 + const { destroy: destroy1 } = replicate(cm1, cm3) + // STEP 3: Start live entries stream from CM3 + const entriesStream = bs3.createEntriesReadStream({ live: true }) + entriesStream.on('data', (entry) => entries.push(entry)) + // STEP 4: Wait for replication + await delay(200) + assert.equal(entries.length, 1, 'entry from replicated blobStore') + // STEP 5: Replicate CM2 with CM3 + const { destroy: destroy2 } = replicate(cm2, cm3) + // STEP 6: Write a blob to CM2 + await bs2.put(blob2Id, blob2) + // STEP 7: Wait for replication + await delay(200) + // STEP 8: destroy all the replication streams + await Promise.all([destroy1(), destroy2()]) + + assert.equal(entries.length, 2, '2 entries from replicated blobStore') +}) + +/** @returns {import('../../src/types.js').BlobId} */ +function randomBlobId() { + const types = /** @type {import('../../src/types.js').BlobType[]} */ ( + Object.keys(SUPPORTED_BLOB_VARIANTS) + ) + const type = types[Math.floor(Math.random() * types.length)] + const variant = + SUPPORTED_BLOB_VARIANTS[type][ + Math.floor(Math.random() * SUPPORTED_BLOB_VARIANTS[type].length) + ] + // @ts-expect-error + return { type, variant, name: randomBytes(8).toString('hex') } +} + +/** @param {import('../../src/types.js').BlobId} blobId */ +function blobIdToKey({ name, type, variant }) { + return `/${type}/${variant}/${name}` +} +/** @param {string} path */ +function addTrailingSlash(path) { + return path.endsWith('/') ? path : `${path}/` +} + /** * @param {Parameters} args */ @@ -512,7 +711,7 @@ function testenv(...args) { /** * Resolve when liveDownload status is 'downloaded' * - * @param {ReturnType} liveDownload + * @param {import('../../src/blob-store/downloader.js').Downloader} liveDownload * @returns {Promise} */ async function downloaded(liveDownload) { diff --git a/test/blob-store/combine-states.js b/test/blob-store/combine-states.js deleted file mode 100644 index 119d4c82..00000000 --- a/test/blob-store/combine-states.js +++ /dev/null @@ -1,149 +0,0 @@ -import { combineStates } from '../../src/blob-store/live-download.js' -import test from 'node:test' -import assert from 'node:assert/strict' - -const partial = { - haveCount: 0, - haveBytes: 0, - wantCount: 0, - wantBytes: 0, - error: null, -} - -const fixtures = /** @type {const} */ ([ - { - statuses: ['checking', 'downloading', 'downloaded'], - expected: 'checking', - }, - { - statuses: ['checking', 'downloading', 'downloading'], - expected: 'checking', - }, - { - statuses: ['downloading', 'downloading', 'downloaded'], - expected: 'downloading', - }, - { - statuses: ['downloaded', 'downloaded', 'downloaded'], - expected: 'downloaded', - }, - { - statuses: ['checking', 'checking', 'checking'], - expected: 'checking', - }, -]) - -test('expected combined state, no error or abort', () => { - for (const { statuses, expected } of fixtures) { - const inputs = statuses.map((status) => ({ state: { ...partial, status } })) - const expectedState = { ...partial, status: expected } - for (const permuted of permute(inputs)) { - assert.deepEqual(combineStates(permuted), expectedState) - } - } -}) - -test('expected combined state, with error', () => { - for (const { statuses } of fixtures) { - const inputs = [ - ...statuses.map((status) => ({ state: { ...partial, status } })), - { - state: { - ...partial, - error: new Error(), - status: /** @type {const} */ ('error'), - }, - }, - ] - const expectedState = { ...partial, error: new Error(), status: 'error' } - for (const permuted of permute(inputs)) { - assert.deepEqual(combineStates(permuted), expectedState) - } - } -}) - -test('expected combined state, with abort', () => { - const controller = new AbortController() - controller.abort() - const { signal } = controller - for (const { statuses } of fixtures) { - const inputs = statuses.map((status) => ({ state: { ...partial, status } })) - const expectedState = { ...partial, status: 'aborted' } - for (const permuted of permute(inputs)) { - assert.deepEqual(combineStates(permuted, { signal }), expectedState) - } - } -}) - -test('arithmetic test', () => { - const counts = [ - [1, 2, 3, 4], - [1, 2, 3, 4], - [1, 2, 3, 4], - ] - const expected = { - haveCount: 3, - haveBytes: 6, - wantCount: 9, - wantBytes: 12, - error: null, - status: 'downloaded', - } - const inputs = counts.map(([haveCount, haveBytes, wantCount, wantBytes]) => { - return { - state: { - haveCount, - haveBytes, - wantCount, - wantBytes, - error: null, - status: /** @type {const} */ ('downloaded'), - }, - } - }) - assert.deepEqual(combineStates(inputs), expected) -}) - -/** - * Returns an iterator of all permutations of the given array. - * - * Implements [Heap's algorithm][0]. - * - * [0]: https://en.wikipedia.org/wiki/Heap%27s_algorithm - * - * @template T - * @param {ReadonlyArray} arr - * @returns {IterableIterator>} - */ -function* permute(arr) { - const c = Array(arr.length).fill(0) - - yield arr - - let i = 1 - while (i < arr.length) { - if (c[i] < i) { - arr = swapping(arr, i % 2 ? c[i] : 0, i) - yield arr - c[i] += 1 - i = 1 - } else { - c[i] = 0 - i += 1 - } - } -} - -/** - * @template T - * @param {ReadonlyArray} arr - * @param {number} index1 - * @param {number} index2 - * @returns {ReadonlyArray} - */ -function swapping(arr, index1, index2) { - const result = arr.slice() - result[index1] = arr[index2] - result[index2] = arr[index1] - return result -} diff --git a/types/hyperbee.d.ts b/types/hyperbee.d.ts new file mode 100644 index 00000000..3f8ca806 --- /dev/null +++ b/types/hyperbee.d.ts @@ -0,0 +1,165 @@ +declare module 'hyperbee' { + import type { TypedEmitter } from 'tiny-typed-emitter' + import Hypercore from 'hypercore' + import { EventEmitter } from 'events' + import { Readable } from 'streamx' + + type Encoding = 'binary' | 'utf-8' | 'ascii' | 'json' | AbstractEncoding + + declare namespace Hyperbee { + interface HyperbeeOptions { + keyEncoding?: Encoding + valueEncoding?: Encoding + } + + interface HyperbeeEntry { + seq: number + key: string + value: T + } + + interface PutOptions { + cas?: (prev: HyperbeeEntry, next: HyperbeeEntry) => boolean + } + + interface DelOptions { + cas?: (prev: T) => boolean + } + + interface ReadStreamRange { + gt?: string + gte?: string + lt?: string + lte?: string + } + + interface ReadStreamOptions { + reverse?: boolean + limit?: number + } + + interface HistoryStreamOptions extends ReadStreamOptions { + live?: boolean + reverse?: boolean + gte?: number + gt?: number + lte?: number + lt?: number + // These options missing from the docs + keyEncoding?: Encoding + valueEncoding?: Encoding + encoding?: Encoding + } + + interface DiffStreamEntry { + left: T + right: T + } + + interface DiffStreamOptions extends ReadStreamOptions {} + + interface GetAndWatchOptions { + keyEncoding?: 'binary' | 'utf-8' | 'ascii' | 'json' | AbstractEncoding + valueEncoding?: 'binary' | 'utf-8' | 'ascii' | 'json' | AbstractEncoding + } + + interface SubDatabaseOptions extends HyperbeeOptions { + sep?: Buffer + } + + interface HeaderOptions {} + } + + class Hyperbee { + constructor(core: Hypercore, options?: Hyperbee.HyperbeeOptions) + + ready(): Promise + close(): Promise + + readonly core: Hypercore + readonly version: number + // Below are not yet implemented on the version of hyperbee we're using + // readonly id: string + // readonly key: null | Buffer + // readonly discoveryKey: null | Buffer + // readonly writable: boolean + // readonly readable: boolean + + put( + key: string, + value?: any, + options?: Hyperbee.PutOptions + ): Promise + del(key: string, options?: Hyperbee.DelOptions): Promise + get(key: string): Promise | null> + getBySeq( + seq: number, + options?: any + ): Promise, 'seq'> | null> + + batch(): HyperbeeBatch + replicate(isInitiatorOrStream: any): Readable + createReadStream( + range?: Hyperbee.ReadStreamRange, + options?: Hyperbee.ReadStreamOptions + ): Readable> + peek( + range?: Hyperbee.ReadStreamRange, + options?: Hyperbee.ReadStreamOptions + ): Promise | null> + createHistoryStream(options?: Hyperbee.HistoryStreamOptions): Readable< + Hyperbee.HyperbeeEntry & { + type: 'put' | 'del' + } + > + createDiffStream( + otherVersion: number, + options?: Hyperbee.DiffStreamOptions + ): Readable> + + getAndWatch( + key: string, + options?: Hyperbee.GetAndWatchOptions + ): Promise> + watch( + range?: Hyperbee.ReadStreamRange + ): AsyncIterable<[any, any]> & { close: () => Promise } + + checkout(version: number): Hyperbee + snapshot(): Hyperbee + + sub(prefix: string, options?: Hyperbee.SubDatabaseOptions): Hyperbee + getHeader(options?: any): Promise + + static isHyperbee(core: Hypercore, options?: any): Promise + } + + class HyperbeeBatch { + put(key: string, value?: T, options?: PutOptions): Promise + get(key: string): Promise | null> + del(key: string, options?: DelOptions): Promise + flush(): Promise + close(): Promise + } + + class EntryWatcher extends TypedEmitter<{ + update: () => void + }> { + node: { seq: number; key: string; value: T } + + close(): Promise + } + + interface AbstractEncoding { + encode: (data: T) => Buffer + encode: (data: T, buffer: Buffer) => Buffer + encode: (data: T, buffer: Buffer, offset: number) => Buffer + encode: (data: T, buffer?: Buffer, offset: number) => Buffer + decode: (buffer: Buffer) => T + decode: (buffer: Buffer, offset: number) => T + decode: (buffer: Buffer, offset: number, end: number) => T + decode: (buffer: Buffer, offset?: number, end: number) => T + } + + export = Hyperbee +} diff --git a/types/hyperdrive.d.ts b/types/hyperdrive.d.ts index 3dd708a1..0f650389 100644 --- a/types/hyperdrive.d.ts +++ b/types/hyperdrive.d.ts @@ -2,6 +2,7 @@ declare module 'hyperdrive' { import Corestore from 'corestore' import Hypercore from 'hypercore' import Hyperblobs, { BlobId } from 'hyperblobs' + import Hyperbee from 'hyperbee' import { Readable, Writable } from 'streamx' import { TypedEmitter } from 'tiny-typed-emitter' import { JsonValue } from 'type-fest' @@ -33,16 +34,14 @@ declare module 'hyperdrive' { } namespace Hyperdrive { - export interface HyperdriveEntry { - seq: number - key: string - value: { - executable: boolean // whether the blob at path is an executable - linkname: null | string // if entry not symlink, otherwise a string to the entry this links to - blob: BlobId // a Hyperblob id that can be used to fetch the blob associated with this entry - metadata: JsonValue - } + interface HyperdriveEntryValue { + executable: boolean // whether the blob at path is an executable + linkname: null | string // if entry not symlink, otherwise a string to the entry this links to + blob: BlobId // a Hyperblob id that can be used to fetch the blob associated with this entry + metadata: JsonValue } + export interface HyperdriveEntry + extends Hyperbee.HyperbeeEntry {} } class Hyperdrive extends TypedEmitter { @@ -58,7 +57,7 @@ declare module 'hyperdrive' { readonly key: Buffer | null readonly discoveryKey: Buffer | null readonly contentKey: Buffer | null // The public key of the Hyperblobs instance holding blobs associated with entries in the drive. - readonly db: any // Hyperbee + readonly db: Hyperbee // Hyperbee readonly version: number ready(): Promise update(options?: { wait?: boolean }): Promise diff --git a/types/unix-path-resolve.d.ts b/types/unix-path-resolve.d.ts new file mode 100644 index 00000000..11eef547 --- /dev/null +++ b/types/unix-path-resolve.d.ts @@ -0,0 +1,4 @@ +declare module 'unix-path-resolve' { + function resolve(path: string, path: string): string + export = resolve +} From 04da19f91d6258b804d2b14967b927c1f0be967e Mon Sep 17 00:00:00 2001 From: Gregor MacLennan Date: Fri, 25 Oct 2024 20:43:36 +0100 Subject: [PATCH 03/25] WIP: IterableWeakMap for referencing live external objects --- package-lock.json | 22 ++++++++++-- package.json | 1 + src/blob-store/downloader.js | 57 ++++++++++++++++---------------- src/blob-store/entries-stream.js | 24 +++++++++----- src/blob-store/index.js | 46 ++++++++++++++++++-------- types/hyperdrive.d.ts | 1 + 6 files changed, 96 insertions(+), 55 deletions(-) diff --git a/package-lock.json b/package-lock.json index 31b9fdac..38175baa 100644 --- a/package-lock.json +++ b/package-lock.json @@ -57,6 +57,7 @@ "undici": "^6.13.0", "unix-path-resolve": "^1.0.2", "varint": "^6.0.0", + "weakref": "^0.2.1", "yauzl-promise": "^4.0.0" }, "devDependencies": { @@ -7473,13 +7474,15 @@ "integrity": "sha512-gC13b/bWrqQoKY2EmROCZ+AR0jitc6DnDGaQ6Ls9QpKmuSgJB1eQ7H3KETtQm7qSdMWMKCmsshyCmUwMLh3OAA==" }, "node_modules/protomux": { - "version": "3.4.1", - "license": "MIT", + "version": "3.10.0", + "resolved": "https://registry.npmjs.org/protomux/-/protomux-3.10.0.tgz", + "integrity": "sha512-YjNhvdYWI5HGbbUUKRR8DT3mg+RaZQT6V5T83ktd4veAdHg0CLvevcs33wo8rjdEwCnRaSNXkzlw48tbSaPVag==", "dependencies": { "b4a": "^1.3.1", "compact-encoding": "^2.5.1", "queue-tick": "^1.0.0", - "safety-catch": "^1.0.1" + "safety-catch": "^1.0.1", + "unslab": "^1.3.0" } }, "node_modules/proxy-addr": { @@ -9598,6 +9601,14 @@ "resolved": "https://registry.npmjs.org/unix-path-resolve/-/unix-path-resolve-1.0.2.tgz", "integrity": "sha512-kG4g5nobBBaMnH2XbrS4sLUXEpx4nY2J3C6KAlAUcnahG2HChxSPVKWYrqEq76iTo+cyMkLUjqxGaQR2tz097Q==" }, + "node_modules/unslab": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/unslab/-/unslab-1.3.0.tgz", + "integrity": "sha512-YATkfKAFj47kTzmiQrWXMyRvaVrHsW6MEALa4bm+FhiA2YG4oira+Z3DXN6LrYOYn2Y8eO94Lwl9DOHjs1FpoQ==", + "dependencies": { + "b4a": "^1.6.6" + } + }, "node_modules/uri-js": { "version": "4.4.1", "license": "BSD-2-Clause", @@ -9659,6 +9670,11 @@ "defaults": "^1.0.3" } }, + "node_modules/weakref": { + "version": "0.2.1", + "resolved": "https://registry.npmjs.org/weakref/-/weakref-0.2.1.tgz", + "integrity": "sha512-yayGEhEmDx38epbAbjPrgLu8VRwxP7MrmprqktcuInsEWPi0g0qHdrwYLS5F1AOjWayqpE8F+qbCW/BLoLfFsQ==" + }, "node_modules/which": { "version": "2.0.2", "license": "ISC", diff --git a/package.json b/package.json index eb7d6d0f..ca80d7e2 100644 --- a/package.json +++ b/package.json @@ -201,6 +201,7 @@ "undici": "^6.13.0", "unix-path-resolve": "^1.0.2", "varint": "^6.0.0", + "weakref": "^0.2.1", "yauzl-promise": "^4.0.0" } } diff --git a/src/blob-store/downloader.js b/src/blob-store/downloader.js index 1b01ad25..5a53a09f 100644 --- a/src/blob-store/downloader.js +++ b/src/blob-store/downloader.js @@ -21,6 +21,16 @@ import { noop } from '../utils.js' * @property {(state: BlobDownloadState | BlobDownloadStateError ) => void} state Emitted with the current download state whenever it changes (not emitted during initial 'checking' status) */ +const kAddDrive = Symbol('addDrive to downloader') + +/** + * @param {Downloader} downloader + * @param {Hyperdrive} drive + */ +export function addDriveToDownloader(downloader, drive) { + downloader[kAddDrive](drive) +} + class State { haveCount = 0 haveBytes = 0 @@ -72,51 +82,24 @@ export class Downloader extends TypedEmitter { #ac = new AbortController() #state - /** @param {import('hyperdrive')} drive */ - #addDrive = (drive) => { - if (drive.key) { - this.#drivesById.set(drive.key.toString('hex'), drive) - return - } - drive - .ready() - .then(() => { - if (!drive.key) return // should never happen - this.#drivesById.set(drive.key.toString('hex'), drive) - }) - .catch(noop) - } - /** * Like drive.download() but 'live', and for multiple drives * @param {Array} drives - * @param {import('./index.js').InternalDriveEmitter} driveEmitter * @param {object} [options] * @param {import('../types.js').BlobFilter} [options.filter] Filter blobs of specific types and/or sizes to download * @param {boolean} [options.live=false] */ - constructor(drives, driveEmitter, { filter, live = false } = {}) { + constructor(drives, { filter, live = false } = {}) { super() this.#state = new State({ live }) - this.#entriesStream = createEntriesStream(drives, driveEmitter, { + this.#entriesStream = createEntriesStream(drives, { live, folders: filterToFolders(filter), }) this.#donePromise = this.#start() this.#donePromise.catch(noop) - - if (!live) return - - driveEmitter.on('add-drive', this.#addDrive) - this.#ac.signal.addEventListener( - 'abort', - () => { - driveEmitter.off('add-drive', this.#addDrive) - }, - { once: true } - ) } async #start() { @@ -171,6 +154,22 @@ export class Downloader extends TypedEmitter { } } + /** @param {import('hyperdrive')} drive */ + [kAddDrive](drive) { + if (this.#ac.signal.aborted) return + if (drive.key) { + this.#drivesById.set(drive.key.toString('hex'), drive) + return + } + drive + .ready() + .then(() => { + if (!drive.key) return // should never happen + this.#drivesById.set(drive.key.toString('hex'), drive) + }) + .catch(noop) + } + done() { return this.#donePromise } diff --git a/src/blob-store/entries-stream.js b/src/blob-store/entries-stream.js index eccceaa1..ab2248c8 100644 --- a/src/blob-store/entries-stream.js +++ b/src/blob-store/entries-stream.js @@ -7,11 +7,20 @@ import unixPathResolve from 'unix-path-resolve' /** @import { BlobStoreEntriesStream } from '../types.js' */ const keyEncoding = new SubEncoder('files', 'utf-8') +const kAddDrive = Symbol('addDrive to entries stream') + +/** + * @param {BlobStoreEntriesStream} entriesStream + * @param {Hyperdrive} drive + */ +export function addDriveToEntriesStream(entriesStream, drive) { + // @ts-expect-error - We don't expose this method in the type + entriesStream[kAddDrive](drive) +} /** * * @param {Array} drives - * @param {import('./index.js').InternalDriveEmitter} driveEmitter * @param {object} opts * @param {boolean} [opts.live=false] * @param {readonly string[]} [opts.folders] @@ -19,20 +28,17 @@ const keyEncoding = new SubEncoder('files', 'utf-8') */ export function createEntriesStream( drives, - driveEmitter, { live = false, folders = ['/'] } = {} ) { folders = normalizeFolders(folders) const mergedEntriesStreams = mergeStreams( drives.map((drive) => getFilteredHistoryStream(drive.db, { folders, live })) ) - if (live) { - driveEmitter.on('add-drive', addDrive) - mergedEntriesStreams.on('close', () => { - driveEmitter.off('add-drive', addDrive) - }) - } - // @ts-expect-error + Object.defineProperty(mergedEntriesStreams, kAddDrive, { + value: addDrive, + writable: false, + enumerable: false, + }) return mergedEntriesStreams /** @param {Hyperdrive} drive */ diff --git a/src/blob-store/index.js b/src/blob-store/index.js index cb6d83d5..f04ff8e6 100644 --- a/src/blob-store/index.js +++ b/src/blob-store/index.js @@ -2,13 +2,17 @@ import Hyperdrive from 'hyperdrive' import b4a from 'b4a' import util from 'node:util' import { discoveryKey } from 'hypercore-crypto' -import { TypedEmitter } from 'tiny-typed-emitter' -import { Downloader } from './downloader.js' -import { createEntriesStream } from './entries-stream.js' +import { addDriveToDownloader, Downloader } from './downloader.js' +import { + addDriveToEntriesStream, + createEntriesStream, +} from './entries-stream.js' +import { IterableWeakSet } from 'weakref' + /** @import { JsonObject } from 'type-fest' */ /** @import { Readable as NodeReadable } from 'node:stream' */ /** @import { Readable as StreamxReadable, Writable } from 'streamx' */ -/** @import { BlobFilter, BlobId } from '../types.js' */ +/** @import { BlobFilter, BlobId, BlobStoreEntriesStream } from '../types.js' */ /** @import { BlobDownloadEvents } from './downloader.js' */ /** @@ -16,8 +20,6 @@ import { createEntriesStream } from './entries-stream.js' * @typedef {NodeReadable | StreamxReadable} Readable */ -/** @typedef {TypedEmitter<{ 'add-drive': (drive: import('hyperdrive')) => void }>} InternalDriveEmitter */ - // prop = blob type name // value = array of blob variants supported for that type const SUPPORTED_BLOB_VARIANTS = /** @type {const} */ ({ @@ -42,11 +44,10 @@ export class BlobStore { /** @type {Map} Indexed by hex-encoded discovery key */ #hyperdrives = new Map() #writer - /** - * Used to communicate to live download instances when new drives are added - * @type {InternalDriveEmitter} - */ - #driveEmitter = new TypedEmitter() + /** @type {IterableWeakSet} */ + #liveDownloaders = new IterableWeakSet() + /** @type {IterableWeakSet + close(): Promise } export = Hyperdrive From 8e4742c0cd490a0230f68065cba47f100e2be5a5 Mon Sep 17 00:00:00 2001 From: Gregor MacLennan Date: Mon, 28 Oct 2024 13:45:50 +0000 Subject: [PATCH 04/25] WIP: cleanup, reduce, fix --- src/blob-store/downloader.js | 229 ++++++++++++++++++++++--------- src/blob-store/entries-stream.js | 95 +++---------- src/blob-store/index.js | 16 ++- 3 files changed, 193 insertions(+), 147 deletions(-) diff --git a/src/blob-store/downloader.js b/src/blob-store/downloader.js index 5a53a09f..eeee2cb0 100644 --- a/src/blob-store/downloader.js +++ b/src/blob-store/downloader.js @@ -1,9 +1,17 @@ import { TypedEmitter } from 'tiny-typed-emitter' import { once } from 'node:events' import { createEntriesStream } from './entries-stream.js' -import { noop } from '../utils.js' + /** @import Hyperdrive from 'hyperdrive' */ +/** + * Download statuses: + * - 'processingEntries' - checking initial entries from blob index cores to determine what to download + * - 'downloading' - one or more blob chunks are currently downloading + * - 'downloaded' - all blob chunks that match the filter have been downloaded (non-live downloaders only) + * - 'waiting' - live downloader is waiting for new entries from sync + */ + /** * @typedef {object} BlobDownloadState * @property {number} haveCount The number of files already downloaded @@ -11,7 +19,7 @@ import { noop } from '../utils.js' * @property {number} wantCount The number of files pending download * @property {number} wantBytes The bytes pending download * @property {null} error If status = 'error' then this will be an Error object - * @property {'pending' | 'downloading' | 'downloaded'} status + * @property {'processingEntries' | 'downloading' | 'downloaded' | 'waiting'} status */ /** @typedef {Omit & { status: 'error', error: Error }} BlobDownloadStateError */ @@ -35,13 +43,23 @@ class State { haveCount = 0 haveBytes = 0 /** @type {Set<{ done(): Promise, destroy(): void }>} */ - downloads = new Set() + queuedDownloads = new Set() + /** + * The initial length of each drive, if > 0. Once we have processed entries up + * to the initial length, we remove the drive from this map. We use this map + * to determine whether we are in the "processing initial entries" state, or + * if we are downloading, downloaded, or waiting for new entries. + * + * @type {Map} + */ + initialLengthsByDriveId = new Map() wantBytes = 0 error = null + processingInitialEntries = true + live constructor({ live = false } = {}) { - /** @type {'pending' | 'downloading' | 'downloaded'} */ - this.status = live ? 'pending' : 'downloading' + this.live = live } /** @type {BlobDownloadState | BlobDownloadStateError} */ @@ -50,27 +68,42 @@ class State { return { haveCount: this.haveCount, haveBytes: this.haveBytes, - wantCount: this.downloads.size, + wantCount: this.queuedDownloads.size, wantBytes: this.wantBytes, error: this.error, status: 'error', } } + /** @type {BlobDownloadState['status']} */ + let status + if (this.processingInitialEntries) { + status = 'processingEntries' + } else if (this.queuedDownloads.size) { + status = 'downloading' + } else if (this.live) { + status = 'waiting' + } else { + status = 'downloaded' + } return { haveCount: this.haveCount, haveBytes: this.haveBytes, - wantCount: this.downloads.size, + wantCount: this.queuedDownloads.size, wantBytes: this.wantBytes, error: null, - status: this.status, + status, } } } /** - * Hyperdrive Downloader class, like drive.download() for multiple drives, but - * will download all previous versions that match the filter, and is optionally - * "live", which will download any new files from replicating peers. + * Like hyperdrive.download() but optionally 'live', and for multiple drives. + * Emits `state` events with the current download state. + * + * NB: unlike hyperdrive.download(), this will also download deleted and + * previous versions of blobs - we don't currently support editing or deleting + * of blobs, so this should not be an issue, and if we do in the future, + * downloading deleted and previous versions may be desirable behavior anyway * * @extends {TypedEmitter} */ @@ -78,107 +111,150 @@ export class Downloader extends TypedEmitter { /** @type {Map} */ #drivesById = new Map() #entriesStream - #donePromise + #processEntriesPromise #ac = new AbortController() #state + #live + #pathPrefixes /** - * Like drive.download() but 'live', and for multiple drives * @param {Array} drives * @param {object} [options] * @param {import('../types.js').BlobFilter} [options.filter] Filter blobs of specific types and/or sizes to download - * @param {boolean} [options.live=false] + * @param {boolean} [options.live=false] If true, the downloader will never be done, and will wait for new entries from the drives */ constructor(drives, { filter, live = false } = {}) { super() + this.#live = live this.#state = new State({ live }) + this.#pathPrefixes = filter ? pathPrefixesFromFilters(filter) : [] - this.#entriesStream = createEntriesStream(drives, { - live, - folders: filterToFolders(filter), - }) + this.#entriesStream = createEntriesStream(drives, { live }) + this.#entriesStream.once('error', this.#ac.abort) + + this.#ac.signal.addEventListener('abort', this.#cleanup, { once: true }) - this.#donePromise = this.#start() - this.#donePromise.catch(noop) + this.#processEntriesPromise = this.#processEntries(drives) + this.#processEntriesPromise.catch(this.#ac.abort) } - async #start() { + /** + * Start processing entries from the entries stream - if an entry matches the + * filter, and we don't already have it, queue it for download. If the + * Downloader is live, this method will never resolve, otherwise it will + * resolve when all the entries have been processed, but not necessarily + * downloaded. + * + * @param {Hyperdrive[]} drives + */ + async #processEntries(drives) { + await Promise.all(drives.map((drive) => this[kAddDrive](drive))) for await (const entry of this.#entriesStream) { this.#ac.signal.throwIfAborted() const { + seq, driveId, + key: filePath, value: { blob }, } = entry + // If we have processed all entries up to the initial length of the drive, + // the we've processed the "initial entries" in the drive. + if (this.#state.initialLengthsByDriveId.has(driveId)) { + const initialLength = this.#state.initialLengthsByDriveId.get(driveId) + if (typeof initialLength === 'number' && seq >= initialLength - 1) { + this.#state.initialLengthsByDriveId.delete(driveId) + } + } + if (!this.#shouldDownloadFile(filePath)) continue const drive = this.#drivesById.get(driveId) if (!drive) throw new Error('Drive not found: ' + driveId) const core = await getBlobsCore(drive, { signal: this.#ac.signal }) + this.#ac.signal.throwIfAborted() await this.#processEntry(core, blob) + this.#ac.signal.throwIfAborted() + this.emit('state', this.#state.value) + // This loop will never end if live. } } + /** @param {string} filePath */ + #shouldDownloadFile(filePath) { + if (!this.#pathPrefixes.length) return true + return this.#pathPrefixes.some((prefix) => filePath.startsWith(prefix)) + } + /** * Update state and queue missing entries for download * - * @param {import('hypercore')} core + * @param {import('hypercore')} blobsCore * @param {{ blockOffset: number, blockLength: number, byteLength: number }} blob */ async #processEntry( - core, + blobsCore, { blockOffset: start, blockLength: length, byteLength } ) { const end = start + length - const have = await core.has(start, end) + const have = await blobsCore.has(start, end) this.#ac.signal.throwIfAborted() if (have) { this.#state.haveCount++ this.#state.haveBytes += byteLength } else { this.#state.wantBytes += byteLength - const download = core.download({ start, end }) - this.#state.downloads.add(download) + const download = blobsCore.download({ start, end }) + this.#state.queuedDownloads.add(download) download .done() .then(() => { this.#state.haveCount++ this.#state.haveBytes += byteLength this.#state.wantBytes -= byteLength + this.emit('state', this.#state.value) }) .catch((e) => { this.#state.error = e this.#ac.abort(e) }) .finally(() => { - this.#state.downloads.delete(download) - this.emit('state', this.#state.value) + this.#state.queuedDownloads.delete(download) }) } } /** @param {import('hyperdrive')} drive */ - [kAddDrive](drive) { - if (this.#ac.signal.aborted) return - if (drive.key) { - this.#drivesById.set(drive.key.toString('hex'), drive) - return - } - drive - .ready() - .then(() => { - if (!drive.key) return // should never happen - this.#drivesById.set(drive.key.toString('hex'), drive) - }) - .catch(noop) + async [kAddDrive](drive) { + this.#ac.signal.throwIfAborted() + await drive.ready() + this.#ac.signal.throwIfAborted() + if (!drive.key) throw new Error('Unexpected: missing drive key') // should never happen + this.#drivesById.set(drive.key.toString('hex'), drive) + if (drive.db.core.length === 0) return + this.#state.initialLengthsByDriveId.set( + drive.key.toString('hex'), + drive.db.core.length + ) } - done() { - return this.#donePromise + destroy() { + this.#ac.abort() } - /** - * @param {Error} [reason] - */ - destroy(reason) { - this.#ac.abort(reason) + async done() { + if (this.#live) throw new Error('Live downloader will never be done') + await this.#processEntriesPromise + await Promise.all( + Array.from(this.#state.queuedDownloads, (download) => download.done()) + ) + this.#cleanup() + } + + #cleanup = () => { + for (const download of this.#state.queuedDownloads) download.destroy() + this.#ac.signal.removeEventListener('abort', this.#cleanup) + this.#entriesStream.removeListener('error', this.#ac.abort) + this.#state.queuedDownloads.clear() + this.#drivesById.clear() + this.#entriesStream.destroy() } /** @@ -190,31 +266,50 @@ export class Downloader extends TypedEmitter { } /** - * Convert a filter to an array of folders that need to be downloaded + * This is a more generic version of the BlobFilter type that can filter unknown + * blob types and variants from the blob store. * - * @param {import('../types.js').BlobFilter} [filter] - * @returns {string[]} array of folders that match the filter + * @typedef {{ [type: string]: readonly string[] }} GenericBlobFilter */ -function filterToFolders(filter) { - if (!filter) return ['/'] - const folders = [] - for (const [ - type, - variants, - ] of /** @type {import('type-fest').Entries} */ ( - Object.entries(filter) - )) { - // De-dupe variants array - for (const variant of new Set(variants)) { - folders.push(makePath({ type, variant })) + +/** + * Convert a filter to an array of path prefixes that match the filter. These + * path prefixes can be used to filter entries by + * `entry.key.startsWith(pathPrefix)`. + * + * @param {GenericBlobFilter} filter + * @returns {readonly string[]} array of folders that match the filter + */ +function pathPrefixesFromFilters(filter) { + const pathPrefixes = [] + for (const [type, variants] of Object.entries(filter)) { + const dedupedVariants = new Set(variants) + for (const variant of dedupedVariants) { + pathPrefixes.push(`/${type}/${variant}/`) } } - return folders + return filterSubfoldersAndDuplicates(pathPrefixes) } -/** @param {Pick} opts */ -function makePath({ type, variant }) { - return `/${type}/${variant}` +/** + * Take an array of folders, remove any folders that are duplicates or subfolders of another + * + * @param {readonly string[]} folders + * @returns {readonly string[]} + */ +function filterSubfoldersAndDuplicates(folders) { + /** @type {Set} */ + const filtered = new Set() + for (let i = 0; i < folders.length; i++) { + const isSubfolderOfAnotherFolder = !!folders.find((folder, index) => { + if (index === i) return false + // Deduping is done by the Set, if we do it here we don't get either + if (folder === folders[i]) return true + return folders[i].startsWith(folder) + }) + if (!isSubfolderOfAnotherFolder) filtered.add(folders[i]) + } + return Array.from(filtered) } /** diff --git a/src/blob-store/entries-stream.js b/src/blob-store/entries-stream.js index ab2248c8..e4acb58b 100644 --- a/src/blob-store/entries-stream.js +++ b/src/blob-store/entries-stream.js @@ -1,7 +1,6 @@ import SubEncoder from 'sub-encoder' import mergeStreams from '@sindresorhus/merge-streams' import { Transform } from 'node:stream' -import unixPathResolve from 'unix-path-resolve' /** @import Hyperdrive from 'hyperdrive' */ /** @import { BlobStoreEntriesStream } from '../types.js' */ @@ -23,16 +22,11 @@ export function addDriveToEntriesStream(entriesStream, drive) { * @param {Array} drives * @param {object} opts * @param {boolean} [opts.live=false] - * @param {readonly string[]} [opts.folders] * @returns {BlobStoreEntriesStream} */ -export function createEntriesStream( - drives, - { live = false, folders = ['/'] } = {} -) { - folders = normalizeFolders(folders) +export function createEntriesStream(drives, { live = false } = {}) { const mergedEntriesStreams = mergeStreams( - drives.map((drive) => getFilteredHistoryStream(drive.db, { folders, live })) + drives.map((drive) => getHistoryStream(drive.db, { live })) ) Object.defineProperty(mergedEntriesStreams, kAddDrive, { value: addDrive, @@ -43,9 +37,7 @@ export function createEntriesStream( /** @param {Hyperdrive} drive */ function addDrive(drive) { - mergedEntriesStreams.add( - getFilteredHistoryStream(drive.db, { folders, live }) - ) + mergedEntriesStreams.add(getHistoryStream(drive.db, { live })) } } @@ -54,10 +46,8 @@ export function createEntriesStream( * @param {import('hyperbee')} bee * @param {object} opts * @param {boolean} opts.live - * @param {readonly string[]} opts.folders */ -function getFilteredHistoryStream(bee, { folders, live }) { - let driveId = bee.core.discoveryKey?.toString('hex') +function getHistoryStream(bee, { live }) { // This will also include old versions of files, but it is the only way to // get a live stream from a Hyperbee, however we currently do not support // edits of blobs, so this should not be an issue, and the consequence is @@ -68,68 +58,27 @@ function getFilteredHistoryStream(bee, { folders, live }) { // under the `files` sub-encoding key keyEncoding, }) - return historyStream.pipe( - new Transform({ - objectMode: true, - /** @param {import('hyperdrive').HyperdriveEntry} entry */ - transform(entry, _, callback) { - if (matchesFolder(entry.key, folders)) { - // Unnecessary performance optimization to only call toString() once - // bee.discoveryKey will always be defined by the time it starts - // streaming, but could be null when the instance is first created. - driveId = driveId || bee.core.discoveryKey?.toString('hex') - callback(null, { ...entry, driveId }) - } else { - callback() - } - }, - }) - ) + return historyStream.pipe(new AddDiscoveryIds(bee.core)) } -/** - * Take an array of folders, remove any folders that are subfolders of another, - * remove duplicates, and add trailing slashes - * @param {readonly string[]} folders - * @returns {readonly [string, ...string[]]} - */ -function normalizeFolders(folders) { - // 1. Add trailing slashes so that path.startsWith(folder) does not match a folder whose name starts with this folder. - // 2. Standardize path names as done internally in Hyperdrive: https://github.com/holepunchto/hyperdrive/blob/5ee0164fb39eadc0a073f7926800f81117a4c52e/index.js#L685 - folders = folders.map((folder) => - addTrailingSlash(unixPathResolve('/', folder)) - ) - /** @type {Set} */ - const normalized = new Set() - for (let i = 0; i < folders.length; i++) { - const isSubfolderOfAnotherFolder = !!folders.find((folder, index) => { - if (index === i) return false - // Deduping is done by the Set, if we do it here we don't get either - if (folder === folders[i]) return true - return folders[i].startsWith(folder) - }) - if (!isSubfolderOfAnotherFolder) normalized.add(folders[i]) - } - const normalizedArray = Array.from(normalized) - // @ts-expect-error - TS should know this, but doesn't - return normalizedArray.length === 0 ? ['/'] : normalizedArray -} +class AddDiscoveryIds extends Transform { + #core + #discoveryKey -/** @param {string} path */ -function addTrailingSlash(path) { - return path.endsWith('/') ? path : `${path}/` -} + /** @param {import('hypercore')} core */ + constructor(core) { + super({ objectMode: true }) + this.#core = core + this.#discoveryKey = core.discoveryKey?.toString('hex') + } -/** - * Returns true if the path is within one of the given folders - * - * @param {string} path - * @param {readonly string[]} folders - * @returns {boolean} - */ -function matchesFolder(path, folders) { - for (const folder of folders) { - if (path.startsWith(folder)) return true + /** @type {Transform['_transform']} */ + _transform(entry, _, callback) { + // Minimal performance optimization to only call toString() once. + // core.discoveryKey will always be defined by the time it starts + // streaming, but could be null when the instance is first created. + const driveId = + this.#discoveryKey || this.#core.discoveryKey?.toString('hex') + callback(null, { ...entry, driveId }) } - return false } diff --git a/src/blob-store/index.js b/src/blob-store/index.js index f04ff8e6..20a6059c 100644 --- a/src/blob-store/index.js +++ b/src/blob-store/index.js @@ -46,7 +46,7 @@ export class BlobStore { #writer /** @type {IterableWeakSet} */ #liveDownloaders = new IterableWeakSet() - /** @type {IterableWeakSet} */ #liveEntriesStreams = new IterableWeakSet() /** @@ -147,6 +147,9 @@ export class BlobStore { live, }) if (live) { + // If the returned downloader is "live", then we need to add incoming + // drives to it, so we keep a weak reference that will be available for as + // long as the returned downloader is referenced. this.#liveDownloaders.add(downloader) } return downloader @@ -177,16 +180,15 @@ export class BlobStore { * * @param {object} opts * @param {boolean} [opts.live=false] Set to `true` to get a live stream of entries - * @param {readonly string[]} [opts.folders] Filter entries to only those in these folders * @returns */ - createEntriesReadStream({ live = false, folders } = {}) { + createEntriesReadStream({ live = false } = {}) { const drives = Array.from(this.#hyperdrives.values()) - const entriesStream = createEntriesStream(drives, { - live, - folders, - }) + const entriesStream = createEntriesStream(drives, { live }) if (live) { + // If the returned entries stream is "live", then we need to add incoming + // drives to it, so we keep a weak reference that will be available for as + // long as the returned entries stream is referenced. this.#liveEntriesStreams.add(entriesStream) } return entriesStream From 9f125725e9d83f62b35b9cd2c4754d2b4980fe67 Mon Sep 17 00:00:00 2001 From: Gregor MacLennan Date: Mon, 28 Oct 2024 13:47:15 +0000 Subject: [PATCH 05/25] cleanup one more thing --- src/blob-store/downloader.js | 1 + 1 file changed, 1 insertion(+) diff --git a/src/blob-store/downloader.js b/src/blob-store/downloader.js index eeee2cb0..f07f0f4d 100644 --- a/src/blob-store/downloader.js +++ b/src/blob-store/downloader.js @@ -253,6 +253,7 @@ export class Downloader extends TypedEmitter { this.#ac.signal.removeEventListener('abort', this.#cleanup) this.#entriesStream.removeListener('error', this.#ac.abort) this.#state.queuedDownloads.clear() + this.#state.initialLengthsByDriveId.clear() this.#drivesById.clear() this.#entriesStream.destroy() } From 7e6fa62fb9df93bc86e85ce5ee7ae20236028544 Mon Sep 17 00:00:00 2001 From: Gregor MacLennan Date: Mon, 28 Oct 2024 14:06:19 +0000 Subject: [PATCH 06/25] WIP more cleanup --- src/blob-store/downloader.js | 41 ++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/src/blob-store/downloader.js b/src/blob-store/downloader.js index f07f0f4d..fd1ae968 100644 --- a/src/blob-store/downloader.js +++ b/src/blob-store/downloader.js @@ -96,9 +96,19 @@ class State { } } +// This class contains a large amount of parallel async code, and contains lots +// of references and some listeners that need to be deferenced when this class +// is finished with (e.g when a download is complete, or there is an error). +// I've highlighted lines which could throw an error which would put the +// downloader in an "error" state. Currently this does not emit an "error" +// event, but we may want to add one for _live_ downloaders in the future. +// Non-live downloaders can return error state in `done()`. + /** * Like hyperdrive.download() but optionally 'live', and for multiple drives. - * Emits `state` events with the current download state. + * Emits `state` events with the current download state. A 'live' downloader + * must be `destroy()`ed to clean up resources and avoid memory leaks. A + * non-live downloader will clean up after itself when it is done. * * NB: unlike hyperdrive.download(), this will also download deleted and * previous versions of blobs - we don't currently support editing or deleting @@ -114,7 +124,6 @@ export class Downloader extends TypedEmitter { #processEntriesPromise #ac = new AbortController() #state - #live #pathPrefixes /** @@ -125,7 +134,6 @@ export class Downloader extends TypedEmitter { */ constructor(drives, { filter, live = false } = {}) { super() - this.#live = live this.#state = new State({ live }) this.#pathPrefixes = filter ? pathPrefixesFromFilters(filter) : [] @@ -142,12 +150,12 @@ export class Downloader extends TypedEmitter { * Start processing entries from the entries stream - if an entry matches the * filter, and we don't already have it, queue it for download. If the * Downloader is live, this method will never resolve, otherwise it will - * resolve when all the entries have been processed, but not necessarily - * downloaded. + * resolve when all the entries have been processed and downloaded. * * @param {Hyperdrive[]} drives */ async #processEntries(drives) { + // ERROR HANDLING: Should only throw if drive.ready() throws for any drive await Promise.all(drives.map((drive) => this[kAddDrive](drive))) for await (const entry of this.#entriesStream) { this.#ac.signal.throwIfAborted() @@ -167,14 +175,21 @@ export class Downloader extends TypedEmitter { } if (!this.#shouldDownloadFile(filePath)) continue const drive = this.#drivesById.get(driveId) + // ERROR HANDLING: this is unexpected and should not happen if (!drive) throw new Error('Drive not found: ' + driveId) + // ERROR HANDLING: this should not throw const core = await getBlobsCore(drive, { signal: this.#ac.signal }) this.#ac.signal.throwIfAborted() + // ERROR HANDLING: this will throw if core.has() throws await this.#processEntry(core, blob) this.#ac.signal.throwIfAborted() this.emit('state', this.#state.value) // This loop will never end if live. } + await Promise.all( + Array.from(this.#state.queuedDownloads, (download) => download.done()) + ) + this.#cleanup() } /** @param {string} filePath */ @@ -212,6 +227,8 @@ export class Downloader extends TypedEmitter { this.emit('state', this.#state.value) }) .catch((e) => { + // ERROR HANDLING: _should_ only happen if the download is destroyed + if (this.#state.error) return this.#state.error = e this.#ac.abort(e) }) @@ -223,7 +240,6 @@ export class Downloader extends TypedEmitter { /** @param {import('hyperdrive')} drive */ async [kAddDrive](drive) { - this.#ac.signal.throwIfAborted() await drive.ready() this.#ac.signal.throwIfAborted() if (!drive.key) throw new Error('Unexpected: missing drive key') // should never happen @@ -235,23 +251,26 @@ export class Downloader extends TypedEmitter { ) } + /** + * Cancel the downloads and clean up resources. + */ destroy() { this.#ac.abort() } + /** + * Will resolve when all blobs have been downloaded. Will never resolve for a + * live downloader. + */ async done() { - if (this.#live) throw new Error('Live downloader will never be done') await this.#processEntriesPromise - await Promise.all( - Array.from(this.#state.queuedDownloads, (download) => download.done()) - ) - this.#cleanup() } #cleanup = () => { for (const download of this.#state.queuedDownloads) download.destroy() this.#ac.signal.removeEventListener('abort', this.#cleanup) this.#entriesStream.removeListener('error', this.#ac.abort) + // queuedDownloads should always be empty by here anyway, but just in case. this.#state.queuedDownloads.clear() this.#state.initialLengthsByDriveId.clear() this.#drivesById.clear() From 1222673709b756bd890a697f5427b0a137fa74ab Mon Sep 17 00:00:00 2001 From: Gregor MacLennan Date: Mon, 28 Oct 2024 20:23:52 +0000 Subject: [PATCH 07/25] cleanup and remove unused code --- package-lock.json | 6 - package.json | 1 - src/blob-store/downloader.js | 290 ++++--------------------------- src/blob-store/entries-stream.js | 30 ++-- src/blob-store/index.js | 180 +++++++++---------- src/blob-store/utils.js | 71 ++++++++ types/hyperdrive.d.ts | 2 +- 7 files changed, 209 insertions(+), 371 deletions(-) create mode 100644 src/blob-store/utils.js diff --git a/package-lock.json b/package-lock.json index 38175baa..8b5b0892 100644 --- a/package-lock.json +++ b/package-lock.json @@ -57,7 +57,6 @@ "undici": "^6.13.0", "unix-path-resolve": "^1.0.2", "varint": "^6.0.0", - "weakref": "^0.2.1", "yauzl-promise": "^4.0.0" }, "devDependencies": { @@ -9670,11 +9669,6 @@ "defaults": "^1.0.3" } }, - "node_modules/weakref": { - "version": "0.2.1", - "resolved": "https://registry.npmjs.org/weakref/-/weakref-0.2.1.tgz", - "integrity": "sha512-yayGEhEmDx38epbAbjPrgLu8VRwxP7MrmprqktcuInsEWPi0g0qHdrwYLS5F1AOjWayqpE8F+qbCW/BLoLfFsQ==" - }, "node_modules/which": { "version": "2.0.2", "license": "ISC", diff --git a/package.json b/package.json index ca80d7e2..eb7d6d0f 100644 --- a/package.json +++ b/package.json @@ -201,7 +201,6 @@ "undici": "^6.13.0", "unix-path-resolve": "^1.0.2", "varint": "^6.0.0", - "weakref": "^0.2.1", "yauzl-promise": "^4.0.0" } } diff --git a/src/blob-store/downloader.js b/src/blob-store/downloader.js index fd1ae968..89d2ffe9 100644 --- a/src/blob-store/downloader.js +++ b/src/blob-store/downloader.js @@ -1,148 +1,52 @@ import { TypedEmitter } from 'tiny-typed-emitter' -import { once } from 'node:events' import { createEntriesStream } from './entries-stream.js' +import { pathPrefixesFromFilter } from './utils.js' /** @import Hyperdrive from 'hyperdrive' */ -/** - * Download statuses: - * - 'processingEntries' - checking initial entries from blob index cores to determine what to download - * - 'downloading' - one or more blob chunks are currently downloading - * - 'downloaded' - all blob chunks that match the filter have been downloaded (non-live downloaders only) - * - 'waiting' - live downloader is waiting for new entries from sync - */ - -/** - * @typedef {object} BlobDownloadState - * @property {number} haveCount The number of files already downloaded - * @property {number} haveBytes The bytes already downloaded - * @property {number} wantCount The number of files pending download - * @property {number} wantBytes The bytes pending download - * @property {null} error If status = 'error' then this will be an Error object - * @property {'processingEntries' | 'downloading' | 'downloaded' | 'waiting'} status - */ - -/** @typedef {Omit & { status: 'error', error: Error }} BlobDownloadStateError */ - -/** - * @typedef {object} BlobDownloadEvents - * @property {(state: BlobDownloadState | BlobDownloadStateError ) => void} state Emitted with the current download state whenever it changes (not emitted during initial 'checking' status) - */ - -const kAddDrive = Symbol('addDrive to downloader') - -/** - * @param {Downloader} downloader - * @param {Hyperdrive} drive - */ -export function addDriveToDownloader(downloader, drive) { - downloader[kAddDrive](drive) -} - -class State { - haveCount = 0 - haveBytes = 0 - /** @type {Set<{ done(): Promise, destroy(): void }>} */ - queuedDownloads = new Set() - /** - * The initial length of each drive, if > 0. Once we have processed entries up - * to the initial length, we remove the drive from this map. We use this map - * to determine whether we are in the "processing initial entries" state, or - * if we are downloading, downloaded, or waiting for new entries. - * - * @type {Map} - */ - initialLengthsByDriveId = new Map() - wantBytes = 0 - error = null - processingInitialEntries = true - live - - constructor({ live = false } = {}) { - this.live = live - } - - /** @type {BlobDownloadState | BlobDownloadStateError} */ - get value() { - if (this.error) { - return { - haveCount: this.haveCount, - haveBytes: this.haveBytes, - wantCount: this.queuedDownloads.size, - wantBytes: this.wantBytes, - error: this.error, - status: 'error', - } - } - /** @type {BlobDownloadState['status']} */ - let status - if (this.processingInitialEntries) { - status = 'processingEntries' - } else if (this.queuedDownloads.size) { - status = 'downloading' - } else if (this.live) { - status = 'waiting' - } else { - status = 'downloaded' - } - return { - haveCount: this.haveCount, - haveBytes: this.haveBytes, - wantCount: this.queuedDownloads.size, - wantBytes: this.wantBytes, - error: null, - status, - } - } -} - // This class contains a large amount of parallel async code, and contains lots // of references and some listeners that need to be deferenced when this class // is finished with (e.g when a download is complete, or there is an error). // I've highlighted lines which could throw an error which would put the // downloader in an "error" state. Currently this does not emit an "error" -// event, but we may want to add one for _live_ downloaders in the future. -// Non-live downloaders can return error state in `done()`. +// event. /** - * Like hyperdrive.download() but optionally 'live', and for multiple drives. - * Emits `state` events with the current download state. A 'live' downloader - * must be `destroy()`ed to clean up resources and avoid memory leaks. A - * non-live downloader will clean up after itself when it is done. + * Like hyperdrive.download() but 'live', and for multiple drives. * * NB: unlike hyperdrive.download(), this will also download deleted and * previous versions of blobs - we don't currently support editing or deleting * of blobs, so this should not be an issue, and if we do in the future, * downloading deleted and previous versions may be desirable behavior anyway * - * @extends {TypedEmitter} + * @extends {TypedEmitter<{ error: (error: Error) => void }>} */ export class Downloader extends TypedEmitter { - /** @type {Map} */ - #drivesById = new Map() + /** @type {import('./index.js').THyperdriveIndex} */ + #driveIndex + /** @type {Set<{ done(): Promise, destroy(): void }>} */ + #queuedDownloads = new Set() #entriesStream #processEntriesPromise #ac = new AbortController() - #state #pathPrefixes /** - * @param {Array} drives + * @param {import('./index.js').THyperdriveIndex} driveIndex * @param {object} [options] - * @param {import('../types.js').BlobFilter} [options.filter] Filter blobs of specific types and/or sizes to download - * @param {boolean} [options.live=false] If true, the downloader will never be done, and will wait for new entries from the drives + * @param {import('../types.js').BlobFilter | null} [options.filter] Filter blobs of specific types and/or sizes to download */ - constructor(drives, { filter, live = false } = {}) { + constructor(driveIndex, { filter } = {}) { super() - this.#state = new State({ live }) - this.#pathPrefixes = filter ? pathPrefixesFromFilters(filter) : [] + this.#pathPrefixes = filter ? pathPrefixesFromFilter(filter) : [] + this.#driveIndex = driveIndex - this.#entriesStream = createEntriesStream(drives, { live }) + this.#entriesStream = createEntriesStream(driveIndex, { live: true }) this.#entriesStream.once('error', this.#ac.abort) this.#ac.signal.addEventListener('abort', this.#cleanup, { once: true }) - this.#processEntriesPromise = this.#processEntries(drives) + this.#processEntriesPromise = this.#processEntries() this.#processEntriesPromise.catch(this.#ac.abort) } @@ -151,45 +55,27 @@ export class Downloader extends TypedEmitter { * filter, and we don't already have it, queue it for download. If the * Downloader is live, this method will never resolve, otherwise it will * resolve when all the entries have been processed and downloaded. - * - * @param {Hyperdrive[]} drives */ - async #processEntries(drives) { - // ERROR HANDLING: Should only throw if drive.ready() throws for any drive - await Promise.all(drives.map((drive) => this[kAddDrive](drive))) + async #processEntries() { for await (const entry of this.#entriesStream) { this.#ac.signal.throwIfAborted() const { - seq, driveId, key: filePath, value: { blob }, } = entry - // If we have processed all entries up to the initial length of the drive, - // the we've processed the "initial entries" in the drive. - if (this.#state.initialLengthsByDriveId.has(driveId)) { - const initialLength = this.#state.initialLengthsByDriveId.get(driveId) - if (typeof initialLength === 'number' && seq >= initialLength - 1) { - this.#state.initialLengthsByDriveId.delete(driveId) - } - } if (!this.#shouldDownloadFile(filePath)) continue - const drive = this.#drivesById.get(driveId) + const drive = this.#driveIndex.get(driveId) // ERROR HANDLING: this is unexpected and should not happen if (!drive) throw new Error('Drive not found: ' + driveId) // ERROR HANDLING: this should not throw - const core = await getBlobsCore(drive, { signal: this.#ac.signal }) + const blobs = await drive.getBlobs() this.#ac.signal.throwIfAborted() - // ERROR HANDLING: this will throw if core.has() throws - await this.#processEntry(core, blob) + // ERROR HANDLING: this will throw if core.has() throws, which should not happen + await this.#processEntry(blobs.core, blob) this.#ac.signal.throwIfAborted() - this.emit('state', this.#state.value) - // This loop will never end if live. + // This loop will never end unless thrown, since this is a live stream } - await Promise.all( - Array.from(this.#state.queuedDownloads, (download) => download.done()) - ) - this.#cleanup() } /** @param {string} filePath */ @@ -204,51 +90,22 @@ export class Downloader extends TypedEmitter { * @param {import('hypercore')} blobsCore * @param {{ blockOffset: number, blockLength: number, byteLength: number }} blob */ - async #processEntry( - blobsCore, - { blockOffset: start, blockLength: length, byteLength } - ) { + async #processEntry(blobsCore, { blockOffset: start, blockLength: length }) { const end = start + length const have = await blobsCore.has(start, end) this.#ac.signal.throwIfAborted() - if (have) { - this.#state.haveCount++ - this.#state.haveBytes += byteLength - } else { - this.#state.wantBytes += byteLength - const download = blobsCore.download({ start, end }) - this.#state.queuedDownloads.add(download) - download - .done() - .then(() => { - this.#state.haveCount++ - this.#state.haveBytes += byteLength - this.#state.wantBytes -= byteLength - this.emit('state', this.#state.value) - }) - .catch((e) => { - // ERROR HANDLING: _should_ only happen if the download is destroyed - if (this.#state.error) return - this.#state.error = e - this.#ac.abort(e) - }) - .finally(() => { - this.#state.queuedDownloads.delete(download) - }) - } - } - - /** @param {import('hyperdrive')} drive */ - async [kAddDrive](drive) { - await drive.ready() - this.#ac.signal.throwIfAborted() - if (!drive.key) throw new Error('Unexpected: missing drive key') // should never happen - this.#drivesById.set(drive.key.toString('hex'), drive) - if (drive.db.core.length === 0) return - this.#state.initialLengthsByDriveId.set( - drive.key.toString('hex'), - drive.db.core.length - ) + if (have) return + const download = blobsCore.download({ start, end }) + this.#queuedDownloads.add(download) + download + .done() + .catch((e) => { + // TODO: emit error rather than abort downloader if error here + this.#ac.abort(e) + }) + .finally(() => { + this.#queuedDownloads.delete(download) + }) } /** @@ -258,87 +115,12 @@ export class Downloader extends TypedEmitter { this.#ac.abort() } - /** - * Will resolve when all blobs have been downloaded. Will never resolve for a - * live downloader. - */ - async done() { - await this.#processEntriesPromise - } - #cleanup = () => { - for (const download of this.#state.queuedDownloads) download.destroy() + for (const download of this.#queuedDownloads) download.destroy() this.#ac.signal.removeEventListener('abort', this.#cleanup) this.#entriesStream.removeListener('error', this.#ac.abort) // queuedDownloads should always be empty by here anyway, but just in case. - this.#state.queuedDownloads.clear() - this.#state.initialLengthsByDriveId.clear() - this.#drivesById.clear() + this.#queuedDownloads.clear() this.#entriesStream.destroy() } - - /** - * @returns {BlobDownloadState | BlobDownloadStateError} - */ - get state() { - return this.#state.value - } -} - -/** - * This is a more generic version of the BlobFilter type that can filter unknown - * blob types and variants from the blob store. - * - * @typedef {{ [type: string]: readonly string[] }} GenericBlobFilter - */ - -/** - * Convert a filter to an array of path prefixes that match the filter. These - * path prefixes can be used to filter entries by - * `entry.key.startsWith(pathPrefix)`. - * - * @param {GenericBlobFilter} filter - * @returns {readonly string[]} array of folders that match the filter - */ -function pathPrefixesFromFilters(filter) { - const pathPrefixes = [] - for (const [type, variants] of Object.entries(filter)) { - const dedupedVariants = new Set(variants) - for (const variant of dedupedVariants) { - pathPrefixes.push(`/${type}/${variant}/`) - } - } - return filterSubfoldersAndDuplicates(pathPrefixes) -} - -/** - * Take an array of folders, remove any folders that are duplicates or subfolders of another - * - * @param {readonly string[]} folders - * @returns {readonly string[]} - */ -function filterSubfoldersAndDuplicates(folders) { - /** @type {Set} */ - const filtered = new Set() - for (let i = 0; i < folders.length; i++) { - const isSubfolderOfAnotherFolder = !!folders.find((folder, index) => { - if (index === i) return false - // Deduping is done by the Set, if we do it here we don't get either - if (folder === folders[i]) return true - return folders[i].startsWith(folder) - }) - if (!isSubfolderOfAnotherFolder) filtered.add(folders[i]) - } - return Array.from(filtered) -} - -/** - * @param {Hyperdrive} drive - * @param {{signal?: AbortSignal}} [opts] - * @returns {Promise} - */ -async function getBlobsCore(drive, { signal } = {}) { - if (drive.blobs) return drive.blobs.core - const [blobs] = await once(drive, 'blobs', { signal }) - return blobs.core } diff --git a/src/blob-store/entries-stream.js b/src/blob-store/entries-stream.js index e4acb58b..b7bc175d 100644 --- a/src/blob-store/entries-stream.js +++ b/src/blob-store/entries-stream.js @@ -6,33 +6,23 @@ import { Transform } from 'node:stream' /** @import { BlobStoreEntriesStream } from '../types.js' */ const keyEncoding = new SubEncoder('files', 'utf-8') -const kAddDrive = Symbol('addDrive to entries stream') - -/** - * @param {BlobStoreEntriesStream} entriesStream - * @param {Hyperdrive} drive - */ -export function addDriveToEntriesStream(entriesStream, drive) { - // @ts-expect-error - We don't expose this method in the type - entriesStream[kAddDrive](drive) -} /** * - * @param {Array} drives + * @param {import('./index.js').THyperdriveIndex} driveIndex * @param {object} opts * @param {boolean} [opts.live=false] * @returns {BlobStoreEntriesStream} */ -export function createEntriesStream(drives, { live = false } = {}) { +export function createEntriesStream(driveIndex, { live = false } = {}) { const mergedEntriesStreams = mergeStreams( - drives.map((drive) => getHistoryStream(drive.db, { live })) + [...driveIndex].map((drive) => getHistoryStream(drive.db, { live })) + ) + driveIndex.on('add-drive', addDrive) + // Close is always emitted, so we can use it to remove the listener + mergedEntriesStreams.once('close', () => + driveIndex.off('add-drive', addDrive) ) - Object.defineProperty(mergedEntriesStreams, kAddDrive, { - value: addDrive, - writable: false, - enumerable: false, - }) return mergedEntriesStreams /** @param {Hyperdrive} drive */ @@ -58,10 +48,10 @@ function getHistoryStream(bee, { live }) { // under the `files` sub-encoding key keyEncoding, }) - return historyStream.pipe(new AddDiscoveryIds(bee.core)) + return historyStream.pipe(new AddDriveIds(bee.core)) } -class AddDiscoveryIds extends Transform { +class AddDriveIds extends Transform { #core #discoveryKey diff --git a/src/blob-store/index.js b/src/blob-store/index.js index 20a6059c..4645f459 100644 --- a/src/blob-store/index.js +++ b/src/blob-store/index.js @@ -1,19 +1,18 @@ import Hyperdrive from 'hyperdrive' import b4a from 'b4a' import util from 'node:util' +import { pipeline } from 'node:stream' import { discoveryKey } from 'hypercore-crypto' -import { addDriveToDownloader, Downloader } from './downloader.js' -import { - addDriveToEntriesStream, - createEntriesStream, -} from './entries-stream.js' -import { IterableWeakSet } from 'weakref' +import { Downloader } from './downloader.js' +import { createEntriesStream } from './entries-stream.js' +import { FilterEntriesStream } from './utils.js' +import { noop } from '../utils.js' +import { TypedEmitter } from 'tiny-typed-emitter' /** @import { JsonObject } from 'type-fest' */ /** @import { Readable as NodeReadable } from 'node:stream' */ /** @import { Readable as StreamxReadable, Writable } from 'streamx' */ /** @import { BlobFilter, BlobId, BlobStoreEntriesStream } from '../types.js' */ -/** @import { BlobDownloadEvents } from './downloader.js' */ /** * @internal @@ -41,57 +40,19 @@ class ErrNotFound extends Error { } export class BlobStore { - /** @type {Map} Indexed by hex-encoded discovery key */ - #hyperdrives = new Map() - #writer - /** @type {IterableWeakSet} */ - #liveDownloaders = new IterableWeakSet() - /** @type {IterableWeakSet} */ - #liveEntriesStreams = new IterableWeakSet() + #driveIndex + /** @type {Downloader} */ + #downloader /** * @param {object} options * @param {import('../core-manager/index.js').CoreManager} options.coreManager + * @param {BlobFilter | null} options.downloadFilter - Filter blob types and/or variants to download. Set to `null` to download all blobs. */ - constructor({ coreManager }) { - /** @type {undefined | (Hyperdrive & { key: Buffer })} */ - let writer - const corestore = new PretendCorestore({ coreManager }) - const blobIndexCores = coreManager.getCores('blobIndex') - const { key: writerKey } = coreManager.getWriterCore('blobIndex') - for (const { key } of blobIndexCores) { - // @ts-ignore - we know pretendCorestore is not actually a Corestore - const drive = new Hyperdrive(corestore, key) - // We use the discovery key to derive the id for a drive - this.#hyperdrives.set(getDiscoveryId(key), drive) - if (key.equals(writerKey)) { - writer = proxyProps(drive, { key: writerKey }) - } - } - if (!writer) { - throw new Error('Could not find a writer for the blobIndex namespace') - } - this.#writer = writer - - coreManager.on('add-core', ({ key, namespace }) => { - if (namespace !== 'blobIndex') return - // We use the discovery key to derive the id for a drive - const driveId = getDiscoveryId(key) - if (this.#hyperdrives.has(driveId)) return - // @ts-ignore - we know pretendCorestore is not actually a Corestore - const drive = new Hyperdrive(corestore, key) - this.#hyperdrives.set(driveId, drive) - for (const downloader of this.#liveDownloaders) { - addDriveToDownloader(downloader, drive) - } - for (const entriesStream of this.#liveEntriesStreams) { - try { - addDriveToEntriesStream(entriesStream, drive) - } catch { - // This happens when the stream is already closed, so we can remove our reference. - this.#liveEntriesStreams.delete(entriesStream) - } - } + constructor({ coreManager, downloadFilter }) { + this.#driveIndex = new HyperdriveIndex(coreManager) + this.#downloader = new Downloader(this.#driveIndex, { + filter: downloadFilter, }) } @@ -99,7 +60,7 @@ export class BlobStore { * @returns {string} */ get writerDriveId() { - return getDiscoveryId(this.#writer.key) + return getDiscoveryId(this.#driveIndex.writerKey) } /** @@ -107,7 +68,7 @@ export class BlobStore { * @returns {Hyperdrive} */ #getDrive(driveId) { - const drive = this.#hyperdrives.get(driveId) + const drive = this.#driveIndex.get(driveId) if (!drive) throw new Error('Drive not found ' + driveId.slice(0, 7)) return drive } @@ -128,31 +89,16 @@ export class BlobStore { } /** - * Download blobs from all drives, optionally filtering particular blob types - * or blob variants. Download will be 'live' and will continue downloading new - * data as it becomes available from any replicating drive. + * Set the filter for downloading blobs. * - * If no filter is specified, all blobs will be downloaded. If a filter is - * specified, then _only_ blobs that match the filter will be downloaded. - * - * @param {object} [options] - * @param {import('../types.js').BlobFilter} [options.filter] Filter blob types and/or variants to download. Filter is { [BlobType]: BlobVariants[] }. At least one blob variant must be specified for each blob type. - * @param {boolean} [options.live=false] Set to `true` for a downloader that never ends, and will continue downloading any new data that becomes available. - * @returns {Downloader} + * @param {import('../types.js').BlobFilter | null} filter Filter blob types and/or variants to download. Filter is { [BlobType]: BlobVariants[] }. At least one blob variant must be specified for each blob type. + * @returns {void} */ - download({ filter, live = false } = {}) { - const drives = Array.from(this.#hyperdrives.values()) - const downloader = new Downloader(drives, { + setDownloadFilter(filter) { + this.#downloader.destroy() + this.#downloader = new Downloader(this.#driveIndex, { filter, - live, }) - if (live) { - // If the returned downloader is "live", then we need to add incoming - // drives to it, so we keep a weak reference that will be available for as - // long as the returned downloader is referenced. - this.#liveDownloaders.add(downloader) - } - return downloader } /** @@ -180,18 +126,14 @@ export class BlobStore { * * @param {object} opts * @param {boolean} [opts.live=false] Set to `true` to get a live stream of entries + * @param {import('../types.js').BlobFilter | null} [opts.filter] Filter blob types and/or variants in returned entries. Filter is { [BlobType]: BlobVariants[] }. * @returns */ - createEntriesReadStream({ live = false } = {}) { - const drives = Array.from(this.#hyperdrives.values()) - const entriesStream = createEntriesStream(drives, { live }) - if (live) { - // If the returned entries stream is "live", then we need to add incoming - // drives to it, so we keep a weak reference that will be available for as - // long as the returned entries stream is referenced. - this.#liveEntriesStreams.add(entriesStream) - } - return entriesStream + createEntriesReadStream({ live = false, filter } = {}) { + const entriesStream = createEntriesStream(this.#driveIndex, { live }) + if (!filter) return entriesStream + const filterStream = new FilterEntriesStream(filter) + return pipeline(entriesStream, filterStream, noop) } /** @@ -246,7 +188,7 @@ export class BlobStore { */ async put({ type, variant, name }, blob, options) { const path = makePath({ type, variant, name }) - await this.#writer.put(path, blob, options) + await this.#driveIndex.writer.put(path, blob, options) return this.writerDriveId } @@ -258,7 +200,7 @@ export class BlobStore { */ createWriteStream({ type, variant, name }, options) { const path = makePath({ type, variant, name }) - const stream = this.#writer.createWriteStream(path, options) + const stream = this.#driveIndex.writer.createWriteStream(path, options) return proxyProps(stream, { driveId: this.writerDriveId, }) @@ -276,7 +218,7 @@ export class BlobStore { { type, variant, name, driveId }, options = { follow: false, wait: false } ) { - const drive = this.#hyperdrives.get(driveId) + const drive = this.#driveIndex.get(driveId) if (!drive) throw new Error('Drive not found ' + driveId.slice(0, 7)) const path = makePath({ type, variant, name }) const entry = await drive.entry(path, options) @@ -297,6 +239,66 @@ export class BlobStore { } } +// Don't want to export the class, but do want to export the type. +/** @typedef {HyperdriveIndex} THyperdriveIndex */ + +/** + * @extends {TypedEmitter<{ 'add-drive': (drive: Hyperdrive) => void }>} + */ +class HyperdriveIndex extends TypedEmitter { + /** @type {Map} */ + #hyperdrives = new Map() + #writer + #writerKey + /** @param {import('../core-manager/index.js').CoreManager} coreManager */ + constructor(coreManager) { + super() + /** @type {undefined | Hyperdrive} */ + let writer + const corestore = new PretendCorestore({ coreManager }) + const blobIndexCores = coreManager.getCores('blobIndex') + const writerCoreRecord = coreManager.getWriterCore('blobIndex') + this.#writerKey = writerCoreRecord.key + for (const { key } of blobIndexCores) { + // @ts-ignore - we know pretendCorestore is not actually a Corestore + const drive = new Hyperdrive(corestore, key) + // We use the discovery key to derive the id for a drive + this.#hyperdrives.set(getDiscoveryId(key), drive) + if (key.equals(this.#writerKey)) { + writer = drive + } + } + if (!writer) { + throw new Error('Could not find a writer for the blobIndex namespace') + } + this.#writer = writer + + coreManager.on('add-core', ({ key, namespace }) => { + if (namespace !== 'blobIndex') return + // We use the discovery key to derive the id for a drive + const driveId = getDiscoveryId(key) + if (this.#hyperdrives.has(driveId)) return + // @ts-ignore - we know pretendCorestore is not actually a Corestore + const drive = new Hyperdrive(corestore, key) + this.#hyperdrives.set(driveId, drive) + this.emit('add-drive', drive) + }) + } + get writer() { + return this.#writer + } + get writerKey() { + return this.#writerKey + } + [Symbol.iterator]() { + return this.#hyperdrives.values() + } + /** @param {string} driveId */ + get(driveId) { + return this.#hyperdrives.get(driveId) + } +} + /** * @template {object} T * @template {object} U diff --git a/src/blob-store/utils.js b/src/blob-store/utils.js new file mode 100644 index 00000000..ac41d8a7 --- /dev/null +++ b/src/blob-store/utils.js @@ -0,0 +1,71 @@ +/** + * This is a more generic version of the BlobFilter type that can filter unknown + * blob types and variants from the blob store. + * + * @typedef {{ [type: string]: readonly string[] }} GenericBlobFilter + */ + +import { Transform } from 'node:stream' + +/** + * Convert a filter to an array of path prefixes that match the filter. These + * path prefixes can be used to filter entries by + * `entry.key.startsWith(pathPrefix)`. + * + * @param {GenericBlobFilter} filter + * @returns {readonly string[]} array of folders that match the filter + */ +export function pathPrefixesFromFilter(filter) { + const pathPrefixes = [] + for (const [type, variants] of Object.entries(filter)) { + const dedupedVariants = new Set(variants) + for (const variant of dedupedVariants) { + pathPrefixes.push(`/${type}/${variant}/`) + } + } + return filterSubfoldersAndDuplicates(pathPrefixes) +} + +/** @type {import("../types.js").BlobStoreEntriesStream} */ +export class FilterEntriesStream extends Transform { + #pathPrefixes + /** @param {GenericBlobFilter} filter */ + constructor(filter) { + super({ objectMode: true }) + this.#pathPrefixes = pathPrefixesFromFilter(filter) + } + /** + * @param {import("hyperdrive").HyperdriveEntry} entry + * @param {Parameters[1]} _ + * @param {Parameters[2]} callback + */ + _transform(entry, _, callback) { + const { key: filePath } = entry + const isIncludedInFilter = this.#pathPrefixes.some((pathPrefix) => + filePath.startsWith(pathPrefix) + ) + if (isIncludedInFilter) this.push(entry) + callback() + } +} + +/** + * Take an array of folders, remove any folders that are duplicates or subfolders of another + * + * @param {readonly string[]} folders + * @returns {readonly string[]} + */ +function filterSubfoldersAndDuplicates(folders) { + /** @type {Set} */ + const filtered = new Set() + for (let i = 0; i < folders.length; i++) { + const isSubfolderOfAnotherFolder = !!folders.find((folder, index) => { + if (index === i) return false + // Deduping is done by the Set, if we do it here we don't get either + if (folder === folders[i]) return true + return folders[i].startsWith(folder) + }) + if (!isSubfolderOfAnotherFolder) filtered.add(folders[i]) + } + return Array.from(filtered) +} diff --git a/types/hyperdrive.d.ts b/types/hyperdrive.d.ts index a6549d69..a8a8269b 100644 --- a/types/hyperdrive.d.ts +++ b/types/hyperdrive.d.ts @@ -69,7 +69,7 @@ declare module 'hyperdrive' { path: string, opts?: HyperdriveGetOpts ): Promise - getBlobs(): Promise + getBlobs(): Promise get( path: string, opts?: { follow?: boolean } & HyperdriveGetOpts From 6fd542bbf8b1d5a6a51a2ac2097c35406657d3d3 Mon Sep 17 00:00:00 2001 From: Gregor MacLennan Date: Mon, 28 Oct 2024 20:30:52 +0000 Subject: [PATCH 08/25] other small fixes --- src/blob-store/downloader.js | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/blob-store/downloader.js b/src/blob-store/downloader.js index 89d2ffe9..226fc8f3 100644 --- a/src/blob-store/downloader.js +++ b/src/blob-store/downloader.js @@ -68,12 +68,9 @@ export class Downloader extends TypedEmitter { const drive = this.#driveIndex.get(driveId) // ERROR HANDLING: this is unexpected and should not happen if (!drive) throw new Error('Drive not found: ' + driveId) - // ERROR HANDLING: this should not throw const blobs = await drive.getBlobs() this.#ac.signal.throwIfAborted() - // ERROR HANDLING: this will throw if core.has() throws, which should not happen await this.#processEntry(blobs.core, blob) - this.#ac.signal.throwIfAborted() // This loop will never end unless thrown, since this is a live stream } } From edd8d152af463d2b818add120ee54918162c0681 Mon Sep 17 00:00:00 2001 From: Gregor MacLennan Date: Mon, 28 Oct 2024 20:33:37 +0000 Subject: [PATCH 09/25] cleanup package-lock --- package-lock.json | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/package-lock.json b/package-lock.json index 8b5b0892..dc930d6f 100644 --- a/package-lock.json +++ b/package-lock.json @@ -7473,15 +7473,13 @@ "integrity": "sha512-gC13b/bWrqQoKY2EmROCZ+AR0jitc6DnDGaQ6Ls9QpKmuSgJB1eQ7H3KETtQm7qSdMWMKCmsshyCmUwMLh3OAA==" }, "node_modules/protomux": { - "version": "3.10.0", - "resolved": "https://registry.npmjs.org/protomux/-/protomux-3.10.0.tgz", - "integrity": "sha512-YjNhvdYWI5HGbbUUKRR8DT3mg+RaZQT6V5T83ktd4veAdHg0CLvevcs33wo8rjdEwCnRaSNXkzlw48tbSaPVag==", + "version": "3.4.1", + "license": "MIT", "dependencies": { "b4a": "^1.3.1", "compact-encoding": "^2.5.1", "queue-tick": "^1.0.0", - "safety-catch": "^1.0.1", - "unslab": "^1.3.0" + "safety-catch": "^1.0.1" } }, "node_modules/proxy-addr": { @@ -9597,16 +9595,7 @@ }, "node_modules/unix-path-resolve": { "version": "1.0.2", - "resolved": "https://registry.npmjs.org/unix-path-resolve/-/unix-path-resolve-1.0.2.tgz", - "integrity": "sha512-kG4g5nobBBaMnH2XbrS4sLUXEpx4nY2J3C6KAlAUcnahG2HChxSPVKWYrqEq76iTo+cyMkLUjqxGaQR2tz097Q==" - }, - "node_modules/unslab": { - "version": "1.3.0", - "resolved": "https://registry.npmjs.org/unslab/-/unslab-1.3.0.tgz", - "integrity": "sha512-YATkfKAFj47kTzmiQrWXMyRvaVrHsW6MEALa4bm+FhiA2YG4oira+Z3DXN6LrYOYn2Y8eO94Lwl9DOHjs1FpoQ==", - "dependencies": { - "b4a": "^1.6.6" - } + "license": "MIT" }, "node_modules/uri-js": { "version": "4.4.1", From cb4d1c84fce0775236befd780637607550f01c01 Mon Sep 17 00:00:00 2001 From: Gregor MacLennan Date: Tue, 29 Oct 2024 10:03:03 +0000 Subject: [PATCH 10/25] cleanup error handling --- src/blob-store/downloader.js | 32 ++++++++++++++++++++------------ src/blob-store/index.js | 7 ++++++- src/mapeo-project.js | 7 +++++++ 3 files changed, 33 insertions(+), 13 deletions(-) diff --git a/src/blob-store/downloader.js b/src/blob-store/downloader.js index 226fc8f3..23ccc1f2 100644 --- a/src/blob-store/downloader.js +++ b/src/blob-store/downloader.js @@ -4,16 +4,19 @@ import { pathPrefixesFromFilter } from './utils.js' /** @import Hyperdrive from 'hyperdrive' */ -// This class contains a large amount of parallel async code, and contains lots -// of references and some listeners that need to be deferenced when this class -// is finished with (e.g when a download is complete, or there is an error). -// I've highlighted lines which could throw an error which would put the -// downloader in an "error" state. Currently this does not emit an "error" -// event. - /** * Like hyperdrive.download() but 'live', and for multiple drives. * + * Will emit an 'error' event for any unexpected errors. A consumer must attach + * an error listener to avoid uncaught errors. Sources of errors include: + * + * - If the entries stream emits an error + * - If a drive referenced in an entry is not found + * - If core.has() throws (e.g. if hypercore is closed) + * - If core.download().done() throws, which should not happen according to + * current hypercore code. + * - If the entries stream ends unexpectedly (it should be live and not end) + * * NB: unlike hyperdrive.download(), this will also download deleted and * previous versions of blobs - we don't currently support editing or deleting * of blobs, so this should not be an issue, and if we do in the future, @@ -44,7 +47,7 @@ export class Downloader extends TypedEmitter { this.#entriesStream = createEntriesStream(driveIndex, { live: true }) this.#entriesStream.once('error', this.#ac.abort) - this.#ac.signal.addEventListener('abort', this.#cleanup, { once: true }) + this.#ac.signal.addEventListener('abort', this.#handleAbort, { once: true }) this.#processEntriesPromise = this.#processEntries() this.#processEntriesPromise.catch(this.#ac.abort) @@ -71,8 +74,8 @@ export class Downloader extends TypedEmitter { const blobs = await drive.getBlobs() this.#ac.signal.throwIfAborted() await this.#processEntry(blobs.core, blob) - // This loop will never end unless thrown, since this is a live stream } + throw new Error('Entries stream ended unexpectedly') } /** @param {string} filePath */ @@ -97,7 +100,7 @@ export class Downloader extends TypedEmitter { download .done() .catch((e) => { - // TODO: emit error rather than abort downloader if error here + // According to the code, this should never throw. this.#ac.abort(e) }) .finally(() => { @@ -112,9 +115,14 @@ export class Downloader extends TypedEmitter { this.#ac.abort() } - #cleanup = () => { + #handleAbort = () => { + const abortReason = this.#ac.signal.reason + const wasAbortedByDestroy = abortReason && abortReason.name === 'AbortError' + if (!wasAbortedByDestroy) { + this.emit('error', abortReason) + } for (const download of this.#queuedDownloads) download.destroy() - this.#ac.signal.removeEventListener('abort', this.#cleanup) + this.#ac.signal.removeEventListener('abort', this.#handleAbort) this.#entriesStream.removeListener('error', this.#ac.abort) // queuedDownloads should always be empty by here anyway, but just in case. this.#queuedDownloads.clear() diff --git a/src/blob-store/index.js b/src/blob-store/index.js index 4645f459..db3fefaf 100644 --- a/src/blob-store/index.js +++ b/src/blob-store/index.js @@ -39,7 +39,8 @@ class ErrNotFound extends Error { } } -export class BlobStore { +/** @extends {TypedEmitter<{ error: (error: Error) => void }>} */ +export class BlobStore extends TypedEmitter { #driveIndex /** @type {Downloader} */ #downloader @@ -50,10 +51,12 @@ export class BlobStore { * @param {BlobFilter | null} options.downloadFilter - Filter blob types and/or variants to download. Set to `null` to download all blobs. */ constructor({ coreManager, downloadFilter }) { + super() this.#driveIndex = new HyperdriveIndex(coreManager) this.#downloader = new Downloader(this.#driveIndex, { filter: downloadFilter, }) + this.#downloader.on('error', (error) => this.emit('error', error)) } /** @@ -95,10 +98,12 @@ export class BlobStore { * @returns {void} */ setDownloadFilter(filter) { + this.#downloader.removeAllListeners() this.#downloader.destroy() this.#downloader = new Downloader(this.#driveIndex, { filter, }) + this.#downloader.on('error', (error) => this.emit('error', error)) } /** diff --git a/src/mapeo-project.js b/src/mapeo-project.js index db85d986..70589113 100644 --- a/src/mapeo-project.js +++ b/src/mapeo-project.js @@ -318,6 +318,13 @@ export class MapeoProject extends TypedEmitter { this.#blobStore = new BlobStore({ coreManager: this.#coreManager, + downloadFilter: null, + }) + + this.#blobStore.on('error', (err) => { + // TODO: Handle this error in some way - this error will come from an + // unexpected error with background blob downloads + console.error('BlobStore error', err) }) this.$blobs = new BlobApi({ From 35f297d3d7b97391c214b46131a2c07a04211f80 Mon Sep 17 00:00:00 2001 From: Gregor MacLennan Date: Tue, 29 Oct 2024 10:47:48 +0000 Subject: [PATCH 11/25] fix tests, fix bugs --- src/blob-store/downloader.js | 22 +- src/blob-store/entries-stream.js | 5 +- src/blob-store/index.js | 2 +- src/blob-store/utils.js | 4 + test/blob-store/blob-store.js | 214 +++++------------- test/blob-store/live-download.js | 358 ------------------------------- 6 files changed, 77 insertions(+), 528 deletions(-) delete mode 100644 test/blob-store/live-download.js diff --git a/src/blob-store/downloader.js b/src/blob-store/downloader.js index 23ccc1f2..9b1b0963 100644 --- a/src/blob-store/downloader.js +++ b/src/blob-store/downloader.js @@ -45,12 +45,12 @@ export class Downloader extends TypedEmitter { this.#driveIndex = driveIndex this.#entriesStream = createEntriesStream(driveIndex, { live: true }) - this.#entriesStream.once('error', this.#ac.abort) + this.#entriesStream.once('error', this.#handleError) this.#ac.signal.addEventListener('abort', this.#handleAbort, { once: true }) this.#processEntriesPromise = this.#processEntries() - this.#processEntriesPromise.catch(this.#ac.abort) + this.#processEntriesPromise.catch(this.#handleError) } /** @@ -99,10 +99,8 @@ export class Downloader extends TypedEmitter { this.#queuedDownloads.add(download) download .done() - .catch((e) => { - // According to the code, this should never throw. - this.#ac.abort(e) - }) + // According to the code, this should never throw. + .catch(this.#handleError) .finally(() => { this.#queuedDownloads.delete(download) }) @@ -115,12 +113,14 @@ export class Downloader extends TypedEmitter { this.#ac.abort() } + /** @param {any} error */ + #handleError = (error) => { + if (this.#ac.signal.aborted) return + this.emit('error', error) + this.#ac.abort(error) + } + #handleAbort = () => { - const abortReason = this.#ac.signal.reason - const wasAbortedByDestroy = abortReason && abortReason.name === 'AbortError' - if (!wasAbortedByDestroy) { - this.emit('error', abortReason) - } for (const download of this.#queuedDownloads) download.destroy() this.#ac.signal.removeEventListener('abort', this.#handleAbort) this.#entriesStream.removeListener('error', this.#ac.abort) diff --git a/src/blob-store/entries-stream.js b/src/blob-store/entries-stream.js index b7bc175d..b9a5ebb9 100644 --- a/src/blob-store/entries-stream.js +++ b/src/blob-store/entries-stream.js @@ -1,6 +1,7 @@ import SubEncoder from 'sub-encoder' import mergeStreams from '@sindresorhus/merge-streams' -import { Transform } from 'node:stream' +import { Transform, pipeline } from 'node:stream' +import { noop } from '../utils.js' /** @import Hyperdrive from 'hyperdrive' */ /** @import { BlobStoreEntriesStream } from '../types.js' */ @@ -48,7 +49,7 @@ function getHistoryStream(bee, { live }) { // under the `files` sub-encoding key keyEncoding, }) - return historyStream.pipe(new AddDriveIds(bee.core)) + return pipeline(historyStream, new AddDriveIds(bee.core), noop) } class AddDriveIds extends Transform { diff --git a/src/blob-store/index.js b/src/blob-store/index.js index db3fefaf..1eeddde5 100644 --- a/src/blob-store/index.js +++ b/src/blob-store/index.js @@ -131,7 +131,7 @@ export class BlobStore extends TypedEmitter { * * @param {object} opts * @param {boolean} [opts.live=false] Set to `true` to get a live stream of entries - * @param {import('../types.js').BlobFilter | null} [opts.filter] Filter blob types and/or variants in returned entries. Filter is { [BlobType]: BlobVariants[] }. + * @param {import('./utils.js').GenericBlobFilter | null} [opts.filter] Filter blob types and/or variants in returned entries. Filter is { [BlobType]: BlobVariants[] }. * @returns */ createEntriesReadStream({ live = false, filter } = {}) { diff --git a/src/blob-store/utils.js b/src/blob-store/utils.js index ac41d8a7..8563ef8f 100644 --- a/src/blob-store/utils.js +++ b/src/blob-store/utils.js @@ -18,6 +18,10 @@ import { Transform } from 'node:stream' export function pathPrefixesFromFilter(filter) { const pathPrefixes = [] for (const [type, variants] of Object.entries(filter)) { + if (variants.length === 0) { + pathPrefixes.push(`/${type}/`) + continue + } const dedupedVariants = new Set(variants) for (const variant of dedupedVariants) { pathPrefixes.push(`/${type}/${variant}/`) diff --git a/test/blob-store/blob-store.js b/test/blob-store/blob-store.js index 73bcef86..7bd5e412 100644 --- a/test/blob-store/blob-store.js +++ b/test/blob-store/blob-store.js @@ -14,7 +14,6 @@ import { BlobStore, SUPPORTED_BLOB_VARIANTS, } from '../../src/blob-store/index.js' -import { setTimeout } from 'node:timers/promises' import { concat } from '../helpers/blob-store.js' import { discoveryKey } from 'hypercore-crypto' import { setTimeout as delay } from 'node:timers/promises' @@ -290,9 +289,9 @@ test('blobStore.writerDriveId', async () => { }) // Tests: -// A) Downloads from peers connected when download() is first called -// B) Downloads from peers connected after download() is first called -test.skip('live download', async function () { +// A) Downloads from peers blobs added before replication +// B) Downloads from peers blobs added after replication +test('download all blobs', async function () { const projectKey = randomBytes(32) const { blobStore: bs1, coreManager: cm1 } = testenv({ projectKey }) const { blobStore: bs2, coreManager: cm2 } = testenv({ projectKey }) @@ -315,17 +314,13 @@ test.skip('live download', async function () { const driveId1 = await bs1.put(blob1Id, blob1) // STEP 2: Replicate CM1 with CM3 const { destroy: destroy1 } = replicate(cm1, cm3) - // STEP 3: Start live download to CM3 - const liveDownload = bs3.download() - // STEP 4: Wait for blobs to be downloaded - await downloaded(liveDownload) - // STEP 5: Replicate CM2 with CM3 + // STEP 3: Replicate CM2 with CM3 const { destroy: destroy2 } = replicate(cm2, cm3) - // STEP 6: Write a blob to CM2 + // STEP 4: Write a blob to CM2 const driveId2 = await bs2.put(blob2Id, blob2) - // STEP 7: Wait for blobs to be downloaded - await downloaded(liveDownload) - // STEP 8: destroy all the replication streams + // STEP 5: Wait for blobs to be downloaded + await delay(200) + // STEP 6: destroy all the replication streams await Promise.all([destroy1(), destroy2()]) // Both blob1 and blob2 (from CM1 and CM2) should have been downloaded to CM3 @@ -341,10 +336,13 @@ test.skip('live download', async function () { ) }) -test.skip('sparse live download', async function () { +test('filtered download, filter changed', async function () { const projectKey = randomBytes(32) const { blobStore: bs1, coreManager: cm1 } = testenv({ projectKey }) - const { blobStore: bs2, coreManager: cm2 } = testenv({ projectKey }) + const { blobStore: bs2, coreManager: cm2 } = testenv({ + projectKey, + downloadFilter: { photo: ['thumbnail', 'preview'] }, + }) const blob1 = randomBytes(TEST_BUF_SIZE) const blob1Id = /** @type {const} */ ({ @@ -371,77 +369,37 @@ test.skip('sparse live download', async function () { const { destroy } = replicate(cm1, cm2) - const liveDownload = bs2.download({ - filter: { photo: ['original', 'preview'] }, - }) - await downloaded(liveDownload) - - await destroy() + // Wait for blobs to be downloaded + await delay(200) - assert.deepEqual( - await bs2.get({ ...blob1Id, driveId }), - blob1, - 'blob1 was downloaded' - ) assert.deepEqual( await bs2.get({ ...blob2Id, driveId }), blob2, - 'blob2 was downloaded' + 'preview was downloaded' + ) + assert.deepEqual( + await bs2.get({ ...blob3Id, driveId }), + blob3, + 'thumbnail was downloaded' ) await assert.rejects( - () => bs2.get({ ...blob3Id, driveId }), - 'blob3 was not downloaded' + () => bs2.get({ ...blob1Id, driveId }), + 'original was not downloaded' ) -}) -test.skip('cancelled live download', async function () { - const projectKey = randomBytes(32) - const { blobStore: bs1, coreManager: cm1 } = testenv({ projectKey }) - const { blobStore: bs2, coreManager: cm2 } = testenv({ projectKey }) - const { blobStore: bs3, coreManager: cm3 } = testenv({ projectKey }) + // Change the filter to download all + bs2.setDownloadFilter(null) - const blob1 = randomBytes(TEST_BUF_SIZE) - const blob1Id = /** @type {const} */ ({ - type: 'photo', - variant: 'original', - name: 'blob1', - }) - const blob2 = randomBytes(TEST_BUF_SIZE) - const blob2Id = /** @type {const} */ ({ - type: 'photo', - variant: 'original', - name: 'blob2', - }) - - // STEP 1: Write a blob to CM1 - const driveId1 = await bs1.put(blob1Id, blob1) - // STEP 2: Replicate CM1 with CM3 - const { destroy: destroy1 } = replicate(cm1, cm3) - // STEP 3: Start live download to CM3 - const liveDownload = bs3.download() - // STEP 4: Wait for blobs to be downloaded - await downloaded(liveDownload) - // STEP 5: Cancel download - liveDownload.destroy() - // STEP 6: Replicate CM2 with CM3 - const { destroy: destroy2 } = replicate(cm2, cm3) - // STEP 7: Write a blob to CM2 - const driveId2 = await bs2.put(blob2Id, blob2) - // STEP 8: Wait for blobs to (not) download - await setTimeout(200) - // STEP 9: destroy all the replication streams - await Promise.all([destroy1(), destroy2()]) + // Wait for blobs to be downloaded + await delay(200) - // Both blob1 and blob2 (from CM1 and CM2) should have been downloaded to CM3 assert.deepEqual( - await bs3.get({ ...blob1Id, driveId: driveId1 }), + await bs2.get({ ...blob1Id, driveId }), blob1, - 'blob1 was downloaded' - ) - await assert.rejects( - async () => bs3.get({ ...blob2Id, driveId: driveId2 }), - 'blob2 was not downloaded' + 'original was downloaded' ) + + await destroy() }) test('blobStore.getEntryBlob(driveId, entry)', async () => { @@ -542,29 +500,29 @@ test('blobStore.createEntriesReadStream({ live: false })', async (t) => { return keys } - await t.test('no folders filter, returns everything', async () => { + await t.test('no filter, returns everything', async () => { const expectedKeys = new Set(inputKeys) const entriesStream = blobStore.createEntriesReadStream() const keys = await getKeys(entriesStream) assert.deepEqual(keys, expectedKeys, 'returns all keys') }) - await t.test('[] folders filter, returns everything', async () => { + await t.test('null filter, returns everything', async () => { const expectedKeys = new Set(inputKeys) - const entriesStream = blobStore.createEntriesReadStream({ folders: [] }) + const entriesStream = blobStore.createEntriesReadStream({ filter: null }) const keys = await getKeys(entriesStream) assert.deepEqual(keys, expectedKeys, 'returns all keys') }) - await t.test('single folders filter', async () => { - const folders = ['/photo'] + await t.test('blob type only, returns all variants', async () => { + const filter = { photo: [] } const unexpectedKeys = new Set( - inputKeys.filter((key) => key.startsWith(folders[0])) + inputKeys.filter((key) => key.startsWith('/photo')) ) const expectedKeys = new Set( - inputKeys.filter((key) => key.startsWith(addTrailingSlash(folders[0]))) + inputKeys.filter((key) => key.startsWith('/photo/')) ) - const entriesStream = blobStore.createEntriesReadStream({ folders }) + const entriesStream = blobStore.createEntriesReadStream({ filter }) const keys = await getKeys(entriesStream) assert.notDeepEqual( keys, @@ -574,61 +532,29 @@ test('blobStore.createEntriesReadStream({ live: false })', async (t) => { assert.deepEqual(keys, expectedKeys, 'returns expected keys') }) - await t.test('multiple folders filter, no subfolder', async () => { - const folders = ['/video/original', '/photo/preview'] + await t.test('multiple types and variants filter', async () => { + const filter = { + video: ['original'], + photo: ['preview'], + } const expectedKeys = new Set( - inputKeys.filter((key) => - folders.find((folder) => key.startsWith(addTrailingSlash(folder))) + inputKeys.filter( + (key) => + key.startsWith('/video/original/') || + key.startsWith('/photo/preview/') ) ) - const entriesStream = blobStore.createEntriesReadStream({ folders }) - const keys = await getKeys(entriesStream) - assert.deepEqual(keys, expectedKeys, 'returns expected keys') - }) - - await t.test('multiple folders filter, subfolder', async () => { - const folders = ['/photo/original', '/photo'] - const expectedKeys = new Set( - inputKeys.filter((key) => key.startsWith(addTrailingSlash(folders[1]))) - ) - const entriesStream = blobStore.createEntriesReadStream({ folders }) - const keys = await getKeys(entriesStream) - assert.deepEqual(keys, expectedKeys, 'returns expected keys') - }) - - await t.test('folders filter with trailing slashes', async () => { - const folders = ['/photo/original/'] - const expectedKeys = new Set( - inputKeys.filter((key) => key.startsWith(addTrailingSlash(folders[0]))) - ) - const entriesStream = blobStore.createEntriesReadStream({ folders }) - const keys = await getKeys(entriesStream) - assert.deepEqual(keys, expectedKeys, 'returns expected keys') - }) - - await t.test('folders filter without leading slash', async () => { - const folders = ['photo/original'] - const expectedKeys = new Set( - inputKeys.filter((key) => key.startsWith('/photo/original/')) - ) - const entriesStream = blobStore.createEntriesReadStream({ folders }) - const keys = await getKeys(entriesStream) - assert.deepEqual(keys, expectedKeys, 'returns expected keys') - }) - - await t.test('folders filter windows separator', async () => { - const folders = ['C:\\photo\\original'] - const expectedKeys = new Set( - inputKeys.filter((key) => key.startsWith('/photo/original/')) - ) - const entriesStream = blobStore.createEntriesReadStream({ folders }) + const entriesStream = blobStore.createEntriesReadStream({ filter }) const keys = await getKeys(entriesStream) assert.deepEqual(keys, expectedKeys, 'returns expected keys') }) await t.test('folders filter unknown blob type & variant', async () => { - const folders = ['/unknownType', '/photo/unknownVariant'] - const entriesStream = blobStore.createEntriesReadStream({ folders }) + const filter = { + unknownType: [], + photo: ['unknownVariant'], + } + const entriesStream = blobStore.createEntriesReadStream({ filter }) const keys = await getKeys(entriesStream) assert.deepEqual(keys.size, 2) }) @@ -694,36 +620,12 @@ function randomBlobId() { function blobIdToKey({ name, type, variant }) { return `/${type}/${variant}/${name}` } -/** @param {string} path */ -function addTrailingSlash(path) { - return path.endsWith('/') ? path : `${path}/` -} /** - * @param {Parameters} args + * @param {Parameters[0] & { downloadFilter?: ConstructorParameters[0]['downloadFilter'] }} opts */ -function testenv(...args) { - const coreManager = createCoreManager(...args) - const blobStore = new BlobStore({ coreManager }) +function testenv({ downloadFilter = null, ...coreManagerOpts } = {}) { + const coreManager = createCoreManager(coreManagerOpts) + const blobStore = new BlobStore({ coreManager, downloadFilter }) return { blobStore, coreManager } } - -/** - * Resolve when liveDownload status is 'downloaded' - * - * @param {import('../../src/blob-store/downloader.js').Downloader} liveDownload - * @returns {Promise} - */ -async function downloaded(liveDownload) { - return new Promise((res) => { - liveDownload.on('state', function onState(state) { - // If liveDownload is created before all cores have been added to the - // replication stream, then initially it will emit `downloaded` (since it - // has downloaded the zero data there is available to download), so we - // also wait for the `downloaded` once data has actually downloaded - if (state.status !== 'downloaded' || state.haveCount === 0) return - liveDownload.off('state', onState) - res() - }) - }) -} diff --git a/test/blob-store/live-download.js b/test/blob-store/live-download.js deleted file mode 100644 index 293985df..00000000 --- a/test/blob-store/live-download.js +++ /dev/null @@ -1,358 +0,0 @@ -import test from 'node:test' -import assert from 'node:assert/strict' -import { DriveLiveDownload } from '../../src/blob-store/live-download.js' -import Hyperdrive from 'hyperdrive' -import Corestore from 'corestore' -import RAM from 'random-access-memory' -import { setTimeout } from 'node:timers/promises' -import { once } from 'node:events' -import { randomBytes } from 'node:crypto' -/** - * @import { - * BlobDownloadState, - * BlobDownloadStateError - * } from '../../src/blob-store/live-download.js' - */ - -// Test with buffers that are 3 times the default blockSize for hyperblobs -const TEST_BUF_SIZE = 3 * 64 * 1024 - -test('live download', async () => { - const { drive1, drive2, replicate } = await testEnv() - - await drive1.put('/foo', randomBytes(TEST_BUF_SIZE)) - const drive1Entry = await drive1.entry('/foo') - assert(drive1Entry) - const { - value: { blob: blob1 }, - } = drive1Entry - - const stream = replicate() - const blobCore2 = (await drive2.getBlobs())?.core - assert(blobCore2) - - const download = new DriveLiveDownload(drive2) - await waitForState(download, 'downloaded') - // Can't use `drive2.get()` here because connected to replication stream, so - // it would download anyway (no `waitFor = false` support for Hyperdrive yet) - assert( - await blobCore2.has( - blob1.blockOffset, - blob1.blockOffset + blob1.blockLength - ), - 'First blob is downloaded' - ) - assert(blob1.blockLength > 1, 'Blob is more than one block length') - - const expected = randomBytes(TEST_BUF_SIZE) - await drive1.put('/bar', expected) - - await waitForState(download, 'downloaded') - stream.destroy() - await once(stream, 'close') - - assert.deepEqual( - await drive2.get('/bar'), - expected, - 'Second blob is downloaded' - ) -}) - -test('sparse live download', async () => { - const { drive1, drive2, replicate } = await testEnv() - - const buf1 = randomBytes(TEST_BUF_SIZE) - const buf2 = randomBytes(TEST_BUF_SIZE) - - await drive1.put('photo/original/one', buf1) - await drive1.put('video/original/one', randomBytes(TEST_BUF_SIZE)) - - const stream = replicate() - - const download = new DriveLiveDownload(drive2, { - filter: { photo: ['original'] }, - }) - await waitForState(download, 'downloaded') - - await drive1.put('photo/original/two', buf2) - await drive1.put('video/original/two', randomBytes(TEST_BUF_SIZE)) - await waitForState(download, 'downloaded') - - stream.destroy() - await once(stream, 'close') - - assert.deepEqual(await drive2.get('photo/original/one'), buf1) - assert.deepEqual(await drive2.get('photo/original/two'), buf2) - - await assert.rejects( - drive2.get('video/original/one', { wait: false }), - { - message: /BLOCK_NOT_AVAILABLE/, - }, - 'Block not available' - ) - await assert.rejects( - drive2.get('video/original/two', { wait: false }), - { - message: /BLOCK_NOT_AVAILABLE/, - }, - 'Block not available' - ) -}) - -test('Abort download (same tick)', async () => { - const { drive1, drive2, replicate } = await testEnv() - await drive1.put('/foo', randomBytes(TEST_BUF_SIZE)) - const stream = replicate() - const controller = new AbortController() - const download = new DriveLiveDownload(drive2, { signal: controller.signal }) - controller.abort() - stream.destroy() - await once(stream, 'close') - assert.deepEqual(download.state, { - haveCount: 0, - haveBytes: 0, - wantCount: 0, - wantBytes: 0, - error: null, - status: 'aborted', - }) - assert.equal(await drive2.get('/foo'), null, 'nothing downloaded') -}) - -test('Abort download (next event loop)', async () => { - const { drive1, drive2, replicate } = await testEnv() - await drive1.put('/one', randomBytes(TEST_BUF_SIZE)) - const stream = replicate() - const controller = new AbortController() - const download = new DriveLiveDownload(drive2, { signal: controller.signal }) - // This is the only way to trigger abort before the entryStream loop - await drive2.getBlobs() - controller.abort() - stream.destroy() - await once(stream, 'close') - assert.deepEqual(download.state, { - haveCount: 0, - haveBytes: 0, - wantCount: 0, - wantBytes: 0, - error: null, - status: 'aborted', - }) - await assert.rejects( - drive2.get('/foo', { wait: false }), - { - message: /Block not available locally/, - }, - 'Block not available locally' - ) -}) - -test('Abort download (after initial download)', async () => { - const { drive1, drive2, replicate } = await testEnv() - - const buf1 = randomBytes(TEST_BUF_SIZE) - await drive1.put('/one', buf1) - - const stream = replicate() - const controller = new AbortController() - const download = new DriveLiveDownload(drive2, { signal: controller.signal }) - await waitForState(download, 'downloaded') - - controller.abort() - - await drive1.put('/two', randomBytes(TEST_BUF_SIZE)) - - // Nothing should happen here, but allow some time to see if it does - await setTimeout(200) - - stream.destroy() - await once(stream, 'close') - - assert.deepEqual(await drive2.get('/one'), buf1, 'First blob is downloaded') - await assert.rejects( - drive2.get('/two', { wait: false }), - { - message: /BLOCK_NOT_AVAILABLE/, - }, - 'Second blob is not downloaded' - ) -}) - -test('Live download when data is already downloaded', async () => { - const { drive1, drive2, replicate } = await testEnv() - - const buf1 = randomBytes(20) - await drive1.put('/one', buf1) - - const stream1 = replicate() - - await drive2.db.core.update({ wait: true }) - await drive2.download() - assert.deepEqual(await drive2.get('/one'), buf1, 'First blob is downloaded') - - stream1.destroy() - await once(stream1, 'close') - - const stream2 = replicate() - const download = new DriveLiveDownload(drive2) - await waitForState(download, 'downloaded') - assert.deepEqual( - download.state, - { - haveCount: 1, - haveBytes: buf1.byteLength, - wantCount: 0, - wantBytes: 0, - error: null, - status: 'downloaded', - }, - 'Blob already downloaded is included in state' - ) - - const buf2 = randomBytes(TEST_BUF_SIZE) - await drive1.put('/two', buf2) - await waitForState(download, 'downloaded') - - stream2.destroy() - await once(stream2, 'close') - - assert.deepEqual(await drive2.get('/two'), buf2, 'Second blob is downloaded') -}) - -test('Live download continues across disconnection and reconnect', async () => { - const { drive1, drive2, replicate } = await testEnv() - - const buf1 = randomBytes(TEST_BUF_SIZE) - await drive1.put('/one', buf1) - - const stream1 = replicate() - - const download = new DriveLiveDownload(drive2) - await waitForState(download, 'downloaded') - - assert.deepEqual(await drive2.get('/one'), buf1, 'First blob is downloaded') - - stream1.destroy() - await once(stream1, 'close') - - const buf2 = randomBytes(TEST_BUF_SIZE) - await drive1.put('/two', buf2) - - const stream2 = replicate() - await waitForState(download, 'downloaded') - - stream2.destroy() - await once(stream2, 'close') - - assert.deepEqual(await drive2.get('/two'), buf2, 'Second blob is downloaded') -}) - -test('Initial status', async () => { - const { drive1 } = await testEnv() - - const download = new DriveLiveDownload(drive1) - assert.equal( - download.state.status, - 'checking', - "initial status is 'checking'" - ) -}) - -test('Unitialized drive with no data', async () => { - // This test is important because it catches an edge case where a drive might - // have been added by its key, but has never replicated, so it has no data so - // the content feed will never be read from the header, which might result in - // it forever being in the 'checking' status. This tests that we catch this - // and resolve status to 'downloaded'. - const { drive2 } = await testEnv() - const download = new DriveLiveDownload(drive2) - await waitForState(download, 'downloaded') - assert.equal( - download.state.status, - 'downloaded', - 'uninitialized drive without peers results in `downloaded` state' - ) -}) - -test('live download started before initial replication', async () => { - const { drive1, drive2, replicate } = await testEnv() - - await drive1.put('/foo', randomBytes(TEST_BUF_SIZE)) - const drive1Entry = await drive1.entry('/foo') - assert(drive1Entry) - const { - value: { blob: blob1 }, - } = drive1Entry - - const download = new DriveLiveDownload(drive2) - await waitForState(download, 'downloaded') - // initially drive2 is not replicating and empty, so we expect a 'downloaded' status - assert.equal(download.state.status, 'downloaded') - - const stream = replicate() - const blobCore2 = (await drive2.getBlobs())?.core - assert(blobCore2) - await waitForState(download, 'downloaded') - - // Can't use `drive2.get()` here because connected to replication stream, so - // it would download anyway (no `waitFor = false` support for Hyperdrive yet) - assert( - await blobCore2.has( - blob1.blockOffset, - blob1.blockOffset + blob1.blockLength - ), - 'First blob is downloaded' - ) - assert(blob1.blockLength > 1, 'Blob is more than one block length') - - const expected = randomBytes(TEST_BUF_SIZE) - await drive1.put('/bar', expected) - - await waitForState(download, 'downloaded') - stream.destroy() - await once(stream, 'close') - - assert.deepEqual( - await drive2.get('/bar'), - expected, - 'Second blob is downloaded' - ) -}) - -/** - * @param {DriveLiveDownload} download - * @param {(BlobDownloadState | BlobDownloadStateError)['status']} status - * @returns {Promise} - */ -async function waitForState(download, status) { - return new Promise((res) => { - download.on('state', function onState(state) { - // console.log('download state', state) - if (state.status !== status) return - download.off('state', onState) - res() - }) - }) -} - -async function testEnv() { - const store1 = new Corestore(() => new RAM()) - const store2 = new Corestore(() => new RAM()) - const drive1 = new Hyperdrive(store1) - await drive1.ready() - const drive2 = new Hyperdrive(store2, drive1.key) - await drive2.ready() - - function replicate() { - const s = store1.replicate(true) - s.pipe(store2.replicate(false)).pipe(s) - return s - } - - return { - drive1, - drive2, - replicate, - } -} From 81c925396e1ceb041566b184f03fedbb579ebf01 Mon Sep 17 00:00:00 2001 From: Gregor MacLennan Date: Tue, 29 Oct 2024 10:50:20 +0000 Subject: [PATCH 12/25] fix method name change --- src/fastify-plugins/blobs.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fastify-plugins/blobs.js b/src/fastify-plugins/blobs.js index 894595c6..34e41b9c 100644 --- a/src/fastify-plugins/blobs.js +++ b/src/fastify-plugins/blobs.js @@ -93,7 +93,7 @@ async function routes(fastify, options) { let blobStream try { - blobStream = await blobStore.createEntryReadStream(driveId, entry) + blobStream = await blobStore.createReadStreamFromEntry(driveId, entry) } catch (e) { reply.code(404) throw e From b8470840f46bd39c3e5723b8ba180529cc994210 Mon Sep 17 00:00:00 2001 From: Gregor MacLennan Date: Tue, 29 Oct 2024 10:55:26 +0000 Subject: [PATCH 13/25] add blobStore.close() --- src/blob-store/index.js | 5 +++++ src/mapeo-project.js | 1 + 2 files changed, 6 insertions(+) diff --git a/src/blob-store/index.js b/src/blob-store/index.js index 1eeddde5..b1f73003 100644 --- a/src/blob-store/index.js +++ b/src/blob-store/index.js @@ -242,6 +242,11 @@ export class BlobStore extends TypedEmitter { return drive.clear(path, options) } + + close() { + this.#downloader.removeAllListeners() + this.#downloader.destroy() + } } // Don't want to export the class, but do want to export the type. diff --git a/src/mapeo-project.js b/src/mapeo-project.js index 70589113..66baaaf7 100644 --- a/src/mapeo-project.js +++ b/src/mapeo-project.js @@ -442,6 +442,7 @@ export class MapeoProject extends TypedEmitter { */ async close() { this.#l.log('closing project %h', this.#projectId) + this.#blobStore.close() const dataStorePromises = [] for (const dataStore of Object.values(this.#dataStores)) { dataStorePromises.push(dataStore.close()) From d26ef47fed39ed8b1d5005d6c0983f89a94e296c Mon Sep 17 00:00:00 2001 From: Gregor MacLennan Date: Tue, 29 Oct 2024 11:50:50 +0000 Subject: [PATCH 14/25] setDownloadFilter based on archive device setting --- src/mapeo-project.js | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/mapeo-project.js b/src/mapeo-project.js index 6acc70ca..dd95b1c2 100644 --- a/src/mapeo-project.js +++ b/src/mapeo-project.js @@ -72,6 +72,13 @@ export const kIsArchiveDevice = Symbol('isArchiveDevice (temp - test only)') const EMPTY_PROJECT_SETTINGS = Object.freeze({}) +/** @type {import('./types.js').BlobFilter} */ +const NON_ARCHIVE_DEVICE_DOWNLOAD_FILTER = { + photo: ['preview', 'thumbnail'], + // Don't download any audio of video files, since previews and + // thumbnails aren't supported yet. +} + /** * @extends {TypedEmitter<{ close: () => void }>} */ @@ -326,7 +333,9 @@ export class MapeoProject extends TypedEmitter { this.#blobStore = new BlobStore({ coreManager: this.#coreManager, - downloadFilter: null, + downloadFilter: isArchiveDevice + ? null + : NON_ARCHIVE_DEVICE_DOWNLOAD_FILTER, }) this.#blobStore.on('error', (err) => { @@ -664,6 +673,10 @@ export class MapeoProject extends TypedEmitter { /** @param {boolean} isArchiveDevice */ async [kSetIsArchiveDevice](isArchiveDevice) { + if (this.#isArchiveDevice === isArchiveDevice) return + this.#blobStore.setDownloadFilter( + isArchiveDevice ? null : NON_ARCHIVE_DEVICE_DOWNLOAD_FILTER + ) this.#isArchiveDevice = isArchiveDevice } From 9d936a2318a8fe3446ba71b480a8d158a8d633e1 Mon Sep 17 00:00:00 2001 From: Gregor MacLennan Date: Tue, 29 Oct 2024 11:56:34 +0000 Subject: [PATCH 15/25] don't auto-download blobs in core-manager --- src/core-manager/index.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/core-manager/index.js b/src/core-manager/index.js index 60db08cf..9a20341c 100644 --- a/src/core-manager/index.js +++ b/src/core-manager/index.js @@ -281,7 +281,8 @@ export class CoreManager extends TypedEmitter { keyPair, encryptionKey: this.#encryptionKeys[namespace], }) - if (this.#autoDownload) { + if (this.#autoDownload && namespace !== 'blob') { + // Blob downloads are managed by BlobStore core.download({ start: 0, end: -1 }) } // Every peer adds a listener, so could have many peers From 1d2489c7ee2068da8313753bbe14efc364237ead Mon Sep 17 00:00:00 2001 From: Evan Hahn Date: Mon, 4 Nov 2024 00:37:52 +0000 Subject: [PATCH 16/25] Remove unnecessary comment --- types/hyperdrive.d.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/types/hyperdrive.d.ts b/types/hyperdrive.d.ts index a8a8269b..a795cb05 100644 --- a/types/hyperdrive.d.ts +++ b/types/hyperdrive.d.ts @@ -57,7 +57,7 @@ declare module 'hyperdrive' { readonly key: Buffer | null readonly discoveryKey: Buffer | null readonly contentKey: Buffer | null // The public key of the Hyperblobs instance holding blobs associated with entries in the drive. - readonly db: Hyperbee // Hyperbee + readonly db: Hyperbee readonly version: number ready(): Promise update(options?: { wait?: boolean }): Promise From 98c9dd359627a2bf781bbc967e158cafbe46b6d4 Mon Sep 17 00:00:00 2001 From: Evan Hahn Date: Mon, 4 Nov 2024 00:38:43 +0000 Subject: [PATCH 17/25] Run npm install --- package-lock.json | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/package-lock.json b/package-lock.json index 87f768e5..648094bf 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1180,19 +1180,6 @@ "resolved": "https://registry.npmjs.org/@sinclair/typebox/-/typebox-0.29.6.tgz", "integrity": "sha512-aX5IFYWlMa7tQ8xZr3b2gtVReCvg7f3LEhjir/JAjX2bJCMVJA5tIPv30wTD4KDfcwMd7DDYY3hFDeGmOgtrZQ==" }, - "node_modules/@sindresorhus/merge-streams": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/@sindresorhus/merge-streams/-/merge-streams-4.0.0.tgz", - "integrity": "sha512-tlqY9xq5ukxTUZBmoOp+m61cqwQD5pHJtFY3Mn8CA8ps6yghLH/Hw8UPdqg4OLmFW3IFlcXnQNmo/dh8HzXYIQ==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=18" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, "node_modules/@sindresorhus/merge-streams": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/@sindresorhus/merge-streams/-/merge-streams-4.0.0.tgz", From 64966727514caff2968771e70262c72de5519dc9 Mon Sep 17 00:00:00 2001 From: Evan Hahn Date: Mon, 4 Nov 2024 00:39:54 +0000 Subject: [PATCH 18/25] Use more accurate types for Hyperbee options --- types/hyperbee.d.ts | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/types/hyperbee.d.ts b/types/hyperbee.d.ts index 3f8ca806..dbb1edaa 100644 --- a/types/hyperbee.d.ts +++ b/types/hyperbee.d.ts @@ -19,11 +19,14 @@ declare module 'hyperbee' { } interface PutOptions { - cas?: (prev: HyperbeeEntry, next: HyperbeeEntry) => boolean + cas?: ( + prev: HyperbeeEntry, + next: HyperbeeEntry + ) => boolean | PromiseLike } interface DelOptions { - cas?: (prev: T) => boolean + cas?: (prev: T) => boolean | PromiseLike } interface ReadStreamRange { From a79a0ccf0a7dc8338426d2bc16ebcb066e8cd23d Mon Sep 17 00:00:00 2001 From: Evan Hahn Date: Mon, 4 Nov 2024 00:41:19 +0000 Subject: [PATCH 19/25] `getBySeq` is not defined in our version of Hyperbee --- types/hyperbee.d.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/types/hyperbee.d.ts b/types/hyperbee.d.ts index dbb1edaa..9aa4054b 100644 --- a/types/hyperbee.d.ts +++ b/types/hyperbee.d.ts @@ -87,6 +87,10 @@ declare module 'hyperbee' { // readonly discoveryKey: null | Buffer // readonly writable: boolean // readonly readable: boolean + // getBySeq( + // seq: number, + // options?: any + // ): Promise, 'seq'> | null> put( key: string, @@ -95,10 +99,6 @@ declare module 'hyperbee' { ): Promise del(key: string, options?: Hyperbee.DelOptions): Promise get(key: string): Promise | null> - getBySeq( - seq: number, - options?: any - ): Promise, 'seq'> | null> batch(): HyperbeeBatch replicate(isInitiatorOrStream: any): Readable From f0ab9974dc30b244edde7d1532c273603568aca1 Mon Sep 17 00:00:00 2001 From: Evan Hahn Date: Mon, 4 Nov 2024 00:43:20 +0000 Subject: [PATCH 20/25] Add a type for error --- src/blob-store/downloader.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/blob-store/downloader.js b/src/blob-store/downloader.js index 9b1b0963..799619fb 100644 --- a/src/blob-store/downloader.js +++ b/src/blob-store/downloader.js @@ -113,7 +113,7 @@ export class Downloader extends TypedEmitter { this.#ac.abort() } - /** @param {any} error */ + /** @param {Error} error */ #handleError = (error) => { if (this.#ac.signal.aborted) return this.emit('error', error) From 13a4314e40126d2a4d6e3b58e14f839d6432a078 Mon Sep 17 00:00:00 2001 From: Evan Hahn Date: Tue, 5 Nov 2024 15:52:54 +0000 Subject: [PATCH 21/25] Add another throw if aborted --- src/blob-store/downloader.js | 1 + 1 file changed, 1 insertion(+) diff --git a/src/blob-store/downloader.js b/src/blob-store/downloader.js index 799619fb..0d6a1cd2 100644 --- a/src/blob-store/downloader.js +++ b/src/blob-store/downloader.js @@ -74,6 +74,7 @@ export class Downloader extends TypedEmitter { const blobs = await drive.getBlobs() this.#ac.signal.throwIfAborted() await this.#processEntry(blobs.core, blob) + this.#ac.signal.throwIfAborted() } throw new Error('Entries stream ended unexpectedly') } From dcff18bde001a1da4d7c877f9bfd0830652b507b Mon Sep 17 00:00:00 2001 From: Evan Hahn Date: Tue, 5 Nov 2024 09:54:13 -0600 Subject: [PATCH 22/25] Clean up AddDriveIds --- src/blob-store/entries-stream.js | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/blob-store/entries-stream.js b/src/blob-store/entries-stream.js index b9a5ebb9..f2556767 100644 --- a/src/blob-store/entries-stream.js +++ b/src/blob-store/entries-stream.js @@ -54,13 +54,13 @@ function getHistoryStream(bee, { live }) { class AddDriveIds extends Transform { #core - #discoveryKey + #cachedDriveId /** @param {import('hypercore')} core */ constructor(core) { super({ objectMode: true }) this.#core = core - this.#discoveryKey = core.discoveryKey?.toString('hex') + this.#cachedDriveId = core.discoveryKey?.toString('hex') } /** @type {Transform['_transform']} */ @@ -68,8 +68,13 @@ class AddDriveIds extends Transform { // Minimal performance optimization to only call toString() once. // core.discoveryKey will always be defined by the time it starts // streaming, but could be null when the instance is first created. - const driveId = - this.#discoveryKey || this.#core.discoveryKey?.toString('hex') + let driveId + if (this.#cachedDriveId) { + driveId = this.#cachedDriveId + } else { + driveId = this.#core.discoveryKey?.toString('hex') + this.#cachedDriveId = driveId + } callback(null, { ...entry, driveId }) } } From 36256319f92e1d821b7fd18dfff1cc0b8a1e3c94 Mon Sep 17 00:00:00 2001 From: Evan Hahn Date: Wed, 30 Oct 2024 11:15:46 -0500 Subject: [PATCH 23/25] Invert "does this entry match the filter" logic (#947) Before this change, we turned filters into a list of path prefixes and then checked entry paths against those prefixes. Rough pseudocode: ```javascript function doesEntryMatchFilter({ path }, filter) { const pathPrefixes = pathPrefixesFromFilter(filter) return pathPrefixes.some((prefix) => path.startsWith(prefix)) } ``` For performance and simplicity, I think it's cleaner if we "look up" entry paths in the existing filter object. Rough pseudocode: ```javascript function doesEntryMatchFilter({ path }, filter) { const { type, variant } = parsePath(path) return filter[type]?.includes(variant) } ``` I think this has two advantages: - It's less code. We don't need to worry about de-duping paths (e.g., `/photos/original` versus `/photos/`). We don't need to worry about sketchy paths (e.g., `/photos/garbage/../original`). - It's faster (at least, in theory). Rather than having to iterate over every path prefix, we only need to iterate over the variants of each path. (This could be further optimized with a `Set`.) --- src/blob-store/downloader.js | 16 ++++----- src/blob-store/utils.js | 67 +++++++++++++----------------------- test/blob-store/utils.js | 50 +++++++++++++++++++++++++++ 3 files changed, 80 insertions(+), 53 deletions(-) create mode 100644 test/blob-store/utils.js diff --git a/src/blob-store/downloader.js b/src/blob-store/downloader.js index 0d6a1cd2..c4275f24 100644 --- a/src/blob-store/downloader.js +++ b/src/blob-store/downloader.js @@ -1,8 +1,9 @@ import { TypedEmitter } from 'tiny-typed-emitter' import { createEntriesStream } from './entries-stream.js' -import { pathPrefixesFromFilter } from './utils.js' +import { filePathMatchesFilter } from './utils.js' /** @import Hyperdrive from 'hyperdrive' */ +/** @import { BlobFilter } from '../types.js' */ /** * Like hyperdrive.download() but 'live', and for multiple drives. @@ -32,7 +33,7 @@ export class Downloader extends TypedEmitter { #entriesStream #processEntriesPromise #ac = new AbortController() - #pathPrefixes + #shouldDownloadFile /** * @param {import('./index.js').THyperdriveIndex} driveIndex @@ -41,9 +42,12 @@ export class Downloader extends TypedEmitter { */ constructor(driveIndex, { filter } = {}) { super() - this.#pathPrefixes = filter ? pathPrefixesFromFilter(filter) : [] this.#driveIndex = driveIndex + this.#shouldDownloadFile = filter + ? filePathMatchesFilter.bind(null, filter) + : () => true + this.#entriesStream = createEntriesStream(driveIndex, { live: true }) this.#entriesStream.once('error', this.#handleError) @@ -79,12 +83,6 @@ export class Downloader extends TypedEmitter { throw new Error('Entries stream ended unexpectedly') } - /** @param {string} filePath */ - #shouldDownloadFile(filePath) { - if (!this.#pathPrefixes.length) return true - return this.#pathPrefixes.some((prefix) => filePath.startsWith(prefix)) - } - /** * Update state and queue missing entries for download * diff --git a/src/blob-store/utils.js b/src/blob-store/utils.js index 8563ef8f..f9f9e685 100644 --- a/src/blob-store/utils.js +++ b/src/blob-store/utils.js @@ -8,35 +8,38 @@ import { Transform } from 'node:stream' /** - * Convert a filter to an array of path prefixes that match the filter. These - * path prefixes can be used to filter entries by - * `entry.key.startsWith(pathPrefix)`. - * * @param {GenericBlobFilter} filter - * @returns {readonly string[]} array of folders that match the filter + * @param {string} filePath + * @returns {boolean} */ -export function pathPrefixesFromFilter(filter) { - const pathPrefixes = [] - for (const [type, variants] of Object.entries(filter)) { - if (variants.length === 0) { - pathPrefixes.push(`/${type}/`) - continue - } - const dedupedVariants = new Set(variants) - for (const variant of dedupedVariants) { - pathPrefixes.push(`/${type}/${variant}/`) - } +export function filePathMatchesFilter(filter, filePath) { + const pathParts = filePath.split('/', 4) + const [shouldBeEmpty, type, variant] = pathParts + + if (typeof shouldBeEmpty !== 'string' || shouldBeEmpty) return false + + if (!type) return false + if (!Object.hasOwn(filter, type)) return false + + const allowedVariants = filter[type] ?? [] + if (allowedVariants.length === 0) { + return pathParts.length >= 3 + } else { + return ( + pathParts.length >= 4 && + typeof variant === 'string' && + allowedVariants.includes(variant) + ) } - return filterSubfoldersAndDuplicates(pathPrefixes) } /** @type {import("../types.js").BlobStoreEntriesStream} */ export class FilterEntriesStream extends Transform { - #pathPrefixes + #isIncludedInFilter /** @param {GenericBlobFilter} filter */ constructor(filter) { super({ objectMode: true }) - this.#pathPrefixes = pathPrefixesFromFilter(filter) + this.#isIncludedInFilter = filePathMatchesFilter.bind(null, filter) } /** * @param {import("hyperdrive").HyperdriveEntry} entry @@ -45,31 +48,7 @@ export class FilterEntriesStream extends Transform { */ _transform(entry, _, callback) { const { key: filePath } = entry - const isIncludedInFilter = this.#pathPrefixes.some((pathPrefix) => - filePath.startsWith(pathPrefix) - ) - if (isIncludedInFilter) this.push(entry) + if (this.#isIncludedInFilter(filePath)) this.push(entry) callback() } } - -/** - * Take an array of folders, remove any folders that are duplicates or subfolders of another - * - * @param {readonly string[]} folders - * @returns {readonly string[]} - */ -function filterSubfoldersAndDuplicates(folders) { - /** @type {Set} */ - const filtered = new Set() - for (let i = 0; i < folders.length; i++) { - const isSubfolderOfAnotherFolder = !!folders.find((folder, index) => { - if (index === i) return false - // Deduping is done by the Set, if we do it here we don't get either - if (folder === folders[i]) return true - return folders[i].startsWith(folder) - }) - if (!isSubfolderOfAnotherFolder) filtered.add(folders[i]) - } - return Array.from(filtered) -} diff --git a/test/blob-store/utils.js b/test/blob-store/utils.js new file mode 100644 index 00000000..08516fd9 --- /dev/null +++ b/test/blob-store/utils.js @@ -0,0 +1,50 @@ +import test from 'node:test' +import assert from 'node:assert/strict' +import { filePathMatchesFilter } from '../../src/blob-store/utils.js' + +test('filePathMatchesFilter', () => { + const filter = { + photo: ['a', 'b'], + video: [], + } + + const shouldMatch = [ + '/photo/a/foo.jpg', + '/photo/b/foo.jpg', + '/photo/a/', + '/video/foo.mp4', + '/video/foo/bar.mp4', + '/video/', + '/video///', + ] + for (const filePath of shouldMatch) { + assert( + filePathMatchesFilter(filter, filePath), + `${filePath} matches filter` + ) + } + + const shouldntMatch = [ + '/photo/c/foo.jpg', + '/photo/c/', + '/photo/a', + '/photo/ax/foo.jpg', + '/photo/c/../a/foo.jpg', + '/photo', + '/photo/', + '/photo//', + '/PHOTO/a/foo.jpg', + '/audio/a/foo.mp3', + 'photo/a/foo.jpg', + '//photo/a/foo.jpg', + ' /photo/a/foo.jpg', + '/hasOwnProperty/', + '/hasOwnProperty/a/foo.jpg', + ] + for (const filePath of shouldntMatch) { + assert( + !filePathMatchesFilter(filter, filePath), + `${filePath} doesn't match filter` + ) + } +}) From 01b045ff2e707a0e64b63591a01f5b1b92601f02 Mon Sep 17 00:00:00 2001 From: Evan Hahn Date: Tue, 5 Nov 2024 16:00:04 +0000 Subject: [PATCH 24/25] Clean up a comment --- src/blob-store/downloader.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/blob-store/downloader.js b/src/blob-store/downloader.js index c4275f24..857c7d1a 100644 --- a/src/blob-store/downloader.js +++ b/src/blob-store/downloader.js @@ -123,7 +123,7 @@ export class Downloader extends TypedEmitter { for (const download of this.#queuedDownloads) download.destroy() this.#ac.signal.removeEventListener('abort', this.#handleAbort) this.#entriesStream.removeListener('error', this.#ac.abort) - // queuedDownloads should always be empty by here anyway, but just in case. + // queuedDownloads is likely to be empty here anyway, but clear just in case. this.#queuedDownloads.clear() this.#entriesStream.destroy() } From d374b20ed21f124803891f61db29e85ddac198e0 Mon Sep 17 00:00:00 2001 From: Evan Hahn Date: Tue, 5 Nov 2024 20:45:54 +0000 Subject: [PATCH 25/25] E2E tests for sparse blob downloads --- src/fastify-plugins/blobs.js | 15 ++++++ test-e2e/sync.js | 100 +++++++++++++++++++++++++++++++++++ 2 files changed, 115 insertions(+) diff --git a/src/fastify-plugins/blobs.js b/src/fastify-plugins/blobs.js index 34e41b9c..65b00896 100644 --- a/src/fastify-plugins/blobs.js +++ b/src/fastify-plugins/blobs.js @@ -1,9 +1,11 @@ import fp from 'fastify-plugin' import { filetypemime } from 'magic-bytes.js' +import { pEvent } from 'p-event' import { Type as T } from '@sinclair/typebox' import { SUPPORTED_BLOB_VARIANTS } from '../blob-store/index.js' import { HEX_REGEX_32_BYTES, Z_BASE_32_REGEX_32_BYTES } from './constants.js' +import { getErrorMessage } from '../lib/error.js' /** @import { BlobId } from '../types.js' */ @@ -99,6 +101,19 @@ async function routes(fastify, options) { throw e } + try { + await pEvent(blobStream, 'readable', { rejectionEvents: ['error'] }) + } catch (err) { + // This matches [how Hyperblobs checks if a blob is unavailable][0]. + // [0]: https://github.com/holepunchto/hyperblobs/blob/518088d2b828082fd70a276fa2c8848a2cf2a56b/index.js#L49 + if (getErrorMessage(err) === 'Block not available') { + reply.code(404) + throw new Error('Blob not found') + } else { + throw err + } + } + // Extract the 'mimeType' property of the metadata and use it for the response header if found if ( metadata && diff --git a/test-e2e/sync.js b/test-e2e/sync.js index 95fb1a43..627f99f7 100644 --- a/test-e2e/sync.js +++ b/test-e2e/sync.js @@ -180,6 +180,106 @@ test('syncing blobs', async (t) => { }) }) +test('non-archive devices only sync a subset of blobs', async (t) => { + const invitor = createManager('invitor', t) + + const fastify = Fastify() + const fastifyController = new FastifyController({ fastify }) + t.after(() => fastifyController.stop()) + const invitee = createManager('invitee', t, { fastify }) + invitee.setIsArchiveDevice(false) + + const managers = [invitee, invitor] + + await Promise.all([ + invitor.setDeviceInfo({ name: 'invitor', deviceType: 'mobile' }), + invitee.setDeviceInfo({ name: 'invitee', deviceType: 'mobile' }), + fastifyController.start(), + ]) + + const disconnectPeers = connectPeers(managers) + t.after(() => disconnectPeers()) + const projectId = await invitor.createProject({ name: 'Mapeo' }) + await invite({ invitor, invitees: [invitee], projectId }) + + const projects = await Promise.all([ + invitor.getProject(projectId), + invitee.getProject(projectId), + ]) + const [invitorProject, inviteeProject] = projects + + const fixturesPath = new URL('../test/fixtures/', import.meta.url) + const imagesFixturesPath = new URL('images/', fixturesPath) + const photoFixturePaths = { + original: new URL('02-digidem-logo.jpg', imagesFixturesPath).pathname, + preview: new URL('02-digidem-logo-preview.jpg', imagesFixturesPath) + .pathname, + thumbnail: new URL('02-digidem-logo-thumb.jpg', imagesFixturesPath) + .pathname, + } + const audioFixturePath = new URL('blob-api/audio.mp3', fixturesPath).pathname + + const [photoBlob, audioBlob] = await Promise.all([ + invitorProject.$blobs.create( + photoFixturePaths, + blobMetadata({ mimeType: 'image/jpeg' }) + ), + invitorProject.$blobs.create( + { original: audioFixturePath }, + blobMetadata({ mimeType: 'audio/mpeg' }) + ), + ]) + + invitorProject.$sync.start() + inviteeProject.$sync.start() + + // TODO: We should replace this with `await waitForSync(projects, 'full')` once + // the following issues are merged: + // + // - + // - + await delay(2000) + + /** + * @param {BlobId} blobId + * @param {string} path + */ + const assertLoads = async (blobId, path) => { + const expectedBytesPromise = fs.readFile(path) + + const originalBlobUrl = await inviteeProject.$blobs.getUrl(blobId) + const response = await request(originalBlobUrl, { reset: true }) + assert.equal(response.statusCode, 200) + assert.deepEqual( + Buffer.from(await response.body.arrayBuffer()), + await expectedBytesPromise, + 'blob makes it to the other side' + ) + } + + /** @param {BlobId} blobId */ + const assert404 = async (blobId) => { + const originalBlobUrl = await inviteeProject.$blobs.getUrl(blobId) + const response = await request(originalBlobUrl, { reset: true }) + assert.equal(response.statusCode, 404, 'blob is not synced') + } + + await Promise.all([ + assert404({ ...photoBlob, variant: 'original' }), + assert404({ ...audioBlob, variant: 'original' }), + // We have to tell TypeScript that the blob's type is "photo", which it + // isn't smart enough to figure out. + assertLoads( + { ...photoBlob, type: 'photo', variant: 'preview' }, + photoFixturePaths.preview + ), + assertLoads( + { ...photoBlob, type: 'photo', variant: 'thumbnail' }, + photoFixturePaths.thumbnail + ), + ]) +}) + test('start and stop sync', async function (t) { // Checks that both peers need to start syncing for data to sync, and that // $sync.stop() actually stops data syncing