From 4c15cf735fdb1e5894d76d4fe1afffac621c0ce6 Mon Sep 17 00:00:00 2001 From: Evan Hahn Date: Tue, 30 Apr 2024 09:29:08 -0500 Subject: [PATCH] fix: force-closing local discovery TCP server could always time out (#498) We have a way to force-close the local discovery TCP server. This was susceptible to the following race condition bug: 1. A connection opens and is established. 2. A duplicate connection opens. [We decide to keep the new one. We start to close the old connection and enqueue a "promotion" of the new one.][2] 3. We close the TCP server. This does not immediately halt the server; it ["stops the server from accepting new connections and keeps existing connections"][docs]. [We iterate through all existing connections and destroy them][3], but the duplicate connection is not yet in this list so it is never destroyed and the server will never close. 4. The old connection is closed and ["promotion" of the new connection completes][4]. It is never destroyed. This caused [intermittent CI failures][ci] and should be fixed by this change. [docs]: https://nodejs.org/docs/latest-v18.x/api/net.html#serverclosecallback [ci]: https://github.com/digidem/mapeo-core-next/actions/runs/7996074898/job/21837815901 [2]: https://github.com/digidem/mapeo-core-next/blob/5e05e35200297435f4ebd33df2c5b1444ed4ef1b/src/discovery/local-discovery.js#L229-L232 [3]: https://github.com/digidem/mapeo-core-next/blob/5e05e35200297435f4ebd33df2c5b1444ed4ef1b/src/discovery/local-discovery.js#L278-L280 [4]: https://github.com/digidem/mapeo-core-next/blob/5e05e35200297435f4ebd33df2c5b1444ed4ef1b/src/discovery/local-discovery.js#L173 --- src/discovery/local-discovery.js | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/discovery/local-discovery.js b/src/discovery/local-discovery.js index d498788b0..b6f3230b9 100644 --- a/src/discovery/local-discovery.js +++ b/src/discovery/local-discovery.js @@ -226,6 +226,16 @@ export class LocalDiscovery extends TypedEmitter { return } } + + // Bail if the server has already stopped by this point. This should be + // rare but can happen during connection swaps if the new connection is + // "promoted" after the server's doors have already been closed. + if (!this.#server.listening) { + this.#log('server stopped, destroying connection %h', remotePublicKey) + conn.destroy() + return + } + this.#noiseConnections.set(remoteId, conn) conn.on('close', () => {