From a66d65ea08f46253268c0c1597e296c17c9c07f8 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 7 Jan 2025 12:48:30 -0600 Subject: [PATCH] Data Explorer: Always read fresh file contents when ingesting a file into duckdb (#5890) Addresses #5860. duckdb-wasm has some kind of file caching behavior that is not resilient to file changes on the underlying file system. This patch nukes this issue from orbit by always registering the file contents as a virtual file and reading from that. I think if and when we migrate to the duckdb NodeJS bindings per #5889 that we can avoid this chicanery. This does the same thing for Parquet files, too, since they would also become stale. e2e: @:data-explorer ### QA Notes Here was the reproducible failure from the issue: * Run iris |> readr::write_csv("test.csv") * Click on the file in the File Explorer to open it up; note that we correctly see the iris data * Run mtcars |> readr::write_csv("test.csv") * Again click on the file in the File Explorer to open it up; note that we incorrectly still see the iris data --- extensions/positron-duckdb/src/extension.ts | 37 +++++++++------------ 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/extensions/positron-duckdb/src/extension.ts b/extensions/positron-duckdb/src/extension.ts index 7eb9d1ed2ef..d6fb2118a44 100644 --- a/extensions/positron-duckdb/src/extension.ts +++ b/extensions/positron-duckdb/src/extension.ts @@ -1221,7 +1221,7 @@ export class DataExplorerRpcHandler { fs.watchFile(filePath, { interval: 1000 }, async () => { const newTableName = `positron_${this._tableIndex++}`; - await this.createTableFromFilePath(filePath, newTableName, true); + await this.createTableFromFilePath(filePath, newTableName); const newSchema = (await this.db.runQuery(`DESCRIBE ${newTableName};`)).toArray(); await tableView.onFileUpdated(newTableName, newSchema); @@ -1241,11 +1241,8 @@ export class DataExplorerRpcHandler { * Import data file into DuckDB by creating table or view * @param filePath The file path on disk. * @param catalogName The table name to use in the DuckDB catalog. - * @param forceRefresh If true, force an uncached read from disk to circumvent DuckDB's - * file handle caching. */ - async createTableFromFilePath(filePath: string, catalogName: string, - forceRefresh: boolean = false) { + async createTableFromFilePath(filePath: string, catalogName: string) { const getCsvImportQuery = (_filePath: string, options: Array) => { return `CREATE OR REPLACE TABLE ${catalogName} AS @@ -1278,24 +1275,22 @@ export class DataExplorerRpcHandler { const fileExt = path.extname(filePath); - if (fileExt === '.parquet' || fileExt === '.parq') { - // Always create a view for Parquet files - const query = `CREATE VIEW ${catalogName} AS - SELECT * FROM parquet_scan('${filePath}');`; - await this.db.runQuery(query); - } else if (forceRefresh) { - // For smaller text files, read the entire contents and register it as a temp file - // to avoid caching in duckdb-wasm - const fileContents = fs.readFileSync(filePath, { encoding: null }); - const virtualPath = path.basename(filePath); - await this.db.db.registerFileBuffer(virtualPath, fileContents); - try { + // Read the entire contents and register it as a temp file + // to avoid file handle caching in duckdb-wasm + const fileContents = fs.readFileSync(filePath, { encoding: null }); + const virtualPath = path.basename(filePath); + await this.db.db.registerFileBuffer(virtualPath, fileContents); + try { + if (fileExt === '.parquet' || fileExt === '.parq') { + // Always create a view for Parquet files + const query = `CREATE OR REPLACE TABLE ${catalogName} AS + SELECT * FROM parquet_scan('${virtualPath}');`; + await this.db.runQuery(query); + } else { await importDelimited(virtualPath); - } finally { - await this.db.db.dropFile(virtualPath); } - } else { - await importDelimited(filePath); + } finally { + await this.db.db.dropFile(virtualPath); } }