Skip to content

Commit

Permalink
Data Explorer: Support opening gzipped csv files with DuckDB by decom…
Browse files Browse the repository at this point in the history
…pressing in memory in NodeJS (#5901)

Addresses #5332. This works around duckdb-wasm's limitation that it
cannot read directly from gzipped CSV/TSV files by unzipping in memory
and registering as a virtual file. This is okay for small files but for
very large gzipped files will feel a bit slow and use a bunch of memory
(we will have to tackle that problem via #5889).

### Release Notes

#### New Features

- Support opening gzipped CSV/TSV files in the data explorer with .gz
extension from the file explorer pane (#5332).

#### Bug Fixes

- N/A

e2e: @:data-explorer
  • Loading branch information
wesm authored Jan 8, 2025
1 parent 0823a33 commit 8d2adc8
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 7 deletions.
25 changes: 20 additions & 5 deletions extensions/positron-duckdb/src/extension.ts
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ import {
import * as duckdb from '@duckdb/duckdb-wasm';
import * as path from 'path';
import * as fs from 'fs';
import * as zlib from 'zlib';
import Worker from 'web-worker';
import { Table, Vector } from 'apache-arrow';
import { pathToFileURL } from 'url';
Expand Down Expand Up @@ -417,6 +418,7 @@ class ColumnProfileEvaluator {
const numRows = Number(stats.get('num_rows'));
const nullCount = Number(stats.get(`null_count_${field}`));


return {
values,
counts,
Expand Down Expand Up @@ -1243,6 +1245,12 @@ export class DataExplorerRpcHandler {
* @param catalogName The table name to use in the DuckDB catalog.
*/
async createTableFromFilePath(filePath: string, catalogName: string) {
let fileExt = path.extname(filePath);
const isGzipped = fileExt === '.gz';

if (isGzipped) {
fileExt = path.extname(filePath.slice(0, -3));
}

const getCsvImportQuery = (_filePath: string, options: Array<String>) => {
return `CREATE OR REPLACE TABLE ${catalogName} AS
Expand Down Expand Up @@ -1273,15 +1281,22 @@ export class DataExplorerRpcHandler {
}
};

const fileExt = path.extname(filePath);

// Read the entire contents and register it as a temp file
// to avoid file handle caching in duckdb-wasm
const fileContents = fs.readFileSync(filePath, { encoding: null });
const virtualPath = path.basename(filePath);
let fileContents = fs.readFileSync(filePath, { encoding: null });
if (isGzipped) {
fileContents = zlib.gunzipSync(fileContents);
}

// For gzipped files, use the base name without the .gz extension
const virtualPath = isGzipped ?
path.basename(filePath, '.gz') :
path.basename(filePath);

await this.db.db.registerFileBuffer(virtualPath, fileContents);
try {
if (fileExt === '.parquet' || fileExt === '.parq') {
const baseExt = path.extname(virtualPath);
if (baseExt === '.parquet' || baseExt === '.parq') {
// Always create a view for Parquet files
const query = `CREATE OR REPLACE TABLE ${catalogName} AS
SELECT * FROM parquet_scan('${virtualPath}');`;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import { PositronDataExplorerEditor } from './positronDataExplorerEditor.js';
import { PositronDataExplorerEditorInput } from './positronDataExplorerEditorInput.js';
import { registerPositronDataExplorerActions } from './positronDataExplorerActions.js';
import { extname } from '../../../../base/common/resources.js';
import { posix } from '../../../../base/common/path.js';
import { IPositronDataExplorerService } from '../../../services/positronDataExplorer/browser/interfaces/positronDataExplorerService.js';
import { PositronDataExplorerUri } from '../../../services/positronDataExplorer/common/positronDataExplorerUri.js';

Expand Down Expand Up @@ -76,15 +77,20 @@ class PositronDataExplorerContribution extends Disposable {
}
));

const DUCKDB_SUPPORTED_EXTENSIONS = ['parquet', 'parq', 'csv', 'tsv'];
const DUCKDB_SUPPORTED_EXTENSIONS = ['parquet', 'parq', 'csv', 'tsv', 'gz'];

this._register(editorResolverService.registerEditor(
`*.{${DUCKDB_SUPPORTED_EXTENSIONS.join(',')}}`,
editorInfo,
{
singlePerResource: true,
canSupportResource: resource => {
return DUCKDB_SUPPORTED_EXTENSIONS.includes(extname(resource).substring(1));
let fileExt = extname(resource).substring(1);
if (fileExt === 'gz') {
// Strip the .gz and get the actual extension
fileExt = posix.extname(resource.path.slice(0, -3)).substring(1);
}
return DUCKDB_SUPPORTED_EXTENSIONS.includes(fileExt);
}
},
{
Expand Down

0 comments on commit 8d2adc8

Please sign in to comment.