diff --git a/Cargo.lock b/Cargo.lock index f342e23..0ec1460 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,5 +1,11 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + [[package]] name = "ahash" version = "0.4.7" @@ -12,7 +18,7 @@ version = "0.7.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7404febffaa47dac81aa44dba71523c9d069b1bdc50a77db41195149e17f68e5" dependencies = [ - "memchr", + "memchr 2.3.4", ] [[package]] @@ -63,7 +69,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a40b47ad93e1a5404e6c18dec46b628214fee441c70f4ab5d6942142cc268a3d" dependencies = [ "lazy_static", - "memchr", + "memchr 2.3.4", "regex-automata", "serde", ] @@ -74,6 +80,12 @@ version = "1.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" +[[package]] +name = "cfg-if" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" + [[package]] name = "cfg-if" version = "1.0.0" @@ -145,6 +157,15 @@ dependencies = [ "winapi", ] +[[package]] +name = "crc32fast" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81156fece84ab6a9f2afdb109ce3ae577e42b1228441eded99bd77f627953b1a" +dependencies = [ + "cfg-if 1.0.0", +] + [[package]] name = "csv" version = "1.1.6" @@ -164,7 +185,7 @@ version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" dependencies = [ - "memchr", + "memchr 2.3.4", ] [[package]] @@ -195,6 +216,24 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" +[[package]] +name = "fixedbitset" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37ab347416e802de484e4d03c7316c48f1ecb56574dfd4a46a80f173ce1de04d" + +[[package]] +name = "flate2" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd3aec53de10fe96d7d8c565eb17f2c687bb5518a2ec453b5b1252964526abe0" +dependencies = [ + "cfg-if 1.0.0", + "crc32fast", + "libc", + "miniz_oxide", +] + [[package]] name = "float-cmp" version = "0.8.0" @@ -204,6 +243,12 @@ dependencies = [ "num-traits", ] +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + [[package]] name = "fuchsia-cprng" version = "0.1.1" @@ -216,7 +261,7 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c9495705279e7140bf035dde1f6e750c162df8b625267cd52cc44e0b156732c8" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "libc", "wasi", ] @@ -295,13 +340,22 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "lock_api" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4da24a77a3d8a6d4862d95f72e6fdb9c09a643ecdb402d754004a557f2bec75" +dependencies = [ + "scopeguard", +] + [[package]] name = "log" version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", ] [[package]] @@ -310,12 +364,40 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" +[[package]] +name = "memchr" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "148fab2e51b4f1cfc66da2a7c32981d1d3c083a803978268bb11fe4b86925e7a" +dependencies = [ + "libc", +] + [[package]] name = "memchr" version = "2.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525" +[[package]] +name = "miniz_oxide" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a92518e98c078586bc6c934028adcca4c92a53d6a958196de835170a01d84e4b" +dependencies = [ + "adler", + "autocfg 1.0.1", +] + +[[package]] +name = "nom" +version = "3.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05aec50c70fd288702bcd93284a8444607f3292dbdf2a30de5ea5dcdbe72287b" +dependencies = [ + "memchr 1.0.2", +] + [[package]] name = "normalize-line-endings" version = "0.3.0" @@ -405,6 +487,40 @@ version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "afb2e1c3ee07430c2cf76151675e583e0f19985fa6efae47d6848a3e2c824f85" +[[package]] +name = "parking_lot" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3a704eb390aafdc107b0e392f56a82b668e3a71366993b5340f5833fd62505e" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d58c7c768d4ba344e3e8d72518ac13e259d7c7ade24167003b8488e10b6740a3" +dependencies = [ + "cfg-if 0.1.10", + "cloudabi", + "libc", + "redox_syscall", + "smallvec", + "winapi", +] + +[[package]] +name = "petgraph" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "467d164a6de56270bd7c4d070df81d07beace25012d5103ced4e9ff08d6afdb7" +dependencies = [ + "fixedbitset", + "indexmap", +] + [[package]] name = "pkg-config" version = "0.3.19" @@ -480,6 +596,7 @@ dependencies = [ "assert_cmd", "clap", "csv", + "flate2", "log", "md5", "predicates", @@ -487,6 +604,7 @@ dependencies = [ "simple_logger", "sqlparser", "statistical", + "tree_magic", "uuid", ] @@ -614,6 +732,12 @@ dependencies = [ "rand_core 0.3.1", ] +[[package]] +name = "redox_syscall" +version = "0.1.57" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41cc0f7e4d5d4544e8861606a285bb08d3e70712ccc7d2b84d7c0ccfaf4b05ce" + [[package]] name = "regex" version = "1.4.6" @@ -621,7 +745,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2a26af418b574bd56588335b3a3659a65725d4e636eb1016c2f9e3b38c7cc759" dependencies = [ "aho-corasick", - "memchr", + "memchr 2.3.4", "regex-syntax", ] @@ -651,7 +775,7 @@ dependencies = [ "fallible-streaming-iterator", "hashlink", "libsqlite3-sys", - "memchr", + "memchr 2.3.4", "smallvec", ] @@ -661,6 +785,12 @@ version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e" +[[package]] +name = "scopeguard" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" + [[package]] name = "serde" version = "1.0.125" @@ -775,6 +905,19 @@ dependencies = [ "winapi", ] +[[package]] +name = "tree_magic" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1d99367ce3e553a84738f73bd626ccca541ef90ae757fdcdc4cbe728e6cb629" +dependencies = [ + "fnv", + "lazy_static", + "nom", + "parking_lot", + "petgraph", +] + [[package]] name = "treeline" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 6ebb486..72757c9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,8 @@ simple_logger="1.11.0" clap = "3.0.0-beta.2" md5 = "0.7.0" statistical = "1.0.0" +tree_magic = "0.2.3" +flate2 = "1.0.20" [dev-dependencies] assert_cmd="0.10" diff --git a/src/csv/csv_data.rs b/src/csv/csv_data.rs index 82d24f2..800a96b 100644 --- a/src/csv/csv_data.rs +++ b/src/csv/csv_data.rs @@ -2,6 +2,7 @@ use csv::{StringRecord, Trim}; use log::debug; use std::error::Error; use std::fmt::{Display, Formatter}; +use std::fs::File; #[derive(Eq, PartialEq, Debug)] pub enum CsvWrapper { @@ -44,13 +45,23 @@ impl CsvData { trim: bool, ) -> Result> { debug!("Trying to load CSV from filename {}", filename); + let file = File::open(filename)?; + CsvData::from_reader(file, filename, delimiter, trim) + } + + pub fn from_reader( + reader: R, + filename: &str, + delimiter: char, + trim: bool, + ) -> Result> { let mut records = Vec::with_capacity(10000); let trim = if trim { Trim::All } else { Trim::None }; let mut rdr = csv::ReaderBuilder::new() .buffer_capacity(16 * (1 << 10)) .delimiter(delimiter as u8) .trim(trim) - .from_path(filename)?; + .from_reader(reader); for result in rdr.records() { let record = result?; diff --git a/src/main.rs b/src/main.rs index 7849b87..37904b2 100644 --- a/src/main.rs +++ b/src/main.rs @@ -8,6 +8,7 @@ use crate::qsv::{execute_analysis, execute_query, write_to_stdout}; use clap::{AppSettings, Clap}; use simple_logger::SimpleLogger; use std::error::Error; +use std::path::Path; #[derive(Clap)] #[clap( @@ -23,6 +24,7 @@ struct Opts { enum SubCommand { Query(Query), Analyze(Analyze), + FileType(FileType), } #[derive(Clap)] @@ -44,6 +46,10 @@ struct Analyze { #[clap(long)] trim: bool, } +#[derive(Clap)] +struct FileType { + filename: String, +} fn main() -> Result<(), Box> { SimpleLogger::from_env().init()?; let opts: Opts = Opts::parse(); @@ -71,6 +77,12 @@ fn main() -> Result<(), Box> { let results = execute_analysis(subcmd.query.as_str(), &options)?; println!("{}", results); } + + SubCommand::FileType(ft) => { + let path = Path::new(ft.filename.as_str()); + let t = tree_magic::from_filepath(path); + println!("{}", t); + } } Ok(()) } diff --git a/src/qsv.rs b/src/qsv.rs index 86587c3..d579f1d 100644 --- a/src/qsv.rs +++ b/src/qsv.rs @@ -5,10 +5,11 @@ use crate::db::Db; use crate::parser::collector::Collector; use crate::parser::rewriter::Rewriter; use crate::parser::Parser; -use log::debug; +use flate2::read::GzDecoder; +use log::{debug, error}; use std::collections::HashMap; use std::error::Error; -use std::ffi::OsStr; +use std::fs::File; use std::io::Write; use std::path::Path; use uuid::Uuid; @@ -31,16 +32,21 @@ pub fn execute_query(query: &str, options: &Options) -> Result { + debug!( + "Potential filename from SQL was able to be loaded: {}", + filename + ); + } + Ok(None) => { + debug!( + "Identifier in SQL could not be loaded as file, as it didn't exist: {}", + filename + ); + } + Err(e) => return Err(e), } } let rewritten = Rewriter::new(files_to_tables); @@ -62,7 +68,7 @@ pub fn execute_analysis( collector.collect(statement); //TODO: should we handle multiple SQL statements later? let mut hashmap: HashMap = HashMap::new(); for filename in collector.table_identifiers.iter() { - if let Ok(inference) = maybe_load_analysis(filename, options) { + if let Ok(Some(inference)) = maybe_load_analysis(filename, options) { hashmap.insert(filename.clone(), inference); debug!( "Potential filename from SQL was able to be loaded: {}", @@ -81,33 +87,41 @@ pub fn execute_analysis( fn maybe_load_analysis( filename: &str, options: &Options, -) -> Result> { - let csv = CsvData::from_filename(filename, options.delimiter, options.trim)?; - debug!( - "Attempting to load identifier from SQL as file: {}", - filename - ); +) -> Result, Box> { + let path = Path::new(filename); + if !path.exists() { + return Ok(None); + } + let mime_type = tree_magic::from_filepath(path); + let csv = csv_data_from_mime_type(filename, mime_type.as_str(), options)?; let inference = if options.textonly { ColumnInference::default_inference(&csv) } else { ColumnInference::from_csv(&csv) }; - Ok(inference) + Ok(Some(inference)) } fn maybe_load_file( files_to_tables: &mut HashMap, filename: &str, db: &mut Db, options: &Options, -) -> Result<(), Box> { - let csv = CsvData::from_filename(filename, options.delimiter, options.trim)?; +) -> Result, Box> { + let path = Path::new(filename); + if !path.exists() { + return Ok(None); + } + let mime_type = tree_magic::from_filepath(path); + debug!("File '{}' has MIME type: '{}'", filename, mime_type); + let csv = csv_data_from_mime_type(filename, mime_type.as_str(), options)?; let path = Path::new(filename); debug!( "Attempting to load identifier from SQL as file: {}", filename ); - let table_name = path.file_stem(); //TODO: should we canonicalize path? - let table_name = sanitize(table_name).unwrap_or_else(|| Uuid::new_v4().to_string()); + let without_extension = remove_extension(path); + let table_name = sanitize(without_extension) + .unwrap_or_else(|| String::from("t") + &Uuid::new_v4().as_u128().to_string()); let inference = if options.textonly { ColumnInference::default_inference(&csv) } else { @@ -126,8 +140,37 @@ fn maybe_load_file( debug!("Inserting {} rows into {}", records.len(), table_name); db.insert(table_name, &headers, records); files_to_tables.insert(filename.to_string(), String::from(table_name)); - Ok(()) + Ok(Some(())) +} +fn csv_data_from_mime_type( + filename: &str, + mime_type: &str, + options: &Options, +) -> Result> { + if mime_type == "application/gzip" { + let reader = File::open(filename)?; + let d = GzDecoder::new(reader); + CsvData::from_reader(d, filename, options.delimiter, options.trim) + } else if mime_type == "text/plain" { + CsvData::from_filename(filename, options.delimiter, options.trim) + } else { + let error_format = format!("Unsupported MIME type {} for file {}", mime_type, filename); + error!("{}", error_format); + Err(error_format.into()) + } } + +fn remove_extension(p0: &Path) -> Option { + let file_name = p0.file_name()?; + let file_str = file_name.to_str()?; + let mut split = file_str.split('.'); + if let Some(str) = split.next() { + Some(String::from(str)) + } else { + None + } +} + ///Writes a set of rows to STDOUT pub fn write_to_stdout(results: Rows) -> Result<(), Box> { let stdout = std::io::stdout(); @@ -140,9 +183,9 @@ pub fn write_to_stdout(results: Rows) -> Result<(), Box> { Ok(()) } -fn sanitize(str: Option<&OsStr>) -> Option { +fn sanitize(str: Option) -> Option { match str { - Some(s) => s.to_str().map(|v| v.replace(" ", "_")), + Some(s) => Some(s.replace(" ", "_")), None => None, } } diff --git a/testdata/people.csv.gz b/testdata/people.csv.gz new file mode 100644 index 0000000..becc792 Binary files /dev/null and b/testdata/people.csv.gz differ diff --git a/tests/integration.rs b/tests/integration.rs index 891725e..aa34c30 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -1,4 +1,3 @@ -#[cfg(test)] mod query_subcommand { use std::process::Command; @@ -131,8 +130,15 @@ mod query_subcommand { cmd.assert().success(); Ok(()) } + + #[test] + fn it_will_run_a_query_from_a_gz_file() -> Result<(), Box> { + let mut cmd = build_cmd(); + cmd.arg("select * from testdata/people.csv.gz"); + cmd.assert().success(); + Ok(()) + } } -#[cfg(test)] mod analyze_subcommand { use std::process::Command;