From a3e57f14a1640426f41356062f2e6e8e98a05f55 Mon Sep 17 00:00:00 2001 From: Dermot Haughey Date: Fri, 23 Apr 2021 16:38:23 -0500 Subject: [PATCH] initial commit --- .gitignore | 1 + Cargo.lock | 189 +++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 12 +++ src/csv/csv.rs | 53 ++++++++++++ src/csv/inference.rs | 71 ++++++++++++++++ src/csv/mod.rs | 2 + src/db.rs | 24 ++++++ src/main.rs | 6 ++ testdata/test.csv | 2 + 9 files changed, 360 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 src/csv/csv.rs create mode 100644 src/csv/inference.rs create mode 100644 src/csv/mod.rs create mode 100644 src/db.rs create mode 100644 src/main.rs create mode 100644 testdata/test.csv diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..0b32dfe --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,189 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +[[package]] +name = "ahash" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "739f4a8db6605981345c5654f3a85b056ce52f37a39d34da03f25bf2151ea16e" + +[[package]] +name = "bitflags" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" + +[[package]] +name = "bstr" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a40b47ad93e1a5404e6c18dec46b628214fee441c70f4ab5d6942142cc268a3d" +dependencies = [ + "lazy_static", + "memchr", + "regex-automata", + "serde", +] + +[[package]] +name = "byteorder" +version = "1.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" + +[[package]] +name = "csv" +version = "1.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1" +dependencies = [ + "bstr", + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" +dependencies = [ + "memchr", +] + +[[package]] +name = "either" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" + +[[package]] +name = "fallible-iterator" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" + +[[package]] +name = "fallible-streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" + +[[package]] +name = "hashbrown" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7afe4a420e3fe79967a00898cc1f4db7c8a49a9333a29f8a4bd76a253d5cd04" +dependencies = [ + "ahash", +] + +[[package]] +name = "hashlink" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d99cf782f0dc4372d26846bec3de7804ceb5df083c2d4462c0b8d2330e894fa8" +dependencies = [ + "hashbrown", +] + +[[package]] +name = "itertools" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37d572918e350e82412fe766d24b15e6682fb2ed2bbe018280caa810397cb319" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736" + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "libsqlite3-sys" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19cb1effde5f834799ac5e5ef0e40d45027cd74f271b1de786ba8abb30e2164d" +dependencies = [ + "pkg-config", + "vcpkg", +] + +[[package]] +name = "memchr" +version = "2.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525" + +[[package]] +name = "pkg-config" +version = "0.3.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3831453b3449ceb48b6d9c7ad7c96d5ea673e9b470a1dc578c2ce6521230884c" + +[[package]] +name = "qsv" +version = "0.1.0" +dependencies = [ + "csv", + "itertools", + "rusqlite", +] + +[[package]] +name = "regex-automata" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae1ded71d66a4a97f5e961fd0cb25a5f366a42a41570d16a763a69c092c26ae4" +dependencies = [ + "byteorder", +] + +[[package]] +name = "rusqlite" +version = "0.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbc783b7ddae608338003bac1fa00b6786a75a9675fbd8e87243ecfdea3f6ed2" +dependencies = [ + "bitflags", + "fallible-iterator", + "fallible-streaming-iterator", + "hashlink", + "libsqlite3-sys", + "memchr", + "smallvec", +] + +[[package]] +name = "ryu" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e" + +[[package]] +name = "serde" +version = "1.0.125" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "558dc50e1a5a5fa7112ca2ce4effcb321b0300c0d4ccf0776a9f60cd89031171" + +[[package]] +name = "smallvec" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e" + +[[package]] +name = "vcpkg" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbdbff6266a24120518560b5dc983096efb98462e51d0d68169895b237be3e5d" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..3a4ffda --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "qsv" +version = "0.1.0" +authors = ["Dermot Haughey "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +rusqlite = "0.25.1" +csv="1.1" +itertools = "0.10.0" diff --git a/src/csv/csv.rs b/src/csv/csv.rs new file mode 100644 index 0000000..79c7a9b --- /dev/null +++ b/src/csv/csv.rs @@ -0,0 +1,53 @@ +use csv::StringRecord; +use std::error::Error; +use std::fs::File; +use std::io::BufReader; +#[derive(Eq, PartialEq, Debug)] +pub enum CsvWrapper { + Numeric(i64), + String(String), +} +impl CsvWrapper { + pub fn get_type(&self) -> CsvType { + match self { + CsvWrapper::Numeric(_) => CsvType::Numeric, + CsvWrapper::String(_) => CsvType::String, + } + } +} + +#[derive(Debug, Eq, Hash, Clone, Copy, PartialEq)] +pub enum CsvType { + Numeric, + String, +} +pub struct Csv { + pub records: Vec, + pub headers: StringRecord, +} +impl Csv { + fn from_filename(filename: &str) -> Result> { + let mut records = Vec::with_capacity(100); + let file_reader = File::open(filename)?; + let mut rdr = csv::Reader::from_reader(BufReader::new(file_reader)); + for result in rdr.records() { + let record = result?; + records.push(record); + } + let headers = rdr.headers()?; + Ok(Csv { + records, + headers: headers.to_owned(), + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + #[test] + fn it_can_load_file() { + let csv = Csv::from_filename("testdata/test.csv").unwrap(); + assert_eq!(csv.records, vec!(StringRecord::from(vec!("bar", "13")))) + } +} diff --git a/src/csv/inference.rs b/src/csv/inference.rs new file mode 100644 index 0000000..d3d75bd --- /dev/null +++ b/src/csv/inference.rs @@ -0,0 +1,71 @@ +use crate::csv::csv::{Csv, CsvType, CsvWrapper}; +use csv::StringRecord; +use itertools::Itertools; +use std::collections::HashMap; +use std::num::ParseIntError; + +struct ColumnInference { + columns_to_types: HashMap, +} + +impl ColumnInference { + fn from_csv(csv: Csv) -> ColumnInference { + let mut columns_to_types: HashMap = HashMap::new(); + for (i, header) in csv.headers.iter().enumerate() { + let t: Vec = csv + .records + .iter() + .map(|s| parse(s.get(i).unwrap())) + .collect(); + let types: Vec = t.iter().map(|s| s.get_type()).collect(); + let unique_types: Vec<&CsvType> = types.iter().unique().collect(); + + if unique_types.len() == 1 { + columns_to_types.insert(String::from(header), unique_types[0].to_owned()); + } else { + columns_to_types.insert(String::from(header), CsvType::String); + } + } + ColumnInference { columns_to_types } + } + pub fn get_type(&self, s: String) -> Option<&CsvType> { + self.columns_to_types.get(s.as_str()) + } +} +fn parse(s: &str) -> CsvWrapper { + let is_numeric: Result = s.parse(); + is_numeric + .map(CsvWrapper::Numeric) + .unwrap_or_else(|_| CsvWrapper::String(String::from(s))) +} +#[cfg(test)] +mod test { + use super::*; + #[test] + fn it_should_parse_integers() { + assert_eq!(parse("1"), CsvWrapper::Numeric(1)); + assert_eq!(parse("-1"), CsvWrapper::Numeric(-1)); + } + #[test] + fn it_should_parse_strings() { + assert_eq!(parse("foo"), CsvWrapper::String(String::from("foo"))); + assert_eq!(parse("bar"), CsvWrapper::String(String::from("bar"))); + } + #[test] + fn it_should_recognize_integer_column() { + let headers = StringRecord::from(vec!["foo", "bar"]); + let records = vec![ + StringRecord::from(vec!["entry1", "1"]), + StringRecord::from(vec!["entry2", "2"]), + ]; + let inference = ColumnInference::from_csv(Csv { headers, records }); + assert_eq!( + inference.get_type(String::from("foo")), + Some(&CsvType::String) + ); + assert_eq!( + inference.get_type(String::from("bar")), + Some(&CsvType::Numeric) + ); + } +} diff --git a/src/csv/mod.rs b/src/csv/mod.rs new file mode 100644 index 0000000..34b80c0 --- /dev/null +++ b/src/csv/mod.rs @@ -0,0 +1,2 @@ +mod csv; +mod inference; diff --git a/src/db.rs b/src/db.rs new file mode 100644 index 0000000..aeaded9 --- /dev/null +++ b/src/db.rs @@ -0,0 +1,24 @@ +use rusqlite::{Connection, Result}; + +struct Db { + pub connection: Connection, +} +impl Db { + fn open_in_memory() -> Result { + let connection = Connection::open_in_memory()?; + Ok(Db { connection }) + } +} +#[cfg(test)] +mod test { + use super::*; + #[test] + fn can_execute_a_query() { + let db = Db::open_in_memory().unwrap(); + let result: usize = db + .connection + .query_row("SELECT 1 = 1", [], |row| row.get(0)) + .unwrap(); + assert_eq!(result, 1); + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..bc8fd8d --- /dev/null +++ b/src/main.rs @@ -0,0 +1,6 @@ +mod csv; +mod db; + +fn main() { + println!("Hello, world!"); +} diff --git a/testdata/test.csv b/testdata/test.csv new file mode 100644 index 0000000..fe539a1 --- /dev/null +++ b/testdata/test.csv @@ -0,0 +1,2 @@ +foo,age +bar,13 \ No newline at end of file