diff --git a/Cargo.lock b/Cargo.lock index 8dd4795..e5b4359 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -833,6 +833,12 @@ dependencies = [ "unicode-segmentation", ] +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + [[package]] name = "hermit-abi" version = "0.1.19" @@ -1443,6 +1449,7 @@ dependencies = [ "serde_json", "size_format", "structopt", + "strum", "tempfile", "tokio", "tokio-rusqlite", @@ -1495,6 +1502,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "rustversion" +version = "1.0.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7c45b9784283f1b2e7fb61b42047c2fd678ef0960d4f6f1eba131594cc369d4" + [[package]] name = "ryu" version = "1.0.18" @@ -1660,13 +1673,35 @@ version = "0.4.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dcb5ae327f9cc13b68763b5749770cb9e048a99bd9dfdfa58d0cf05d5f64afe0" dependencies = [ - "heck", + "heck 0.3.3", "proc-macro-error", "proc-macro2", "quote", "syn 1.0.109", ] +[[package]] +name = "strum" +version = "0.26.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" +dependencies = [ + "strum_macros", +] + +[[package]] +name = "strum_macros" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" +dependencies = [ + "heck 0.5.0", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.89", +] + [[package]] name = "syn" version = "1.0.109" diff --git a/Cargo.toml b/Cargo.toml index 3c77dfa..6802f19 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -55,6 +55,7 @@ serde = {version = "1.0.163", features = ["derive"]} serde_json = "1.0.96" size_format = "1.0.2" structopt = "0.3.26" +strum = {version = "0.26", features = ["derive"]} tempfile = "3.5.0" tokio = {version = "1.28.1", features = ["full"]} tokio-rusqlite = "0.5.0" diff --git a/exampledir/test/excel.xlsx b/exampledir/test/excel.xlsx new file mode 100644 index 0000000..1d3fb36 Binary files /dev/null and b/exampledir/test/excel.xlsx differ diff --git a/src/adapters.rs b/src/adapters.rs index f1f36f9..45c90fb 100644 --- a/src/adapters.rs +++ b/src/adapters.rs @@ -11,6 +11,7 @@ pub mod zip; use crate::{adapted_iter::AdaptedFilesIterBox, config::RgaConfig, matching::*}; use anyhow::{format_err, Context, Result}; use async_trait::async_trait; +use custom::Builtin; use custom::CustomAdapterConfig; use custom::BUILTIN_SPAWNING_ADAPTERS; use log::*; @@ -38,7 +39,7 @@ pub struct AdapterMeta { pub fast_matchers: Vec, /// list of matchers when we have mime type detection active (interpreted as ORed) /// warning: this *overrides* the fast matchers - pub slow_matchers: Option>, + pub slow_matchers: Vec, /// if true, slow_matchers is merged with fast matchers if accurate is enabled /// for example, in sqlite you want this disabled since the db extension can mean other things and the mime type matching is very accurate for sqlite. /// but for tar you want it enabled, since the tar extension is very accurate but the tar mime matcher can have false negatives @@ -48,39 +49,63 @@ pub struct AdapterMeta { } impl AdapterMeta { // todo: this is pretty ugly - pub fn get_matchers<'a>( - &'a self, - slow: bool, - ) -> Box> + 'a> { + pub fn get_matchers(&self, slow: bool) -> Box> + '_> { match ( slow, self.keep_fast_matchers_if_accurate, &self.slow_matchers, + &self.fast_matchers, ) { - (true, false, Some(ref sm)) => Box::new(sm.iter().map(Cow::Borrowed)), - (true, true, Some(ref sm)) => Box::new( + (true, false, sm, _) => Box::new(sm.iter().map(Cow::Borrowed)), + (true, true, sm, fm) => Box::new( sm.iter().map(Cow::Borrowed).chain( - self.fast_matchers - .iter() - .map(|e| Cow::Owned(FileMatcher::Fast(e.clone()))), + fm.iter() + .map(|e| Cow::Owned(FileMatcher::Fast(e.clone()))) + .collect::>(), ), ), - // don't have slow matchers or slow matching disabled - (true, _, None) | (false, _, _) => Box::new( - self.fast_matchers - .iter() - .map(|e| Cow::Owned(FileMatcher::Fast(e.clone()))), - ), + // slow matching disabled + (false, _, _, fm) => { + Box::new(fm.iter().map(|e| Cow::Owned(FileMatcher::Fast(e.clone())))) + } } } } -pub trait GetMetadata { - fn metadata(&self) -> &AdapterMeta; +pub trait Adapter { + fn name(&self) -> String; + fn version(&self) -> i32; + fn description(&self) -> String; + fn recurses(&self) -> bool; + fn disabled_by_default(&self) -> bool; + fn keep_fast_matchers_if_accurate(&self) -> bool; + fn extensions(&self) -> Vec; + fn mimetypes(&self) -> Vec; + + fn metadata(&self) -> AdapterMeta { + return AdapterMeta { + name: self.name(), + version: self.version(), + description: self.description(), + recurses: true, + fast_matchers: self + .extensions() + .iter() + .map(|s| FastFileMatcher::FileExtension(s.to_string())) + .collect(), + slow_matchers: self + .mimetypes() + .iter() + .map(|mimetype| FileMatcher::MimeType(mimetype.to_string())) + .collect(), + disabled_by_default: self.disabled_by_default(), + keep_fast_matchers_if_accurate: self.keep_fast_matchers_if_accurate(), + }; + } } #[async_trait] -pub trait FileAdapter: GetMetadata + Send + Sync { +pub trait FileAdapter: Adapter + Send + Sync { /// adapt a file. /// /// detection_reason is the Matcher that was used to identify this file. Unless --rga-accurate was given, it is always a FastMatcher @@ -109,7 +134,67 @@ pub struct AdaptInfo { /// (enabledAdapters, disabledAdapters) type AdaptersTuple = (Vec>, Vec>); -pub fn get_all_adapters(custom_adapters: Option>) -> AdaptersTuple { +pub fn get_all_adapters( + custom_extensions: Option>, + custom_mimetypes: Option>, + custom_adapters: Option>, +) -> AdaptersTuple { + let extensions: &mut HashMap> = &mut HashMap::new(); + if let Some(ce) = custom_extensions.as_ref() { + for (ext, builtin) in ce { + extensions + .entry(*builtin) + .or_default() + .push(ext.to_string()); + } + } + for (builtin, exts) in [ + (Builtin::BZ2, decompress::EXTENSIONS_BZ2), + (Builtin::GZ, decompress::EXTENSIONS_GZ), + (Builtin::XZ, decompress::EXTENSIONS_XZ), + (Builtin::ZST, decompress::EXTENSIONS_ZST), + (Builtin::FFMPEG, ffmpeg::EXTENSIONS), + (Builtin::MBOX, mbox::EXTENSIONS), + (Builtin::SQLITE, sqlite::EXTENSIONS), + (Builtin::TAR, tar::EXTENSIONS), + (Builtin::ZIP, zip::EXTENSIONS), + ] { + for ext in exts { + if !custom_extensions + .as_ref() + .is_some_and(|ce| ce.contains_key(ext.to_owned())) + { + extensions.entry(builtin).or_default().push(ext.to_string()); + } + } + } + + let mimetypes: &mut HashMap> = &mut HashMap::new(); + if let Some(cm) = custom_mimetypes.as_ref() { + for (mime, builtin) in cm { + mimetypes + .entry(*builtin) + .or_default() + .push(mime.to_string()); + } + } + for (builtin, mimes) in [ + (Builtin::BZ2, decompress::MIMETYPES_BZ2), + (Builtin::GZ, decompress::MIMETYPES_GZ), + (Builtin::XZ, decompress::MIMETYPES_XZ), + (Builtin::ZST, decompress::MIMETYPES_ZST), + (Builtin::FFMPEG, ffmpeg::MIMETYPES), + (Builtin::MBOX, mbox::MIMETYPES), + (Builtin::SQLITE, sqlite::MIMETYPES), + (Builtin::TAR, tar::MIMETYPES), + (Builtin::ZIP, zip::MIMETYPES), + ] { + let val = mimetypes.entry(builtin).or_default(); + for mime in mimes { + val.push(mime.to_string()); + } + } + // order in descending priority let mut adapters: Vec> = vec![]; if let Some(custom_adapters) = custom_adapters { @@ -120,12 +205,36 @@ pub fn get_all_adapters(custom_adapters: Option>) -> Ad let internal_adapters: Vec> = vec![ Arc::new(PostprocPageBreaks::default()), - Arc::new(ffmpeg::FFmpegAdapter::new()), - Arc::new(zip::ZipAdapter::new()), - Arc::new(decompress::DecompressAdapter::new()), - Arc::new(mbox::MboxAdapter::new()), - Arc::new(tar::TarAdapter::new()), - Arc::new(sqlite::SqliteAdapter::new()), + Arc::new(ffmpeg::FFmpegAdapter { + extensions: extensions[&Builtin::FFMPEG].clone(), + mimetypes: mimetypes[&Builtin::FFMPEG].clone(), + }), + Arc::new(zip::ZipAdapter { + extensions: extensions[&Builtin::ZIP].clone(), + mimetypes: mimetypes[&Builtin::ZIP].clone(), + }), + Arc::new(decompress::DecompressAdapter { + extensions_gz: extensions[&Builtin::GZ].clone(), + extensions_bz2: extensions[&Builtin::BZ2].clone(), + extensions_xz: extensions[&Builtin::XZ].clone(), + extensions_zst: extensions[&Builtin::ZST].clone(), + mimetypes_gz: mimetypes[&Builtin::GZ].clone(), + mimetypes_bz2: mimetypes[&Builtin::BZ2].clone(), + mimetypes_xz: mimetypes[&Builtin::XZ].clone(), + mimetypes_zst: mimetypes[&Builtin::ZST].clone(), + }), + Arc::new(mbox::MboxAdapter { + extensions: extensions[&Builtin::MBOX].clone(), + mimetypes: mimetypes[&Builtin::MBOX].clone(), + }), + Arc::new(sqlite::SqliteAdapter { + extensions: extensions[&Builtin::SQLITE].clone(), + mimetypes: mimetypes[&Builtin::SQLITE].clone(), + }), + Arc::new(tar::TarAdapter { + extensions: extensions[&Builtin::TAR].clone(), + mimetypes: mimetypes[&Builtin::TAR].clone(), + }), ]; adapters.extend( BUILTIN_SPAWNING_ADAPTERS @@ -148,10 +257,13 @@ pub fn get_all_adapters(custom_adapters: Option>) -> Ad * - "+a,b" means use default list but also a and b (a,b will be prepended to the list so given higher priority) */ pub fn get_adapters_filtered>( + custom_extensions: Option>, + custom_identifiers: Option>, custom_adapters: Option>, adapter_names: &[T], ) -> Result>> { - let (def_enabled_adapters, def_disabled_adapters) = get_all_adapters(custom_adapters); + let (def_enabled_adapters, def_disabled_adapters) = + get_all_adapters(custom_extensions, custom_identifiers, custom_adapters); let adapters = if !adapter_names.is_empty() { let adapters_map: HashMap<_, _> = def_enabled_adapters .iter() diff --git a/src/adapters/custom.rs b/src/adapters/custom.rs index 814c2b8..13b67c1 100644 --- a/src/adapters/custom.rs +++ b/src/adapters/custom.rs @@ -1,12 +1,8 @@ use super::*; -use super::{AdaptInfo, AdapterMeta, FileAdapter, GetMetadata}; +use super::{AdaptInfo, Adapter, FileAdapter}; use crate::adapted_iter::one_file; -use crate::{ - adapted_iter::AdaptedFilesIterBox, - expand::expand_str_ez, - matching::{FastFileMatcher, FileMatcher}, -}; +use crate::{adapted_iter::AdaptedFilesIterBox, expand::expand_str_ez, matching::FileMatcher}; use crate::{join_handle_to_stream, to_io_err}; use anyhow::Result; use async_stream::stream; @@ -17,12 +13,30 @@ use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use std::path::Path; use std::process::Stdio; +use strum::EnumString; use tokio::io::AsyncReadExt; use tokio::process::Child; use tokio::process::Command; use tokio_util::io::StreamReader; // mostly the same as AdapterMeta + SpawningFileAdapter + +#[derive( + Clone, Copy, Debug, Deserialize, EnumString, Eq, Hash, JsonSchema, PartialEq, Serialize, +)] +#[strum(serialize_all = "lowercase")] +pub enum Builtin { + BZ2, + GZ, + XZ, + ZST, + FFMPEG, + MBOX, + SQLITE, + TAR, + ZIP, +} + #[derive(Debug, Deserialize, Serialize, JsonSchema, Default, PartialEq, Clone)] pub struct CustomAdapterConfig { /// The unique identifier and name of this adapter. @@ -44,8 +58,8 @@ pub struct CustomAdapterConfig { /// The file extensions this adapter supports, for example `["epub", "mobi"]`. pub extensions: Vec, - /// If not null and `--rga-accurate` is enabled, mimetype matching is used instead of file name matching. - pub mimetypes: Option>, + /// If not empty `--rga-accurate` is enabled, mimetype matching is used instead of file name matching. + pub mimetypes: Vec, /// If `--rga-accurate`, only match by mime types and ignore extensions completely. pub match_only_by_mime: Option, @@ -72,7 +86,7 @@ pub struct CustomAdapterConfig { pub output_path_hint: Option, } -fn strs(arr: &[&str]) -> Vec { +pub fn strs(arr: &[&str]) -> Vec { arr.iter().map(ToString::to_string).collect() } @@ -119,7 +133,7 @@ lazy_static! { version: 3, extensions: strs(&["epub", "odt", "docx", "fb2", "ipynb", "html", "htm"]), binary: "pandoc".to_string(), - mimetypes: None, + mimetypes: Vec::new(), // simpler markdown (with more information loss but plainer text) //.arg("--to=commonmark-header_attributes-link_attributes-fenced_divs-markdown_in_html_blocks-raw_html-native_divs-native_spans-bracketed_spans") args: strs(&[ @@ -139,8 +153,7 @@ lazy_static! { .to_owned(), extensions: strs(&["pdf"]), - mimetypes: Some(strs(&["application/pdf"])), - + mimetypes: strs(&["application/pdf"]), binary: "pdftotext".to_string(), args: strs(&["-", "-"]), disabled_by_default: None, @@ -199,16 +212,46 @@ pub fn pipe_output( } pub struct CustomSpawningFileAdapter { + name: String, + version: i32, + description: String, + recurses: bool, + disabled_by_default: bool, + keep_fast_matchers_if_accurate: bool, + extensions: Vec, + mimetypes: Vec, binary: String, args: Vec, - meta: AdapterMeta, output_path_hint: Option, } -impl GetMetadata for CustomSpawningFileAdapter { - fn metadata(&self) -> &AdapterMeta { - &self.meta + +impl Adapter for CustomSpawningFileAdapter { + fn name(&self) -> String { + self.name.clone() + } + fn version(&self) -> i32 { + self.version + } + fn description(&self) -> String { + self.description.clone() + } + fn recurses(&self) -> bool { + self.recurses + } + fn disabled_by_default(&self) -> bool { + self.disabled_by_default + } + fn keep_fast_matchers_if_accurate(&self) -> bool { + self.keep_fast_matchers_if_accurate + } + fn extensions(&self) -> Vec { + self.extensions.clone() + } + fn mimetypes(&self) -> Vec { + self.mimetypes.clone() } } + fn arg_replacer(arg: &str, filepath_hint: &Path) -> Result { expand_str_ez(arg, |s| match s { "input_virtual_path" => Ok(filepath_hint.to_string_lossy()), @@ -281,33 +324,22 @@ impl FileAdapter for CustomSpawningFileAdapter { impl CustomAdapterConfig { pub fn to_adapter(&self) -> CustomSpawningFileAdapter { CustomSpawningFileAdapter { + name: self.name.clone(), + version: self.version, + description: format!( + "{}\nRuns: {} {}", + self.description, + self.binary, + self.args.join(" ") + ), + recurses: false, + disabled_by_default: self.disabled_by_default.unwrap_or(false), + keep_fast_matchers_if_accurate: !self.match_only_by_mime.unwrap_or(false), + extensions: self.extensions.clone(), + mimetypes: self.mimetypes.clone(), binary: self.binary.clone(), args: self.args.clone(), output_path_hint: self.output_path_hint.clone(), - meta: AdapterMeta { - name: self.name.clone(), - version: self.version, - description: format!( - "{}\nRuns: {} {}", - self.description, - self.binary, - self.args.join(" ") - ), - recurses: true, - fast_matchers: self - .extensions - .iter() - .map(|s| FastFileMatcher::FileExtension(s.to_string())) - .collect(), - slow_matchers: self.mimetypes.as_ref().map(|mimetypes| { - mimetypes - .iter() - .map(|s| FileMatcher::MimeType(s.to_string())) - .collect() - }), - keep_fast_matchers_if_accurate: !self.match_only_by_mime.unwrap_or(false), - disabled_by_default: self.disabled_by_default.unwrap_or(false), - }, } } } @@ -334,13 +366,9 @@ mod test { let o = adapted_to_vec(r).await?; assert_eq!( String::from_utf8(o)?, - "PREFIX:Page 1: hello world -PREFIX:Page 1: this is just a test. -PREFIX:Page 1: -PREFIX:Page 1: 1 -PREFIX:Page 1: -PREFIX:Page 1: -" + ["hello world", "this is just a test.", "", "1", "", "\n",] + .map(|s| format!("{}{}", "PREFIX:Page 1: ", s)) + .join("\n") ); Ok(()) } @@ -360,7 +388,7 @@ PREFIX:Page 1: disabled_by_default: None, version: 1, extensions: vec!["txt".to_string()], - mimetypes: None, + mimetypes: Vec::new(), match_only_by_mime: None, binary: "sed".to_string(), args: vec!["s/e/u/g".to_string()], diff --git a/src/adapters/decompress.rs b/src/adapters/decompress.rs index f4b96a7..cf67b57 100644 --- a/src/adapters/decompress.rs +++ b/src/adapters/decompress.rs @@ -3,51 +3,133 @@ use crate::adapted_iter::one_file; use super::*; use anyhow::Result; -use lazy_static::lazy_static; +use std::path::{Path, PathBuf}; +use std::str::FromStr; use tokio::io::BufReader; -use std::path::{Path, PathBuf}; +pub const EXTENSIONS_GZ: &[&str] = &["als", "gz", "tgz"]; +pub const EXTENSIONS_BZ2: &[&str] = &["bz2", "tbz", "tbz2"]; +pub const EXTENSIONS_XZ: &[&str] = &["xz"]; +pub const EXTENSIONS_ZST: &[&str] = &["zst"]; -static EXTENSIONS: &[&str] = &["als", "bz2", "gz", "tbz", "tbz2", "tgz", "xz", "zst"]; -static MIME_TYPES: &[&str] = &[ - "application/gzip", - "application/x-bzip", - "application/x-xz", - "application/zstd", -]; -lazy_static! { - static ref METADATA: AdapterMeta = AdapterMeta { - name: "decompress".to_owned(), - version: 1, - description: - "Reads compressed file as a stream and runs a different extractor on the contents." - .to_owned(), - recurses: true, - fast_matchers: EXTENSIONS - .iter() - .map(|s| FastFileMatcher::FileExtension(s.to_string())) - .collect(), - slow_matchers: Some( - MIME_TYPES - .iter() - .map(|s| FileMatcher::MimeType(s.to_string())) - .collect() - ), - disabled_by_default: false, - keep_fast_matchers_if_accurate: true - }; +#[derive(Debug, PartialEq, Eq)] +struct DecompressError; + +#[derive(Debug, PartialEq)] +enum Extension { + Gz, + Bz2, + Xz, + Zst, +} +impl FromStr for Extension { + type Err = DecompressError; + + fn from_str(ext: &str) -> Result { + if EXTENSIONS_GZ.contains(&ext) { + Ok(Extension::Gz) + } else if EXTENSIONS_BZ2.contains(&ext) { + Ok(Extension::Bz2) + } else if EXTENSIONS_XZ.contains(&ext) { + Ok(Extension::Xz) + } else if EXTENSIONS_ZST.contains(&ext) { + Ok(Extension::Zst) + } else { + Err(DecompressError) + } + } } -#[derive(Default)] -pub struct DecompressAdapter; -impl DecompressAdapter { - pub fn new() -> DecompressAdapter { - DecompressAdapter +pub const MIMETYPES_GZ: &[&str] = &["application/gzip"]; +pub const MIMETYPES_BZ2: &[&str] = &["application/x-bzip"]; +pub const MIMETYPES_XZ: &[&str] = &["application/x-xz"]; +pub const MIMETYPES_ZST: &[&str] = &["application/zstd"]; + +#[derive(Debug, PartialEq)] +enum Mime { + Gz, + Bz2, + Xz, + Zst, +} +impl FromStr for Mime { + type Err = DecompressError; + + fn from_str(ext: &str) -> Result { + if MIMETYPES_GZ.contains(&ext) { + Ok(Mime::Gz) + } else if MIMETYPES_BZ2.contains(&ext) { + Ok(Mime::Bz2) + } else if MIMETYPES_XZ.contains(&ext) { + Ok(Mime::Xz) + } else if MIMETYPES_ZST.contains(&ext) { + Ok(Mime::Zst) + } else { + Err(DecompressError) + } } } -impl GetMetadata for DecompressAdapter { - fn metadata(&self) -> &AdapterMeta { - &METADATA + +#[derive(Default)] +pub struct DecompressAdapter { + pub extensions_gz: Vec, + pub extensions_bz2: Vec, + pub extensions_xz: Vec, + pub extensions_zst: Vec, + pub mimetypes_gz: Vec, + pub mimetypes_bz2: Vec, + pub mimetypes_xz: Vec, + pub mimetypes_zst: Vec, +} + +impl Adapter for DecompressAdapter { + fn name(&self) -> String { + String::from("decompress") + } + fn version(&self) -> i32 { + 1 + } + fn description(&self) -> String { + String::from( + "Reads compressed file as a stream and runs a different extractor on the contents.", + ) + } + fn recurses(&self) -> bool { + true + } + fn disabled_by_default(&self) -> bool { + false + } + fn keep_fast_matchers_if_accurate(&self) -> bool { + true + } + fn extensions(&self) -> Vec { + let mut extensions: Vec = Vec::new(); + for exts in [ + &self.extensions_gz, + &self.extensions_bz2, + &self.extensions_xz, + &self.extensions_zst, + ] { + for ext in exts { + extensions.push(ext.to_string()) + } + } + extensions + } + fn mimetypes(&self) -> Vec { + let mut mimetypes: Vec = Vec::new(); + for mimes in [ + &self.mimetypes_gz, + &self.mimetypes_bz2, + &self.mimetypes_xz, + &self.mimetypes_zst, + ] { + for mime in mimes { + mimetypes.push(mime.to_string()) + } + } + mimetypes } } @@ -61,19 +143,19 @@ fn decompress_any(reason: &FileMatcher, inp: ReadBox) -> Result { let zst = |inp: ReadBox| Box::pin(bufread::ZstdDecoder::new(BufReader::new(inp))); Ok(match reason { - Fast(FileExtension(ext)) => match ext.as_ref() { - "als" | "gz" | "tgz" => gz(inp), - "bz2" | "tbz" | "tbz2" => bz2(inp), - "zst" => zst(inp), - "xz" => xz(inp), - ext => Err(format_err!("don't know how to decompress {}", ext))?, + Fast(FileExtension(ext)) => match Extension::from_str(ext) { + Ok(Extension::Gz) => gz(inp), + Ok(Extension::Bz2) => bz2(inp), + Ok(Extension::Zst) => xz(inp), + Ok(Extension::Xz) => zst(inp), + Err(_) => Err(format_err!("don't know how to decompress {}", ext))?, }, - MimeType(mime) => match mime.as_ref() { - "application/gzip" => gz(inp), - "application/x-bzip" => bz2(inp), - "application/x-xz" => xz(inp), - "application/zstd" => zst(inp), - mime => Err(format_err!("don't know how to decompress mime {}", mime))?, + MimeType(mime) => match Mime::from_str(mime) { + Ok(Mime::Gz) => gz(inp), + Ok(Mime::Bz2) => bz2(inp), + Ok(Mime::Xz) => xz(inp), + Ok(Mime::Zst) => zst(inp), + Err(_) => Err(format_err!("don't know how to decompress mime {}", mime))?, }, }) } @@ -137,7 +219,7 @@ mod tests { #[tokio::test] async fn gz() -> Result<()> { - let adapter = DecompressAdapter; + let adapter = DecompressAdapter::default(); let filepath = test_data_dir().join("hello.gz"); @@ -150,7 +232,7 @@ mod tests { #[tokio::test] async fn pdf_gz() -> Result<()> { - let adapter = DecompressAdapter; + let adapter = DecompressAdapter::default(); let filepath = test_data_dir().join("short.pdf.gz"); diff --git a/src/adapters/ffmpeg.rs b/src/adapters/ffmpeg.rs index 32298fe..6e23a2a 100644 --- a/src/adapters/ffmpeg.rs +++ b/src/adapters/ffmpeg.rs @@ -2,7 +2,6 @@ use super::*; use super::{custom::map_exe_error, writing::async_writeln}; use anyhow::*; use async_trait::async_trait; -use lazy_static::lazy_static; use regex::Regex; use serde::{Deserialize, Serialize}; use std::process::Stdio; @@ -10,41 +9,45 @@ use tokio::io::AsyncWrite; use tokio::io::{AsyncBufReadExt, BufReader}; use tokio::process::Command; use writing::WritingFileAdapter; -// todo: + // maybe todo: read list of extensions from // ffmpeg -demuxers | tail -n+5 | awk '{print $2}' | while read demuxer; do echo MUX=$demuxer; ffmpeg -h demuxer=$demuxer | grep 'Common extensions'; done 2>/dev/null // but really, the probability of getting useful information from a .flv is low -static EXTENSIONS: &[&str] = &["mkv", "mp4", "avi", "mp3", "ogg", "flac", "webm"]; +pub const EXTENSIONS: &[&str] = &["mkv", "mp4", "avi", "mp3", "ogg", "flac", "webm"]; +pub const MIMETYPES: &[&str] = &[]; -lazy_static! { - static ref METADATA: AdapterMeta = AdapterMeta { - name: "ffmpeg".to_owned(), - version: 1, - description: - "Uses ffmpeg to extract video metadata/chapters, subtitles, lyrics, and other metadata" - .to_owned(), - recurses: false, - fast_matchers: EXTENSIONS - .iter() - .map(|s| FastFileMatcher::FileExtension(s.to_string())) - .collect(), - slow_matchers: None, - disabled_by_default: false, - keep_fast_matchers_if_accurate: true - }; +#[derive(Clone)] +pub struct FFmpegAdapter { + pub extensions: Vec, + pub mimetypes: Vec, } -#[derive(Default, Clone)] -pub struct FFmpegAdapter; - -impl FFmpegAdapter { - pub fn new() -> FFmpegAdapter { - FFmpegAdapter +impl Adapter for FFmpegAdapter { + fn name(&self) -> String { + String::from("ffmpeg") } -} -impl GetMetadata for FFmpegAdapter { - fn metadata(&self) -> &AdapterMeta { - &METADATA + fn version(&self) -> i32 { + 1 + } + fn description(&self) -> String { + String::from( + "Uses ffmpeg to extract video metadata/chapters, subtitles, lyrics, and other metadata.", + ) + } + fn recurses(&self) -> bool { + false + } + fn disabled_by_default(&self) -> bool { + false + } + fn keep_fast_matchers_if_accurate(&self) -> bool { + true + } + fn extensions(&self) -> Vec { + self.extensions.clone() + } + fn mimetypes(&self) -> Vec { + self.mimetypes.clone() } } diff --git a/src/adapters/mbox.rs b/src/adapters/mbox.rs index ee39d0d..d6bf31f 100644 --- a/src/adapters/mbox.rs +++ b/src/adapters/mbox.rs @@ -9,42 +9,45 @@ use tokio::io::AsyncReadExt; use std::{collections::VecDeque, io::Cursor}; -static EXTENSIONS: &[&str] = &["mbox", "mbx", "eml"]; -static MIME_TYPES: &[&str] = &["application/mbox", "message/rfc822"]; +pub const EXTENSIONS: &[&str] = &["mbox", "mbx", "eml"]; +pub const MIMETYPES: &[&str] = &["application/mbox", "message/rfc822"]; + lazy_static! { - static ref METADATA: AdapterMeta = AdapterMeta { - name: "mail".to_owned(), - version: 1, - description: - "Reads mailbox/mail files and runs extractors on the contents and attachments." - .to_owned(), - recurses: true, - fast_matchers: EXTENSIONS - .iter() - .map(|s| FastFileMatcher::FileExtension(s.to_string())) - .collect(), - slow_matchers: Some( - MIME_TYPES - .iter() - .map(|s| FileMatcher::MimeType(s.to_string())) - .collect() - ), - disabled_by_default: true, - keep_fast_matchers_if_accurate: true - }; static ref FROM_REGEX: Regex = Regex::new("\r?\nFrom [^\n]+\n").unwrap(); } + #[derive(Default)] -pub struct MboxAdapter; +pub struct MboxAdapter { + pub extensions: Vec, + pub mimetypes: Vec, +} -impl MboxAdapter { - pub fn new() -> MboxAdapter { - MboxAdapter +impl Adapter for MboxAdapter { + fn name(&self) -> String { + String::from("mail") } -} -impl GetMetadata for MboxAdapter { - fn metadata(&self) -> &AdapterMeta { - &METADATA + fn version(&self) -> i32 { + 1 + } + fn description(&self) -> String { + String::from( + "Reads mailbox/mail files and runs extractors on the contents and attachments.", + ) + } + fn recurses(&self) -> bool { + true + } + fn disabled_by_default(&self) -> bool { + false + } + fn keep_fast_matchers_if_accurate(&self) -> bool { + true + } + fn extensions(&self) -> Vec { + self.extensions.clone() + } + fn mimetypes(&self) -> Vec { + self.mimetypes.clone() } } @@ -138,7 +141,7 @@ mod tests { #[tokio::test] async fn mail_simple() -> Result<()> { - let adapter = MboxAdapter; + let adapter = MboxAdapter::default(); let filepath = test_data_dir().join("github_email.eml"); @@ -171,7 +174,7 @@ mod tests { #[tokio::test] async fn mbox_simple() -> Result<()> { - let adapter = MboxAdapter; + let adapter = MboxAdapter::default(); let filepath = test_data_dir().join("test.mbx"); @@ -197,7 +200,7 @@ mod tests { async fn mbox_attachment() -> Result<()> { init_logging(); - let adapter = MboxAdapter; + let adapter = MboxAdapter::default(); let filepath = test_data_dir().join("mail_with_attachment.mbox"); diff --git a/src/adapters/postproc.rs b/src/adapters/postproc.rs index 45ec2a7..5339d10 100644 --- a/src/adapters/postproc.rs +++ b/src/adapters/postproc.rs @@ -1,4 +1,4 @@ -//trait RunFnAdapter: GetMetadata {} +//trait RunFnAdapter: Adapter {} //impl FileAdapter for T where T: RunFnAdapter {} @@ -19,30 +19,38 @@ use tokio_util::io::StreamReader; use crate::adapted_iter::one_file; use crate::adapted_iter::AdaptedFilesIterBox; -use crate::matching::FastFileMatcher; -use super::{AdaptInfo, AdapterMeta, FileAdapter, GetMetadata}; +use super::{AdaptInfo, Adapter, FileAdapter}; fn add_newline(ar: impl AsyncRead + Send) -> impl AsyncRead + Send { ar.chain(Cursor::new(&[b'\n'])) } pub struct PostprocPrefix {} -impl GetMetadata for PostprocPrefix { - fn metadata(&self) -> &super::AdapterMeta { - lazy_static::lazy_static! { - static ref METADATA: AdapterMeta = AdapterMeta { - name: "postprocprefix".to_owned(), - version: 1, - description: "Adds the line prefix to each line (e.g. the filename within a zip)".to_owned(), - recurses: false, - fast_matchers: vec![], - slow_matchers: None, - keep_fast_matchers_if_accurate: false, - disabled_by_default: false - }; - } - &METADATA +impl Adapter for PostprocPrefix { + fn name(&self) -> String { + String::from("postprocprefix") + } + fn version(&self) -> i32 { + 1 + } + fn description(&self) -> String { + String::from("Adds the line prefix to each line (e.g. the filename within a zip)") + } + fn recurses(&self) -> bool { + false + } + fn mimetypes(&self) -> Vec { + [].into() + } + fn extensions(&self) -> Vec { + [].into() + } + fn keep_fast_matchers_if_accurate(&self) -> bool { + false + } + fn disabled_by_default(&self) -> bool { + false } } #[async_trait] @@ -155,21 +163,30 @@ pub fn postproc_prefix(line_prefix: &str, inp: impl AsyncRead + Send) -> impl As #[derive(Default)] pub struct PostprocPageBreaks {} -impl GetMetadata for PostprocPageBreaks { - fn metadata(&self) -> &super::AdapterMeta { - lazy_static::lazy_static! { - static ref METADATA: AdapterMeta = AdapterMeta { - name: "postprocpagebreaks".to_owned(), - version: 1, - description: "Adds the page number to each line for an input file that specifies page breaks as ascii page break character.\nMainly to be used internally by the poppler adapter.".to_owned(), - recurses: false, - fast_matchers: vec![FastFileMatcher::FileExtension("asciipagebreaks".to_string())], - slow_matchers: None, - keep_fast_matchers_if_accurate: false, - disabled_by_default: false - }; - } - &METADATA +impl Adapter for PostprocPageBreaks { + fn name(&self) -> String { + String::from("postprocpagebreaks") + } + fn version(&self) -> i32 { + 1 + } + fn description(&self) -> String { + String::from("Adds the page number to each line for an input file that specifies page breaks as ascii page break character.\nMainly to be used internally by the poppler adapter.") + } + fn recurses(&self) -> bool { + false + } + fn extensions(&self) -> Vec { + vec![String::from("asciipagebreaks")] + } + fn mimetypes(&self) -> Vec { + [].into() + } + fn disabled_by_default(&self) -> bool { + false + } + fn keep_fast_matchers_if_accurate(&self) -> bool { + true } } #[async_trait] diff --git a/src/adapters/sqlite.rs b/src/adapters/sqlite.rs index 0e8c1b9..ddaf487 100644 --- a/src/adapters/sqlite.rs +++ b/src/adapters/sqlite.rs @@ -1,7 +1,6 @@ use super::{writing::WritingFileAdapter, *}; use anyhow::Result; use async_trait::async_trait; -use lazy_static::lazy_static; use log::*; use rusqlite::types::ValueRef; use rusqlite::*; @@ -10,39 +9,50 @@ use tokio::io::AsyncWrite; use tokio_util::io::SyncIoBridge; -static EXTENSIONS: &[&str] = &["db", "db3", "sqlite", "sqlite3"]; +pub const EXTENSIONS: &[&str] = &["db", "db3", "sqlite", "sqlite3"]; +pub const MIMETYPES: &[&str] = &["application/x-sqlite3"]; -lazy_static! { - static ref METADATA: AdapterMeta = AdapterMeta { - name: "sqlite".to_owned(), - version: 1, - description: - "Uses sqlite bindings to convert sqlite databases into a simple plain text format" - .to_owned(), - recurses: false, // set to true if we decide to make sqlite blobs searchable (gz blob in db is kinda common I think) - fast_matchers: EXTENSIONS - .iter() - .map(|s| FastFileMatcher::FileExtension(s.to_string())) - .collect(), - slow_matchers: Some(vec![FileMatcher::MimeType( - "application/x-sqlite3".to_owned() - )]), - keep_fast_matchers_if_accurate: false, - disabled_by_default: false - }; +#[derive(Clone)] +pub struct SqliteAdapter { + pub extensions: Vec, + pub mimetypes: Vec, } -#[derive(Default, Clone)] -pub struct SqliteAdapter; - -impl SqliteAdapter { - pub fn new() -> SqliteAdapter { - SqliteAdapter +impl Default for SqliteAdapter { + fn default() -> SqliteAdapter { + SqliteAdapter { + extensions: EXTENSIONS.iter().map(|&s| s.to_string()).collect(), + mimetypes: MIMETYPES.iter().map(|&s| s.to_string()).collect(), + } } } -impl GetMetadata for SqliteAdapter { - fn metadata(&self) -> &AdapterMeta { - &METADATA + +impl Adapter for SqliteAdapter { + fn name(&self) -> String { + String::from("sqlite") + } + fn version(&self) -> i32 { + 1 + } + fn description(&self) -> String { + String::from( + "Uses sqlite bindings to convert sqlite databases into a simple plain text format", + ) + } + fn recurses(&self) -> bool { + false + } + fn disabled_by_default(&self) -> bool { + false + } + fn keep_fast_matchers_if_accurate(&self) -> bool { + false + } + fn extensions(&self) -> Vec { + self.extensions.clone() + } + fn mimetypes(&self) -> Vec { + self.mimetypes.clone() } } diff --git a/src/adapters/tar.rs b/src/adapters/tar.rs index 144bd20..e4fcf68 100644 --- a/src/adapters/tar.rs +++ b/src/adapters/tar.rs @@ -1,48 +1,56 @@ -use crate::{ - adapted_iter::AdaptedFilesIterBox, - adapters::AdapterMeta, - matching::{FastFileMatcher, FileMatcher}, - print_bytes, -}; +use crate::{adapted_iter::AdaptedFilesIterBox, matching::FileMatcher, print_bytes}; use anyhow::*; use async_stream::stream; use async_trait::async_trait; -use lazy_static::lazy_static; use log::*; use std::path::PathBuf; use tokio_stream::StreamExt; -use super::{AdaptInfo, FileAdapter, GetMetadata}; +use super::{AdaptInfo, Adapter, FileAdapter}; -static EXTENSIONS: &[&str] = &["tar"]; +pub const EXTENSIONS: &[&str] = &["tar"]; +pub const MIMETYPES: &[&str] = &[]; -lazy_static! { - static ref METADATA: AdapterMeta = AdapterMeta { - name: "tar".to_owned(), - version: 1, - description: "Reads a tar file as a stream and recurses down into its contents".to_owned(), - recurses: true, - fast_matchers: EXTENSIONS - .iter() - .map(|s| FastFileMatcher::FileExtension(s.to_string())) - .collect(), - slow_matchers: None, - keep_fast_matchers_if_accurate: true, - disabled_by_default: false - }; +#[derive(Clone)] +pub struct TarAdapter { + pub extensions: Vec, + pub mimetypes: Vec, } -#[derive(Default, Clone)] -pub struct TarAdapter; -impl TarAdapter { - pub fn new() -> TarAdapter { - TarAdapter +impl Default for TarAdapter { + fn default() -> TarAdapter { + TarAdapter { + extensions: EXTENSIONS.iter().map(|&s| s.to_string()).collect(), + mimetypes: MIMETYPES.iter().map(|&s| s.to_string()).collect(), + } } } -impl GetMetadata for TarAdapter { - fn metadata(&self) -> &AdapterMeta { - &METADATA + +impl Adapter for TarAdapter { + fn name(&self) -> String { + String::from("tar") + } + fn version(&self) -> i32 { + 1 + } + fn description(&self) -> String { + String::from("Reads a tar file as a stream and recurses down into its contents") + } + fn recurses(&self) -> bool { + true + } + fn disabled_by_default(&self) -> bool { + false + } + fn keep_fast_matchers_if_accurate(&self) -> bool { + true + } + fn extensions(&self) -> Vec { + self.extensions.clone() + } + fn mimetypes(&self) -> Vec { + self.mimetypes.clone() } } @@ -108,7 +116,7 @@ mod tests { let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?)); - let adapter = TarAdapter::new(); + let adapter = TarAdapter::default(); let r = loop_adapt(&adapter, d, a).await.context("adapt")?; let o = adapted_to_vec(r).await.context("adapted_to_vec")?; assert_eq!( diff --git a/src/adapters/writing.rs b/src/adapters/writing.rs index b17152a..1ed13d0 100644 --- a/src/adapters/writing.rs +++ b/src/adapters/writing.rs @@ -2,13 +2,13 @@ use std::pin::Pin; use crate::{adapted_iter::one_file, join_handle_to_stream, to_io_err}; -use super::{AdaptInfo, FileAdapter, GetMetadata}; +use super::{AdaptInfo, Adapter, FileAdapter}; use anyhow::{Context, Result}; use async_trait::async_trait; use tokio::io::{AsyncReadExt, AsyncWrite}; #[async_trait] -pub trait WritingFileAdapter: GetMetadata + Send + Sync + Clone { +pub trait WritingFileAdapter: Adapter + Send + Sync + Clone { async fn adapt_write( a: super::AdaptInfo, detection_reason: &crate::matching::FileMatcher, diff --git a/src/adapters/zip.rs b/src/adapters/zip.rs index 8c30407..d616528 100644 --- a/src/adapters/zip.rs +++ b/src/adapters/zip.rs @@ -2,39 +2,50 @@ use super::*; use crate::print_bytes; use anyhow::*; use async_stream::stream; -use lazy_static::lazy_static; use log::*; -// TODO: allow users to configure file extensions instead of hard coding the list -// https://github.com/phiresky/ripgrep-all/pull/208#issuecomment-2173241243 -static EXTENSIONS: &[&str] = &["zip", "jar", "xpi", "kra", "snagx"]; +pub const EXTENSIONS: &[&str] = &["zip", "jar", "xpi", "kra", "snagx"]; +pub const MIMETYPES: &[&str] = &["application/zip"]; -lazy_static! { - static ref METADATA: AdapterMeta = AdapterMeta { - name: "zip".to_owned(), - version: 1, - description: "Reads a zip file as a stream and recurses down into its contents".to_owned(), - recurses: true, - fast_matchers: EXTENSIONS - .iter() - .map(|s| FastFileMatcher::FileExtension(s.to_string())) - .collect(), - slow_matchers: Some(vec![FileMatcher::MimeType("application/zip".to_owned())]), - keep_fast_matchers_if_accurate: false, - disabled_by_default: false - }; +#[derive(Debug, Clone)] +pub struct ZipAdapter { + pub extensions: Vec, + pub mimetypes: Vec, } -#[derive(Default, Clone)] -pub struct ZipAdapter; -impl ZipAdapter { - pub fn new() -> ZipAdapter { - ZipAdapter +impl Default for ZipAdapter { + fn default() -> ZipAdapter { + ZipAdapter { + extensions: EXTENSIONS.iter().map(|&s| s.to_string()).collect(), + mimetypes: MIMETYPES.iter().map(|&s| s.to_string()).collect(), + } } } -impl GetMetadata for ZipAdapter { - fn metadata(&self) -> &AdapterMeta { - &METADATA + +impl Adapter for ZipAdapter { + fn name(&self) -> String { + String::from("zip") + } + fn version(&self) -> i32 { + 1 + } + fn description(&self) -> String { + String::from("Reads a zip file as a stream and recurses down into its contents") + } + fn recurses(&self) -> bool { + true + } + fn disabled_by_default(&self) -> bool { + false + } + fn keep_fast_matchers_if_accurate(&self) -> bool { + false + } + fn extensions(&self) -> Vec { + self.extensions.clone() + } + fn mimetypes(&self) -> Vec { + self.mimetypes.clone() } } @@ -225,7 +236,7 @@ mod test { async fn only_seek_zip_fs() -> Result<()> { let zip = test_data_dir().join("only-seek-zip.zip"); let (a, d) = simple_fs_adapt_info(&zip).await?; - let _v = adapted_to_vec(loop_adapt(&ZipAdapter::new(), d, a).await?).await?; + let _v = adapted_to_vec(loop_adapt(&ZipAdapter::default(), d, a).await?).await?; // assert_eq!(String::from_utf8(v)?, ""); Ok(()) @@ -242,7 +253,7 @@ mod test { #[tokio::test] async fn recurse() -> Result<()> { let zipfile = create_zip("outer.txt", "outer text file", true).await?; - let adapter = ZipAdapter::new(); + let adapter = ZipAdapter::default(); let (a, d) = simple_adapt_info( &PathBuf::from("outer.zip"), @@ -257,4 +268,25 @@ mod test { Ok(()) } + + #[tokio::test] + async fn search_xlsx_with_extension_config() -> Result<()> { + let zip = test_data_dir().join("excel.xlsx"); + let (a, d) = simple_fs_adapt_info(&zip).await?; + let v = adapted_to_vec( + loop_adapt( + &ZipAdapter { + extensions: vec![String::from("xlsx")], + mimetypes: Vec::new(), + }, + d, + a, + ) + .await?, + ) + .await?; + assert_eq!(String::from_utf8(v[..18].to_vec())?, "PREFIX:_rels/.rels"); // first filename in the spreadsheet archive + + Ok(()) + } } diff --git a/src/bin/rga.rs b/src/bin/rga.rs index c3ed99d..7da46c8 100644 --- a/src/bin/rga.rs +++ b/src/bin/rga.rs @@ -12,7 +12,11 @@ use std::process::Command; use std::time::Instant; fn list_adapters(args: RgaConfig) -> Result<()> { - let (enabled_adapters, disabled_adapters) = get_all_adapters(args.custom_adapters); + let (enabled_adapters, disabled_adapters) = get_all_adapters( + args.custom_extensions, + args.custom_mimetypes, + args.custom_adapters, + ); println!("Adapters:\n"); let print = |adapter: std::sync::Arc| { @@ -27,8 +31,6 @@ fn list_adapters(args: RgaConfig) -> Result<()> { .join(", "); let slow_matchers = meta .slow_matchers - .as_ref() - .unwrap_or(&vec![]) .iter() .filter_map(|m| match m { FileMatcher::MimeType(x) => Some(x.to_string()), @@ -87,14 +89,19 @@ fn main() -> anyhow::Result<()> { return Ok(()); } - let adapters = get_adapters_filtered(config.custom_adapters.clone(), &config.adapters)?; + let adapters = get_adapters_filtered( + config.custom_extensions.clone(), + config.custom_mimetypes.clone(), + config.custom_adapters.clone(), + &config.adapters, + )?; let pre_glob = if !config.accurate { let extensions = adapters .iter() - .flat_map(|a| &a.metadata().fast_matchers) - .flat_map(|m| match m { - FastFileMatcher::FileExtension(ext) => vec![ext.clone(), ext.to_ascii_uppercase()], + .flat_map(|a| a.metadata().fast_matchers) + .map(|matcher| match matcher { + FastFileMatcher::FileExtension(_) => matcher.to_string(), }) .collect::>() .join(","); diff --git a/src/config.rs b/src/config.rs index 709aadc..6eb89b7 100644 --- a/src/config.rs +++ b/src/config.rs @@ -1,9 +1,11 @@ -use crate::{adapters::custom::CustomAdapterConfig, project_dirs}; -use anyhow::{Context, Result}; +use crate::adapters::custom; +use crate::project_dirs; +use anyhow::{anyhow, Context, Result}; use derive_more::FromStr; use log::*; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; +use std::collections::HashMap; use std::ffi::OsString; use std::io::Read; use std::{fs::File, io::Write, iter::IntoIterator, path::PathBuf, str::FromStr}; @@ -96,6 +98,21 @@ impl FromStr for CacheMaxBlobLen { } } +fn parse_custom_identifiers(s: &str) -> Result> { + let identifiers: &mut HashMap = &mut HashMap::new(); + for pair in s.split(",") { + match pair.split_once("=") { + Some((k, v)) => { + let v_parsed = custom::Builtin::from_str(v) + .with_context(|| format!("No known built-in adapter {}", v))?; + identifiers.insert(k.to_string(), v_parsed); + } + None => return Err(anyhow!("Must be in form {{id}}={{adapter}}")), + } + } + Ok(identifiers.to_owned()) +} + /// # rga configuration /// /// This is kind of a "polyglot" struct serving multiple purposes: @@ -165,9 +182,31 @@ pub struct RgaConfig { #[serde(default, skip_serializing_if = "is_default")] #[structopt(skip)] // config file only - pub custom_adapters: Option>, + pub custom_adapters: Option>, + + /// Map extensions to built-in adapters. + /// + /// The syntax is "{extension}={adapter}", e.g. "xlsx=zip" to process files ending with ".xlsx" using the zip adapter. + #[serde(default, skip_serializing_if = "is_default")] + #[structopt( + long = "--rga-custom-extensions", + require_equals = true, + parse(try_from_str = parse_custom_identifiers), + )] + pub custom_extensions: Option>, - #[serde(skip)] + /// Map mimetypes to built-in adapters. + /// + /// The syntax is "{mimetype}={adapter}", e.g. "application/vnd.ms-excel=zip" to process Microsoft Excel files using the zip adapter. + #[serde(default, skip_serializing_if = "is_default")] + #[structopt( + long = "--rga-custom-mimetypes", + require_equals = true, + parse(try_from_str = parse_custom_identifiers), + )] + pub custom_mimetypes: Option>, + + #[serde(skip)] // CLI only #[structopt(long = "--rga-config-file", require_equals = true)] pub config_file_path: Option, diff --git a/src/matching.rs b/src/matching.rs index 3b67ba4..2839103 100644 --- a/src/matching.rs +++ b/src/matching.rs @@ -7,8 +7,8 @@ use anyhow::*; use regex::{Regex, RegexSet}; +use std::fmt; use std::iter::Iterator; - use std::sync::Arc; // match only based on file path @@ -24,6 +24,20 @@ pub enum FastFileMatcher { // todo: maybe allow matching a directory (e.g. /var/lib/postgres) } +impl std::fmt::Display for FastFileMatcher { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + FastFileMatcher::FileExtension(val) => { + // Write strictly the first element into the supplied output + // stream: `f`. Returns `fmt::Result` which indicates whether the + // operation succeeded or failed. Note that `write!` uses syntax which + // is very similar to `println!`. + write!(f, "{}", val) + } + } + } +} + #[derive(Clone, Debug)] pub enum FileMatcher { /// any type of fast matcher @@ -40,12 +54,12 @@ impl From for FileMatcher { } } -pub struct FileMeta { +pub struct FileMeta<'a> { // filename is not actually a utf8 string, but since we can't do regex on OsStr and can't get a &[u8] from OsStr either, // and since we probably only want to do only matching on ascii stuff anyways, this is the filename as a string with non-valid bytes removed pub lossy_filename: String, // only given when slow matching is enabled - pub mimetype: Option<&'static str>, + pub mimetype: Option<&'a str>, } pub fn extension_to_regex(extension: &str) -> Regex { diff --git a/src/preproc.rs b/src/preproc.rs index 32f3fa8..86500b1 100644 --- a/src/preproc.rs +++ b/src/preproc.rs @@ -32,7 +32,12 @@ async fn choose_adapter( archive_recursion_depth: i32, inp: &mut (impl AsyncBufRead + Unpin), ) -> Result, FileMatcher, ActiveAdapters)>> { - let active_adapters = get_adapters_filtered(config.custom_adapters.clone(), &config.adapters)?; + let active_adapters = get_adapters_filtered( + config.custom_extensions.clone(), + config.custom_mimetypes.clone(), + config.custom_adapters.clone(), + &config.adapters, + )?; let adapters = adapter_matcher(&active_adapters, config.accurate)?; let filename = filepath_hint .file_name() @@ -255,7 +260,7 @@ pub async fn loop_adapt_inner( ai.filepath_hint.to_string_lossy(), &adapter.metadata().name ); - for await ifile in loop_adapt(adapter.as_ref(), detection_reason, ai).await? { + for await ifile in loop_adapt(adapter.clone().as_ref(), detection_reason, ai).await? { yield ifile; } }