Skip to content

Commit

Permalink
Add zip option to GraphML export
Browse files Browse the repository at this point in the history
  • Loading branch information
thomaskrause committed Feb 28, 2024
1 parent c46fc44 commit 525a6f8
Show file tree
Hide file tree
Showing 3 changed files with 136 additions and 52 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Added simple chunker module based on
[text-splitter](https://crates.io/crates/text-splitter).
- `check` can write check report to file
- Add `zip` option to GraphML export to directly export as ZIP file which can be
more easily imported in ANNIS.

### Changed

Expand Down
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ toml = "0.8.0"
tracing-subscriber = {version = "0.3", features = ["env-filter"]}
umya-spreadsheet = "~1.1.1"
xml-rs = "0.8"
zip = "0.6.6"

[dev-dependencies]
assert_cmd = "2.0.11"
Expand Down
185 changes: 133 additions & 52 deletions src/exporter/graphml.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@
//! conform to the [graphANNIS data
//! model](https://korpling.github.io/graphANNIS/docs/v2/data-model.html).
use std::{
borrow::Cow,
cmp::Ordering,
collections::{BTreeMap, BTreeSet},
fs::{create_dir_all, File},
path::Path,
io::BufReader,
path::{Path, PathBuf},
};

use crate::{
Expand All @@ -18,9 +20,9 @@ use graphannis::{
model::{AnnotationComponent, AnnotationComponentType},
};
use graphannis_core::{
annostorage::ValueSearch,
annostorage::{NodeAnnotationStorage, ValueSearch},
dfs::CycleSafeDFS,
graph::{ANNIS_NS, NODE_NAME_KEY, NODE_TYPE_KEY},
graph::{ANNIS_NS, NODE_NAME_KEY, NODE_TYPE, NODE_TYPE_KEY},
util::{join_qname, split_qname},
};
use itertools::Itertools;
Expand All @@ -33,6 +35,7 @@ pub const MODULE_NAME: &str = "export_graphml";
pub struct GraphMLExporter {
add_vis: Option<String>,
guess_vis: bool,
zip: bool,
}

impl Module for GraphMLExporter {
Expand Down Expand Up @@ -393,6 +396,43 @@ fn vis_from_graph(graph: &AnnotationGraph) -> Result<String, Box<dyn std::error:
Ok(vis)
}

impl GraphMLExporter {
/// Find all nodes of the type "file" and return an iterator
/// over a tuple of the node name and path of the linked file as it is given in the annotation.
fn get_linked_files<'a>(
&'a self,
graph: &'a AnnotationGraph,
) -> anyhow::Result<impl Iterator<Item = anyhow::Result<PathBuf>> + 'a> {
let linked_file_key = AnnoKey {
ns: ANNIS_NS.into(),
name: "file".into(),
};
// Find all nodes of the type "file"
let node_annos: &dyn NodeAnnotationStorage = graph.get_node_annos();
let it = node_annos
.exact_anno_search(Some(ANNIS_NS), NODE_TYPE, ValueSearch::Some("file"))
// Get the linked file for this node
.map(move |m| match m {
Ok(m) => node_annos
.get_value_for_item(&m.node, &NODE_NAME_KEY)
.map(|node_name| (m, node_name)),
Err(e) => Err(e),
})
.map(move |result| match result {
Ok((m, _node_name)) => node_annos.get_value_for_item(&m.node, &linked_file_key),
Err(e) => Err(e),
})
.filter_map_ok(move |file_path_value| {
if let Some(file_path_value) = file_path_value {
return Some(PathBuf::from(file_path_value.as_ref()));
}
None
})
.map(|item| item.map_err(anyhow::Error::from));
Ok(it)
}
}

impl Exporter for GraphMLExporter {
fn export_corpus(
&self,
Expand All @@ -402,52 +442,49 @@ impl Exporter for GraphMLExporter {
tx: Option<StatusSender>,
) -> Result<(), Box<dyn std::error::Error>> {
let reporter = ProgressReporter::new_unknown_total_work(tx, step_id)?;
let file_name;
let extension = self.file_extension();
if let Some(part_of_c) = graph

// Get the toplevel corpus name from the corpus structure
let part_of_c = graph
.get_all_components(Some(AnnotationComponentType::PartOf), None)
.first()
{
let corpus_nodes = graph.get_node_annos().exact_anno_search(
Some(NODE_TYPE_KEY.ns.as_str()),
NODE_TYPE_KEY.name.as_str(),
ValueSearch::Some("corpus"),
);
let part_of_storage = graph.get_graphstorage(part_of_c).unwrap();
let corpus_root = corpus_nodes
.into_iter()
.find(|n| {
part_of_storage
.get_outgoing_edges((*n).as_ref().unwrap().node)
.count()
== 0
})
.unwrap()?
.node;
file_name = format!(
"{}.{extension}",
graph
.get_node_annos()
.get_value_for_item(&corpus_root, &NODE_NAME_KEY)?
.unwrap()
);
} else {
let reason = String::from("Could not determine file name for graphML.");
let err = AnnattoError::Export {
reason,
.cloned()
.ok_or_else(|| AnnattoError::Export {
reason: "Could not determine file name for graphML.".into(),
exporter: self.module_name().to_string(),
path: output_path.to_path_buf(),
};
return Err(Box::new(err));
})?;

let corpus_nodes = graph.get_node_annos().exact_anno_search(
Some(NODE_TYPE_KEY.ns.as_str()),
NODE_TYPE_KEY.name.as_str(),
ValueSearch::Some("corpus"),
);
let part_of_storage = graph.get_graphstorage(&part_of_c).unwrap();
let corpus_root = corpus_nodes
.into_iter()
.find(|n| {
part_of_storage
.get_outgoing_edges((*n).as_ref().unwrap().node)
.count()
== 0
})
.unwrap()?
.node;
let toplevel_corpus_name = graph

Check failure on line 473 in src/exporter/graphml.rs

View workflow job for this annotation

GitHub Actions / Static code analysis

unnecessary closure used to substitute value for `Option::None`
.get_node_annos()
.get_value_for_item(&corpus_root, &NODE_NAME_KEY)?
.unwrap_or_else(|| Cow::Borrowed("corpus"));

// Use the corpus name to determine the file name
let extension = self.file_extension();
let file_name = format!("{toplevel_corpus_name}.{extension}");

if !output_path.exists() {
create_dir_all(output_path)?;
}
let output_file_path = match output_path.is_dir() {
true => output_path.join(file_name),
false => {
create_dir_all(output_path)?;
output_path.join(file_name)
}
};
let output_file_path = output_path.join(file_name);
let output_file = File::create(output_file_path.clone())?;

let infered_vis = if self.guess_vis {
Some(vis_from_graph(graph)?)
} else {
Expand All @@ -463,18 +500,62 @@ impl Exporter for GraphMLExporter {
vis_str
};
reporter.info(format!("Starting export to {}", &output_file_path.display()).as_str())?;
graphannis_core::graph::serialization::graphml::export(
graph,
Some(format!("\n{vis}\n").as_str()),
output_file,
|msg| {
reporter.info(msg).expect("Could not send status message");
},
)?;

if self.zip {
// Create a ZIP file at the given location
let mut zip = zip::ZipWriter::new(output_file);

// Create an entry in the ZIP file and write the GraphML to this file entry
let options = zip::write::FileOptions::default()
.compression_method(zip::CompressionMethod::Deflated);

zip.start_file(format!("{toplevel_corpus_name}.graphml"), options)?;

graphannis_core::graph::serialization::graphml::export(
graph,
Some(format!("\n{vis}\n").as_str()),
&mut zip,
|msg| {
reporter.info(msg).expect("Could not send status message");
},
)?;
// Insert all linked files with a *relative* path into the ZIP file.
// We can't rewrite the links in the GraphML at this point and have
// to assume that wen unpacking it again, the absolute file paths
// should point to the original files. But when relative files are
// used, we can store them in the ZIP file itself and the when
// unpacked, the paths are still valid regardless of whether they
// existed in the first place on the target system.
for file in self.get_linked_files(graph)? {
let original_path = file?;

if original_path.is_relative() {
zip.start_file(original_path.to_string_lossy(), options)?;
}
let file_to_copy = File::open(original_path)?;
let mut reader = BufReader::new(file_to_copy);
std::io::copy(&mut reader, &mut zip)?;
}
} else {
// Directly writhe the GraphML to the output file
graphannis_core::graph::serialization::graphml::export(
graph,
Some(format!("\n{vis}\n").as_str()),
output_file,
|msg| {
reporter.info(msg).expect("Could not send status message");
},
)?;
}

Ok(())
}

fn file_extension(&self) -> &str {
"graphml"
if self.zip {
"zip"
} else {
"graphml"
}
}
}

0 comments on commit 525a6f8

Please sign in to comment.