Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix structured data metadata only trim #132

Merged
merged 1 commit into from
Oct 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 14 additions & 23 deletions mdio/utils/trim.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,34 +67,18 @@ Future<void> TrimDataset(std::string dataset_path,
MDIO_ASSIGN_OR_RETURN(auto var, ds.variables.at(varIdentifier))
var.set_metadata_publish_flag(true);

if (var.dimensions().labels().back() == "") {
auto spec = var.spec();
if (!spec.status().ok()) {
// Something went wrong with Tensorstore retrieving the spec
return spec.status();
}
auto specJsonResult = spec.value().ToJson(IncludeDefaults{});
if (!specJsonResult.status().ok()) {
return specJsonResult.status();
}
// This will fall over if the first dtype is itself structured data
nlohmann::json specJson =
specJsonResult.value()["metadata"]["dtype"][0][0];
std::string field = specJson.get<std::string>();
// If the variable is structured data we will pick the first dimension
// arbitrarially
auto selection = ds.SelectField(varIdentifier, field);
if (!selection.status().ok()) {
return selection.status();
}
MDIO_ASSIGN_OR_RETURN(var, ds.variables.at(varIdentifier))
}
bool wasStruct = var.dimensions().labels().back() == "";

auto varStore = var.get_store();
std::vector<tensorstore::Index> implicitDims;
std::vector<tensorstore::Index> newShape;

for (size_t i = 0; i < var.dimensions().shape().size(); i++) {
auto dims = var.dimensions().shape().size();
if (wasStruct) {
--dims;
}

for (size_t i = 0; i < dims; i++) {
implicitDims.push_back(tensorstore::kImplicit);
if (shapeDescriptors.count(var.dimensions().labels()[i]) > 0) {
newShape.push_back(shapeDescriptors[var.dimensions().labels()[i]]);
Expand All @@ -103,9 +87,16 @@ Future<void> TrimDataset(std::string dataset_path,
}
}

if (wasStruct) {
implicitDims.push_back(tensorstore::kImplicit);
newShape.push_back(tensorstore::kImplicit);
}

tensorstore::ResizeOptions resizeOptions;
if (delete_sliced_out_chunks) {
resizeOptions.mode = tensorstore::ResizeMode::resize_tied_bounds;
} else if (wasStruct) {
resizeOptions.mode = tensorstore::ResizeMode::resize_metadata_only;
} else {
resizeOptions.mode = tensorstore::ResizeMode::resize_metadata_only;
}
Expand Down
49 changes: 49 additions & 0 deletions mdio/utils/trim_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,55 @@ TEST(TrimDataset, oneSliceData) {
}
}

TEST(TrimDataset, oneSliceDataNoDelete) {
// Set up the dataset
ASSERT_TRUE(SETUP(kTestPath).status().ok());
auto dsRes = mdio::Dataset::Open(kTestPath, mdio::constants::kOpen);
ASSERT_TRUE(dsRes.status().ok()) << dsRes.status();
auto ds = dsRes.value();

// Write some data to the inline variable
auto inlineVarRes = ds.variables.get<mdio::dtypes::uint32_t>("inline");
ASSERT_TRUE(inlineVarRes.status().ok()) << inlineVarRes.status();
auto inlineVar = inlineVarRes.value();

auto inlineVarFuture = inlineVar.Read();
ASSERT_TRUE(inlineVarFuture.status().ok()) << inlineVarFuture.status();
auto inlineVarData = inlineVarFuture.value();

auto inlineDataAccessor = inlineVarData.get_data_accessor();

for (int i = 0; i < 256; ++i) {
inlineDataAccessor({i}) = i + 256;
}

auto writeFuture = inlineVar.Write(inlineVarData);
ASSERT_TRUE(writeFuture.status().ok()) << writeFuture.status();

// Trim outside of a chunk boundry
mdio::RangeDescriptor<mdio::Index> slice = {"inline", 0, 128, 1};
auto res = mdio::utils::TrimDataset(kTestPath, false, slice);
ASSERT_TRUE(res.status().ok()) << res.status();

auto newDsRes = mdio::Dataset::Open(kTestPath, mdio::constants::kOpen);
ASSERT_TRUE(newDsRes.status().ok()) << newDsRes.status();
auto newDs = newDsRes.value();

std::string name = "inline";
auto varRes = newDs.get_variable(name);
ASSERT_TRUE(varRes.status().ok()) << varRes.status();
auto var = varRes.value();
auto varFuture = var.Read();
ASSERT_TRUE(varFuture.status().ok()) << varFuture.status();
auto varData = varFuture.value();

auto varDataAccessor = reinterpret_cast<mdio::dtypes::uint32_t*>(
varData.get_data_accessor().data());
for (int i = 0; i < 128; ++i) {
EXPECT_EQ(varDataAccessor[i], i + 256) << "i: " << i;
}
}

TEST(TrimDataset, metadataConsistency) {
ASSERT_TRUE(SETUP(kTestPath).status().ok());
nlohmann::json imageData;
Expand Down
Loading