From fcb8b5127eb633d45f2ae9d812038515408148fe Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 21 Oct 2024 20:56:46 +0000 Subject: [PATCH 1/5] Add typed selection for structured data and returns properly sliced Variable --- mdio/dataset.h | 155 ++++++++++++++++++++++++------------------- mdio/dataset_test.cc | 53 +++++++++++++++ 2 files changed, 138 insertions(+), 70 deletions(-) diff --git a/mdio/dataset.h b/mdio/dataset.h index c4db168..ce75a5e 100644 --- a/mdio/dataset.h +++ b/mdio/dataset.h @@ -1097,69 +1097,67 @@ class Dataset { auto all_done_future = tensorstore::WaitAllFuture(futures); auto pair = tensorstore::PromiseFuturePair::Make(); - all_done_future.ExecuteWhenReady( - [promise = std::move(pair.promise), variables = std::move(variables), - metadata](tensorstore::ReadyFuture readyFut) { - mdio::VariableCollection collection; - mdio::coordinate_map coords; - std::unordered_map shape_size; - - for (const auto& fvar : variables) { - // we should have waited for this to be ready so it's not blocking - // ... - auto _var = fvar.result(); - if (!_var.ok()) { - promise.SetResult(_var.status()); - continue; - } - auto var = _var.value(); - - collection.add(var.get_variable_name(), std::move(var)); - // update coordinates if any: - auto meta = var.getMetadata(); - if (meta.contains("coordinates")) { - // Because of how Variable is set up, we need to break down a - // space delimited string to the vector - std::string coords_str = meta["coordinates"].get(); - std::vector coords_vec = - absl::StrSplit(coords_str, ' '); - coords[var.get_variable_name()] = coords_vec; - } + all_done_future.ExecuteWhenReady([promise = std::move(pair.promise), + variables = std::move(variables), + metadata](tensorstore::ReadyFuture + readyFut) { + mdio::VariableCollection collection; + mdio::coordinate_map coords; + std::unordered_map shape_size; + + for (const auto& fvar : variables) { + // we should have waited for this to be ready so it's not blocking + // ... + auto _var = fvar.result(); + if (!_var.ok()) { + promise.SetResult(_var.status()); + continue; + } + auto var = _var.value(); + + collection.add(var.get_variable_name(), std::move(var)); + // update coordinates if any: + auto meta = var.getMetadata(); + if (meta.contains("coordinates")) { + // Because of how Variable is set up, we need to break down a + // space delimited string to the vector + std::string coords_str = meta["coordinates"].get(); + std::vector coords_vec = absl::StrSplit(coords_str, ' '); + coords[var.get_variable_name()] = coords_vec; + } - auto domain = var.dimensions(); - auto shape = domain.shape().cbegin(); - for (const auto& label : domain.labels()) { - // FIXME check that if exists shape is the same ... - shape_size[label] = *shape; - ++shape; - } - } + auto domain = var.dimensions(); + auto shape = domain.shape().cbegin(); + for (const auto& label : domain.labels()) { + // FIXME check that if exists shape is the same ... + shape_size[label] = *shape; + ++shape; + } + } - std::vector keys; - std::vector values; + std::vector keys; + std::vector values; - keys.reserve(shape_size.size()); - values.reserve(shape_size.size()); - for (const auto& pair : shape_size) { - keys.push_back(std::move(pair.first)); - values.push_back(pair.second); - } + keys.reserve(shape_size.size()); + values.reserve(shape_size.size()); + for (const auto& pair : shape_size) { + keys.push_back(std::move(pair.first)); + values.push_back(pair.second); + } - auto dataset_domain = - tensorstore::IndexDomainBuilder<>(shape_size.size()) - .shape(values) - .labels(keys) - .Finalize(); + auto dataset_domain = tensorstore::IndexDomainBuilder<>(shape_size.size()) + .shape(values) + .labels(keys) + .Finalize(); - if (!dataset_domain.ok()) { - promise.SetResult(dataset_domain.status()); - return; - } + if (!dataset_domain.ok()) { + promise.SetResult(dataset_domain.status()); + return; + } - Dataset new_dataset{metadata, collection, coords, - dataset_domain.value()}; - promise.SetResult(std::move(new_dataset)); - }); + Dataset new_dataset{metadata, collection, coords, dataset_domain.value()}; + promise.SetResult(std::move(new_dataset)); + }); return pair.future; } @@ -1174,8 +1172,10 @@ class Dataset { * @return An `mdio::Future` if the selection was valid and successful, or an * error if the selection was invalid. */ - Future> SelectField(const std::string variableName, - const std::string fieldName) { + template + Future> SelectField(const std::string variableName, + const std::string fieldName) { // Ensure that the variable exists in the Dataset if (!variables.contains_key(variableName)) { return absl::Status( @@ -1184,11 +1184,11 @@ class Dataset { } // Grab the Variable from the Dataset - auto varRes = variables.get(variableName); - if (!varRes.status().ok()) { - return varRes.status(); - } - mdio::Variable var = varRes.value(); + MDIO_ASSIGN_OR_RETURN(auto var, variables.at(variableName)); + // Preserve the intervals so it can be re-sliced to the same dimensions + MDIO_ASSIGN_OR_RETURN(auto intervals, var.get_intervals()); + intervals.pop_back(); // Remove the byte dimension (Doesn't matter if it + // doesn't exist) // Ensure that the Variable is of dtype structarray auto spec = var.spec(); @@ -1258,18 +1258,33 @@ class Dataset { base["kvstore"]["path"] = cloudPath; } - auto fieldedVar = mdio::Variable<>::Open(base, constants::kOpen); + auto fieldedVar = mdio::Variable::Open(base, constants::kOpen); + // mdio::Future> fieldedVar = mdio::Variable::Open(base, constants::kOpen); - auto pair = tensorstore::PromiseFuturePair>::Make(); + auto pair = tensorstore::PromiseFuturePair>::Make(); fieldedVar.ExecuteWhenReady( - [this, promise = pair.promise, - variableName](tensorstore::ReadyFuture> readyFut) { + [this, promise = pair.promise, variableName, intervals]( + tensorstore::ReadyFuture> readyFut) { auto ready_result = readyFut.result(); if (!ready_result.ok()) { promise.SetResult(ready_result.status()); } else { - this->variables.add(variableName, ready_result.value()); - promise.SetResult(ready_result); + // Re-slice the Variable to the same dimensions as the original + std::vector> slices; + slices.reserve(intervals.size() - 1); + for (const auto& interval : intervals) { + slices.emplace_back(mdio::RangeDescriptor( + {interval.label, interval.inclusive_min, + interval.exclusive_max, 1})); + } + auto slicedVarRes = ready_result.value().slice(slices); + if (!slicedVarRes.status().ok()) { + promise.SetResult(slicedVarRes.status()); + } else { + this->variables.add(variableName, ready_result.value()); + promise.SetResult(slicedVarRes); + } } }); return pair.future; diff --git a/mdio/dataset_test.cc b/mdio/dataset_test.cc index a97f0fb..a44d9c3 100644 --- a/mdio/dataset_test.cc +++ b/mdio/dataset_test.cc @@ -656,6 +656,59 @@ TEST(Dataset, selRepeatedRangeStop) { ASSERT_FALSE(sliceRes.status().ok()); } +TEST(Dataset, selectField) { + auto json_var = GetToyExample(); + + auto dataset = mdio::Dataset::from_json(json_var, "zarrs/acceptance", + mdio::constants::kCreateClean); + + ASSERT_TRUE(dataset.status().ok()) << dataset.status(); + auto ds = dataset.value(); + + // std::cout << ds << std::endl; + + mdio::RangeDescriptor desc1 = {"inline", 0, 5, 1}; + mdio::RangeDescriptor desc2 = {"crossline", 0, 5, 1}; + mdio::RangeDescriptor desc3 = {"depth", 0, 5, 1}; + + auto sliceRes = ds.isel(desc1, desc2, desc3); + ASSERT_TRUE(sliceRes.status().ok()) << sliceRes.status(); + auto slicedDs = sliceRes.value(); + + auto structImageHeadersRes = + slicedDs.variables.get("image_headers"); + ASSERT_TRUE(structImageHeadersRes.status().ok()) + << structImageHeadersRes.status(); + auto structImageHeaders = structImageHeadersRes.value(); + auto asdf = structImageHeaders.get_intervals(); + ASSERT_TRUE(asdf.status().ok()) << asdf.status(); + auto structIntervals = asdf.value(); + ASSERT_EQ(structIntervals.size(), 3); + + auto selectedVarFut = + slicedDs.SelectField("image_headers", "cdp-x"); + ASSERT_TRUE(selectedVarFut.status().ok()) << selectedVarFut.status(); + // std::cout << selectedVarFut.value() << std::endl; + + auto typedInervalsRes = selectedVarFut.value().get_intervals(); + ASSERT_TRUE(typedInervalsRes.status().ok()) << typedInervalsRes.status(); + auto typedIntervals = typedInervalsRes.value(); + ASSERT_EQ(typedIntervals.size(), 2); + + EXPECT_EQ(typedIntervals[0].label, structIntervals[0].label) + << "Dimension 0 labels did not match"; + EXPECT_EQ(typedIntervals[1].label, structIntervals[1].label) + << "Dimension 1 labels did not match"; + EXPECT_EQ(typedIntervals[0].inclusive_min, structIntervals[0].inclusive_min) + << "Dimension 0 min did not match"; + EXPECT_EQ(typedIntervals[1].inclusive_min, structIntervals[1].inclusive_min) + << "Dimension 1 min did not match"; + EXPECT_EQ(typedIntervals[0].exclusive_max, structIntervals[0].exclusive_max) + << "Dimension 0 max did not match"; + EXPECT_EQ(typedIntervals[1].exclusive_max, structIntervals[1].exclusive_max) + << "Dimension 1 max did not match"; +} + TEST(Dataset, fromConsolidatedMeta) { auto json_vars = GetToyExample(); From 1406974910ca761a481b870dde3be3f491241ed2 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 22 Oct 2024 13:24:10 +0000 Subject: [PATCH 2/5] Formatting --- mdio/dataset.h | 112 +++++++++++++++++++++++++------------------------ 1 file changed, 57 insertions(+), 55 deletions(-) diff --git a/mdio/dataset.h b/mdio/dataset.h index ce75a5e..077c00e 100644 --- a/mdio/dataset.h +++ b/mdio/dataset.h @@ -1097,67 +1097,69 @@ class Dataset { auto all_done_future = tensorstore::WaitAllFuture(futures); auto pair = tensorstore::PromiseFuturePair::Make(); - all_done_future.ExecuteWhenReady([promise = std::move(pair.promise), - variables = std::move(variables), - metadata](tensorstore::ReadyFuture - readyFut) { - mdio::VariableCollection collection; - mdio::coordinate_map coords; - std::unordered_map shape_size; - - for (const auto& fvar : variables) { - // we should have waited for this to be ready so it's not blocking - // ... - auto _var = fvar.result(); - if (!_var.ok()) { - promise.SetResult(_var.status()); - continue; - } - auto var = _var.value(); - - collection.add(var.get_variable_name(), std::move(var)); - // update coordinates if any: - auto meta = var.getMetadata(); - if (meta.contains("coordinates")) { - // Because of how Variable is set up, we need to break down a - // space delimited string to the vector - std::string coords_str = meta["coordinates"].get(); - std::vector coords_vec = absl::StrSplit(coords_str, ' '); - coords[var.get_variable_name()] = coords_vec; - } + all_done_future.ExecuteWhenReady( + [promise = std::move(pair.promise), variables = std::move(variables), + metadata](tensorstore::ReadyFuture readyFut) { + mdio::VariableCollection collection; + mdio::coordinate_map coords; + std::unordered_map shape_size; + + for (const auto& fvar : variables) { + // we should have waited for this to be ready so it's not blocking + // ... + auto _var = fvar.result(); + if (!_var.ok()) { + promise.SetResult(_var.status()); + continue; + } + auto var = _var.value(); + + collection.add(var.get_variable_name(), std::move(var)); + // update coordinates if any: + auto meta = var.getMetadata(); + if (meta.contains("coordinates")) { + // Because of how Variable is set up, we need to break down a + // space delimited string to the vector + std::string coords_str = meta["coordinates"].get(); + std::vector coords_vec = + absl::StrSplit(coords_str, ' '); + coords[var.get_variable_name()] = coords_vec; + } - auto domain = var.dimensions(); - auto shape = domain.shape().cbegin(); - for (const auto& label : domain.labels()) { - // FIXME check that if exists shape is the same ... - shape_size[label] = *shape; - ++shape; - } - } + auto domain = var.dimensions(); + auto shape = domain.shape().cbegin(); + for (const auto& label : domain.labels()) { + // FIXME check that if exists shape is the same ... + shape_size[label] = *shape; + ++shape; + } + } - std::vector keys; - std::vector values; + std::vector keys; + std::vector values; - keys.reserve(shape_size.size()); - values.reserve(shape_size.size()); - for (const auto& pair : shape_size) { - keys.push_back(std::move(pair.first)); - values.push_back(pair.second); - } + keys.reserve(shape_size.size()); + values.reserve(shape_size.size()); + for (const auto& pair : shape_size) { + keys.push_back(std::move(pair.first)); + values.push_back(pair.second); + } - auto dataset_domain = tensorstore::IndexDomainBuilder<>(shape_size.size()) - .shape(values) - .labels(keys) - .Finalize(); + auto dataset_domain = + tensorstore::IndexDomainBuilder<>(shape_size.size()) + .shape(values) + .labels(keys) + .Finalize(); - if (!dataset_domain.ok()) { - promise.SetResult(dataset_domain.status()); - return; - } + if (!dataset_domain.ok()) { + promise.SetResult(dataset_domain.status()); + return; + } - Dataset new_dataset{metadata, collection, coords, dataset_domain.value()}; - promise.SetResult(std::move(new_dataset)); - }); + Dataset new_dataset{metadata, collection, coords, + dataset_domain.value()}; + promise.SetResult(std::move(new_dataset)); + }); return pair.future; } From f2683f21f33b2118aef57facc6b7567479c784a2 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 22 Oct 2024 13:25:48 +0000 Subject: [PATCH 3/5] Pass parameters by reference --- mdio/dataset.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mdio/dataset.h b/mdio/dataset.h index 077c00e..7b5a9a9 100644 --- a/mdio/dataset.h +++ b/mdio/dataset.h @@ -1176,8 +1176,8 @@ class Dataset { */ template - Future> SelectField(const std::string variableName, - const std::string fieldName) { + Future> SelectField(const std::string& variableName, + const std::string& fieldName) { // Ensure that the variable exists in the Dataset if (!variables.contains_key(variableName)) { return absl::Status( From bf1a7d646d0dc28cbfbf6023aeeaf7bdea64f8a3 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 22 Oct 2024 14:44:55 +0000 Subject: [PATCH 4/5] Fix missing Variable name --- mdio/dataset.h | 7 +++++-- mdio/dataset_test.cc | 19 ++++++++++++++++--- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/mdio/dataset.h b/mdio/dataset.h index 7b5a9a9..075f3d5 100644 --- a/mdio/dataset.h +++ b/mdio/dataset.h @@ -1250,6 +1250,11 @@ class Dataset { } base["kvstore"]["driver"] = specJson["kvstore"]["driver"]; base["kvstore"]["path"] = specJson["kvstore"]["path"]; + // Remove trailing slashes. This causes issue #130 + while (base["kvstore"]["path"].get().back() == '/') { + base["kvstore"]["path"] = base["kvstore"]["path"].get().substr( + 0, base["kvstore"]["path"].get().size() - 1); + } // Handle cloud stores if (specJson["kvstore"].contains("bucket")) { @@ -1261,8 +1266,6 @@ class Dataset { } auto fieldedVar = mdio::Variable::Open(base, constants::kOpen); - // mdio::Future> fieldedVar = mdio::Variable::Open(base, constants::kOpen); auto pair = tensorstore::PromiseFuturePair>::Make(); fieldedVar.ExecuteWhenReady( diff --git a/mdio/dataset_test.cc b/mdio/dataset_test.cc index a44d9c3..bf17b50 100644 --- a/mdio/dataset_test.cc +++ b/mdio/dataset_test.cc @@ -665,8 +665,6 @@ TEST(Dataset, selectField) { ASSERT_TRUE(dataset.status().ok()) << dataset.status(); auto ds = dataset.value(); - // std::cout << ds << std::endl; - mdio::RangeDescriptor desc1 = {"inline", 0, 5, 1}; mdio::RangeDescriptor desc2 = {"crossline", 0, 5, 1}; mdio::RangeDescriptor desc3 = {"depth", 0, 5, 1}; @@ -688,7 +686,6 @@ TEST(Dataset, selectField) { auto selectedVarFut = slicedDs.SelectField("image_headers", "cdp-x"); ASSERT_TRUE(selectedVarFut.status().ok()) << selectedVarFut.status(); - // std::cout << selectedVarFut.value() << std::endl; auto typedInervalsRes = selectedVarFut.value().get_intervals(); ASSERT_TRUE(typedInervalsRes.status().ok()) << typedInervalsRes.status(); @@ -709,6 +706,22 @@ TEST(Dataset, selectField) { << "Dimension 1 max did not match"; } +TEST(Dataset, selectFieldName) { + auto json_var = GetToyExample(); + + auto dataset = mdio::Dataset::from_json(json_var, "zarrs/acceptance", + mdio::constants::kCreateClean); + + ASSERT_TRUE(dataset.status().ok()) << dataset.status(); + auto ds = dataset.value(); + + auto selectedVarFut = + ds.SelectField("image_headers", "cdp-x"); + ASSERT_TRUE(selectedVarFut.status().ok()) << selectedVarFut.status(); + EXPECT_EQ(selectedVarFut.value().get_variable_name(), "image_headers") + << "Expected selected variable to be named image_headers"; +} + TEST(Dataset, fromConsolidatedMeta) { auto json_vars = GetToyExample(); From 27397136b7849601db5071a8010654fd8a7e2209 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 22 Oct 2024 14:46:27 +0000 Subject: [PATCH 5/5] Formatting --- mdio/dataset.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mdio/dataset.h b/mdio/dataset.h index 075f3d5..b637a12 100644 --- a/mdio/dataset.h +++ b/mdio/dataset.h @@ -1252,8 +1252,9 @@ class Dataset { base["kvstore"]["path"] = specJson["kvstore"]["path"]; // Remove trailing slashes. This causes issue #130 while (base["kvstore"]["path"].get().back() == '/') { - base["kvstore"]["path"] = base["kvstore"]["path"].get().substr( - 0, base["kvstore"]["path"].get().size() - 1); + base["kvstore"]["path"] = + base["kvstore"]["path"].get().substr( + 0, base["kvstore"]["path"].get().size() - 1); } // Handle cloud stores