From a61d9974c9f65fad29e698c3e981a28c70aff42b Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Wed, 5 Feb 2025 22:50:26 +0100 Subject: [PATCH 01/38] Add `RowIdColumnDescriptor` --- Cargo.lock | 1 + crates/store/re_sorbet/Cargo.toml | 1 + crates/store/re_sorbet/src/lib.rs | 2 + .../re_sorbet/src/row_id_column_schema.rs | 61 +++++++++++++++++++ 4 files changed, 65 insertions(+) create mode 100644 crates/store/re_sorbet/src/row_id_column_schema.rs diff --git a/Cargo.lock b/Cargo.lock index ee0ad1e76ede..e31e7a0b8f91 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6588,6 +6588,7 @@ dependencies = [ "arrow", "re_log", "re_log_types", + "re_tuid", "re_types_core", "thiserror 1.0.65", ] diff --git a/crates/store/re_sorbet/Cargo.toml b/crates/store/re_sorbet/Cargo.toml index 9cbedebbf68e..f23822371aee 100644 --- a/crates/store/re_sorbet/Cargo.toml +++ b/crates/store/re_sorbet/Cargo.toml @@ -22,6 +22,7 @@ all-features = true [dependencies] re_log_types.workspace = true re_log.workspace = true +re_tuid.workspace = true re_types_core.workspace = true arrow.workspace = true diff --git a/crates/store/re_sorbet/src/lib.rs b/crates/store/re_sorbet/src/lib.rs index adfd5c216ac3..fcd4cd150958 100644 --- a/crates/store/re_sorbet/src/lib.rs +++ b/crates/store/re_sorbet/src/lib.rs @@ -7,6 +7,7 @@ mod data_column_schema; mod index_column_schema; mod ipc; mod metadata; +mod row_id_column_schema; pub use self::{ column_schema::{ColumnDescriptor, ColumnError}, @@ -17,4 +18,5 @@ pub use self::{ ArrowBatchMetadata, ArrowFieldMetadata, MetadataExt, MissingFieldMetadata, MissingMetadataKey, }, + row_id_column_schema::{RowIdColumnDescriptor, WrongDatatypeError}, }; diff --git a/crates/store/re_sorbet/src/row_id_column_schema.rs b/crates/store/re_sorbet/src/row_id_column_schema.rs new file mode 100644 index 000000000000..0346283ca213 --- /dev/null +++ b/crates/store/re_sorbet/src/row_id_column_schema.rs @@ -0,0 +1,61 @@ +use arrow::datatypes::{DataType as ArrowDatatype, Field as ArrowField}; +use re_types_core::{Component as _, Loggable as _, RowId}; + +#[derive(thiserror::Error, Debug)] +#[error("Wrong datatype. Expected {expected:?}, got {actual:?}")] +pub struct WrongDatatypeError { + pub expected: ArrowDatatype, + pub actual: ArrowDatatype, +} + +/// Describes the [`RowId`] +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub struct RowIdColumnDescriptor {} + +impl RowIdColumnDescriptor { + #[inline] + pub fn to_arrow_field(&self) -> ArrowField { + let Self {} = self; + + let nullable = false; // All rows has an id + + let metadata = [ + Some(("rerun.kind".to_owned(), "control".to_owned())), + // This ensures the RowId/Tuid is formatted correctly: + Some(( + "ARROW:extension:name".to_owned(), + re_tuid::Tuid::ARROW_EXTENSION_NAME.to_owned(), + )), + ] + .into_iter() + .flatten() + .collect(); + + ArrowField::new( + RowId::descriptor().to_string(), + RowId::arrow_datatype(), + nullable, + ) + .with_metadata(metadata) + } + + #[allow(clippy::unused_self)] + pub fn datatype(&self) -> ArrowDatatype { + RowId::arrow_datatype() + } +} + +impl TryFrom<&ArrowField> for RowIdColumnDescriptor { + type Error = WrongDatatypeError; + + fn try_from(field: &ArrowField) -> Result { + if field.data_type() == &RowId::arrow_datatype() { + Ok(Self {}) + } else { + Err(WrongDatatypeError { + expected: RowId::arrow_datatype(), + actual: field.data_type().clone(), + }) + } + } +} From 65a90df8bc5a97f88eab4a8dc8787c5a0aea1f6d Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Thu, 6 Feb 2025 08:24:36 +0100 Subject: [PATCH 02/38] Add `struct ChunkSchema` --- crates/store/re_sorbet/src/chunk_schema.rs | 59 ++++++++++++++++++++++ crates/store/re_sorbet/src/lib.rs | 2 + 2 files changed, 61 insertions(+) create mode 100644 crates/store/re_sorbet/src/chunk_schema.rs diff --git a/crates/store/re_sorbet/src/chunk_schema.rs b/crates/store/re_sorbet/src/chunk_schema.rs new file mode 100644 index 000000000000..00d7a13775c9 --- /dev/null +++ b/crates/store/re_sorbet/src/chunk_schema.rs @@ -0,0 +1,59 @@ +use arrow::datatypes::{Field as ArrowField, Schema as ArrowSchema}; + +use re_log_types::EntityPath; +use re_types_core::ChunkId; + +use crate::{ArrowBatchMetadata, ComponentColumnDescriptor, RowIdColumnDescriptor}; + +/// The parsed schema of a Rerun chunk, i.e. multiple columns of data for a single entity. +/// +/// This does NOT preserve custom arrow metadata. +/// It only contains the metadata used by Rerun. +pub struct ChunkSchema { + /// The globally unique ID of this chunk. + chunk_id: ChunkId, + + /// Which entity is this chunk for? + entity_path: EntityPath, + + /// Are we sorted by the row id column? + is_sorted: bool, + + /// The primary row id column. + row_id_column: RowIdColumnDescriptor, + + /// All other columns (indices and data). + columns: Vec, +} + +impl From for ArrowSchema { + fn from(chunk_schema: ChunkSchema) -> Self { + let ChunkSchema { + chunk_id, + entity_path, + is_sorted, + row_id_column, + columns, + } = chunk_schema; + + let mut metadata = ArrowBatchMetadata::from([ + ("rerun.id".to_owned(), format!("{:X}", chunk_id.as_u128())), + ("rerun.entity_path".to_owned(), entity_path.to_string()), + // TODO: heap_size_bytes ? + ]); + if is_sorted { + metadata.insert("rerun.is_sorted".to_owned(), "true".to_owned()); + } + + let mut fields: Vec = Vec::with_capacity(1 + columns.len()); + fields.push(row_id_column.to_arrow_field()); + fields.extend(columns.iter().map(|column| column.to_arrow_field())); + + Self { + metadata, + fields: fields.into(), + } + } +} + +// impl TryFrom for ChunkSchema { } diff --git a/crates/store/re_sorbet/src/lib.rs b/crates/store/re_sorbet/src/lib.rs index fcd4cd150958..fc0cf0260a16 100644 --- a/crates/store/re_sorbet/src/lib.rs +++ b/crates/store/re_sorbet/src/lib.rs @@ -2,6 +2,7 @@ //! //! Handles the structure of arrow record batches and their meta data for different use cases for Rerun. +mod chunk_schema; mod column_schema; mod data_column_schema; mod index_column_schema; @@ -10,6 +11,7 @@ mod metadata; mod row_id_column_schema; pub use self::{ + chunk_schema::ChunkSchema, column_schema::{ColumnDescriptor, ColumnError}, data_column_schema::ComponentColumnDescriptor, index_column_schema::{TimeColumnDescriptor, UnsupportedTimeType}, From a5b597f01d3816a1f0360ecc4617e5a2eccd4fba Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Thu, 6 Feb 2025 10:17:25 +0100 Subject: [PATCH 03/38] Add converted from arrow schema --- crates/store/re_sorbet/src/chunk_schema.rs | 99 ++++++++++++++++++- .../re_sorbet/src/row_id_column_schema.rs | 1 + 2 files changed, 97 insertions(+), 3 deletions(-) diff --git a/crates/store/re_sorbet/src/chunk_schema.rs b/crates/store/re_sorbet/src/chunk_schema.rs index 00d7a13775c9..b9d889ef3036 100644 --- a/crates/store/re_sorbet/src/chunk_schema.rs +++ b/crates/store/re_sorbet/src/chunk_schema.rs @@ -3,7 +3,36 @@ use arrow::datatypes::{Field as ArrowField, Schema as ArrowSchema}; use re_log_types::EntityPath; use re_types_core::ChunkId; -use crate::{ArrowBatchMetadata, ComponentColumnDescriptor, RowIdColumnDescriptor}; +use crate::{ + ArrowBatchMetadata, ComponentColumnDescriptor, MetadataExt as _, MissingFieldMetadata, + MissingMetadataKey, RowIdColumnDescriptor, WrongDatatypeError, +}; + +#[derive(thiserror::Error, Debug)] +pub enum ChunkSchemaError { + #[error(transparent)] + MissingMetadataKey(#[from] MissingMetadataKey), + + #[error("Bad RowId columns: {0}")] + BadRowIdColumn(WrongDatatypeError), + + #[error("Bad column '{field_name}': {error}")] + BadColumn { + field_name: String, + error: MissingFieldMetadata, + }, + + #[error("Bad chunk schema: {reason}")] + Custom { reason: String }, +} + +impl ChunkSchemaError { + fn custom(reason: impl Into) -> Self { + Self::Custom { + reason: reason.into(), + } + } +} /// The parsed schema of a Rerun chunk, i.e. multiple columns of data for a single entity. /// @@ -16,6 +45,9 @@ pub struct ChunkSchema { /// Which entity is this chunk for? entity_path: EntityPath, + /// The heap size of this chunk in bytes, if known. + heap_size_bytes: Option, + /// Are we sorted by the row id column? is_sorted: bool, @@ -31,6 +63,7 @@ impl From for ArrowSchema { let ChunkSchema { chunk_id, entity_path, + heap_size_bytes, is_sorted, row_id_column, columns, @@ -39,8 +72,13 @@ impl From for ArrowSchema { let mut metadata = ArrowBatchMetadata::from([ ("rerun.id".to_owned(), format!("{:X}", chunk_id.as_u128())), ("rerun.entity_path".to_owned(), entity_path.to_string()), - // TODO: heap_size_bytes ? ]); + if let Some(heap_size_bytes) = heap_size_bytes { + metadata.insert( + "rerun.heap_size_bytes".to_owned(), + heap_size_bytes.to_string(), + ); + } if is_sorted { metadata.insert("rerun.is_sorted".to_owned(), "true".to_owned()); } @@ -56,4 +94,59 @@ impl From for ArrowSchema { } } -// impl TryFrom for ChunkSchema { } +impl TryFrom for ChunkSchema { + type Error = ChunkSchemaError; + + fn try_from(arrow_schema: ArrowSchema) -> Result { + let ArrowSchema { metadata, fields } = arrow_schema; + + let chunk_id = { + let chunk_id = metadata.get_or_err("rerun.id")?; + let chunk_id = u128::from_str_radix(chunk_id, 16).map_err(|err| { + ChunkSchemaError::custom(format!( + "Failed to deserialize chunk id {chunk_id:?}: {err}" + )) + })?; + ChunkId::from_u128(chunk_id) + }; + + let entity_path = EntityPath::parse_forgiving(metadata.get_or_err("rerun.entity_path")?); + let is_sorted = metadata.get_bool("rerun.is_sorted"); + let heap_size_bytes = if let Some(heap_size_bytes) = metadata.get("rerun.heap_size_bytes") { + heap_size_bytes.parse().ok() // TODO: log error + } else { + None + }; + + // The first field must be the row id column: + let Some(first_field) = fields.first() else { + return Err(ChunkSchemaError::custom("No fields in schema")); + }; + + let row_id_column = RowIdColumnDescriptor::try_from(first_field.as_ref()) + .map_err(ChunkSchemaError::BadRowIdColumn)?; + + let columns: Result, _> = fields + .iter() + .skip(1) + .map(|field| { + ComponentColumnDescriptor::try_from(field.as_ref()).map_err(|err| { + ChunkSchemaError::BadColumn { + field_name: field.name().to_owned(), + error: err, + } + }) + }) + .collect(); + let columns = columns?; + + Ok(Self { + chunk_id, + entity_path, + heap_size_bytes, + is_sorted, + row_id_column, + columns, + }) + } +} diff --git a/crates/store/re_sorbet/src/row_id_column_schema.rs b/crates/store/re_sorbet/src/row_id_column_schema.rs index 0346283ca213..3f8746434704 100644 --- a/crates/store/re_sorbet/src/row_id_column_schema.rs +++ b/crates/store/re_sorbet/src/row_id_column_schema.rs @@ -49,6 +49,7 @@ impl TryFrom<&ArrowField> for RowIdColumnDescriptor { type Error = WrongDatatypeError; fn try_from(field: &ArrowField) -> Result { + // TODO: check `rerun.kind` if field.data_type() == &RowId::arrow_datatype() { Ok(Self {}) } else { From b095d591079b901e3e9cfddffbe52dc42d606247 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Thu, 6 Feb 2025 16:25:52 +0100 Subject: [PATCH 04/38] Add `re_sorbet::ChunkBatch` --- Cargo.lock | 1 + crates/store/re_sorbet/Cargo.toml | 1 + crates/store/re_sorbet/src/chunk_batch.rs | 128 +++++++++++---------- crates/store/re_sorbet/src/chunk_schema.rs | 105 +++++++++++++---- crates/store/re_sorbet/src/lib.rs | 2 + 5 files changed, 159 insertions(+), 78 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e31e7a0b8f91..82477f8c0d33 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6586,6 +6586,7 @@ name = "re_sorbet" version = "0.23.0-alpha.1+dev" dependencies = [ "arrow", + "re_format_arrow", "re_log", "re_log_types", "re_tuid", diff --git a/crates/store/re_sorbet/Cargo.toml b/crates/store/re_sorbet/Cargo.toml index f23822371aee..42d50270c915 100644 --- a/crates/store/re_sorbet/Cargo.toml +++ b/crates/store/re_sorbet/Cargo.toml @@ -20,6 +20,7 @@ all-features = true [dependencies] +re_format_arrow.workspace = true re_log_types.workspace = true re_log.workspace = true re_tuid.workspace = true diff --git a/crates/store/re_sorbet/src/chunk_batch.rs b/crates/store/re_sorbet/src/chunk_batch.rs index e91d7500b835..24fd116b5364 100644 --- a/crates/store/re_sorbet/src/chunk_batch.rs +++ b/crates/store/re_sorbet/src/chunk_batch.rs @@ -1,26 +1,67 @@ -use arrow::array::RecordBatch as ArrowRecordBatch; -use arrow::datatypes::Schema as ArrowSchema; +use arrow::{ + array::RecordBatch as ArrowRecordBatch, + datatypes::{Fields as ArrowFields, Schema as ArrowSchema}, +}; use re_log_types::EntityPath; +use re_types_core::ChunkId; -#[derive(thiserror::Error, Debug)] -pub enum ChunkBatchError { - #[error("Missing record batch metadata key `{0}`")] - MissingRecordBatchMetadataKey(&'static str), -} +use crate::{chunk_schema::InvalidChunkSchema, ArrowBatchMetadata, ChunkSchema}; /// The arrow [`ArrowRecordBatch`] representation of a Rerun chunk. /// /// This is a wrapper around a [`ArrowRecordBatch`]. /// /// Each [`ChunkBatch`] contains logging data for a single [`EntityPath`]. -/// It walwyas have a [`RowId`] column. +/// It always has a [`RowId`] column. #[derive(Debug, Clone)] pub struct ChunkBatch { - entity_path: EntityPath, + schema: ChunkSchema, batch: ArrowRecordBatch, } +impl ChunkBatch { + /// The parsed rerun schema of this chunk. + #[inline] + pub fn chunk_schema(&self) -> &ChunkSchema { + &self.schema + } + + /// The globally unique ID of this chunk. + #[inline] + pub fn chunk_id(&self) -> ChunkId { + self.schema.chunk_id() + } + + /// Which entity is this chunk for? + #[inline] + pub fn entity_path(&self) -> &EntityPath { + self.schema.entity_path() + } + + /// The heap size of this chunk in bytes, if known. + #[inline] + pub fn heap_size_bytes(&self) -> Option { + self.schema.heap_size_bytes() + } + + /// Are we sorted by the row id column? + #[inline] + pub fn is_sorted(&self) -> bool { + self.schema.is_sorted() + } + + #[inline] + pub fn fields(&self) -> &ArrowFields { + &self.schema_ref().fields + } + + #[inline] + pub fn arrow_bacth_metadata(&self) -> &ArrowBatchMetadata { + &self.batch.schema_ref().metadata + } +} + impl std::fmt::Display for ChunkBatch { #[inline] fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { @@ -51,58 +92,29 @@ impl From for ArrowRecordBatch { } } -impl TryFrom for ChunkBatch { - type Error = ChunkBatchError; - - fn try_from(batch: ArrowRecordBatch) -> Result { - let mut schema = ArrowSchema::clone(&*batch.schema()); - let metadata = &mut schema.metadata; - - { - // Verify version - if let Some(batch_version) = metadata.get(Self::CHUNK_METADATA_KEY_VERSION) { - if batch_version != Self::CHUNK_METADATA_VERSION { - re_log::warn_once!( - "ChunkBatch version mismatch. Expected {:?}, got {batch_version:?}", - Self::CHUNK_METADATA_VERSION - ); - } - } - metadata.insert( - Self::CHUNK_METADATA_KEY_VERSION.to_owned(), - Self::CHUNK_METADATA_VERSION.to_owned(), - ); - } - - let entity_path = - if let Some(entity_path) = metadata.get(Self::CHUNK_METADATA_KEY_ENTITY_PATH) { - EntityPath::parse_forgiving(entity_path) - } else { - return Err(ChunkBatchError::MissingRecordBatchMetadataKey( - Self::CHUNK_METADATA_KEY_ENTITY_PATH, - )); - }; - - Ok(Self { entity_path, batch }) +impl From<&ChunkBatch> for ArrowRecordBatch { + #[inline] + fn from(chunk: &ChunkBatch) -> Self { + chunk.batch.clone() } } -/// ## Metadata keys for the record batch metadata -impl ChunkBatch { - /// The key used to identify the version of the Rerun schema. - const CHUNK_METADATA_KEY_VERSION: &'static str = "rerun.version"; - - /// The version of the Rerun schema. - const CHUNK_METADATA_VERSION: &'static str = "1"; - - /// The key used to identify a Rerun [`EntityPath`] in the record batch metadata. - const CHUNK_METADATA_KEY_ENTITY_PATH: &'static str = "rerun.entity_path"; -} +impl TryFrom for ChunkBatch { + type Error = InvalidChunkSchema; -impl ChunkBatch { - /// Returns the [`EntityPath`] of the batch. - #[inline] - pub fn entity_path(&self) -> &EntityPath { - &self.entity_path + fn try_from(batch: ArrowRecordBatch) -> Result { + let chunk_schema = ChunkSchema::try_from(batch.schema_ref().as_ref())?; + + // Extend with any metadata that might have been missing: + let mut arrow_schema = ArrowSchema::clone(batch.schema_ref().as_ref()); + arrow_schema + .metadata + .extend(chunk_schema.arrow_batch_metadata()); + let batch = batch.with_schema(arrow_schema.into()).expect("Can't fail"); + + Ok(Self { + schema: chunk_schema, + batch, + }) } } diff --git a/crates/store/re_sorbet/src/chunk_schema.rs b/crates/store/re_sorbet/src/chunk_schema.rs index b9d889ef3036..a10b40dcb6cd 100644 --- a/crates/store/re_sorbet/src/chunk_schema.rs +++ b/crates/store/re_sorbet/src/chunk_schema.rs @@ -9,7 +9,7 @@ use crate::{ }; #[derive(thiserror::Error, Debug)] -pub enum ChunkSchemaError { +pub enum InvalidChunkSchema { #[error(transparent)] MissingMetadataKey(#[from] MissingMetadataKey), @@ -26,7 +26,7 @@ pub enum ChunkSchemaError { Custom { reason: String }, } -impl ChunkSchemaError { +impl InvalidChunkSchema { fn custom(reason: impl Into) -> Self { Self::Custom { reason: reason.into(), @@ -38,6 +38,7 @@ impl ChunkSchemaError { /// /// This does NOT preserve custom arrow metadata. /// It only contains the metadata used by Rerun. +#[derive(Debug, Clone)] pub struct ChunkSchema { /// The globally unique ID of this chunk. chunk_id: ChunkId, @@ -58,31 +59,85 @@ pub struct ChunkSchema { columns: Vec, } -impl From for ArrowSchema { - fn from(chunk_schema: ChunkSchema) -> Self { - let ChunkSchema { +/// ## Metadata keys for the record batch metadata +impl ChunkSchema { + /// The key used to identify the version of the Rerun schema. + const CHUNK_METADATA_KEY_VERSION: &'static str = "rerun.version"; + + /// The version of the Rerun schema. + const CHUNK_METADATA_VERSION: &'static str = "1"; +} + +impl ChunkSchema { + /// The globally unique ID of this chunk. + + #[inline] + pub fn chunk_id(&self) -> ChunkId { + self.chunk_id + } + + /// Which entity is this chunk for? + + #[inline] + pub fn entity_path(&self) -> &EntityPath { + &self.entity_path + } + + /// The heap size of this chunk in bytes, if known. + + #[inline] + pub fn heap_size_bytes(&self) -> Option { + self.heap_size_bytes + } + + /// Are we sorted by the row id column? + + #[inline] + pub fn is_sorted(&self) -> bool { + self.is_sorted + } + + pub fn arrow_batch_metadata(&self) -> ArrowBatchMetadata { + let Self { chunk_id, entity_path, heap_size_bytes, is_sorted, - row_id_column, - columns, - } = chunk_schema; - - let mut metadata = ArrowBatchMetadata::from([ + row_id_column: _, + columns: _, + } = self; + + let mut arrow_metadata = ArrowBatchMetadata::from([ + ( + Self::CHUNK_METADATA_KEY_VERSION.to_owned(), + Self::CHUNK_METADATA_VERSION.to_owned(), + ), ("rerun.id".to_owned(), format!("{:X}", chunk_id.as_u128())), ("rerun.entity_path".to_owned(), entity_path.to_string()), ]); if let Some(heap_size_bytes) = heap_size_bytes { - metadata.insert( + arrow_metadata.insert( "rerun.heap_size_bytes".to_owned(), heap_size_bytes.to_string(), ); } - if is_sorted { - metadata.insert("rerun.is_sorted".to_owned(), "true".to_owned()); + if *is_sorted { + arrow_metadata.insert("rerun.is_sorted".to_owned(), "true".to_owned()); } + arrow_metadata + } +} + +impl From for ArrowSchema { + fn from(chunk_schema: ChunkSchema) -> Self { + let metadata = chunk_schema.arrow_batch_metadata(); + + let ChunkSchema { + row_id_column, + columns, + .. + } = chunk_schema; let mut fields: Vec = Vec::with_capacity(1 + columns.len()); fields.push(row_id_column.to_arrow_field()); fields.extend(columns.iter().map(|column| column.to_arrow_field())); @@ -94,16 +149,16 @@ impl From for ArrowSchema { } } -impl TryFrom for ChunkSchema { - type Error = ChunkSchemaError; +impl TryFrom<&ArrowSchema> for ChunkSchema { + type Error = InvalidChunkSchema; - fn try_from(arrow_schema: ArrowSchema) -> Result { + fn try_from(arrow_schema: &ArrowSchema) -> Result { let ArrowSchema { metadata, fields } = arrow_schema; let chunk_id = { let chunk_id = metadata.get_or_err("rerun.id")?; let chunk_id = u128::from_str_radix(chunk_id, 16).map_err(|err| { - ChunkSchemaError::custom(format!( + InvalidChunkSchema::custom(format!( "Failed to deserialize chunk id {chunk_id:?}: {err}" )) })?; @@ -118,20 +173,30 @@ impl TryFrom for ChunkSchema { None }; + // Verify version + if let Some(batch_version) = metadata.get(Self::CHUNK_METADATA_KEY_VERSION) { + if batch_version != Self::CHUNK_METADATA_VERSION { + re_log::warn_once!( + "ChunkSchema version mismatch. Expected {:?}, got {batch_version:?}", + Self::CHUNK_METADATA_VERSION + ); + } + } + // The first field must be the row id column: let Some(first_field) = fields.first() else { - return Err(ChunkSchemaError::custom("No fields in schema")); + return Err(InvalidChunkSchema::custom("No fields in schema")); }; let row_id_column = RowIdColumnDescriptor::try_from(first_field.as_ref()) - .map_err(ChunkSchemaError::BadRowIdColumn)?; + .map_err(InvalidChunkSchema::BadRowIdColumn)?; let columns: Result, _> = fields .iter() .skip(1) .map(|field| { ComponentColumnDescriptor::try_from(field.as_ref()).map_err(|err| { - ChunkSchemaError::BadColumn { + InvalidChunkSchema::BadColumn { field_name: field.name().to_owned(), error: err, } diff --git a/crates/store/re_sorbet/src/lib.rs b/crates/store/re_sorbet/src/lib.rs index fc0cf0260a16..9544b32d6afe 100644 --- a/crates/store/re_sorbet/src/lib.rs +++ b/crates/store/re_sorbet/src/lib.rs @@ -2,6 +2,7 @@ //! //! Handles the structure of arrow record batches and their meta data for different use cases for Rerun. +mod chunk_batch; mod chunk_schema; mod column_schema; mod data_column_schema; @@ -11,6 +12,7 @@ mod metadata; mod row_id_column_schema; pub use self::{ + chunk_batch::ChunkBatch, chunk_schema::ChunkSchema, column_schema::{ColumnDescriptor, ColumnError}, data_column_schema::ComponentColumnDescriptor, From e90fdbeb55573ad96c212022515752d79c1b5413 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Fri, 7 Feb 2025 12:12:21 +0100 Subject: [PATCH 05/38] Convert Chunk to ChunkBatch --- Cargo.lock | 2 + crates/store/re_chunk/Cargo.toml | 5 +- crates/store/re_chunk/src/chunk.rs | 9 ++ crates/store/re_chunk/src/transport.rs | 143 +++++++++++++++++- crates/store/re_sorbet/Cargo.toml | 1 + crates/store/re_sorbet/src/chunk_batch.rs | 88 ++++++++++- crates/store/re_sorbet/src/chunk_schema.rs | 116 +++++++++++--- .../store/re_sorbet/src/data_column_schema.rs | 26 ++-- .../re_sorbet/src/index_column_schema.rs | 35 +++-- crates/store/re_sorbet/src/lib.rs | 2 +- .../re_sorbet/src/row_id_column_schema.rs | 43 +++++- 11 files changed, 412 insertions(+), 58 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 82477f8c0d33..a21a840c791a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5830,6 +5830,7 @@ dependencies = [ "re_format_arrow", "re_log", "re_log_types", + "re_sorbet", "re_tracing", "re_tuid", "re_types_core", @@ -6586,6 +6587,7 @@ name = "re_sorbet" version = "0.23.0-alpha.1+dev" dependencies = [ "arrow", + "itertools 0.13.0", "re_format_arrow", "re_log", "re_log_types", diff --git a/crates/store/re_chunk/Cargo.toml b/crates/store/re_chunk/Cargo.toml index e8b617cac5fe..cc4c3873e0c4 100644 --- a/crates/store/re_chunk/Cargo.toml +++ b/crates/store/re_chunk/Cargo.toml @@ -32,10 +32,11 @@ serde = ["re_log_types/serde", "re_tuid/serde", "re_types_core/serde"] re_arrow_util.workspace = true re_byte_size.workspace = true re_error.workspace = true -re_format.workspace = true re_format_arrow.workspace = true -re_log.workspace = true +re_format.workspace = true re_log_types.workspace = true +re_log.workspace = true +re_sorbet.workspace = true re_tracing.workspace = true re_tuid.workspace = true re_types_core.workspace = true diff --git a/crates/store/re_chunk/src/chunk.rs b/crates/store/re_chunk/src/chunk.rs index 436914b46042..7d63e7e39f58 100644 --- a/crates/store/re_chunk/src/chunk.rs +++ b/crates/store/re_chunk/src/chunk.rs @@ -45,6 +45,15 @@ pub enum ChunkError { #[error("Deserialization: {0}")] Deserialization(#[from] DeserializationError), + + #[error(transparent)] + UnsupportedTimeType(#[from] re_sorbet::UnsupportedTimeType), + + #[error(transparent)] + WrongDatatypeError(#[from] re_sorbet::WrongDatatypeError), + + #[error(transparent)] + MismatchedChunkSchemaError(#[from] re_sorbet::MismatchedChunkSchemaError), } pub type ChunkResult = Result; diff --git a/crates/store/re_chunk/src/transport.rs b/crates/store/re_chunk/src/transport.rs index 19cd375a72e3..8669e86d64ba 100644 --- a/crates/store/re_chunk/src/transport.rs +++ b/crates/store/re_chunk/src/transport.rs @@ -390,6 +390,116 @@ impl TransportChunk { } } +impl Chunk { + /// Prepare the [`Chunk`] for transport. + /// + /// It is probably a good idea to sort the chunk first. + pub fn to_chunk_batch(&self) -> ChunkResult { + self.sanity_check()?; + + re_tracing::profile_function!(format!( + "num_columns={} num_rows={}", + self.num_columns(), + self.num_rows() + )); + + let heap_size_bytes = self.heap_size_bytes(); + let Self { + id, + entity_path, + heap_size_bytes: _, // use the method instead because of lazy initialization + is_sorted, + row_ids, + timelines, + components, + } = self; + + let row_id_schema = re_sorbet::RowIdColumnDescriptor::try_from(RowId::arrow_datatype())?; + + let (index_schemas, index_arrays): (Vec<_>, Vec<_>) = { + re_tracing::profile_scope!("timelines"); + + let mut timelines = timelines + .iter() + .map(|(timeline, info)| { + let TimeColumn { + timeline: _, + times: _, + is_sorted, + time_range: _, + } = info; + + let array = info.times_array(); + let schema = re_sorbet::TimeColumnDescriptor { + timeline: *timeline, + datatype: array.data_type().clone(), + is_sorted: *is_sorted, + }; + + (schema, into_arrow_ref(array)) + }) + .collect_vec(); + + timelines.sort_by(|(schema_a, _), (schema_b, _)| schema_a.cmp(schema_b)); + + timelines.into_iter().unzip() + }; + + let (data_schemas, data_arrays): (Vec<_>, Vec<_>) = { + re_tracing::profile_scope!("components"); + + let mut components = components + .values() + .flat_map(|per_desc| per_desc.iter()) + .map(|(component_desc, list_array)| { + let list_array = ArrowListArray::from(list_array.clone()); + let ComponentDescriptor { + archetype_name, + archetype_field_name, + component_name, + } = *component_desc; + + let schema = re_sorbet::ComponentColumnDescriptor { + store_datatype: list_array.data_type().clone(), + entity_path: entity_path.clone(), + + archetype_name, + archetype_field_name, + component_name, + + is_static: false, // TODO + is_indicator: false, // TODO + is_tombstone: false, // TODO + is_semantically_empty: false, // TODO + }; + (schema, into_arrow_ref(list_array)) + }) + .collect_vec(); + + components.sort_by(|(schema_a, _), (schema_b, _)| schema_a.cmp(schema_b)); + + components.into_iter().unzip() + }; + + let schema = re_sorbet::ChunkSchema::new( + *id, + entity_path.clone(), + row_id_schema, + index_schemas, + data_schemas, + ) + .with_heap_size_bytes(heap_size_bytes) + .with_sorted(*is_sorted); + + Ok(re_sorbet::ChunkBatch::try_new( + schema, + into_arrow_ref(row_ids.clone()), + index_arrays, + data_arrays, + )?) + } +} + impl Chunk { /// Prepare the [`Chunk`] for transport. /// @@ -735,7 +845,7 @@ mod tests { let entity_path = EntityPath::parse_forgiving("a/b/c"); let timeline1 = Timeline::new_temporal("log_time"); - let timelines1 = std::iter::once(( + let timelines1: IntMap<_, _> = std::iter::once(( timeline1, TimeColumn::new(Some(true), timeline1, vec![42, 43, 44, 45].into()), )) @@ -788,7 +898,7 @@ mod tests { let row_ids = vec![RowId::new(), RowId::new(), RowId::new(), RowId::new()]; - for timelines in [timelines1, timelines2] { + for timelines in [timelines1.clone(), timelines2.clone()] { let chunk_original = Chunk::from_native_row_ids( ChunkId::new(), entity_path.clone(), @@ -860,6 +970,35 @@ mod tests { } } + for timelines in [timelines1, timelines2] { + let chunk_before = Chunk::from_native_row_ids( + ChunkId::new(), + entity_path.clone(), + None, + &row_ids, + timelines.clone(), + components.clone().into_iter().collect(), + ) + .unwrap(); + + let chunk_batch = chunk_before.to_chunk_batch().unwrap(); + + assert_eq!(chunk_before.num_columns(), chunk_batch.num_columns()); + assert_eq!(chunk_before.num_rows(), chunk_batch.num_rows()); + + let chunk_after = Chunk::from_record_batch(chunk_batch.into()).unwrap(); + + assert_eq!(chunk_before.entity_path(), chunk_after.entity_path()); + assert_eq!( + chunk_before.heap_size_bytes(), + chunk_after.heap_size_bytes(), + ); + assert_eq!(chunk_before.num_columns(), chunk_after.num_columns()); + assert_eq!(chunk_before.num_rows(), chunk_after.num_rows()); + assert!(chunk_before.are_equal(&chunk_after)); + assert_eq!(chunk_before, chunk_after); + } + Ok(()) } } diff --git a/crates/store/re_sorbet/Cargo.toml b/crates/store/re_sorbet/Cargo.toml index 42d50270c915..929fcfa20188 100644 --- a/crates/store/re_sorbet/Cargo.toml +++ b/crates/store/re_sorbet/Cargo.toml @@ -27,6 +27,7 @@ re_tuid.workspace = true re_types_core.workspace = true arrow.workspace = true +itertools.workspace = true thiserror.workspace = true # Keep this crate simple, with few dependencies. diff --git a/crates/store/re_sorbet/src/chunk_batch.rs b/crates/store/re_sorbet/src/chunk_batch.rs index 24fd116b5364..e6cc158a6fad 100644 --- a/crates/store/re_sorbet/src/chunk_batch.rs +++ b/crates/store/re_sorbet/src/chunk_batch.rs @@ -1,12 +1,29 @@ use arrow::{ - array::RecordBatch as ArrowRecordBatch, + array::{ArrayRef as ArrowArrayRef, RecordBatch as ArrowRecordBatch, RecordBatchOptions}, datatypes::{Fields as ArrowFields, Schema as ArrowSchema}, }; use re_log_types::EntityPath; use re_types_core::ChunkId; -use crate::{chunk_schema::InvalidChunkSchema, ArrowBatchMetadata, ChunkSchema}; +use crate::{ + chunk_schema::InvalidChunkSchema, ArrowBatchMetadata, ChunkSchema, WrongDatatypeError, +}; + +#[derive(thiserror::Error, Debug)] +pub enum MismatchedChunkSchemaError { + #[error("{0}")] + Custom(String), + + #[error(transparent)] + WrongDatatypeError(#[from] WrongDatatypeError), +} + +impl MismatchedChunkSchemaError { + pub fn custom(s: impl Into) -> Self { + Self::Custom(s.into()) + } +} /// The arrow [`ArrowRecordBatch`] representation of a Rerun chunk. /// @@ -17,9 +34,76 @@ use crate::{chunk_schema::InvalidChunkSchema, ArrowBatchMetadata, ChunkSchema}; #[derive(Debug, Clone)] pub struct ChunkBatch { schema: ChunkSchema, + + // TODO: should we store a record batch here, or just the parsed columns? batch: ArrowRecordBatch, } +impl ChunkBatch { + pub fn try_new( + schema: ChunkSchema, + row_ids: ArrowArrayRef, + index_arrays: Vec, + data_arrays: Vec, + ) -> Result { + let row_count = row_ids.len(); + + WrongDatatypeError::compare_expected_actual( + &schema.row_id_column.datatype(), + row_ids.data_type(), + )?; + + if index_arrays.len() != schema.index_columns.len() { + return Err(MismatchedChunkSchemaError::custom(format!( + "Schema had {} index columns, but got {}", + schema.index_columns.len(), + index_arrays.len() + ))); + } + for (schema, array) in itertools::izip!(&schema.index_columns, &index_arrays) { + WrongDatatypeError::compare_expected_actual(schema.datatype(), array.data_type())?; + if array.len() != row_count { + return Err(MismatchedChunkSchemaError::custom(format!( + "Index column {:?} had {} rows, but we got {} row IDs", + schema.name(), + array.len(), + row_count + ))); + } + } + + if data_arrays.len() != schema.data_columns.len() { + return Err(MismatchedChunkSchemaError::custom(format!( + "Schema had {} data columns, but got {}", + schema.data_columns.len(), + data_arrays.len() + ))); + } + for (schema, array) in itertools::izip!(&schema.data_columns, &data_arrays) { + WrongDatatypeError::compare_expected_actual(&schema.store_datatype, array.data_type())?; + if array.len() != row_count { + return Err(MismatchedChunkSchemaError::custom(format!( + "Data column {:?} had {} rows, but we got {} row IDs", + schema.column_name(), + array.len(), + row_count + ))); + } + } + + let arrow_columns = itertools::chain!(Some(row_ids), index_arrays, data_arrays).collect(); + + let batch = ArrowRecordBatch::try_new_with_options( + std::sync::Arc::new(ArrowSchema::from(&schema)), + arrow_columns, + &RecordBatchOptions::default().with_row_count(Some(row_count)), + ) + .unwrap(); + + Ok(Self { schema, batch }) + } +} + impl ChunkBatch { /// The parsed rerun schema of this chunk. #[inline] diff --git a/crates/store/re_sorbet/src/chunk_schema.rs b/crates/store/re_sorbet/src/chunk_schema.rs index a10b40dcb6cd..b80784621244 100644 --- a/crates/store/re_sorbet/src/chunk_schema.rs +++ b/crates/store/re_sorbet/src/chunk_schema.rs @@ -1,11 +1,12 @@ use arrow::datatypes::{Field as ArrowField, Schema as ArrowSchema}; +use itertools::Itertools as _; use re_log_types::EntityPath; use re_types_core::ChunkId; use crate::{ - ArrowBatchMetadata, ComponentColumnDescriptor, MetadataExt as _, MissingFieldMetadata, - MissingMetadataKey, RowIdColumnDescriptor, WrongDatatypeError, + ArrowBatchMetadata, ColumnDescriptor, ColumnError, ComponentColumnDescriptor, MetadataExt as _, + MissingMetadataKey, RowIdColumnDescriptor, TimeColumnDescriptor, WrongDatatypeError, }; #[derive(thiserror::Error, Debug)] @@ -19,11 +20,14 @@ pub enum InvalidChunkSchema { #[error("Bad column '{field_name}': {error}")] BadColumn { field_name: String, - error: MissingFieldMetadata, + error: ColumnError, }, #[error("Bad chunk schema: {reason}")] Custom { reason: String }, + + #[error("The data columns were not the last columns. Index columns must come before any data columns.")] + UnorderedIndexAndDataColumns, } impl InvalidChunkSchema { @@ -41,22 +45,25 @@ impl InvalidChunkSchema { #[derive(Debug, Clone)] pub struct ChunkSchema { /// The globally unique ID of this chunk. - chunk_id: ChunkId, + pub chunk_id: ChunkId, /// Which entity is this chunk for? - entity_path: EntityPath, + pub entity_path: EntityPath, /// The heap size of this chunk in bytes, if known. - heap_size_bytes: Option, + pub heap_size_bytes: Option, /// Are we sorted by the row id column? - is_sorted: bool, + pub is_sorted: bool, /// The primary row id column. - row_id_column: RowIdColumnDescriptor, + pub row_id_column: RowIdColumnDescriptor, + + /// Index columns (timelines). + pub index_columns: Vec, - /// All other columns (indices and data). - columns: Vec, + /// The actual component data + pub data_columns: Vec, } /// ## Metadata keys for the record batch metadata @@ -68,35 +75,72 @@ impl ChunkSchema { const CHUNK_METADATA_VERSION: &'static str = "1"; } +/// ## Builders impl ChunkSchema { - /// The globally unique ID of this chunk. + pub fn new( + chunk_id: ChunkId, + entity_path: EntityPath, + row_id_column: RowIdColumnDescriptor, + index_columns: Vec, + data_columns: Vec, + ) -> Self { + Self { + chunk_id, + entity_path, + heap_size_bytes: None, + is_sorted: false, // assume the worst + row_id_column, + index_columns, + data_columns, + } + } + + #[inline] + pub fn with_heap_size_bytes(mut self, heap_size_bytes: u64) -> Self { + self.heap_size_bytes = Some(heap_size_bytes); + self + } + + #[inline] + pub fn with_sorted(mut self, sorted: bool) -> Self { + self.is_sorted = sorted; + self + } +} +/// ## Accessors +impl ChunkSchema { + /// The globally unique ID of this chunk. #[inline] pub fn chunk_id(&self) -> ChunkId { self.chunk_id } /// Which entity is this chunk for? - #[inline] pub fn entity_path(&self) -> &EntityPath { &self.entity_path } /// The heap size of this chunk in bytes, if known. - #[inline] pub fn heap_size_bytes(&self) -> Option { self.heap_size_bytes } /// Are we sorted by the row id column? - #[inline] pub fn is_sorted(&self) -> bool { self.is_sorted } + /// Total number of columns in this chunk, + /// including the row id column, the index columns, + /// and the data columns. + pub fn num_columns(&self) -> usize { + 1 + self.index_columns.len() + self.data_columns.len() + } + pub fn arrow_batch_metadata(&self) -> ArrowBatchMetadata { let Self { chunk_id, @@ -104,7 +148,8 @@ impl ChunkSchema { heap_size_bytes, is_sorted, row_id_column: _, - columns: _, + index_columns: _, + data_columns: _, } = self; let mut arrow_metadata = ArrowBatchMetadata::from([ @@ -129,18 +174,22 @@ impl ChunkSchema { } } -impl From for ArrowSchema { - fn from(chunk_schema: ChunkSchema) -> Self { +impl From<&ChunkSchema> for ArrowSchema { + fn from(chunk_schema: &ChunkSchema) -> Self { let metadata = chunk_schema.arrow_batch_metadata(); + let num_columns = chunk_schema.num_columns(); let ChunkSchema { row_id_column, - columns, + index_columns, + data_columns, .. } = chunk_schema; - let mut fields: Vec = Vec::with_capacity(1 + columns.len()); + + let mut fields: Vec = Vec::with_capacity(num_columns); fields.push(row_id_column.to_arrow_field()); - fields.extend(columns.iter().map(|column| column.to_arrow_field())); + fields.extend(index_columns.iter().map(|column| column.to_arrow_field())); + fields.extend(data_columns.iter().map(|column| column.to_arrow_field())); Self { metadata, @@ -195,7 +244,7 @@ impl TryFrom<&ArrowSchema> for ChunkSchema { .iter() .skip(1) .map(|field| { - ComponentColumnDescriptor::try_from(field.as_ref()).map_err(|err| { + ColumnDescriptor::try_from(field.as_ref()).map_err(|err| { InvalidChunkSchema::BadColumn { field_name: field.name().to_owned(), error: err, @@ -205,13 +254,36 @@ impl TryFrom<&ArrowSchema> for ChunkSchema { .collect(); let columns = columns?; + // Index columns should always come first: + let num_index_columns = columns.partition_point(|p| matches!(p, ColumnDescriptor::Time(_))); + + let index_columns = columns[0..num_index_columns] + .iter() + .filter_map(|c| match c { + ColumnDescriptor::Time(column) => Some(column.clone()), + ColumnDescriptor::Component(_) => None, + }) + .collect_vec(); + let data_columns = columns[0..num_index_columns] + .iter() + .filter_map(|c| match c { + ColumnDescriptor::Time(_) => None, + ColumnDescriptor::Component(column) => Some(column.clone()), + }) + .collect_vec(); + + if index_columns.len() + data_columns.len() < columns.len() { + return Err(InvalidChunkSchema::UnorderedIndexAndDataColumns); + } + Ok(Self { chunk_id, entity_path, heap_size_bytes, is_sorted, row_id_column, - columns, + index_columns, + data_columns, }) } } diff --git a/crates/store/re_sorbet/src/data_column_schema.rs b/crates/store/re_sorbet/src/data_column_schema.rs index 3c24c9c61494..26659ec158f6 100644 --- a/crates/store/re_sorbet/src/data_column_schema.rs +++ b/crates/store/re_sorbet/src/data_column_schema.rs @@ -196,8 +196,7 @@ impl ComponentColumnDescriptor { self.store_datatype.clone() } - #[inline] - pub fn to_arrow_field(&self) -> ArrowField { + pub fn column_name(&self) -> String { let entity_path = &self.entity_path; let descriptor = ComponentDescriptor { archetype_name: self.archetype_name, @@ -205,16 +204,19 @@ impl ComponentColumnDescriptor { component_name: self.component_name, }; - ArrowField::new( - // NOTE: Uncomment this to expose fully-qualified names in the Dataframe APIs! - // I'm not doing that right now, to avoid breaking changes (and we need to talk about - // what the syntax for these fully-qualified paths need to look like first). - format!("{}:{}", entity_path, descriptor.component_name.short_name()), - // format!("{entity_path}@{}", descriptor.short_name()), - self.returned_datatype(), - true, /* nullable */ - ) - .with_metadata(self.metadata()) + format!("{}:{}", entity_path, descriptor.component_name.short_name()) + + // NOTE: Uncomment this to expose fully-qualified names in the Dataframe APIs! + // I'm not doing that right now, to avoid breaking changes (and we need to talk about + // what the syntax for these fully-qualified paths need to look like first). + // format!("{entity_path}@{}", descriptor.short_name()) + } + + #[inline] + pub fn to_arrow_field(&self) -> ArrowField { + let nullable = true; + ArrowField::new(self.column_name(), self.returned_datatype(), nullable) + .with_metadata(self.metadata()) } } diff --git a/crates/store/re_sorbet/src/index_column_schema.rs b/crates/store/re_sorbet/src/index_column_schema.rs index c782edf0ecaf..29ca0db2a3c1 100644 --- a/crates/store/re_sorbet/src/index_column_schema.rs +++ b/crates/store/re_sorbet/src/index_column_schema.rs @@ -1,6 +1,9 @@ use arrow::datatypes::{DataType as ArrowDatatype, Field as ArrowField}; + use re_log_types::{Timeline, TimelineName}; +use crate::MetadataExt as _; + #[derive(thiserror::Error, Debug)] #[error("Unsupported time type: {datatype:?}")] pub struct UnsupportedTimeType { @@ -15,6 +18,9 @@ pub struct TimeColumnDescriptor { /// The Arrow datatype of the column. pub datatype: ArrowDatatype, + + /// Are the indices in this column sorted? + pub is_sorted: bool, } impl PartialOrd for TimeColumnDescriptor { @@ -30,6 +36,7 @@ impl Ord for TimeColumnDescriptor { let Self { timeline, datatype: _, + is_sorted: _, } = self; timeline.cmp(&other.timeline) } @@ -44,6 +51,7 @@ impl TimeColumnDescriptor { // It doesn't matter, only the name will remain in the Arrow schema anyhow. timeline: Timeline::new_sequence(name), datatype: ArrowDatatype::Null, + is_sorted: true, } } @@ -69,17 +77,21 @@ impl TimeColumnDescriptor { #[inline] pub fn to_arrow_field(&self) -> ArrowField { - let Self { timeline, datatype } = self; + let Self { + timeline, + datatype, + is_sorted, + } = self; let nullable = true; // Time column must be nullable since static data doesn't have a time. - let metadata = [ - Some(("rerun.kind".to_owned(), "index".to_owned())), - Some(("rerun.index_name".to_owned(), timeline.name().to_string())), - ] - .into_iter() - .flatten() - .collect(); + let mut metadata = std::collections::HashMap::from([ + ("rerun.kind".to_owned(), "index".to_owned()), + ("rerun.index_name".to_owned(), timeline.name().to_string()), + ]); + if *is_sorted { + metadata.insert("rerun.is_sorted".to_owned(), "true".to_owned()); + } ArrowField::new(timeline.name().to_string(), datatype.clone(), nullable) .with_metadata(metadata) @@ -91,6 +103,7 @@ impl From for TimeColumnDescriptor { Self { timeline, datatype: timeline.datatype(), + is_sorted: false, // assume the worst } } } @@ -114,6 +127,10 @@ impl TryFrom<&ArrowField> for TimeColumnDescriptor { let timeline = Timeline::new(name, time_type); - Ok(Self { timeline, datatype }) + Ok(Self { + timeline, + datatype, + is_sorted: field.metadata().get_bool("rerun.is_sorted"), + }) } } diff --git a/crates/store/re_sorbet/src/lib.rs b/crates/store/re_sorbet/src/lib.rs index 9544b32d6afe..fb1c754d9604 100644 --- a/crates/store/re_sorbet/src/lib.rs +++ b/crates/store/re_sorbet/src/lib.rs @@ -12,7 +12,7 @@ mod metadata; mod row_id_column_schema; pub use self::{ - chunk_batch::ChunkBatch, + chunk_batch::{ChunkBatch, MismatchedChunkSchemaError}, chunk_schema::ChunkSchema, column_schema::{ColumnDescriptor, ColumnError}, data_column_schema::ComponentColumnDescriptor, diff --git a/crates/store/re_sorbet/src/row_id_column_schema.rs b/crates/store/re_sorbet/src/row_id_column_schema.rs index 3f8746434704..fae2a3b84f11 100644 --- a/crates/store/re_sorbet/src/row_id_column_schema.rs +++ b/crates/store/re_sorbet/src/row_id_column_schema.rs @@ -8,6 +8,22 @@ pub struct WrongDatatypeError { pub actual: ArrowDatatype, } +impl WrongDatatypeError { + pub fn compare_expected_actual( + expected: &ArrowDatatype, + actual: &ArrowDatatype, + ) -> Result<(), Self> { + if expected == actual { + Ok(()) + } else { + Err(Self { + expected: expected.clone(), + actual: actual.clone(), + }) + } + } +} + /// Describes the [`RowId`] #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] pub struct RowIdColumnDescriptor {} @@ -50,13 +66,24 @@ impl TryFrom<&ArrowField> for RowIdColumnDescriptor { fn try_from(field: &ArrowField) -> Result { // TODO: check `rerun.kind` - if field.data_type() == &RowId::arrow_datatype() { - Ok(Self {}) - } else { - Err(WrongDatatypeError { - expected: RowId::arrow_datatype(), - actual: field.data_type().clone(), - }) - } + Self::try_from(field.data_type()) + } +} + +impl TryFrom<&ArrowDatatype> for RowIdColumnDescriptor { + type Error = WrongDatatypeError; + + fn try_from(data_type: &ArrowDatatype) -> Result { + WrongDatatypeError::compare_expected_actual(&RowId::arrow_datatype(), data_type)?; + Ok(Self {}) + } +} + +impl TryFrom for RowIdColumnDescriptor { + type Error = WrongDatatypeError; + + fn try_from(data_type: ArrowDatatype) -> Result { + WrongDatatypeError::compare_expected_actual(&RowId::arrow_datatype(), &data_type)?; + Ok(Self {}) } } From 42d3a207dd21c1a6efa306e403fe857c52721bc8 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Sat, 8 Feb 2025 13:06:04 +0100 Subject: [PATCH 06/38] Roundtrip it --- Cargo.lock | 1 + crates/store/re_chunk/src/transport.rs | 115 ++++++++++++++++++++- crates/store/re_sorbet/Cargo.toml | 1 + crates/store/re_sorbet/src/chunk_batch.rs | 41 +++++++- crates/store/re_sorbet/src/chunk_schema.rs | 15 +-- 5 files changed, 160 insertions(+), 13 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a21a840c791a..8a2d9ab3cc65 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6588,6 +6588,7 @@ version = "0.23.0-alpha.1+dev" dependencies = [ "arrow", "itertools 0.13.0", + "re_arrow_util", "re_format_arrow", "re_log", "re_log_types", diff --git a/crates/store/re_chunk/src/transport.rs b/crates/store/re_chunk/src/transport.rs index 8669e86d64ba..a0ad43438a63 100644 --- a/crates/store/re_chunk/src/transport.rs +++ b/crates/store/re_chunk/src/transport.rs @@ -657,6 +657,102 @@ impl Chunk { Ok(TransportChunk::from(record_batch)) } + pub fn from_chunk_batch(batch: &re_sorbet::ChunkBatch) -> ChunkResult { + re_tracing::profile_function!(format!( + "num_columns={} num_rows={}", + batch.num_columns(), + batch.num_rows() + )); + + // Metadata + let (id, entity_path, is_sorted) = ( + batch.chunk_id(), + batch.entity_path().clone(), + batch.is_sorted(), + ); + + let row_ids = batch.row_id_column().1.clone(); + + let timelines = { + re_tracing::profile_scope!("timelines"); + + let mut timelines = IntMap::default(); + + for (schema, column) in batch.index_columns() { + let timeline = schema.timeline(); + + let times = + TimeColumn::read_array(&as_array_ref(column.clone())).map_err(|err| { + ChunkError::Malformed { + reason: format!("Bad time column '{}': {err}", schema.name()), + } + })?; + + let time_column = + TimeColumn::new(schema.is_sorted.then_some(true), timeline, times); + if timelines.insert(timeline, time_column).is_some() { + return Err(ChunkError::Malformed { + reason: format!( + "time column '{}' was specified more than once", + schema.name(), + ), + }); + } + } + + timelines + }; + + let components = { + let mut components = ChunkComponents::default(); + + for (schema, column) in batch.data_columns() { + let column = column + .downcast_array_ref::() + .ok_or_else(|| ChunkError::Malformed { + reason: format!( + "The outer array in a chunked component batch must be a sparse list, got {:?}", + column.data_type(), + ), + })?; + + let component_desc = ComponentDescriptor { + archetype_name: schema.archetype_name, + archetype_field_name: schema.archetype_field_name, + component_name: schema.component_name, + }; + + if components + .insert_descriptor(component_desc, column.clone()) + .is_some() + { + return Err(ChunkError::Malformed { + reason: format!( + "component column '{schema:?}' was specified more than once" + ), + }); + } + } + + components + }; + + let mut res = Self::new( + id, + entity_path, + is_sorted.then_some(true), + row_ids, + timelines, + components, + )?; + + if let Some(heap_size_bytes) = batch.heap_size_bytes() { + res.heap_size_bytes = heap_size_bytes.into(); + } + + Ok(res) + } + pub fn from_record_batch(batch: ArrowRecordBatch) -> ChunkResult { Self::from_transport(&TransportChunk::from(batch)) } @@ -831,6 +927,7 @@ impl Chunk { #[cfg(test)] mod tests { use nohash_hasher::IntMap; + use similar_asserts::assert_eq; use re_arrow_util::arrow_util; use re_log_types::{ @@ -981,12 +1078,22 @@ mod tests { ) .unwrap(); - let chunk_batch = chunk_before.to_chunk_batch().unwrap(); + let chunk_batch_before = chunk_before.to_chunk_batch().unwrap(); + + assert_eq!(chunk_before.num_columns(), chunk_batch_before.num_columns()); + assert_eq!(chunk_before.num_rows(), chunk_batch_before.num_rows()); - assert_eq!(chunk_before.num_columns(), chunk_batch.num_columns()); - assert_eq!(chunk_before.num_rows(), chunk_batch.num_rows()); + let arrow_record_batch = ArrowRecordBatch::from(&chunk_batch_before); + + let chunk_batch_after = re_sorbet::ChunkBatch::try_from(arrow_record_batch).unwrap(); + + assert_eq!( + chunk_batch_before.chunk_schema(), + chunk_batch_after.chunk_schema() + ); + assert_eq!(chunk_batch_before.num_rows(), chunk_batch_after.num_rows()); - let chunk_after = Chunk::from_record_batch(chunk_batch.into()).unwrap(); + let chunk_after = Chunk::from_chunk_batch(&chunk_batch_after).unwrap(); assert_eq!(chunk_before.entity_path(), chunk_after.entity_path()); assert_eq!( diff --git a/crates/store/re_sorbet/Cargo.toml b/crates/store/re_sorbet/Cargo.toml index 929fcfa20188..be87b024fd93 100644 --- a/crates/store/re_sorbet/Cargo.toml +++ b/crates/store/re_sorbet/Cargo.toml @@ -20,6 +20,7 @@ all-features = true [dependencies] +re_arrow_util.workspace = true re_format_arrow.workspace = true re_log_types.workspace = true re_log.workspace = true diff --git a/crates/store/re_sorbet/src/chunk_batch.rs b/crates/store/re_sorbet/src/chunk_batch.rs index e6cc158a6fad..791672b88ad1 100644 --- a/crates/store/re_sorbet/src/chunk_batch.rs +++ b/crates/store/re_sorbet/src/chunk_batch.rs @@ -1,5 +1,8 @@ use arrow::{ - array::{ArrayRef as ArrowArrayRef, RecordBatch as ArrowRecordBatch, RecordBatchOptions}, + array::{ + Array as _, ArrayRef as ArrowArrayRef, AsArray, RecordBatch as ArrowRecordBatch, + RecordBatchOptions, StructArray as ArrowStructArray, + }, datatypes::{Fields as ArrowFields, Schema as ArrowSchema}, }; @@ -7,7 +10,8 @@ use re_log_types::EntityPath; use re_types_core::ChunkId; use crate::{ - chunk_schema::InvalidChunkSchema, ArrowBatchMetadata, ChunkSchema, WrongDatatypeError, + chunk_schema::InvalidChunkSchema, ArrowBatchMetadata, ChunkSchema, ComponentColumnDescriptor, + RowIdColumnDescriptor, TimeColumnDescriptor, WrongDatatypeError, }; #[derive(thiserror::Error, Debug)] @@ -144,6 +148,37 @@ impl ChunkBatch { pub fn arrow_bacth_metadata(&self) -> &ArrowBatchMetadata { &self.batch.schema_ref().metadata } + + pub fn row_id_column(&self) -> (&RowIdColumnDescriptor, &ArrowStructArray) { + // The first column is always the row IDs. + ( + &self.schema.row_id_column, + self.batch.columns()[0] + .as_struct_opt() + .expect("Row IDs should be encoded as struct"), + ) + } + + /// The columns of the indices (timelines). + pub fn index_columns(&self) -> impl Iterator { + itertools::izip!( + &self.schema.index_columns, + self.batch.columns().iter().skip(1) // skip row IDs + ) + } + + /// The columns of the indices (timelines). + pub fn data_columns( + &self, + ) -> impl Iterator { + itertools::izip!( + &self.schema.data_columns, + self.batch + .columns() + .iter() + .skip(1 + self.schema.index_columns.len()) // skip row IDs and indices + ) + } } impl std::fmt::Display for ChunkBatch { @@ -189,6 +224,8 @@ impl TryFrom for ChunkBatch { fn try_from(batch: ArrowRecordBatch) -> Result { let chunk_schema = ChunkSchema::try_from(batch.schema_ref().as_ref())?; + // TODO: sanity check that the schema matches the columns + // Extend with any metadata that might have been missing: let mut arrow_schema = ArrowSchema::clone(batch.schema_ref().as_ref()); arrow_schema diff --git a/crates/store/re_sorbet/src/chunk_schema.rs b/crates/store/re_sorbet/src/chunk_schema.rs index b80784621244..3a73149d3f8e 100644 --- a/crates/store/re_sorbet/src/chunk_schema.rs +++ b/crates/store/re_sorbet/src/chunk_schema.rs @@ -42,7 +42,7 @@ impl InvalidChunkSchema { /// /// This does NOT preserve custom arrow metadata. /// It only contains the metadata used by Rerun. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq, Eq)] pub struct ChunkSchema { /// The globally unique ID of this chunk. pub chunk_id: ChunkId, @@ -240,7 +240,7 @@ impl TryFrom<&ArrowSchema> for ChunkSchema { let row_id_column = RowIdColumnDescriptor::try_from(first_field.as_ref()) .map_err(InvalidChunkSchema::BadRowIdColumn)?; - let columns: Result, _> = fields + let index_and_data_columns: Result, _> = fields .iter() .skip(1) .map(|field| { @@ -252,19 +252,20 @@ impl TryFrom<&ArrowSchema> for ChunkSchema { }) }) .collect(); - let columns = columns?; + let index_and_data_columns = index_and_data_columns?; // Index columns should always come first: - let num_index_columns = columns.partition_point(|p| matches!(p, ColumnDescriptor::Time(_))); + let num_index_columns = + index_and_data_columns.partition_point(|p| matches!(p, ColumnDescriptor::Time(_))); - let index_columns = columns[0..num_index_columns] + let index_columns = index_and_data_columns[0..num_index_columns] .iter() .filter_map(|c| match c { ColumnDescriptor::Time(column) => Some(column.clone()), ColumnDescriptor::Component(_) => None, }) .collect_vec(); - let data_columns = columns[0..num_index_columns] + let data_columns = index_and_data_columns[num_index_columns..] .iter() .filter_map(|c| match c { ColumnDescriptor::Time(_) => None, @@ -272,7 +273,7 @@ impl TryFrom<&ArrowSchema> for ChunkSchema { }) .collect_vec(); - if index_columns.len() + data_columns.len() < columns.len() { + if index_columns.len() + data_columns.len() != index_and_data_columns.len() { return Err(InvalidChunkSchema::UnorderedIndexAndDataColumns); } From 40fd804336fac89d0968228ce269f9224a89a81a Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Mon, 10 Feb 2025 09:53:32 +0100 Subject: [PATCH 07/38] Less use of `TransportChunk` --- crates/viewer/re_chunk_store_ui/src/chunk_ui.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/crates/viewer/re_chunk_store_ui/src/chunk_ui.rs b/crates/viewer/re_chunk_store_ui/src/chunk_ui.rs index b61273cf22e1..32fbab0d5e8e 100644 --- a/crates/viewer/re_chunk_store_ui/src/chunk_ui.rs +++ b/crates/viewer/re_chunk_store_ui/src/chunk_ui.rs @@ -1,12 +1,11 @@ use std::collections::BTreeMap; use std::sync::Arc; -use arrow::array::Array as _; +use arrow::array::{Array as _, RecordBatch as ArrowRecordBatch}; use egui_extras::{Column, TableRow}; use itertools::Itertools; use re_byte_size::SizeBytes; -use re_chunk_store::external::re_chunk::TransportChunk; use re_chunk_store::Chunk; use re_log_types::{TimeZone, Timeline}; use re_types::datatypes::TimeInt; @@ -204,7 +203,7 @@ impl ChunkUi { } }; - let fields_ui = |ui: &mut egui::Ui, transport: &TransportChunk| { + let fields_ui = |ui: &mut egui::Ui, transport: &ArrowRecordBatch| { for field in &transport.schema_ref().fields { ui.push_id(field.name().clone(), |ui| { ui.list_item_collapsible_noninteractive_label(field.name(), false, |ui| { From 8a4b5d6d3309737f2c44f6cbb58f478c1651abfd Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Mon, 10 Feb 2025 09:54:43 +0100 Subject: [PATCH 08/38] Remove unwrap; fix compilation of tests --- Cargo.lock | 1 - crates/store/re_sorbet/Cargo.toml | 1 - crates/store/re_sorbet/src/chunk_batch.rs | 6 +++++- crates/store/re_sorbet/src/column_schema.rs | 1 + 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8a2d9ab3cc65..a21a840c791a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6588,7 +6588,6 @@ version = "0.23.0-alpha.1+dev" dependencies = [ "arrow", "itertools 0.13.0", - "re_arrow_util", "re_format_arrow", "re_log", "re_log_types", diff --git a/crates/store/re_sorbet/Cargo.toml b/crates/store/re_sorbet/Cargo.toml index be87b024fd93..929fcfa20188 100644 --- a/crates/store/re_sorbet/Cargo.toml +++ b/crates/store/re_sorbet/Cargo.toml @@ -20,7 +20,6 @@ all-features = true [dependencies] -re_arrow_util.workspace = true re_format_arrow.workspace = true re_log_types.workspace = true re_log.workspace = true diff --git a/crates/store/re_sorbet/src/chunk_batch.rs b/crates/store/re_sorbet/src/chunk_batch.rs index 791672b88ad1..7071f2eb48b2 100644 --- a/crates/store/re_sorbet/src/chunk_batch.rs +++ b/crates/store/re_sorbet/src/chunk_batch.rs @@ -102,7 +102,11 @@ impl ChunkBatch { arrow_columns, &RecordBatchOptions::default().with_row_count(Some(row_count)), ) - .unwrap(); + .map_err(|err| { + MismatchedChunkSchemaError::custom(format!( + "Failed to create arrow record batch: {err}" + )) + })?; Ok(Self { schema, batch }) } diff --git a/crates/store/re_sorbet/src/column_schema.rs b/crates/store/re_sorbet/src/column_schema.rs index 9f7cf34ac02e..f22253cbdabe 100644 --- a/crates/store/re_sorbet/src/column_schema.rs +++ b/crates/store/re_sorbet/src/column_schema.rs @@ -114,6 +114,7 @@ fn test_schema_over_ipc() { arrow::datatypes::TimeUnit::Nanosecond, None, ), + is_sorted: true, }), ColumnDescriptor::Component(ComponentColumnDescriptor { entity_path: re_log_types::EntityPath::from("/some/path"), From 538b9b311e87fbfd5661bc5df17386883f27d304 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Mon, 10 Feb 2025 13:26:34 +0100 Subject: [PATCH 09/38] Remove one use of TransportChunk --- Cargo.lock | 1 + crates/top/rerun/Cargo.toml | 1 + crates/top/rerun/src/commands/rrd/filter.rs | 19 +++++++------------ 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a21a840c791a..3bd1059c03e1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7502,6 +7502,7 @@ dependencies = [ "re_sdk", "re_sdk_comms", "re_smart_channel", + "re_sorbet", "re_tracing", "re_types", "re_video", diff --git a/crates/top/rerun/Cargo.toml b/crates/top/rerun/Cargo.toml index 974fddd12dcf..a6f1233c01f5 100644 --- a/crates/top/rerun/Cargo.toml +++ b/crates/top/rerun/Cargo.toml @@ -137,6 +137,7 @@ re_log_types.workspace = true re_log.workspace = true re_memory.workspace = true re_smart_channel.workspace = true +re_sorbet.workspace = true re_tracing.workspace = true re_video.workspace = true diff --git a/crates/top/rerun/src/commands/rrd/filter.rs b/crates/top/rerun/src/commands/rrd/filter.rs index 6b2256d4056a..6f8c84f8dd96 100644 --- a/crates/top/rerun/src/commands/rrd/filter.rs +++ b/crates/top/rerun/src/commands/rrd/filter.rs @@ -58,7 +58,7 @@ impl FilterCommand { let now = std::time::Instant::now(); re_log::info!(srcs = ?path_to_input_rrds, ?dropped_timelines, "filter started"); - let dropped_timelines: HashSet<_> = dropped_timelines.iter().collect(); + let dropped_timelines: HashSet<_> = dropped_timelines.iter().cloned().collect(); let dropped_entity_paths: HashSet = dropped_entity_paths .iter() .map(|s| EntityPath::parse_forgiving(s)) @@ -120,7 +120,7 @@ impl FilterCommand { msg.batch.columns() ) .filter(|(field, _col)| { - should_keep_timeline(&dropped_timelines, field) + !is_field_timeline_of(field, &dropped_timelines) }) .map(|(field, col)| (field.clone(), col.clone())) .unzip(); @@ -201,16 +201,11 @@ impl FilterCommand { // --- -fn should_keep_timeline(dropped_timelines: &HashSet<&String>, field: &ArrowField) -> bool { - let is_timeline = field - .metadata() - .get(TransportChunk::FIELD_METADATA_KEY_KIND) - .map(|s| s.as_str()) - == Some(TransportChunk::FIELD_METADATA_VALUE_KIND_TIME); - - let is_dropped = dropped_timelines.contains(field.name()); - - !is_timeline || !is_dropped +// Does the given field represent a timeline that is in the given set? +fn is_field_timeline_of(field: &ArrowField, dropped_timelines: &HashSet) -> bool { + re_sorbet::TimeColumnDescriptor::try_from(field) + .ok() + .is_some_and(|schema| dropped_timelines.contains(schema.name().as_str())) } fn should_keep_entity_path( From 851c7fed6c2add0463104b025e93158ea78405dd Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Mon, 10 Feb 2025 13:30:57 +0100 Subject: [PATCH 10/38] Use re_sobet in rerun cli filtering --- crates/top/rerun/src/commands/rrd/filter.rs | 77 +++++++++------------ 1 file changed, 34 insertions(+), 43 deletions(-) diff --git a/crates/top/rerun/src/commands/rrd/filter.rs b/crates/top/rerun/src/commands/rrd/filter.rs index 6f8c84f8dd96..fc9a30a3244e 100644 --- a/crates/top/rerun/src/commands/rrd/filter.rs +++ b/crates/top/rerun/src/commands/rrd/filter.rs @@ -8,7 +8,7 @@ use arrow::{ use itertools::Either; use re_build_info::CrateVersion; -use re_chunk::{external::crossbeam, TransportChunk}; +use re_chunk::external::crossbeam; use re_sdk::{external::arrow, EntityPath}; use crate::commands::read_rrd_streams_from_file_or_stdin; @@ -111,32 +111,40 @@ impl FilterCommand { Ok(msg) => { let msg = match msg { re_log_types::LogMsg::ArrowMsg(store_id, mut msg) => { - if !should_keep_entity_path(&dropped_entity_paths, &msg.batch.schema()) + match re_sorbet::ChunkSchema::try_from(msg.batch.schema_ref().as_ref()) { - None - } else { - let (fields, columns): (Vec<_>, Vec<_>) = itertools::izip!( - &msg.batch.schema().fields, - msg.batch.columns() - ) - .filter(|(field, _col)| { - !is_field_timeline_of(field, &dropped_timelines) - }) - .map(|(field, col)| (field.clone(), col.clone())) - .unzip(); - - if let Ok(new_batch) = ArrowRecordBatch::try_new( - ArrowSchema::new_with_metadata( - fields, - msg.batch.schema().metadata().clone(), - ) - .into(), - columns, - ) { - msg.batch = new_batch; - Some(re_log_types::LogMsg::ArrowMsg(store_id, msg)) - } else { - None // Probably failed because we filtered out everything + Ok(schema) => { + if !dropped_entity_paths.contains(&schema.entity_path) { + None + } else { + let (fields, columns): (Vec<_>, Vec<_>) = itertools::izip!( + &msg.batch.schema().fields, + msg.batch.columns() + ) + .filter(|(field, _col)| { + !is_field_timeline_of(field, &dropped_timelines) + }) + .map(|(field, col)| (field.clone(), col.clone())) + .unzip(); + + if let Ok(new_batch) = ArrowRecordBatch::try_new( + ArrowSchema::new_with_metadata( + fields, + msg.batch.schema().metadata().clone(), + ) + .into(), + columns, + ) { + msg.batch = new_batch; + Some(re_log_types::LogMsg::ArrowMsg(store_id, msg)) + } else { + None // Probably failed because we filtered out everything + } + } + } + Err(err) => { + re_log::warn_once!("Failed to parse chunk schema: {err}"); + None } } } @@ -207,20 +215,3 @@ fn is_field_timeline_of(field: &ArrowField, dropped_timelines: &HashSet) .ok() .is_some_and(|schema| dropped_timelines.contains(schema.name().as_str())) } - -fn should_keep_entity_path( - dropped_entity_paths: &HashSet, - schema: &ArrowSchema, -) -> bool { - let Some(entity_path) = schema - .metadata - .get(TransportChunk::CHUNK_METADATA_KEY_ENTITY_PATH) - .map(|s| EntityPath::parse_forgiving(s)) - else { - return true; - }; - - let is_dropped = dropped_entity_paths.contains(&entity_path); - - !is_dropped -} From c42d842c7e9736402b3610f85d01b90d31ffa43e Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Mon, 10 Feb 2025 13:45:43 +0100 Subject: [PATCH 11/38] No TransportChunk in re_grpc_client --- crates/store/re_grpc_client/src/lib.rs | 23 ++++++++++------------- rerun_py/src/remote.rs | 5 +---- 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/crates/store/re_grpc_client/src/lib.rs b/crates/store/re_grpc_client/src/lib.rs index 9c063d094497..38df97ee48e7 100644 --- a/crates/store/re_grpc_client/src/lib.rs +++ b/crates/store/re_grpc_client/src/lib.rs @@ -223,8 +223,7 @@ async fn stream_recording_async( })); } - let store_info = - store_info_from_catalog_chunk(&TransportChunk::from(resp[0].clone()), &recording_id)?; + let store_info = store_info_from_catalog_chunk(&resp[0], &recording_id)?; let store_id = store_info.store_id.clone(); re_log::debug!("Fetching {recording_id}…"); @@ -281,39 +280,37 @@ async fn stream_recording_async( } pub fn store_info_from_catalog_chunk( - tc: &TransportChunk, + record_batch: &ArrowRecordBatch, recording_id: &str, ) -> Result { let store_id = StoreId::from_string(StoreKind::Recording, recording_id.to_owned()); - let (_field, data) = tc - .components() - .find(|(f, _)| f.name() == CATALOG_APP_ID_FIELD_NAME) + let data = record_batch + .column_by_name(CATALOG_APP_ID_FIELD_NAME) .ok_or(StreamError::ChunkError(re_chunk::ChunkError::Malformed { - reason: "no {CATALOG_APP_ID_FIELD_NAME} field found".to_owned(), + reason: format!("no {CATALOG_APP_ID_FIELD_NAME} field found"), }))?; let app_id = data .downcast_array_ref::() .ok_or(StreamError::ChunkError(re_chunk::ChunkError::Malformed { reason: format!( "{CATALOG_APP_ID_FIELD_NAME} must be a utf8 array: {:?}", - tc.schema_ref() + record_batch.schema_ref() ), }))? .value(0); - let (_field, data) = tc - .components() - .find(|(f, _)| f.name() == CATALOG_START_TIME_FIELD_NAME) + let data = record_batch + .column_by_name(CATALOG_START_TIME_FIELD_NAME) .ok_or(StreamError::ChunkError(re_chunk::ChunkError::Malformed { - reason: "no {CATALOG_START_TIME_FIELD_NAME}} field found".to_owned(), + reason: format!("no {CATALOG_START_TIME_FIELD_NAME} field found"), }))?; let start_time = data .downcast_array_ref::() .ok_or(StreamError::ChunkError(re_chunk::ChunkError::Malformed { reason: format!( "{CATALOG_START_TIME_FIELD_NAME} must be a Timestamp array: {:?}", - tc.schema_ref() + record_batch.schema_ref() ), }))? .value(0); diff --git a/rerun_py/src/remote.rs b/rerun_py/src/remote.rs index c5b5afc0dc5a..48febc779f14 100644 --- a/rerun_py/src/remote.rs +++ b/rerun_py/src/remote.rs @@ -138,10 +138,7 @@ impl PyStorageNodeClient { )); } - re_grpc_client::store_info_from_catalog_chunk( - &re_chunk::TransportChunk::from(resp[0].clone()), - id, - ) + re_grpc_client::store_info_from_catalog_chunk(&resp[0], id) }) .map_err(|err| PyRuntimeError::new_err(err.to_string()))?; From b4ed1e09cbf491f4913e572587f5da1ef61c1a1a Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Mon, 10 Feb 2025 13:58:02 +0100 Subject: [PATCH 12/38] Remove `Chunk::from_transport` --- crates/store/re_chunk/src/chunk.rs | 3 + crates/store/re_chunk/src/transport.rs | 148 +----------------- .../tests/arrow_encode_roundtrip.rs | 4 +- crates/store/re_sorbet/src/lib.rs | 2 +- 4 files changed, 12 insertions(+), 145 deletions(-) diff --git a/crates/store/re_chunk/src/chunk.rs b/crates/store/re_chunk/src/chunk.rs index 7d63e7e39f58..14bc14e2ec6a 100644 --- a/crates/store/re_chunk/src/chunk.rs +++ b/crates/store/re_chunk/src/chunk.rs @@ -54,6 +54,9 @@ pub enum ChunkError { #[error(transparent)] MismatchedChunkSchemaError(#[from] re_sorbet::MismatchedChunkSchemaError), + + #[error(transparent)] + InvalidChunkSchema(#[from] re_sorbet::InvalidChunkSchema), } pub type ChunkResult = Result; diff --git a/crates/store/re_chunk/src/transport.rs b/crates/store/re_chunk/src/transport.rs index a0ad43438a63..7db98a534425 100644 --- a/crates/store/re_chunk/src/transport.rs +++ b/crates/store/re_chunk/src/transport.rs @@ -1,12 +1,9 @@ use arrow::{ array::{ Array as ArrowArray, ArrayRef as ArrowArrayRef, ListArray as ArrowListArray, - RecordBatch as ArrowRecordBatch, StructArray as ArrowStructArray, - }, - datatypes::{ - DataType as ArrowDatatype, Field as ArrowField, Fields as ArrowFields, - Schema as ArrowSchema, TimeUnit as ArrowTimeUnit, + RecordBatch as ArrowRecordBatch, }, + datatypes::{Field as ArrowField, Fields as ArrowFields, Schema as ArrowSchema}, }; use itertools::Itertools; use nohash_hasher::IntMap; @@ -14,7 +11,7 @@ use tap::Tap as _; use re_arrow_util::{arrow_util::into_arrow_ref, ArrowArrayDowncastRef as _}; use re_byte_size::SizeBytes as _; -use re_log_types::{EntityPath, Timeline}; +use re_log_types::EntityPath; use re_types_core::{ arrow_helpers::as_array_ref, Component as _, ComponentDescriptor, Loggable as _, }; @@ -754,145 +751,12 @@ impl Chunk { } pub fn from_record_batch(batch: ArrowRecordBatch) -> ChunkResult { - Self::from_transport(&TransportChunk::from(batch)) - } - - pub fn from_transport(transport: &TransportChunk) -> ChunkResult { re_tracing::profile_function!(format!( "num_columns={} num_rows={}", - transport.num_columns(), - transport.num_rows() + batch.num_columns(), + batch.num_rows() )); - - // Metadata - let (id, entity_path, is_sorted) = { - re_tracing::profile_scope!("metadata"); - ( - transport.id()?, - transport.entity_path()?, - transport.is_sorted(), - ) - }; - - // Row IDs - let row_ids = { - re_tracing::profile_scope!("row ids"); - - let Some(row_ids) = transport.controls().find_map(|(field, column)| { - // TODO(cmc): disgusting, but good enough for now. - (field.name() == RowId::descriptor().component_name.as_str()).then_some(column) - }) else { - return Err(ChunkError::Malformed { - reason: format!("missing row_id column ({:?})", transport.schema_ref()), - }); - }; - - ArrowArrayRef::from(row_ids.clone()) - .downcast_array_ref::() - .ok_or_else(|| ChunkError::Malformed { - reason: format!( - "RowId data has the wrong datatype: expected {:?} but got {:?} instead", - RowId::arrow_datatype(), - *row_ids.data_type(), - ), - })? - .clone() - }; - - // Timelines - let timelines = { - re_tracing::profile_scope!("timelines"); - - let mut timelines = IntMap::default(); - - for (field, column) in transport.timelines() { - // See also [`Timeline::datatype`] - let timeline = match column.data_type() { - ArrowDatatype::Int64 => Timeline::new_sequence(field.name().as_str()), - ArrowDatatype::Timestamp(ArrowTimeUnit::Nanosecond, None) => { - Timeline::new_temporal(field.name().as_str()) - } - _ => { - return Err(ChunkError::Malformed { - reason: format!( - "time column '{}' is not deserializable ({:?})", - field.name(), - column.data_type() - ), - }); - } - }; - - let times = TimeColumn::read_array(&ArrowArrayRef::from(column.clone())).map_err( - |err| ChunkError::Malformed { - reason: format!("Bad time column '{}': {err}", field.name()), - }, - )?; - - let is_sorted = field - .metadata() - .contains_key(TransportChunk::FIELD_METADATA_MARKER_IS_SORTED_BY_TIME); - - let time_column = TimeColumn::new(is_sorted.then_some(true), timeline, times); - if timelines.insert(timeline, time_column).is_some() { - return Err(ChunkError::Malformed { - reason: format!( - "time column '{}' was specified more than once", - field.name(), - ), - }); - } - } - - timelines - }; - - // Components - let components = { - let mut components = ChunkComponents::default(); - - for (field, column) in transport.components() { - let column = column - .downcast_array_ref::() - .ok_or_else(|| ChunkError::Malformed { - reason: format!( - "The outer array in a chunked component batch must be a sparse list, got {:?}", - column.data_type(), - ), - })?; - - let component_desc = TransportChunk::component_descriptor_from_field(field); - - if components - .insert_descriptor(component_desc, column.clone()) - .is_some() - { - return Err(ChunkError::Malformed { - reason: format!( - "component column '{}' was specified more than once", - field.name(), - ), - }); - } - } - - components - }; - - let mut res = Self::new( - id, - entity_path, - is_sorted.then_some(true), - row_ids, - timelines, - components, - )?; - - if let Some(heap_size_bytes) = transport.heap_size_bytes() { - res.heap_size_bytes = heap_size_bytes.into(); - } - - Ok(res) + Self::from_chunk_batch(&re_sorbet::ChunkBatch::try_from(batch)?) } } diff --git a/crates/store/re_log_encoding/tests/arrow_encode_roundtrip.rs b/crates/store/re_log_encoding/tests/arrow_encode_roundtrip.rs index de3fb567958b..2e80fc16a2f9 100644 --- a/crates/store/re_log_encoding/tests/arrow_encode_roundtrip.rs +++ b/crates/store/re_log_encoding/tests/arrow_encode_roundtrip.rs @@ -30,8 +30,8 @@ fn encode_roundtrip() { .build() .unwrap(); - let transport = chunk.to_transport().unwrap(); - assert_eq!(Chunk::from_transport(&transport).unwrap(), chunk); + let record_batch = chunk.to_record_batch().unwrap(); + assert_eq!(Chunk::from_record_batch(record_batch).unwrap(), chunk); let arrow_msg = chunk.to_arrow_msg().unwrap(); assert_eq!(Chunk::from_arrow_msg(&arrow_msg).unwrap(), chunk); diff --git a/crates/store/re_sorbet/src/lib.rs b/crates/store/re_sorbet/src/lib.rs index fb1c754d9604..4bff398ecc02 100644 --- a/crates/store/re_sorbet/src/lib.rs +++ b/crates/store/re_sorbet/src/lib.rs @@ -13,7 +13,7 @@ mod row_id_column_schema; pub use self::{ chunk_batch::{ChunkBatch, MismatchedChunkSchemaError}, - chunk_schema::ChunkSchema, + chunk_schema::{ChunkSchema, InvalidChunkSchema}, column_schema::{ColumnDescriptor, ColumnError}, data_column_schema::ComponentColumnDescriptor, index_column_schema::{TimeColumnDescriptor, UnsupportedTimeType}, From 7fc72935fb0bf0e2c0e83502e0e344520db4b243 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Mon, 10 Feb 2025 14:24:18 +0100 Subject: [PATCH 13/38] More robust parsing of ComponentColumnDescriptor --- crates/store/re_sorbet/src/chunk_schema.rs | 8 ++--- crates/store/re_sorbet/src/column_schema.rs | 29 ++++++++++++++----- .../store/re_sorbet/src/data_column_schema.rs | 29 +++++++++++++++---- rerun_py/src/remote.rs | 17 +++++++++-- 4 files changed, 63 insertions(+), 20 deletions(-) diff --git a/crates/store/re_sorbet/src/chunk_schema.rs b/crates/store/re_sorbet/src/chunk_schema.rs index 3a73149d3f8e..b03abe34f6ca 100644 --- a/crates/store/re_sorbet/src/chunk_schema.rs +++ b/crates/store/re_sorbet/src/chunk_schema.rs @@ -244,12 +244,12 @@ impl TryFrom<&ArrowSchema> for ChunkSchema { .iter() .skip(1) .map(|field| { - ColumnDescriptor::try_from(field.as_ref()).map_err(|err| { - InvalidChunkSchema::BadColumn { + ColumnDescriptor::try_from_arrow_field(Some(&entity_path), field.as_ref()).map_err( + |err| InvalidChunkSchema::BadColumn { field_name: field.name().to_owned(), error: err, - } - }) + }, + ) }) .collect(); let index_and_data_columns = index_and_data_columns?; diff --git a/crates/store/re_sorbet/src/column_schema.rs b/crates/store/re_sorbet/src/column_schema.rs index f22253cbdabe..68b99e71965b 100644 --- a/crates/store/re_sorbet/src/column_schema.rs +++ b/crates/store/re_sorbet/src/column_schema.rs @@ -80,22 +80,34 @@ impl ColumnDescriptor { columns.iter().map(|c| c.to_arrow_field()).collect() } - pub fn from_arrow_fields(fields: &[ArrowFieldRef]) -> Result, ColumnError> { + /// `chunk_entity_path`: if this column is part of a chunk batch, + /// what is its entity path (so we can set [`ComponentColumnDescriptor::entity_path`])? + pub fn from_arrow_fields( + chunk_entity_path: Option<&EntityPath>, + fields: &[ArrowFieldRef], + ) -> Result, ColumnError> { fields .iter() - .map(|field| Self::try_from(field.as_ref())) + .map(|field| Self::try_from_arrow_field(chunk_entity_path, field.as_ref())) .collect() } } -impl TryFrom<&ArrowField> for ColumnDescriptor { - type Error = ColumnError; - - fn try_from(field: &ArrowField) -> Result { +impl ColumnDescriptor { + /// `chunk_entity_path`: if this column is part of a chunk batch, + /// what is its entity path (so we can set [`ComponentColumnDescriptor::entity_path`])? + pub fn try_from_arrow_field( + chunk_entity_path: Option<&EntityPath>, + field: &ArrowField, + ) -> Result { let kind = field.get_or_err("rerun.kind")?; match kind { "index" | "time" => Ok(Self::Time(TimeColumnDescriptor::try_from(field)?)), - "data" => Ok(Self::Component(ComponentColumnDescriptor::try_from(field)?)), + + "data" => Ok(Self::Component( + ComponentColumnDescriptor::try_from_arrow_field(chunk_entity_path, field)?, + )), + _ => Err(ColumnError::UnsupportedColumnKind { kind: kind.to_owned(), }), @@ -136,6 +148,7 @@ fn test_schema_over_ipc() { let recovered_schema = crate::schema_from_ipc(&ipc_bytes).unwrap(); assert_eq!(recovered_schema.as_ref(), &original_schema); - let recovered_columns = ColumnDescriptor::from_arrow_fields(&recovered_schema.fields).unwrap(); + let recovered_columns = + ColumnDescriptor::from_arrow_fields(None, &recovered_schema.fields).unwrap(); assert_eq!(recovered_columns, original_columns); } diff --git a/crates/store/re_sorbet/src/data_column_schema.rs b/crates/store/re_sorbet/src/data_column_schema.rs index 26659ec158f6..46fc97dbdeca 100644 --- a/crates/store/re_sorbet/src/data_column_schema.rs +++ b/crates/store/re_sorbet/src/data_column_schema.rs @@ -21,6 +21,10 @@ pub struct ComponentColumnDescriptor { pub store_datatype: ArrowDatatype, /// The path of the entity. + /// + /// If this column is part of a chunk batch, + /// this is the same for all columns in the batch, + /// and will also be set in the schema for the whole chunk. pub entity_path: EntityPath, /// Optional name of the `Archetype` associated with this data. @@ -220,16 +224,31 @@ impl ComponentColumnDescriptor { } } -impl TryFrom<&ArrowField> for ComponentColumnDescriptor { - type Error = MissingFieldMetadata; +impl ComponentColumnDescriptor { + /// `chunk_entity_path`: if this column is part of a chunk batch, + /// what is its entity path (so we can set [`ComponentColumnDescriptor::entity_path`])? + pub fn try_from_arrow_field( + chunk_entity_path: Option<&EntityPath>, + field: &ArrowField, + ) -> Result { + let entity_path = if let Some(chunk_entity_path) = chunk_entity_path { + chunk_entity_path.clone() + } else { + EntityPath::parse_forgiving(field.get_or_err("rerun.entity_path")?) + }; + + let component_name = if let Some(component_name) = field.get_opt("rerun.component") { + ComponentName::from(component_name) + } else { + ComponentName::new(field.name()) // Legacy fallback + }; - fn try_from(field: &ArrowField) -> Result { Ok(Self { store_datatype: field.data_type().clone(), - entity_path: EntityPath::parse_forgiving(field.get_or_err("rerun.entity_path")?), + entity_path, archetype_name: field.get_opt("rerun.archetype").map(|x| x.into()), archetype_field_name: field.get_opt("rerun.archetype_field").map(|x| x.into()), - component_name: field.get_or_err("rerun.component")?.into(), + component_name, is_static: field.get_bool("rerun.is_static"), is_indicator: field.get_bool("rerun.is_indicator"), is_tombstone: field.get_bool("rerun.is_tombstone"), diff --git a/rerun_py/src/remote.rs b/rerun_py/src/remote.rs index 48febc779f14..1b6db9ad1fd1 100644 --- a/rerun_py/src/remote.rs +++ b/rerun_py/src/remote.rs @@ -289,9 +289,20 @@ impl PyStorageNodeClient { let arrow_schema = ArrowSchema::try_from(&schema) .map_err(|err| PyRuntimeError::new_err(err.to_string()))?; - let column_descriptors = - re_sorbet::ColumnDescriptor::from_arrow_fields(&arrow_schema.fields) - .map_err(|err| PyRuntimeError::new_err(err.to_string()))?; + let chunk_schema = re_sorbet::ChunkSchema::try_from(&arrow_schema) + .map_err(|err| PyRuntimeError::new_err(err.to_string()))?; + + let column_descriptors = itertools::chain!( + chunk_schema + .index_columns + .into_iter() + .map(re_sorbet::ColumnDescriptor::Time), + chunk_schema + .data_columns + .into_iter() + .map(re_sorbet::ColumnDescriptor::Component), + ) + .collect(); Ok(PySchema { schema: column_descriptors, From d0b92c9ee2d4b7c5ce3ce288c5c28dff6cfee12d Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Mon, 10 Feb 2025 14:24:27 +0100 Subject: [PATCH 14/38] Fix roundtripping --- crates/store/re_sorbet/src/chunk_batch.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/crates/store/re_sorbet/src/chunk_batch.rs b/crates/store/re_sorbet/src/chunk_batch.rs index 7071f2eb48b2..54377d75d5e7 100644 --- a/crates/store/re_sorbet/src/chunk_batch.rs +++ b/crates/store/re_sorbet/src/chunk_batch.rs @@ -235,7 +235,13 @@ impl TryFrom for ChunkBatch { arrow_schema .metadata .extend(chunk_schema.arrow_batch_metadata()); - let batch = batch.with_schema(arrow_schema.into()).expect("Can't fail"); + + let batch = ArrowRecordBatch::try_new_with_options( + arrow_schema.into(), + batch.columns().to_vec(), + &RecordBatchOptions::default().with_row_count(Some(batch.num_rows())), + ) + .expect("Can't fail"); Ok(Self { schema: chunk_schema, From ab2ad6f0c056de38d9f58b29dea9444e1b352ec0 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Mon, 10 Feb 2025 14:31:21 +0100 Subject: [PATCH 15/38] Use "true" insteado of "yes", to make it more JSON-like --- ...e_dataframe__query__tests__async_barebones_static.snap | 4 ++-- ...dataframe__query__tests__async_barebones_temporal.snap | 4 ++-- .../re_dataframe__query__tests__barebones-2.snap | 4 ++-- .../snapshots/re_dataframe__query__tests__barebones.snap | 4 ++-- .../snapshots/re_dataframe__query__tests__clears-2.snap | 4 ++-- .../src/snapshots/re_dataframe__query__tests__clears.snap | 4 ++-- .../re_dataframe__query__tests__filtered_index_range.snap | 4 ++-- ...re_dataframe__query__tests__filtered_index_values.snap | 4 ++-- ...e_dataframe__query__tests__filtered_is_not_null-2.snap | 4 ++-- ...e_dataframe__query__tests__filtered_is_not_null-3.snap | 4 ++-- ...e_dataframe__query__tests__filtered_is_not_null-4.snap | 4 ++-- .../re_dataframe__query__tests__filtered_is_not_null.snap | 4 ++-- .../re_dataframe__query__tests__selection-2.snap | 5 +++-- .../re_dataframe__query__tests__selection-4.snap | 4 ++-- ...query__tests__sparse_fill_strategy_latestatglobal.snap | 4 ++-- .../re_dataframe__query__tests__using_index_values-2.snap | 4 ++-- .../re_dataframe__query__tests__using_index_values.snap | 4 ++-- .../re_dataframe__query__tests__view_contents-2.snap | 4 ++-- ...aframe__query__tests__view_contents_and_selection.snap | 6 +++--- crates/store/re_sorbet/src/data_column_schema.rs | 8 ++++---- 20 files changed, 44 insertions(+), 43 deletions(-) diff --git a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__async_barebones_static.snap b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__async_barebones_static.snap index db957933033a..271d97e97da8 100644 --- a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__async_barebones_static.snap +++ b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__async_barebones_static.snap @@ -1,6 +1,6 @@ --- source: crates/store/re_dataframe/src/query.rs -expression: "TransportChunk::from(dataframe)" +expression: DisplayRB(dataframe) --- ┌──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ │ CHUNK METADATA: │ @@ -12,7 +12,7 @@ expression: "TransportChunk::from(dataframe)" │ │ type: "i64" ┆ type: "Timestamp(ns)" ┆ type: "List[u32]" ┆ type: "List[Utf8]" ┆ type: "List[Struct[2]]" │ │ │ │ index_name: "frame_nr" ┆ index_name: "log_time" ┆ component: "example.MyColor" ┆ component: "example.MyLabel" ┆ component: "example.MyPoint" │ │ │ │ kind: "index" ┆ kind: "index" ┆ entity_path: "/this/that" ┆ entity_path: "/this/that" ┆ entity_path: "/this/that" │ │ -│ │ ┆ ┆ kind: "data" ┆ is_static: "yes" ┆ kind: "data" │ │ +│ │ ┆ ┆ kind: "data" ┆ is_static: "true" ┆ kind: "data" │ │ │ │ ┆ ┆ ┆ kind: "data" ┆ │ │ │ ╞════════════════════════╪════════════════════════╪══════════════════════════════╪══════════════════════════════╪══════════════════════════════╡ │ │ │ null ┆ null ┆ null ┆ [c] ┆ null │ │ diff --git a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__async_barebones_temporal.snap b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__async_barebones_temporal.snap index 7ce18178b1a0..e0f9c9d8b04b 100644 --- a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__async_barebones_temporal.snap +++ b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__async_barebones_temporal.snap @@ -1,6 +1,6 @@ --- source: crates/store/re_dataframe/src/query.rs -expression: "TransportChunk::from(dataframe)" +expression: DisplayRB(dataframe) --- ┌─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ │ CHUNK METADATA: │ @@ -12,7 +12,7 @@ expression: "TransportChunk::from(dataframe)" │ │ type: "i64" ┆ type: "Timestamp(ns)" ┆ type: "List[u32]" ┆ type: "List[Utf8]" ┆ type: "List[Struct[2]]" │ │ │ │ index_name: "frame_nr" ┆ index_name: "log_time" ┆ component: "example.MyColor" ┆ component: "example.MyLabel" ┆ component: "example.MyPoint" │ │ │ │ kind: "index" ┆ kind: "index" ┆ entity_path: "/this/that" ┆ entity_path: "/this/that" ┆ entity_path: "/this/that" │ │ -│ │ ┆ ┆ kind: "data" ┆ is_static: "yes" ┆ kind: "data" │ │ +│ │ ┆ ┆ kind: "data" ┆ is_static: "true" ┆ kind: "data" │ │ │ │ ┆ ┆ ┆ kind: "data" ┆ │ │ │ ╞════════════════════════╪═══════════════════════════════╪══════════════════════════════╪══════════════════════════════╪══════════════════════════════╡ │ │ │ 10 ┆ 1970-01-01T00:00:00.000000010 ┆ null ┆ [c] ┆ [{x: 0.0, y: 0.0}] │ │ diff --git a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__barebones-2.snap b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__barebones-2.snap index 7ce18178b1a0..e0f9c9d8b04b 100644 --- a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__barebones-2.snap +++ b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__barebones-2.snap @@ -1,6 +1,6 @@ --- source: crates/store/re_dataframe/src/query.rs -expression: "TransportChunk::from(dataframe)" +expression: DisplayRB(dataframe) --- ┌─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ │ CHUNK METADATA: │ @@ -12,7 +12,7 @@ expression: "TransportChunk::from(dataframe)" │ │ type: "i64" ┆ type: "Timestamp(ns)" ┆ type: "List[u32]" ┆ type: "List[Utf8]" ┆ type: "List[Struct[2]]" │ │ │ │ index_name: "frame_nr" ┆ index_name: "log_time" ┆ component: "example.MyColor" ┆ component: "example.MyLabel" ┆ component: "example.MyPoint" │ │ │ │ kind: "index" ┆ kind: "index" ┆ entity_path: "/this/that" ┆ entity_path: "/this/that" ┆ entity_path: "/this/that" │ │ -│ │ ┆ ┆ kind: "data" ┆ is_static: "yes" ┆ kind: "data" │ │ +│ │ ┆ ┆ kind: "data" ┆ is_static: "true" ┆ kind: "data" │ │ │ │ ┆ ┆ ┆ kind: "data" ┆ │ │ │ ╞════════════════════════╪═══════════════════════════════╪══════════════════════════════╪══════════════════════════════╪══════════════════════════════╡ │ │ │ 10 ┆ 1970-01-01T00:00:00.000000010 ┆ null ┆ [c] ┆ [{x: 0.0, y: 0.0}] │ │ diff --git a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__barebones.snap b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__barebones.snap index db957933033a..271d97e97da8 100644 --- a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__barebones.snap +++ b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__barebones.snap @@ -1,6 +1,6 @@ --- source: crates/store/re_dataframe/src/query.rs -expression: "TransportChunk::from(dataframe)" +expression: DisplayRB(dataframe) --- ┌──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ │ CHUNK METADATA: │ @@ -12,7 +12,7 @@ expression: "TransportChunk::from(dataframe)" │ │ type: "i64" ┆ type: "Timestamp(ns)" ┆ type: "List[u32]" ┆ type: "List[Utf8]" ┆ type: "List[Struct[2]]" │ │ │ │ index_name: "frame_nr" ┆ index_name: "log_time" ┆ component: "example.MyColor" ┆ component: "example.MyLabel" ┆ component: "example.MyPoint" │ │ │ │ kind: "index" ┆ kind: "index" ┆ entity_path: "/this/that" ┆ entity_path: "/this/that" ┆ entity_path: "/this/that" │ │ -│ │ ┆ ┆ kind: "data" ┆ is_static: "yes" ┆ kind: "data" │ │ +│ │ ┆ ┆ kind: "data" ┆ is_static: "true" ┆ kind: "data" │ │ │ │ ┆ ┆ ┆ kind: "data" ┆ │ │ │ ╞════════════════════════╪════════════════════════╪══════════════════════════════╪══════════════════════════════╪══════════════════════════════╡ │ │ │ null ┆ null ┆ null ┆ [c] ┆ null │ │ diff --git a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__clears-2.snap b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__clears-2.snap index 99889208a5c1..f145b121fe1b 100644 --- a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__clears-2.snap +++ b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__clears-2.snap @@ -1,6 +1,6 @@ --- source: crates/store/re_dataframe/src/query.rs -expression: "TransportChunk::from(dataframe)" +expression: DisplayRB(dataframe) --- ┌─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ │ CHUNK METADATA: │ @@ -12,7 +12,7 @@ expression: "TransportChunk::from(dataframe)" │ │ type: "i64" ┆ type: "Timestamp(ns)" ┆ type: "List[u32]" ┆ type: "List[Utf8]" ┆ type: "List[Struct[2]]" │ │ │ │ index_name: "frame_nr" ┆ index_name: "log_time" ┆ component: "example.MyColor" ┆ component: "example.MyLabel" ┆ component: "example.MyPoint" │ │ │ │ kind: "index" ┆ kind: "index" ┆ entity_path: "/this/that" ┆ entity_path: "/this/that" ┆ entity_path: "/this/that" │ │ -│ │ ┆ ┆ kind: "data" ┆ is_static: "yes" ┆ kind: "data" │ │ +│ │ ┆ ┆ kind: "data" ┆ is_static: "true" ┆ kind: "data" │ │ │ │ ┆ ┆ ┆ kind: "data" ┆ │ │ │ ╞════════════════════════╪═══════════════════════════════╪══════════════════════════════╪══════════════════════════════╪══════════════════════════════╡ │ │ │ 10 ┆ 1970-01-01T00:00:00.000000010 ┆ null ┆ [c] ┆ [{x: 0.0, y: 0.0}] │ │ diff --git a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__clears.snap b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__clears.snap index 99889208a5c1..f145b121fe1b 100644 --- a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__clears.snap +++ b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__clears.snap @@ -1,6 +1,6 @@ --- source: crates/store/re_dataframe/src/query.rs -expression: "TransportChunk::from(dataframe)" +expression: DisplayRB(dataframe) --- ┌─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ │ CHUNK METADATA: │ @@ -12,7 +12,7 @@ expression: "TransportChunk::from(dataframe)" │ │ type: "i64" ┆ type: "Timestamp(ns)" ┆ type: "List[u32]" ┆ type: "List[Utf8]" ┆ type: "List[Struct[2]]" │ │ │ │ index_name: "frame_nr" ┆ index_name: "log_time" ┆ component: "example.MyColor" ┆ component: "example.MyLabel" ┆ component: "example.MyPoint" │ │ │ │ kind: "index" ┆ kind: "index" ┆ entity_path: "/this/that" ┆ entity_path: "/this/that" ┆ entity_path: "/this/that" │ │ -│ │ ┆ ┆ kind: "data" ┆ is_static: "yes" ┆ kind: "data" │ │ +│ │ ┆ ┆ kind: "data" ┆ is_static: "true" ┆ kind: "data" │ │ │ │ ┆ ┆ ┆ kind: "data" ┆ │ │ │ ╞════════════════════════╪═══════════════════════════════╪══════════════════════════════╪══════════════════════════════╪══════════════════════════════╡ │ │ │ 10 ┆ 1970-01-01T00:00:00.000000010 ┆ null ┆ [c] ┆ [{x: 0.0, y: 0.0}] │ │ diff --git a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__filtered_index_range.snap b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__filtered_index_range.snap index b1b085b65851..a2ef8ebd40c1 100644 --- a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__filtered_index_range.snap +++ b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__filtered_index_range.snap @@ -1,6 +1,6 @@ --- source: crates/store/re_dataframe/src/query.rs -expression: "TransportChunk::from(dataframe)" +expression: DisplayRB(dataframe) --- ┌─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ │ CHUNK METADATA: │ @@ -12,7 +12,7 @@ expression: "TransportChunk::from(dataframe)" │ │ type: "i64" ┆ type: "Timestamp(ns)" ┆ type: "List[u32]" ┆ type: "List[Utf8]" ┆ type: "List[Struct[2]]" │ │ │ │ index_name: "frame_nr" ┆ index_name: "log_time" ┆ component: "example.MyColor" ┆ component: "example.MyLabel" ┆ component: "example.MyPoint" │ │ │ │ kind: "index" ┆ kind: "index" ┆ entity_path: "/this/that" ┆ entity_path: "/this/that" ┆ entity_path: "/this/that" │ │ -│ │ ┆ ┆ kind: "data" ┆ is_static: "yes" ┆ kind: "data" │ │ +│ │ ┆ ┆ kind: "data" ┆ is_static: "true" ┆ kind: "data" │ │ │ │ ┆ ┆ ┆ kind: "data" ┆ │ │ │ ╞════════════════════════╪═══════════════════════════════╪══════════════════════════════╪══════════════════════════════╪══════════════════════════════╡ │ │ │ 30 ┆ null ┆ [2] ┆ [c] ┆ [{x: 2.0, y: 2.0}] │ │ diff --git a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__filtered_index_values.snap b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__filtered_index_values.snap index 14ff3c194dd9..cf59831c6c90 100644 --- a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__filtered_index_values.snap +++ b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__filtered_index_values.snap @@ -1,6 +1,6 @@ --- source: crates/store/re_dataframe/src/query.rs -expression: "TransportChunk::from(dataframe)" +expression: DisplayRB(dataframe) --- ┌──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ │ CHUNK METADATA: │ @@ -12,7 +12,7 @@ expression: "TransportChunk::from(dataframe)" │ │ type: "i64" ┆ type: "Timestamp(ns)" ┆ type: "List[u32]" ┆ type: "List[Utf8]" ┆ type: "List[Struct[2]]" │ │ │ │ index_name: "frame_nr" ┆ index_name: "log_time" ┆ component: "example.MyColor" ┆ component: "example.MyLabel" ┆ component: "example.MyPoint" │ │ │ │ kind: "index" ┆ kind: "index" ┆ entity_path: "/this/that" ┆ entity_path: "/this/that" ┆ entity_path: "/this/that" │ │ -│ │ ┆ ┆ kind: "data" ┆ is_static: "yes" ┆ kind: "data" │ │ +│ │ ┆ ┆ kind: "data" ┆ is_static: "true" ┆ kind: "data" │ │ │ │ ┆ ┆ ┆ kind: "data" ┆ │ │ │ ╞════════════════════════╪════════════════════════╪══════════════════════════════╪══════════════════════════════╪══════════════════════════════╡ │ │ │ 30 ┆ null ┆ [2] ┆ [c] ┆ [{x: 2.0, y: 2.0}] │ │ diff --git a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__filtered_is_not_null-2.snap b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__filtered_is_not_null-2.snap index 961f4883fa04..ebb8736328a9 100644 --- a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__filtered_is_not_null-2.snap +++ b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__filtered_is_not_null-2.snap @@ -1,6 +1,6 @@ --- source: crates/store/re_dataframe/src/query.rs -expression: "TransportChunk::from(dataframe)" +expression: DisplayRB(dataframe) --- ┌────────────────────────┬────────────────────────┬──────────────────────────────┬──────────────────────────────┬──────────────────────────────┐ │ frame_nr ┆ log_time ┆ /this/that:example.MyColor ┆ /this/that:example.MyLabel ┆ /this/that:example.MyPoint │ @@ -8,7 +8,7 @@ expression: "TransportChunk::from(dataframe)" │ type: "i64" ┆ type: "Timestamp(ns)" ┆ type: "List[u32]" ┆ type: "List[Utf8]" ┆ type: "List[Struct[2]]" │ │ index_name: "frame_nr" ┆ index_name: "log_time" ┆ component: "example.MyColor" ┆ component: "example.MyLabel" ┆ component: "example.MyPoint" │ │ kind: "index" ┆ kind: "index" ┆ entity_path: "/this/that" ┆ entity_path: "/this/that" ┆ entity_path: "/this/that" │ -│ ┆ ┆ kind: "data" ┆ is_static: "yes" ┆ kind: "data" │ +│ ┆ ┆ kind: "data" ┆ is_static: "true" ┆ kind: "data" │ │ ┆ ┆ ┆ kind: "data" ┆ │ ╞════════════════════════╪════════════════════════╪══════════════════════════════╪══════════════════════════════╪══════════════════════════════╡ └────────────────────────┴────────────────────────┴──────────────────────────────┴──────────────────────────────┴──────────────────────────────┘ diff --git a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__filtered_is_not_null-3.snap b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__filtered_is_not_null-3.snap index 7ce18178b1a0..e0f9c9d8b04b 100644 --- a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__filtered_is_not_null-3.snap +++ b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__filtered_is_not_null-3.snap @@ -1,6 +1,6 @@ --- source: crates/store/re_dataframe/src/query.rs -expression: "TransportChunk::from(dataframe)" +expression: DisplayRB(dataframe) --- ┌─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ │ CHUNK METADATA: │ @@ -12,7 +12,7 @@ expression: "TransportChunk::from(dataframe)" │ │ type: "i64" ┆ type: "Timestamp(ns)" ┆ type: "List[u32]" ┆ type: "List[Utf8]" ┆ type: "List[Struct[2]]" │ │ │ │ index_name: "frame_nr" ┆ index_name: "log_time" ┆ component: "example.MyColor" ┆ component: "example.MyLabel" ┆ component: "example.MyPoint" │ │ │ │ kind: "index" ┆ kind: "index" ┆ entity_path: "/this/that" ┆ entity_path: "/this/that" ┆ entity_path: "/this/that" │ │ -│ │ ┆ ┆ kind: "data" ┆ is_static: "yes" ┆ kind: "data" │ │ +│ │ ┆ ┆ kind: "data" ┆ is_static: "true" ┆ kind: "data" │ │ │ │ ┆ ┆ ┆ kind: "data" ┆ │ │ │ ╞════════════════════════╪═══════════════════════════════╪══════════════════════════════╪══════════════════════════════╪══════════════════════════════╡ │ │ │ 10 ┆ 1970-01-01T00:00:00.000000010 ┆ null ┆ [c] ┆ [{x: 0.0, y: 0.0}] │ │ diff --git a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__filtered_is_not_null-4.snap b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__filtered_is_not_null-4.snap index f764102f654c..3a99fcafeee9 100644 --- a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__filtered_is_not_null-4.snap +++ b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__filtered_is_not_null-4.snap @@ -1,6 +1,6 @@ --- source: crates/store/re_dataframe/src/query.rs -expression: "TransportChunk::from(dataframe)" +expression: DisplayRB(dataframe) --- ┌─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ │ CHUNK METADATA: │ @@ -12,7 +12,7 @@ expression: "TransportChunk::from(dataframe)" │ │ type: "i64" ┆ type: "Timestamp(ns)" ┆ type: "List[u32]" ┆ type: "List[Utf8]" ┆ type: "List[Struct[2]]" │ │ │ │ index_name: "frame_nr" ┆ index_name: "log_time" ┆ component: "example.MyColor" ┆ component: "example.MyLabel" ┆ component: "example.MyPoint" │ │ │ │ kind: "index" ┆ kind: "index" ┆ entity_path: "/this/that" ┆ entity_path: "/this/that" ┆ entity_path: "/this/that" │ │ -│ │ ┆ ┆ kind: "data" ┆ is_static: "yes" ┆ kind: "data" │ │ +│ │ ┆ ┆ kind: "data" ┆ is_static: "true" ┆ kind: "data" │ │ │ │ ┆ ┆ ┆ kind: "data" ┆ │ │ │ ╞════════════════════════╪═══════════════════════════════╪══════════════════════════════╪══════════════════════════════╪══════════════════════════════╡ │ │ │ 30 ┆ null ┆ [2] ┆ [c] ┆ [{x: 2.0, y: 2.0}] │ │ diff --git a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__filtered_is_not_null.snap b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__filtered_is_not_null.snap index 961f4883fa04..ebb8736328a9 100644 --- a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__filtered_is_not_null.snap +++ b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__filtered_is_not_null.snap @@ -1,6 +1,6 @@ --- source: crates/store/re_dataframe/src/query.rs -expression: "TransportChunk::from(dataframe)" +expression: DisplayRB(dataframe) --- ┌────────────────────────┬────────────────────────┬──────────────────────────────┬──────────────────────────────┬──────────────────────────────┐ │ frame_nr ┆ log_time ┆ /this/that:example.MyColor ┆ /this/that:example.MyLabel ┆ /this/that:example.MyPoint │ @@ -8,7 +8,7 @@ expression: "TransportChunk::from(dataframe)" │ type: "i64" ┆ type: "Timestamp(ns)" ┆ type: "List[u32]" ┆ type: "List[Utf8]" ┆ type: "List[Struct[2]]" │ │ index_name: "frame_nr" ┆ index_name: "log_time" ┆ component: "example.MyColor" ┆ component: "example.MyLabel" ┆ component: "example.MyPoint" │ │ kind: "index" ┆ kind: "index" ┆ entity_path: "/this/that" ┆ entity_path: "/this/that" ┆ entity_path: "/this/that" │ -│ ┆ ┆ kind: "data" ┆ is_static: "yes" ┆ kind: "data" │ +│ ┆ ┆ kind: "data" ┆ is_static: "true" ┆ kind: "data" │ │ ┆ ┆ ┆ kind: "data" ┆ │ ╞════════════════════════╪════════════════════════╪══════════════════════════════╪══════════════════════════════╪══════════════════════════════╡ └────────────────────────┴────────────────────────┴──────────────────────────────┴──────────────────────────────┴──────────────────────────────┘ diff --git a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__selection-2.snap b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__selection-2.snap index fc9abb00c569..4c40256a40bd 100644 --- a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__selection-2.snap +++ b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__selection-2.snap @@ -1,6 +1,6 @@ --- source: crates/store/re_dataframe/src/query.rs -expression: "TransportChunk::from(dataframe)" +expression: DisplayRB(dataframe) --- ┌────────────────────────────────────────────────────────────────────────────────────────────────┐ │ CHUNK METADATA: │ @@ -11,7 +11,8 @@ expression: "TransportChunk::from(dataframe)" │ │ --- ┆ --- ┆ --- │ │ │ │ type: "i64" ┆ type: "i64" ┆ type: "null" │ │ │ │ index_name: "frame_nr" ┆ index_name: "frame_nr" ┆ index_name: "ATimeColumnThatDoesntExist" │ │ -│ │ kind: "index" ┆ kind: "index" ┆ kind: "index" │ │ +│ │ kind: "index" ┆ kind: "index" ┆ is_sorted: "true" │ │ +│ │ ┆ ┆ kind: "index" │ │ │ ╞════════════════════════╪════════════════════════╪══════════════════════════════════════════╡ │ │ │ 10 ┆ 10 ┆ null │ │ │ ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ diff --git a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__selection-4.snap b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__selection-4.snap index ad3cdcde7a7b..4b65d98f4a7e 100644 --- a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__selection-4.snap +++ b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__selection-4.snap @@ -1,6 +1,6 @@ --- source: crates/store/re_dataframe/src/query.rs -expression: "TransportChunk::from(dataframe)" +expression: DisplayRB(dataframe) --- ┌──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ │ CHUNK METADATA: │ @@ -17,7 +17,7 @@ expression: "TransportChunk::from(dataframe)" │ │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ entity_path: │ │ │ │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ "/this/that" │ │ │ │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ is_static: │ │ -│ │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ "yes" │ │ +│ │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ "true" │ │ │ │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ kind: "data" │ │ │ ╞════════════════╪════════════════╪════════════════╪════════════════╪════════════════╪════════════════╪════════════════╪════════════════╪════════════════╪════════════════╪════════════════╡ │ │ │ 10 ┆ 10 ┆ 10 ┆ 10 ┆ 10 ┆ 10 ┆ 10 ┆ 10 ┆ 10 ┆ 10 ┆ [c] │ │ diff --git a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__sparse_fill_strategy_latestatglobal.snap b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__sparse_fill_strategy_latestatglobal.snap index 611b3867fcb0..76f2eff49bb4 100644 --- a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__sparse_fill_strategy_latestatglobal.snap +++ b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__sparse_fill_strategy_latestatglobal.snap @@ -1,6 +1,6 @@ --- source: crates/store/re_dataframe/src/query.rs -expression: "TransportChunk::from(dataframe)" +expression: DisplayRB(dataframe) --- ┌─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ │ CHUNK METADATA: │ @@ -12,7 +12,7 @@ expression: "TransportChunk::from(dataframe)" │ │ type: "i64" ┆ type: "Timestamp(ns)" ┆ type: "List[u32]" ┆ type: "List[Utf8]" ┆ type: "List[Struct[2]]" │ │ │ │ index_name: "frame_nr" ┆ index_name: "log_time" ┆ component: "example.MyColor" ┆ component: "example.MyLabel" ┆ component: "example.MyPoint" │ │ │ │ kind: "index" ┆ kind: "index" ┆ entity_path: "/this/that" ┆ entity_path: "/this/that" ┆ entity_path: "/this/that" │ │ -│ │ ┆ ┆ kind: "data" ┆ is_static: "yes" ┆ kind: "data" │ │ +│ │ ┆ ┆ kind: "data" ┆ is_static: "true" ┆ kind: "data" │ │ │ │ ┆ ┆ ┆ kind: "data" ┆ │ │ │ ╞════════════════════════╪═══════════════════════════════╪══════════════════════════════╪══════════════════════════════╪══════════════════════════════╡ │ │ │ 10 ┆ 1970-01-01T00:00:00.000000010 ┆ null ┆ [c] ┆ [{x: 0.0, y: 0.0}] │ │ diff --git a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__using_index_values-2.snap b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__using_index_values-2.snap index 0c0c7ebc3a62..932dc1b111fb 100644 --- a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__using_index_values-2.snap +++ b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__using_index_values-2.snap @@ -1,6 +1,6 @@ --- source: crates/store/re_dataframe/src/query.rs -expression: "TransportChunk::from(dataframe)" +expression: DisplayRB(dataframe) --- ┌─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ │ CHUNK METADATA: │ @@ -12,7 +12,7 @@ expression: "TransportChunk::from(dataframe)" │ │ type: "i64" ┆ type: "Timestamp(ns)" ┆ type: "List[u32]" ┆ type: "List[Utf8]" ┆ type: "List[Struct[2]]" │ │ │ │ index_name: "frame_nr" ┆ index_name: "log_time" ┆ component: "example.MyColor" ┆ component: "example.MyLabel" ┆ component: "example.MyPoint" │ │ │ │ kind: "index" ┆ kind: "index" ┆ entity_path: "/this/that" ┆ entity_path: "/this/that" ┆ entity_path: "/this/that" │ │ -│ │ ┆ ┆ kind: "data" ┆ is_static: "yes" ┆ kind: "data" │ │ +│ │ ┆ ┆ kind: "data" ┆ is_static: "true" ┆ kind: "data" │ │ │ │ ┆ ┆ ┆ kind: "data" ┆ │ │ │ ╞════════════════════════╪═══════════════════════════════╪══════════════════════════════╪══════════════════════════════╪══════════════════════════════╡ │ │ │ 0 ┆ null ┆ null ┆ [c] ┆ null │ │ diff --git a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__using_index_values.snap b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__using_index_values.snap index bad2cb956e13..f0d9abdc6deb 100644 --- a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__using_index_values.snap +++ b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__using_index_values.snap @@ -1,6 +1,6 @@ --- source: crates/store/re_dataframe/src/query.rs -expression: "TransportChunk::from(dataframe)" +expression: DisplayRB(dataframe) --- ┌──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ │ CHUNK METADATA: │ @@ -12,7 +12,7 @@ expression: "TransportChunk::from(dataframe)" │ │ type: "i64" ┆ type: "Timestamp(ns)" ┆ type: "List[u32]" ┆ type: "List[Utf8]" ┆ type: "List[Struct[2]]" │ │ │ │ index_name: "frame_nr" ┆ index_name: "log_time" ┆ component: "example.MyColor" ┆ component: "example.MyLabel" ┆ component: "example.MyPoint" │ │ │ │ kind: "index" ┆ kind: "index" ┆ entity_path: "/this/that" ┆ entity_path: "/this/that" ┆ entity_path: "/this/that" │ │ -│ │ ┆ ┆ kind: "data" ┆ is_static: "yes" ┆ kind: "data" │ │ +│ │ ┆ ┆ kind: "data" ┆ is_static: "true" ┆ kind: "data" │ │ │ │ ┆ ┆ ┆ kind: "data" ┆ │ │ │ ╞════════════════════════╪════════════════════════╪══════════════════════════════╪══════════════════════════════╪══════════════════════════════╡ │ │ │ 0 ┆ null ┆ null ┆ [c] ┆ null │ │ diff --git a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__view_contents-2.snap b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__view_contents-2.snap index a81573d92315..3e96d3b4e3e9 100644 --- a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__view_contents-2.snap +++ b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__view_contents-2.snap @@ -1,6 +1,6 @@ --- source: crates/store/re_dataframe/src/query.rs -expression: "TransportChunk::from(dataframe)" +expression: DisplayRB(dataframe) --- ┌───────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ │ CHUNK METADATA: │ @@ -12,7 +12,7 @@ expression: "TransportChunk::from(dataframe)" │ │ type: "i64" ┆ type: "Timestamp(ns)" ┆ type: "List[u32]" ┆ type: "List[Utf8]" │ │ │ │ index_name: "frame_nr" ┆ index_name: "log_time" ┆ component: "example.MyColor" ┆ component: "example.MyLabel" │ │ │ │ kind: "index" ┆ kind: "index" ┆ entity_path: "/this/that" ┆ entity_path: "/this/that" │ │ -│ │ ┆ ┆ kind: "data" ┆ is_static: "yes" │ │ +│ │ ┆ ┆ kind: "data" ┆ is_static: "true" │ │ │ │ ┆ ┆ ┆ kind: "data" │ │ │ ╞════════════════════════╪════════════════════════╪══════════════════════════════╪══════════════════════════════╡ │ │ │ 30 ┆ null ┆ [2] ┆ [c] │ │ diff --git a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__view_contents_and_selection.snap b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__view_contents_and_selection.snap index 88cb601328a1..32c026cfed50 100644 --- a/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__view_contents_and_selection.snap +++ b/crates/store/re_dataframe/src/snapshots/re_dataframe__query__tests__view_contents_and_selection.snap @@ -1,6 +1,6 @@ --- source: crates/store/re_dataframe/src/query.rs -expression: "TransportChunk::from(dataframe)" +expression: DisplayRB(dataframe) --- ┌───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ │ CHUNK METADATA: │ @@ -11,8 +11,8 @@ expression: "TransportChunk::from(dataframe)" │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ │ │ type: "i64" ┆ type: "Timestamp(ns)" ┆ type: "null" ┆ type: "null" ┆ type: "List[u32]" ┆ type: "List[Utf8]" │ │ │ │ index_name: "frame_nr" ┆ index_name: "log_time" ┆ index_name: "log_tick" ┆ component: "example.MyPoint" ┆ component: "example.MyColor" ┆ component: "example.MyLabel" │ │ -│ │ kind: "index" ┆ kind: "index" ┆ kind: "index" ┆ entity_path: "/this/that" ┆ entity_path: "/this/that" ┆ entity_path: "/this/that" │ │ -│ │ ┆ ┆ ┆ kind: "data" ┆ kind: "data" ┆ is_static: "yes" │ │ +│ │ kind: "index" ┆ kind: "index" ┆ is_sorted: "true" ┆ entity_path: "/this/that" ┆ entity_path: "/this/that" ┆ entity_path: "/this/that" │ │ +│ │ ┆ ┆ kind: "index" ┆ kind: "data" ┆ kind: "data" ┆ is_static: "true" │ │ │ │ ┆ ┆ ┆ ┆ ┆ kind: "data" │ │ │ ╞════════════════════════╪════════════════════════╪════════════════════════╪══════════════════════════════╪══════════════════════════════╪══════════════════════════════╡ │ │ │ 30 ┆ null ┆ null ┆ null ┆ [2] ┆ [c] │ │ diff --git a/crates/store/re_sorbet/src/data_column_schema.rs b/crates/store/re_sorbet/src/data_column_schema.rs index 46fc97dbdeca..45f886886448 100644 --- a/crates/store/re_sorbet/src/data_column_schema.rs +++ b/crates/store/re_sorbet/src/data_column_schema.rs @@ -184,11 +184,11 @@ impl ComponentColumnDescriptor { "rerun.component".to_owned(), component_name.short_name().to_owned(), )), - (*is_static).then_some(("rerun.is_static".to_owned(), "yes".to_owned())), - (*is_indicator).then_some(("rerun.is_indicator".to_owned(), "yes".to_owned())), - (*is_tombstone).then_some(("rerun.is_tombstone".to_owned(), "yes".to_owned())), + (*is_static).then_some(("rerun.is_static".to_owned(), "true".to_owned())), + (*is_indicator).then_some(("rerun.is_indicator".to_owned(), "true".to_owned())), + (*is_tombstone).then_some(("rerun.is_tombstone".to_owned(), "true".to_owned())), (*is_semantically_empty) - .then_some(("rerun.is_semantically_empty".to_owned(), "yes".to_owned())), + .then_some(("rerun.is_semantically_empty".to_owned(), "true".to_owned())), ] .into_iter() .flatten() From f79bdc26894d3a4999cfb7ea0ca9608c412f23f3 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Mon, 10 Feb 2025 14:44:33 +0100 Subject: [PATCH 16/38] Use similar_asserts --- Cargo.lock | 1 + crates/store/re_grpc_server/Cargo.toml | 3 +++ crates/store/re_grpc_server/src/lib.rs | 3 ++- 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 3bd1059c03e1..fea1d86f2af6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6184,6 +6184,7 @@ dependencies = [ "re_protos", "re_tracing", "re_types", + "similar-asserts", "tokio", "tokio-stream", "tokio-util", diff --git a/crates/store/re_grpc_server/Cargo.toml b/crates/store/re_grpc_server/Cargo.toml index 538b56257975..22dcf8553d31 100644 --- a/crates/store/re_grpc_server/Cargo.toml +++ b/crates/store/re_grpc_server/Cargo.toml @@ -39,3 +39,6 @@ tokio-util.workspace = true tonic = { workspace = true, default-features = false, features = ["transport"] } tonic-web.workspace = true tower-http = { workspace = true, features = ["cors"] } + +[dev-dependencies] +similar-asserts.workspace = true diff --git a/crates/store/re_grpc_server/src/lib.rs b/crates/store/re_grpc_server/src/lib.rs index 020ba2a4559e..7df9f98aa939 100644 --- a/crates/store/re_grpc_server/src/lib.rs +++ b/crates/store/re_grpc_server/src/lib.rs @@ -273,6 +273,7 @@ mod tests { use re_protos::sdk_comms::v0::{ message_proxy_client::MessageProxyClient, message_proxy_server::MessageProxyServer, }; + use similar_asserts::assert_eq; use std::net::SocketAddr; use std::sync::Arc; use std::time::Duration; @@ -541,7 +542,7 @@ mod tests { for log_stream in &mut log_streams { let actual = read_log_stream(log_stream, expected.len()).await; - assert_eq!(expected, actual); + assert_eq!(actual, expected); } completion.finish(); From 52be0a46ffe0ee66491d92e254ef16338806ae56 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Mon, 10 Feb 2025 14:49:25 +0100 Subject: [PATCH 17/38] Less TransportChunk --- Cargo.lock | 7 ------- crates/store/re_chunk/Cargo.toml | 1 - crates/store/re_chunk/src/chunk.rs | 4 ++-- crates/store/re_chunk/src/transport.rs | 3 +-- crates/store/re_log_encoding/src/codec/wire/mod.rs | 2 +- crates/viewer/re_chunk_store_ui/src/chunk_ui.rs | 12 ++++++------ 6 files changed, 10 insertions(+), 19 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index fea1d86f2af6..cf9781718cca 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5835,7 +5835,6 @@ dependencies = [ "re_tuid", "re_types_core", "similar-asserts", - "tap", "thiserror 1.0.65", ] @@ -8692,12 +8691,6 @@ dependencies = [ "libc", ] -[[package]] -name = "tap" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" - [[package]] name = "target-lexicon" version = "0.12.16" diff --git a/crates/store/re_chunk/Cargo.toml b/crates/store/re_chunk/Cargo.toml index cc4c3873e0c4..1407b1ffcb18 100644 --- a/crates/store/re_chunk/Cargo.toml +++ b/crates/store/re_chunk/Cargo.toml @@ -52,7 +52,6 @@ itertools.workspace = true nohash-hasher.workspace = true rand = { workspace = true, features = ["std_rng"] } similar-asserts.workspace = true -tap.workspace = true thiserror.workspace = true # Native dependencies: diff --git a/crates/store/re_chunk/src/chunk.rs b/crates/store/re_chunk/src/chunk.rs index 14bc14e2ec6a..93105f2c19bd 100644 --- a/crates/store/re_chunk/src/chunk.rs +++ b/crates/store/re_chunk/src/chunk.rs @@ -1274,11 +1274,11 @@ impl Chunk { impl std::fmt::Display for Chunk { #[inline] fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let transport_chunk = self.to_transport().map_err(|err| { + let batch = self.to_record_batch().map_err(|err| { re_log::error_once!("couldn't display Chunk: {err}"); std::fmt::Error })?; - transport_chunk.fmt(f) + re_format_arrow::format_record_batch_with_width(&batch, f.width()).fmt(f) } } diff --git a/crates/store/re_chunk/src/transport.rs b/crates/store/re_chunk/src/transport.rs index 7db98a534425..3a32e69a8ed8 100644 --- a/crates/store/re_chunk/src/transport.rs +++ b/crates/store/re_chunk/src/transport.rs @@ -778,11 +778,10 @@ impl Chunk { re_tracing::profile_function!(); self.sanity_check()?; - let transport = self.to_transport()?; Ok(re_log_types::ArrowMsg { chunk_id: re_tuid::Tuid::from_u128(self.id().as_u128()), timepoint_max: self.timepoint_max(), - batch: transport.into(), + batch: self.to_record_batch()?, on_release: None, }) } diff --git a/crates/store/re_log_encoding/src/codec/wire/mod.rs b/crates/store/re_log_encoding/src/codec/wire/mod.rs index 7f2ea2624100..6f511a93b67a 100644 --- a/crates/store/re_log_encoding/src/codec/wire/mod.rs +++ b/crates/store/re_log_encoding/src/codec/wire/mod.rs @@ -56,7 +56,7 @@ mod tests { let encoded: DataframePart = expected_chunk .clone() - .to_transport() + .to_record_batch() .unwrap() .encode() .unwrap(); diff --git a/crates/viewer/re_chunk_store_ui/src/chunk_ui.rs b/crates/viewer/re_chunk_store_ui/src/chunk_ui.rs index 32fbab0d5e8e..efa7ec106bf9 100644 --- a/crates/viewer/re_chunk_store_ui/src/chunk_ui.rs +++ b/crates/viewer/re_chunk_store_ui/src/chunk_ui.rs @@ -203,8 +203,8 @@ impl ChunkUi { } }; - let fields_ui = |ui: &mut egui::Ui, transport: &ArrowRecordBatch| { - for field in &transport.schema_ref().fields { + let fields_ui = |ui: &mut egui::Ui, batch: &ArrowRecordBatch| { + for field in &batch.schema_ref().fields { ui.push_id(field.name().clone(), |ui| { ui.list_item_collapsible_noninteractive_label(field.name(), false, |ui| { ui.list_item_collapsible_noninteractive_label("Data type", false, |ui| { @@ -276,14 +276,14 @@ impl ChunkUi { list_item::list_item_scope(ui, "chunk_stats", |ui| { ui.list_item_collapsible_noninteractive_label("Stats", false, chunk_stats_ui); - match self.chunk.to_transport() { - Ok(transport) => { + match self.chunk.to_record_batch() { + Ok(batch) => { ui.list_item_collapsible_noninteractive_label("Transport", false, |ui| { ui.list_item_collapsible_noninteractive_label("Metadata", false, |ui| { - metadata_ui(ui, &transport.schema_ref().metadata); + metadata_ui(ui, &batch.schema_ref().metadata); }); ui.list_item_collapsible_noninteractive_label("Fields", false, |ui| { - fields_ui(ui, &transport); + fields_ui(ui, &batch); }); }); } From 3519a747c078f953411a1e6ac527f9afc0bc2a7e Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Mon, 10 Feb 2025 15:10:54 +0100 Subject: [PATCH 18/38] Remove some use of TransportChunk --- Cargo.lock | 2 + crates/store/re_chunk/src/transport.rs | 231 +------------------------ crates/store/re_grpc_client/Cargo.toml | 2 + crates/store/re_grpc_client/src/lib.rs | 113 ++++++------ 4 files changed, 72 insertions(+), 276 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cf9781718cca..c7dfecccad65 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6150,6 +6150,7 @@ version = "0.23.0-alpha.1+dev" dependencies = [ "arrow", "async-stream", + "itertools 0.13.0", "re_arrow_util", "re_chunk", "re_error", @@ -6158,6 +6159,7 @@ dependencies = [ "re_log_types", "re_protos", "re_smart_channel", + "re_sorbet", "re_types", "thiserror 1.0.65", "tokio", diff --git a/crates/store/re_chunk/src/transport.rs b/crates/store/re_chunk/src/transport.rs index 3a32e69a8ed8..a931c5c46b73 100644 --- a/crates/store/re_chunk/src/transport.rs +++ b/crates/store/re_chunk/src/transport.rs @@ -3,18 +3,15 @@ use arrow::{ Array as ArrowArray, ArrayRef as ArrowArrayRef, ListArray as ArrowListArray, RecordBatch as ArrowRecordBatch, }, - datatypes::{Field as ArrowField, Fields as ArrowFields, Schema as ArrowSchema}, + datatypes::{Field as ArrowField, Fields as ArrowFields}, }; use itertools::Itertools; use nohash_hasher::IntMap; -use tap::Tap as _; use re_arrow_util::{arrow_util::into_arrow_ref, ArrowArrayDowncastRef as _}; use re_byte_size::SizeBytes as _; use re_log_types::EntityPath; -use re_types_core::{ - arrow_helpers::as_array_ref, Component as _, ComponentDescriptor, Loggable as _, -}; +use re_types_core::{arrow_helpers::as_array_ref, ComponentDescriptor, Loggable as _}; use crate::{chunk::ChunkComponents, Chunk, ChunkError, ChunkId, ChunkResult, RowId, TimeColumn}; @@ -502,156 +499,7 @@ impl Chunk { /// /// It is probably a good idea to sort the chunk first. pub fn to_record_batch(&self) -> ChunkResult { - self.sanity_check()?; - - re_tracing::profile_function!(format!( - "num_columns={} num_rows={}", - self.num_columns(), - self.num_rows() - )); - - let Self { - id, - entity_path, - heap_size_bytes: _, // use the method instead because of lazy initialization - is_sorted, - row_ids, - timelines, - components, - } = self; - - let mut fields: Vec = vec![]; - let mut metadata = std::collections::HashMap::default(); - let mut columns: Vec = - Vec::with_capacity(1 /* row_ids */ + timelines.len() + components.len()); - - // Chunk-level metadata - { - re_tracing::profile_scope!("metadata"); - - metadata.extend(TransportChunk::chunk_metadata_id(*id)); - - metadata.extend(TransportChunk::chunk_metadata_entity_path(entity_path)); - - metadata.extend(TransportChunk::chunk_metadata_heap_size_bytes( - self.heap_size_bytes(), - )); - - if *is_sorted { - metadata.extend(TransportChunk::chunk_metadata_is_sorted()); - } - } - - // Row IDs - { - re_tracing::profile_scope!("row ids"); - - fields.push( - ArrowField::new( - RowId::descriptor().to_string(), - RowId::arrow_datatype().clone(), - false, - ) - .with_metadata( - TransportChunk::field_metadata_control_column().tap_mut(|metadata| { - // This ensures the RowId/Tuid is formatted correctly - metadata.insert( - "ARROW:extension:name".to_owned(), - re_tuid::Tuid::ARROW_EXTENSION_NAME.to_owned(), - ); - }), - ), - ); - columns.push(into_arrow_ref(row_ids.clone())); - } - - // Timelines - { - re_tracing::profile_scope!("timelines"); - - let mut timelines = timelines - .iter() - .map(|(timeline, info)| { - let TimeColumn { - timeline: _, - times: _, - is_sorted, - time_range: _, - } = info; - - let nullable = false; // timelines within a single chunk are always dense - let field = - ArrowField::new(timeline.name().to_string(), timeline.datatype(), nullable) - .with_metadata({ - let mut metadata = TransportChunk::field_metadata_time_column(); - if *is_sorted { - metadata.extend(TransportChunk::field_metadata_is_sorted()); - } - metadata - }); - - let times = info.times_array(); - - (field, times) - }) - .collect_vec(); - - timelines - .sort_by(|(field1, _times1), (field2, _times2)| field1.name().cmp(field2.name())); - - for (field, times) in timelines { - fields.push(field); - columns.push(times); - } - } - - // Components - { - re_tracing::profile_scope!("components"); - - let mut components = components - .values() - .flat_map(|per_desc| per_desc.iter()) - .map(|(component_desc, list_array)| { - let list_array = ArrowListArray::from(list_array.clone()); - - let field = ArrowField::new( - component_desc.component_name.to_string(), - list_array.data_type().clone(), - true, - ) - .with_metadata({ - let mut metadata = TransportChunk::field_metadata_data_column(); - metadata.extend(TransportChunk::field_metadata_component_descriptor( - component_desc, - )); - metadata - }); - - (field, as_array_ref(list_array)) - }) - .collect_vec(); - - components - .sort_by(|(field1, _data1), (field2, _data2)| field1.name().cmp(field2.name())); - - for (field, data) in components { - fields.push(field); - columns.push(data); - } - } - - let schema = ArrowSchema::new_with_metadata(fields, metadata); - - Ok(ArrowRecordBatch::try_new(schema.into(), columns)?) - } - - /// Prepare the [`Chunk`] for transport. - /// - /// It is probably a good idea to sort the chunk first. - pub fn to_transport(&self) -> ChunkResult { - let record_batch = self.to_record_batch()?; - Ok(TransportChunk::from(record_batch)) + Ok(self.to_chunk_batch()?.into()) } pub fn from_chunk_batch(batch: &re_sorbet::ChunkBatch) -> ChunkResult { @@ -797,6 +645,7 @@ mod tests { example_components::{MyColor, MyPoint}, Timeline, }; + use re_types_core::Component as _; use super::*; @@ -858,78 +707,6 @@ mod tests { let row_ids = vec![RowId::new(), RowId::new(), RowId::new(), RowId::new()]; - for timelines in [timelines1.clone(), timelines2.clone()] { - let chunk_original = Chunk::from_native_row_ids( - ChunkId::new(), - entity_path.clone(), - None, - &row_ids, - timelines.clone(), - components.clone().into_iter().collect(), - )?; - let mut chunk_before = chunk_original.clone(); - - for _ in 0..3 { - let chunk_in_transport = chunk_before.to_transport()?; - let chunk_after = Chunk::from_record_batch(chunk_in_transport.clone().into())?; - - assert_eq!( - chunk_in_transport.entity_path()?, - *chunk_original.entity_path() - ); - assert_eq!( - chunk_in_transport.entity_path()?, - *chunk_after.entity_path() - ); - assert_eq!( - chunk_in_transport.heap_size_bytes(), - Some(chunk_after.heap_size_bytes()), - ); - assert_eq!( - chunk_in_transport.num_columns(), - chunk_original.num_columns() - ); - assert_eq!(chunk_in_transport.num_columns(), chunk_after.num_columns()); - assert_eq!(chunk_in_transport.num_rows(), chunk_original.num_rows()); - assert_eq!(chunk_in_transport.num_rows(), chunk_after.num_rows()); - - assert_eq!( - chunk_in_transport.num_controls(), - chunk_original.num_controls() - ); - assert_eq!( - chunk_in_transport.num_controls(), - chunk_after.num_controls() - ); - assert_eq!( - chunk_in_transport.num_timelines(), - chunk_original.num_timelines() - ); - assert_eq!( - chunk_in_transport.num_timelines(), - chunk_after.num_timelines() - ); - assert_eq!( - chunk_in_transport.num_components(), - chunk_original.num_components() - ); - assert_eq!( - chunk_in_transport.num_components(), - chunk_after.num_components() - ); - - eprintln!("{chunk_before}"); - eprintln!("{chunk_in_transport}"); - eprintln!("{chunk_after}"); - - assert_eq!(chunk_before, chunk_after); - - assert!(chunk_before.are_equal(&chunk_after)); - - chunk_before = chunk_after; - } - } - for timelines in [timelines1, timelines2] { let chunk_before = Chunk::from_native_row_ids( ChunkId::new(), diff --git a/crates/store/re_grpc_client/Cargo.toml b/crates/store/re_grpc_client/Cargo.toml index 56a65f476ad4..256a29fc1c00 100644 --- a/crates/store/re_grpc_client/Cargo.toml +++ b/crates/store/re_grpc_client/Cargo.toml @@ -28,10 +28,12 @@ re_log_encoding = { workspace = true, features = ["encoder", "decoder"] } re_log_types.workspace = true re_protos.workspace = true re_smart_channel.workspace = true +re_sorbet.workspace = true re_types.workspace = true arrow.workspace = true async-stream.workspace = true +itertools.workspace = true thiserror.workspace = true tokio-stream.workspace = true url.workspace = true diff --git a/crates/store/re_grpc_client/src/lib.rs b/crates/store/re_grpc_client/src/lib.rs index 38df97ee48e7..f456cdbd8f1b 100644 --- a/crates/store/re_grpc_client/src/lib.rs +++ b/crates/store/re_grpc_client/src/lib.rs @@ -399,7 +399,7 @@ async fn stream_catalog_async( re_log::info!("Starting to read..."); while let Some(result) = resp.next().await { - let input = TransportChunk::from(result.map_err(TonicStatusError)?); + let record_batch = result.map_err(TonicStatusError)?; // Catalog received from the ReDap server isn't suitable for direct conversion to a Rerun Chunk: // - conversion expects "data" columns to be ListArrays, hence we need to convert any individual row column data to ListArray @@ -407,57 +407,72 @@ async fn stream_catalog_async( let mut fields: Vec = Vec::new(); let mut columns: Vec = Vec::new(); - // add the (row id) control field - let (row_id_field, row_id_data) = input.controls().next().ok_or( - StreamError::ChunkError(re_chunk::ChunkError::Malformed { - reason: "no control field found".to_owned(), - }), - )?; - - fields.push( - ArrowField::new( - RowId::name().to_string(), // need to rename to Rerun Chunk expected control field - row_id_field.data_type().clone(), - false, /* not nullable */ - ) - .with_metadata(TransportChunk::field_metadata_control_column()), - ); - columns.push(row_id_data.clone()); - // next add any timeline field - for (field, data) in input.timelines() { - fields.push(field.clone()); - columns.push(data.clone()); - } + for (field, data) in + itertools::izip!(record_batch.schema().fields(), record_batch.columns()) + { + // TODO: move this switch to re_sorbet + match field.metadata().get("rerun.kind").map(|s| s.as_str()) { + Some("control") => { + fields.push( + ArrowField::new( + RowId::name().to_string(), // need to rename to Rerun Chunk expected control field + field.data_type().clone(), + false, /* not nullable */ + ) + .with_metadata(TransportChunk::field_metadata_control_column()), + ); - // now add all the 'data' fields - we slice each column array into individual arrays and then convert the whole lot into a ListArray - for (field, data) in input.components() { - let data_field_inner = - ArrowField::new("item", field.data_type().clone(), true /* nullable */); - - let data_field = ArrowField::new( - field.name().clone(), - ArrowDataType::List(Arc::new(data_field_inner.clone())), - false, /* not nullable */ - ) - .with_metadata(TransportChunk::field_metadata_data_column()); - - let mut sliced: Vec = Vec::new(); - for idx in 0..data.len() { - sliced.push(data.clone().slice(idx, 1)); - } + columns.push(data.clone()); + } + Some("index" | "time") => { + fields.push(ArrowField::clone(field.as_ref())); + columns.push(data.clone()); + } + Some("data" | "component") => { + // We slice each column array into individual arrays and then convert the whole lot into a ListArray + + let data_field_inner = ArrowField::new( + "item", + field.data_type().clone(), + true, /* nullable */ + ); - let data_arrays = sliced.iter().map(|e| Some(e.as_ref())).collect::>(); - #[allow(clippy::unwrap_used)] // we know we've given the right field type - let data_field_array: arrow::array::ListArray = - re_arrow_util::arrow_util::arrays_to_list_array( - data_field_inner.data_type().clone(), - &data_arrays, - ) - .unwrap(); - - fields.push(data_field); - columns.push(as_array_ref(data_field_array)); + let data_field = ArrowField::new( + field.name().clone(), + ArrowDataType::List(Arc::new(data_field_inner.clone())), + false, /* not nullable */ + ) + .with_metadata(TransportChunk::field_metadata_data_column()); + + let mut sliced: Vec = Vec::new(); + for idx in 0..data.len() { + sliced.push(data.clone().slice(idx, 1)); + } + + let data_arrays = sliced.iter().map(|e| Some(e.as_ref())).collect::>(); + #[allow(clippy::unwrap_used)] // we know we've given the right field type + let data_field_array: arrow::array::ListArray = + re_arrow_util::arrow_util::arrays_to_list_array( + data_field_inner.data_type().clone(), + &data_arrays, + ) + .unwrap(); + + fields.push(data_field); + columns.push(as_array_ref(data_field_array)); + } + Some(kind) => { + return Err(StreamError::ChunkError(re_chunk::ChunkError::Malformed { + reason: format!("Unknown rerun.kind {kind:?}"), + })); + } + None => { + return Err(StreamError::ChunkError(re_chunk::ChunkError::Malformed { + reason: "Missing or unknown rerun.kind".to_owned(), + })); + } + } } let schema = { From 3a51b621fece7c89b3c88f1b6dc2e0e3920ffd8c Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Mon, 10 Feb 2025 15:12:19 +0100 Subject: [PATCH 19/38] Remove `TransportChunk` a concrete type --- crates/store/re_chunk/src/transport.rs | 169 +----------------- .../formatting__format_chunk_store.snap | 36 ++-- 2 files changed, 23 insertions(+), 182 deletions(-) diff --git a/crates/store/re_chunk/src/transport.rs b/crates/store/re_chunk/src/transport.rs index a931c5c46b73..a38c2dbeb4b7 100644 --- a/crates/store/re_chunk/src/transport.rs +++ b/crates/store/re_chunk/src/transport.rs @@ -1,9 +1,6 @@ use arrow::{ - array::{ - Array as ArrowArray, ArrayRef as ArrowArrayRef, ListArray as ArrowListArray, - RecordBatch as ArrowRecordBatch, - }, - datatypes::{Field as ArrowField, Fields as ArrowFields}, + array::{Array as ArrowArray, ListArray as ArrowListArray, RecordBatch as ArrowRecordBatch}, + datatypes::Field as ArrowField, }; use itertools::Itertools; use nohash_hasher::IntMap; @@ -19,7 +16,7 @@ pub type ArrowMetadata = std::collections::HashMap; // --- -/// A [`Chunk`] that is ready for transport. Obtained by calling [`Chunk::to_transport`]. +/// A [`Chunk`] that is ready for transport. /// /// It contains a schema with a matching number of columns, all of the same length. /// @@ -35,46 +32,7 @@ pub type ArrowMetadata = std::collections::HashMap; /// claiming to be sorted while it is in fact not). // TODO(emilk): remove this, and replace it with a trait extension type for `ArrowRecordBatch`. #[derive(Debug, Clone)] -pub struct TransportChunk { - batch: ArrowRecordBatch, -} - -impl std::fmt::Display for TransportChunk { - #[inline] - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - re_format_arrow::format_record_batch_with_width(self, f.width()).fmt(f) - } -} - -impl AsRef for TransportChunk { - #[inline] - fn as_ref(&self) -> &ArrowRecordBatch { - &self.batch - } -} - -impl std::ops::Deref for TransportChunk { - type Target = ArrowRecordBatch; - - #[inline] - fn deref(&self) -> &ArrowRecordBatch { - &self.batch - } -} - -impl From for TransportChunk { - #[inline] - fn from(batch: ArrowRecordBatch) -> Self { - Self { batch } - } -} - -impl From for ArrowRecordBatch { - #[inline] - fn from(chunk: TransportChunk) -> Self { - chunk.batch - } -} +pub struct TransportChunk {} // TODO(#6572): Relying on Arrow's native schema metadata feature is bound to fail, we need to // switch to something more powerful asap. @@ -265,125 +223,6 @@ impl TransportChunk { } } -impl TransportChunk { - #[inline] - pub fn id(&self) -> ChunkResult { - if let Some(id) = self.metadata().get(Self::CHUNK_METADATA_KEY_ID) { - let id = u128::from_str_radix(id, 16).map_err(|err| ChunkError::Malformed { - reason: format!("cannot deserialize chunk id: {err}"), - })?; - Ok(ChunkId::from_u128(id)) - } else { - Err(crate::ChunkError::Malformed { - reason: format!("chunk id missing from metadata ({:?})", self.metadata()), - }) - } - } - - #[inline] - pub fn entity_path(&self) -> ChunkResult { - match self - .schema_ref() - .metadata - .get(Self::CHUNK_METADATA_KEY_ENTITY_PATH) - { - Some(entity_path) => Ok(EntityPath::parse_forgiving(entity_path)), - None => Err(crate::ChunkError::Malformed { - reason: format!( - "entity path missing from metadata ({:?})", - self.schema_ref().metadata - ), - }), - } - } - - /// The size in bytes of the data, once loaded in memory, in chunk-level. - /// - /// This is stored in the metadata. Returns `None` if that key is not set. - #[inline] - pub fn heap_size_bytes(&self) -> Option { - self.metadata() - .get(Self::CHUNK_METADATA_KEY_HEAP_SIZE_BYTES) - .and_then(|s| s.parse::().ok()) - } - - #[inline] - pub fn fields(&self) -> &ArrowFields { - &self.schema_ref().fields - } - - pub fn metadata(&self) -> &std::collections::HashMap { - &self.batch.schema_ref().metadata - } - - /// Looks in the chunk metadata for the `IS_SORTED` marker. - /// - /// It is possible that a chunk is sorted but didn't set that marker. - /// This is fine, although wasteful. - #[inline] - pub fn is_sorted(&self) -> bool { - self.metadata() - .contains_key(Self::CHUNK_METADATA_MARKER_IS_SORTED_BY_ROW_ID) - } - - /// Iterates all columns of the specified `kind`. - /// - /// See: - /// * [`Self::FIELD_METADATA_VALUE_KIND_TIME`] - /// * [`Self::FIELD_METADATA_VALUE_KIND_CONTROL`] - /// * [`Self::FIELD_METADATA_VALUE_KIND_DATA`] - #[inline] - fn columns_of_kind<'a>( - &'a self, - kind: &'a str, - ) -> impl Iterator + 'a { - self.fields().iter().enumerate().filter_map(|(i, field)| { - let actual_kind = field.metadata().get(Self::FIELD_METADATA_KEY_KIND); - (actual_kind.map(|s| s.as_str()) == Some(kind)) - .then(|| { - self.batch - .columns() - .get(i) - .map(|column| (field.as_ref(), column)) - }) - .flatten() - }) - } - - /// Iterates all control columns present in this chunk. - #[inline] - pub fn controls(&self) -> impl Iterator { - self.columns_of_kind(Self::FIELD_METADATA_VALUE_KIND_CONTROL) - } - - /// Iterates all data columns present in this chunk. - #[inline] - pub fn components(&self) -> impl Iterator { - self.columns_of_kind(Self::FIELD_METADATA_VALUE_KIND_DATA) - } - - /// Iterates all timeline columns present in this chunk. - #[inline] - pub fn timelines(&self) -> impl Iterator { - self.columns_of_kind(Self::FIELD_METADATA_VALUE_KIND_TIME) - } - - #[inline] - pub fn num_controls(&self) -> usize { - self.controls().count() - } - - #[inline] - pub fn num_timelines(&self) -> usize { - self.timelines().count() - } - - #[inline] - pub fn num_components(&self) -> usize { - self.components().count() - } -} - impl Chunk { /// Prepare the [`Chunk`] for transport. /// diff --git a/crates/store/re_chunk_store/tests/snapshots/formatting__format_chunk_store.snap b/crates/store/re_chunk_store/tests/snapshots/formatting__format_chunk_store.snap index 229e882a9439..690ec61f453b 100644 --- a/crates/store/re_chunk_store/tests/snapshots/formatting__format_chunk_store.snap +++ b/crates/store/re_chunk_store/tests/snapshots/formatting__format_chunk_store.snap @@ -12,22 +12,24 @@ ChunkStore { num_events: 2 } chunks: [ - ┌──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ - │ CHUNK METADATA: │ - │ * entity_path: "/this/that" │ - │ * heap_size_bytes: "1072" │ - │ * id: "661EFDF2E3B19F7C045F15" │ - │ * is_sorted: "" │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ ┌──────────────────────────────────┬───────────────┬───────────────────────────────┬───────────────────┬───────────────────┐ │ - │ │ RowId ┆ frame_nr ┆ log_time ┆ example.MyColor ┆ example.MyIndex │ │ - │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ - │ │ type: "Struct[2]" ┆ type: "i64" ┆ type: "Timestamp(ns)" ┆ type: "List[u32]" ┆ type: "List[u64]" │ │ - │ │ ARROW:extension:name: "TUID" ┆ is_sorted: "" ┆ is_sorted: "" ┆ kind: "data" ┆ kind: "data" │ │ - │ │ kind: "control" ┆ kind: "time" ┆ kind: "time" ┆ ┆ │ │ - │ ╞══════════════════════════════════╪═══════════════╪═══════════════════════════════╪═══════════════════╪═══════════════════╡ │ - │ │ 0000000067816A6BB4B8C1254D40007B ┆ 1 ┆ 2025-01-10T18:43:42.123456789 ┆ [0, 1, 2] ┆ [0, 1, 2] │ │ - │ └──────────────────────────────────┴───────────────┴───────────────────────────────┴───────────────────┴───────────────────┘ │ - └──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ + ┌─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ + │ CHUNK METADATA: │ + │ * entity_path: "/this/that" │ + │ * heap_size_bytes: "1072" │ + │ * id: "661EFDF2E3B19F7C045F15" │ + │ * is_sorted: "true" │ + │ * version: "1" │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ ┌──────────────────────────────────┬────────────────────────┬───────────────────────────────┬──────────────────────────────┬──────────────────────────────┐ │ + │ │ RowId ┆ frame_nr ┆ log_time ┆ /this/that:example.MyColor ┆ /this/that:example.MyIndex │ │ + │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ + │ │ type: "Struct[2]" ┆ type: "i64" ┆ type: "Timestamp(ns)" ┆ type: "List[u32]" ┆ type: "List[u64]" │ │ + │ │ ARROW:extension:name: "TUID" ┆ index_name: "frame_nr" ┆ index_name: "log_time" ┆ component: "example.MyColor" ┆ component: "example.MyIndex" │ │ + │ │ kind: "control" ┆ is_sorted: "true" ┆ is_sorted: "true" ┆ entity_path: "/this/that" ┆ entity_path: "/this/that" │ │ + │ │ ┆ kind: "index" ┆ kind: "index" ┆ kind: "data" ┆ kind: "data" │ │ + │ ╞══════════════════════════════════╪════════════════════════╪═══════════════════════════════╪══════════════════════════════╪══════════════════════════════╡ │ + │ │ 0000000067816A6BB4B8C1254D40007B ┆ 1 ┆ 2025-01-10T18:43:42.123456789 ┆ [0, 1, 2] ┆ [0, 1, 2] │ │ + │ └──────────────────────────────────┴────────────────────────┴───────────────────────────────┴──────────────────────────────┴──────────────────────────────┘ │ + └─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ ] } From f01efda8e19cd9af8f42b285794e76793d1a98e3 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Mon, 10 Feb 2025 15:56:20 +0100 Subject: [PATCH 20/38] Remove the last of TransportChunk --- Cargo.lock | 3 + crates/store/re_chunk/src/chunk.rs | 2 +- crates/store/re_chunk/src/lib.rs | 1 - crates/store/re_chunk/src/transport.rs | 207 ------------------ crates/store/re_dataframe/src/lib.rs | 2 - crates/store/re_grpc_client/Cargo.toml | 1 + crates/store/re_grpc_client/src/lib.rs | 111 ++-------- .../tests/arrow_encode_roundtrip.rs | 2 + crates/store/re_sorbet/Cargo.toml | 2 + crates/store/re_sorbet/src/chunk_batch.rs | 78 ++++++- crates/store/re_sorbet/src/chunk_schema.rs | 12 +- .../store/re_sorbet/src/data_column_schema.rs | 2 + .../re_sorbet/src/row_id_column_schema.rs | 7 +- crates/top/re_sdk/src/lib.rs | 1 - 14 files changed, 126 insertions(+), 305 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c7dfecccad65..bcdcb0c7d891 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6154,6 +6154,7 @@ dependencies = [ "re_arrow_util", "re_chunk", "re_error", + "re_format_arrow", "re_log", "re_log_encoding", "re_log_types", @@ -6590,9 +6591,11 @@ version = "0.23.0-alpha.1+dev" dependencies = [ "arrow", "itertools 0.13.0", + "re_arrow_util", "re_format_arrow", "re_log", "re_log_types", + "re_tracing", "re_tuid", "re_types_core", "thiserror 1.0.65", diff --git a/crates/store/re_chunk/src/chunk.rs b/crates/store/re_chunk/src/chunk.rs index 93105f2c19bd..287649b4ee65 100644 --- a/crates/store/re_chunk/src/chunk.rs +++ b/crates/store/re_chunk/src/chunk.rs @@ -182,7 +182,7 @@ impl FromIterator<(ComponentName, ArrowListArray)> for ChunkComponents { /// Its time columns might or might not be ascendingly sorted, depending on how the data was logged. /// /// This is the in-memory representation of a chunk, optimized for efficient manipulation of the -/// data within. For transport, see [`crate::TransportChunk`] instead. +/// data within. For transport, see [`re_sorbet::ChunkBatch`] instead. #[derive(Debug)] pub struct Chunk { pub(crate) id: ChunkId, diff --git a/crates/store/re_chunk/src/lib.rs b/crates/store/re_chunk/src/lib.rs index f0792e930cec..38e155e68ff7 100644 --- a/crates/store/re_chunk/src/lib.rs +++ b/crates/store/re_chunk/src/lib.rs @@ -29,7 +29,6 @@ pub use self::iter::{ }; pub use self::latest_at::LatestAtQuery; pub use self::range::{RangeQuery, RangeQueryOptions}; -pub use self::transport::TransportChunk; #[cfg(not(target_arch = "wasm32"))] pub use self::batcher::{ diff --git a/crates/store/re_chunk/src/transport.rs b/crates/store/re_chunk/src/transport.rs index a38c2dbeb4b7..ae0acdd98bdd 100644 --- a/crates/store/re_chunk/src/transport.rs +++ b/crates/store/re_chunk/src/transport.rs @@ -16,213 +16,6 @@ pub type ArrowMetadata = std::collections::HashMap; // --- -/// A [`Chunk`] that is ready for transport. -/// -/// It contains a schema with a matching number of columns, all of the same length. -/// -/// This is just a wrapper around an [`ArrowRecordBatch`], with some helper functions on top. -/// It can be converted to and from [`ArrowRecordBatch`] without overhead. -/// -/// Use the `Display` implementation to dump the chunk as a nicely formatted table. -/// -/// This has a stable ABI! The entire point of this type is to allow users to send raw arrow data -/// into Rerun. -/// This means we have to be very careful when checking the validity of the data: slipping corrupt -/// data into the store could silently break all the index search logic (e.g. think of a chunk -/// claiming to be sorted while it is in fact not). -// TODO(emilk): remove this, and replace it with a trait extension type for `ArrowRecordBatch`. -#[derive(Debug, Clone)] -pub struct TransportChunk {} - -// TODO(#6572): Relying on Arrow's native schema metadata feature is bound to fail, we need to -// switch to something more powerful asap. -impl TransportChunk { - /// The key used to identify a Rerun [`ChunkId`] in chunk-level [`ArrowSchema`] metadata. - pub const CHUNK_METADATA_KEY_ID: &'static str = "rerun.id"; - - /// The key used to identify a Rerun [`EntityPath`] in chunk-level [`ArrowSchema`] metadata. - pub const CHUNK_METADATA_KEY_ENTITY_PATH: &'static str = "rerun.entity_path"; - - /// The key used to identify the size in bytes of the data, once loaded in memory, in chunk-level - /// [`ArrowSchema`] metadata. - pub const CHUNK_METADATA_KEY_HEAP_SIZE_BYTES: &'static str = "rerun.heap_size_bytes"; - - /// The marker used to identify whether a chunk is sorted in chunk-level [`ArrowSchema`] metadata. - /// - /// The associated value is irrelevant -- if this marker is present, then it is true. - /// - /// Chunks are ascendingly sorted by their `RowId` column. - pub const CHUNK_METADATA_MARKER_IS_SORTED_BY_ROW_ID: &'static str = "rerun.is_sorted"; - - /// The key used to identify the kind of a Rerun column in field-level [`ArrowSchema`] metadata. - /// - /// That is: control columns (e.g. `row_id`), time columns or component columns. - pub const FIELD_METADATA_KEY_KIND: &'static str = "rerun.kind"; - - /// The value used to identify a Rerun time column in field-level [`ArrowSchema`] metadata. - pub const FIELD_METADATA_VALUE_KIND_TIME: &'static str = "time"; - - /// The value used to identify a Rerun control column in field-level [`ArrowSchema`] metadata. - pub const FIELD_METADATA_VALUE_KIND_CONTROL: &'static str = "control"; - - /// The value used to identify a Rerun data column in field-level [`ArrowSchema`] metadata. - pub const FIELD_METADATA_VALUE_KIND_DATA: &'static str = "data"; - - /// The key used to identify the [`crate::ArchetypeName`] in field-level [`ArrowSchema`] metadata. - pub const FIELD_METADATA_KEY_ARCHETYPE_NAME: &'static str = "rerun.archetype_name"; - - /// The key used to identify the [`crate::ArchetypeFieldName`] in field-level [`ArrowSchema`] metadata. - pub const FIELD_METADATA_KEY_ARCHETYPE_FIELD_NAME: &'static str = "rerun.archetype_field_name"; - - /// The marker used to identify whether a column is sorted in field-level [`ArrowSchema`] metadata. - /// - /// The associated value is irrelevant -- if this marker is present, then it is true. - /// - /// Chunks are ascendingly sorted by their `RowId` column but, depending on whether the data - /// was logged out of order or not for a given time column, that column might follow the global - /// `RowId` while still being unsorted relative to its own time order. - pub const FIELD_METADATA_MARKER_IS_SORTED_BY_TIME: &'static str = - Self::CHUNK_METADATA_MARKER_IS_SORTED_BY_ROW_ID; - - /// Returns the appropriate chunk-level [`ArrowSchema`] metadata for a Rerun [`ChunkId`]. - #[inline] - pub fn chunk_metadata_id(id: ChunkId) -> ArrowMetadata { - [ - ( - Self::CHUNK_METADATA_KEY_ID.to_owned(), - format!("{:X}", id.as_u128()), - ), // - ] - .into() - } - - /// Returns the appropriate chunk-level [`ArrowSchema`] metadata for the in-memory size in bytes. - #[inline] - pub fn chunk_metadata_heap_size_bytes(heap_size_bytes: u64) -> ArrowMetadata { - [ - ( - Self::CHUNK_METADATA_KEY_HEAP_SIZE_BYTES.to_owned(), - heap_size_bytes.to_string(), - ), // - ] - .into() - } - - /// Returns the appropriate chunk-level [`ArrowSchema`] metadata for a Rerun [`EntityPath`]. - #[inline] - pub fn chunk_metadata_entity_path(entity_path: &EntityPath) -> ArrowMetadata { - [ - ( - Self::CHUNK_METADATA_KEY_ENTITY_PATH.to_owned(), - entity_path.to_string(), - ), // - ] - .into() - } - - /// Returns the appropriate chunk-level [`ArrowSchema`] metadata for an `IS_SORTED` marker. - #[inline] - pub fn chunk_metadata_is_sorted() -> ArrowMetadata { - [ - ( - Self::CHUNK_METADATA_MARKER_IS_SORTED_BY_ROW_ID.to_owned(), - String::new(), - ), // - ] - .into() - } - - /// Returns the appropriate field-level [`ArrowSchema`] metadata for a Rerun time column. - #[inline] - pub fn field_metadata_time_column() -> ArrowMetadata { - [ - ( - Self::FIELD_METADATA_KEY_KIND.to_owned(), - Self::FIELD_METADATA_VALUE_KIND_TIME.to_owned(), - ), // - ] - .into() - } - - /// Returns the appropriate field-level [`ArrowSchema`] metadata for a Rerun control column. - #[inline] - pub fn field_metadata_control_column() -> ArrowMetadata { - [ - ( - Self::FIELD_METADATA_KEY_KIND.to_owned(), - Self::FIELD_METADATA_VALUE_KIND_CONTROL.to_owned(), - ), // - ] - .into() - } - - /// Returns the appropriate field-level [`ArrowSchema`] metadata for a Rerun data column. - #[inline] - pub fn field_metadata_data_column() -> ArrowMetadata { - [ - ( - Self::FIELD_METADATA_KEY_KIND.to_owned(), - Self::FIELD_METADATA_VALUE_KIND_DATA.to_owned(), - ), // - ] - .into() - } - - /// Returns the appropriate field-level [`ArrowSchema`] metadata for an `IS_SORTED` marker. - #[inline] - pub fn field_metadata_is_sorted() -> ArrowMetadata { - [ - ( - Self::FIELD_METADATA_MARKER_IS_SORTED_BY_TIME.to_owned(), - String::new(), - ), // - ] - .into() - } - - #[inline] - pub fn field_metadata_component_descriptor( - component_desc: &ComponentDescriptor, - ) -> ArrowMetadata { - component_desc - .archetype_name - .iter() - .copied() - .map(|archetype_name| { - ( - Self::FIELD_METADATA_KEY_ARCHETYPE_NAME.to_owned(), - archetype_name.to_string(), - ) - }) - .chain(component_desc.archetype_field_name.iter().copied().map( - |archetype_field_name| { - ( - Self::FIELD_METADATA_KEY_ARCHETYPE_FIELD_NAME.to_owned(), - archetype_field_name.to_string(), - ) - }, - )) - .collect() - } - - #[inline] - pub fn component_descriptor_from_field(field: &ArrowField) -> ComponentDescriptor { - ComponentDescriptor { - archetype_name: field - .metadata() - .get(Self::FIELD_METADATA_KEY_ARCHETYPE_NAME) - .cloned() - .map(Into::into), - component_name: field.name().clone().into(), - archetype_field_name: field - .metadata() - .get(Self::FIELD_METADATA_KEY_ARCHETYPE_FIELD_NAME) - .cloned() - .map(Into::into), - } - } -} - impl Chunk { /// Prepare the [`Chunk`] for transport. /// diff --git a/crates/store/re_dataframe/src/lib.rs b/crates/store/re_dataframe/src/lib.rs index ab7a5b2c60ce..44f1e891dcd7 100644 --- a/crates/store/re_dataframe/src/lib.rs +++ b/crates/store/re_dataframe/src/lib.rs @@ -6,8 +6,6 @@ mod query; pub use self::engine::QueryEngine; pub use self::query::QueryHandle; -#[doc(no_inline)] -pub use self::external::re_chunk::TransportChunk; #[doc(no_inline)] pub use self::external::re_chunk_store::{ ChunkStoreConfig, ChunkStoreHandle, ColumnSelector, ComponentColumnSelector, Index, IndexRange, diff --git a/crates/store/re_grpc_client/Cargo.toml b/crates/store/re_grpc_client/Cargo.toml index 256a29fc1c00..106b79cbf653 100644 --- a/crates/store/re_grpc_client/Cargo.toml +++ b/crates/store/re_grpc_client/Cargo.toml @@ -23,6 +23,7 @@ all-features = true re_arrow_util.workspace = true re_chunk.workspace = true re_error.workspace = true +re_format_arrow.workspace = true re_log.workspace = true re_log_encoding = { workspace = true, features = ["encoder", "decoder"] } re_log_types.workspace = true diff --git a/crates/store/re_grpc_client/src/lib.rs b/crates/store/re_grpc_client/src/lib.rs index f456cdbd8f1b..f222510b7bed 100644 --- a/crates/store/re_grpc_client/src/lib.rs +++ b/crates/store/re_grpc_client/src/lib.rs @@ -14,7 +14,7 @@ use arrow::{ use url::Url; use re_arrow_util::ArrowArrayDowncastRef as _; -use re_chunk::{Chunk, ChunkBuilder, ChunkId, EntityPath, RowId, Timeline, TransportChunk}; +use re_chunk::{Chunk, ChunkBuilder, ChunkId, EntityPath, RowId, Timeline}; use re_log_encoding::codec::{wire::decoder::Decode, CodecError}; use re_log_types::{ external::re_types_core::ComponentDescriptor, ApplicationId, BlueprintActivationCommand, @@ -96,6 +96,9 @@ pub enum StreamError { #[error("Invalid URI: {0}")] InvalidUri(String), + + #[error(transparent)] + InvalidChunkSchema(#[from] re_sorbet::InvalidChunkSchema), } // ---------------------------------------------------------------------------- @@ -399,99 +402,31 @@ async fn stream_catalog_async( re_log::info!("Starting to read..."); while let Some(result) = resp.next().await { - let record_batch = result.map_err(TonicStatusError)?; - - // Catalog received from the ReDap server isn't suitable for direct conversion to a Rerun Chunk: - // - conversion expects "data" columns to be ListArrays, hence we need to convert any individual row column data to ListArray - // - conversion expects the input TransportChunk to have a ChunkId so we need to add that piece of metadata - - let mut fields: Vec = Vec::new(); - let mut columns: Vec = Vec::new(); + let entity_path = EntityPath::parse_forgiving("catalog"); - for (field, data) in - itertools::izip!(record_batch.schema().fields(), record_batch.columns()) - { - // TODO: move this switch to re_sorbet - match field.metadata().get("rerun.kind").map(|s| s.as_str()) { - Some("control") => { - fields.push( - ArrowField::new( - RowId::name().to_string(), // need to rename to Rerun Chunk expected control field - field.data_type().clone(), - false, /* not nullable */ - ) - .with_metadata(TransportChunk::field_metadata_control_column()), - ); - - columns.push(data.clone()); - } - Some("index" | "time") => { - fields.push(ArrowField::clone(field.as_ref())); - columns.push(data.clone()); - } - Some("data" | "component") => { - // We slice each column array into individual arrays and then convert the whole lot into a ListArray + let record_batch = result.map_err(TonicStatusError)?; - let data_field_inner = ArrowField::new( - "item", - field.data_type().clone(), - true, /* nullable */ - ); + let mut metadata = record_batch.schema_ref().metadata.clone(); - let data_field = ArrowField::new( - field.name().clone(), - ArrowDataType::List(Arc::new(data_field_inner.clone())), - false, /* not nullable */ - ) - .with_metadata(TransportChunk::field_metadata_data_column()); - - let mut sliced: Vec = Vec::new(); - for idx in 0..data.len() { - sliced.push(data.clone().slice(idx, 1)); - } - - let data_arrays = sliced.iter().map(|e| Some(e.as_ref())).collect::>(); - #[allow(clippy::unwrap_used)] // we know we've given the right field type - let data_field_array: arrow::array::ListArray = - re_arrow_util::arrow_util::arrays_to_list_array( - data_field_inner.data_type().clone(), - &data_arrays, - ) - .unwrap(); - - fields.push(data_field); - columns.push(as_array_ref(data_field_array)); - } - Some(kind) => { - return Err(StreamError::ChunkError(re_chunk::ChunkError::Malformed { - reason: format!("Unknown rerun.kind {kind:?}"), - })); - } - None => { - return Err(StreamError::ChunkError(re_chunk::ChunkError::Malformed { - reason: "Missing or unknown rerun.kind".to_owned(), - })); - } + for (key, value) in [ + re_sorbet::ChunkSchema::chunk_id_metadata(&ChunkId::new()), + re_sorbet::ChunkSchema::entity_path_metadata(&entity_path), + ] { + if !metadata.contains_key(&key) { + metadata.insert(key, value); } } - let schema = { - let metadata = HashMap::from_iter([ - ( - TransportChunk::CHUNK_METADATA_KEY_ENTITY_PATH.to_owned(), - "catalog".to_owned(), - ), - ( - TransportChunk::CHUNK_METADATA_KEY_ID.to_owned(), - ChunkId::new().to_string(), - ), - ]); - arrow::datatypes::Schema::new_with_metadata(fields, metadata) - }; - - let record_batch = ArrowRecordBatch::try_new(schema.into(), columns) - .map_err(re_chunk::ChunkError::from)?; - let mut chunk = Chunk::from_record_batch(record_batch)?; + let schema_with_more_metadata = arrow::datatypes::Schema::clone(record_batch.schema_ref()) + .with_metadata(metadata) + .into(); + let record_batch = record_batch + .with_schema(schema_with_more_metadata) + .expect("Can't fail, because we only added metadata"); + + let chunk_batch = re_sorbet::ChunkBatch::try_from(record_batch.clone())?; + + let mut chunk = Chunk::from_chunk_batch(&chunk_batch)?; // finally, enrich catalog data with RecordingUri that's based on the ReDap endpoint (that we know) // and the recording id (that we have in the catalog data) diff --git a/crates/store/re_log_encoding/tests/arrow_encode_roundtrip.rs b/crates/store/re_log_encoding/tests/arrow_encode_roundtrip.rs index 2e80fc16a2f9..80eb680b915f 100644 --- a/crates/store/re_log_encoding/tests/arrow_encode_roundtrip.rs +++ b/crates/store/re_log_encoding/tests/arrow_encode_roundtrip.rs @@ -1,3 +1,5 @@ +use similar_asserts::assert_eq; + use re_build_info::CrateVersion; use re_chunk::{Chunk, RowId, TimePoint, Timeline}; use re_log_encoding::{ diff --git a/crates/store/re_sorbet/Cargo.toml b/crates/store/re_sorbet/Cargo.toml index 929fcfa20188..136f3f5282b0 100644 --- a/crates/store/re_sorbet/Cargo.toml +++ b/crates/store/re_sorbet/Cargo.toml @@ -20,9 +20,11 @@ all-features = true [dependencies] +re_arrow_util.workspace = true re_format_arrow.workspace = true re_log_types.workspace = true re_log.workspace = true +re_tracing.workspace = true re_tuid.workspace = true re_types_core.workspace = true diff --git a/crates/store/re_sorbet/src/chunk_batch.rs b/crates/store/re_sorbet/src/chunk_batch.rs index 54377d75d5e7..e9b6a4fcbb3f 100644 --- a/crates/store/re_sorbet/src/chunk_batch.rs +++ b/crates/store/re_sorbet/src/chunk_batch.rs @@ -1,11 +1,17 @@ +use std::sync::Arc; + use arrow::{ array::{ Array as _, ArrayRef as ArrowArrayRef, AsArray, RecordBatch as ArrowRecordBatch, RecordBatchOptions, StructArray as ArrowStructArray, }, - datatypes::{Fields as ArrowFields, Schema as ArrowSchema}, + datatypes::{ + DataType as ArrowDataType, Field as ArrowField, FieldRef as ArrowFieldRef, + Fields as ArrowFields, Schema as ArrowSchema, + }, }; +use re_arrow_util::{arrow_util::into_arrow_ref, ArrowArrayDowncastRef}; use re_log_types::EntityPath; use re_types_core::ChunkId; @@ -225,7 +231,12 @@ impl From<&ChunkBatch> for ArrowRecordBatch { impl TryFrom for ChunkBatch { type Error = InvalidChunkSchema; + /// Will automatically wrap data columns in `ListArrays` if they are not already. fn try_from(batch: ArrowRecordBatch) -> Result { + re_tracing::profile_function!(); + + let batch = make_all_data_columns_list_arrays(&batch); + let chunk_schema = ChunkSchema::try_from(batch.schema_ref().as_ref())?; // TODO: sanity check that the schema matches the columns @@ -249,3 +260,68 @@ impl TryFrom for ChunkBatch { }) } } + +/// Make sure all data columns are `ListArrays`. +fn make_all_data_columns_list_arrays(batch: &ArrowRecordBatch) -> ArrowRecordBatch { + re_tracing::profile_function!(); + + let num_columns = batch.num_columns(); + let mut fields: Vec = Vec::with_capacity(num_columns); + let mut columns: Vec = Vec::with_capacity(num_columns); + + for (field, data) in itertools::izip!(batch.schema().fields(), batch.columns()) { + if data + .downcast_array_ref::() + .is_some() + { + // Already fine + fields.push(field.clone()); + columns.push(data.clone()); + } else if field + .metadata() + .get("rerun.kind") + .is_some_and(|kind| kind == "data") + { + // We slice each column array into individual arrays and then convert the whole lot into a ListArray + + let data_field_inner = + ArrowField::new("item", field.data_type().clone(), true /* nullable */); + + let data_field = ArrowField::new( + field.name().clone(), + ArrowDataType::List(Arc::new(data_field_inner.clone())), + false, /* not nullable */ + ) + .with_metadata(field.metadata().clone()); + + let mut sliced: Vec = Vec::new(); + for idx in 0..data.len() { + sliced.push(data.clone().slice(idx, 1)); + } + + let data_arrays = sliced.iter().map(|e| Some(e.as_ref())).collect::>(); + #[allow(clippy::unwrap_used)] // we know we've given the right field type + let list_array: arrow::array::ListArray = + re_arrow_util::arrow_util::arrays_to_list_array( + data_field_inner.data_type().clone(), + &data_arrays, + ) + .unwrap(); + + fields.push(data_field.into()); + columns.push(into_arrow_ref(list_array)); + } else { + fields.push(field.clone()); + columns.push(data.clone()); + } + } + + let schema = ArrowSchema::new_with_metadata(fields, batch.schema().metadata.clone()); + + ArrowRecordBatch::try_new_with_options( + schema.into(), + columns, + &RecordBatchOptions::default().with_row_count(Some(batch.num_rows())), + ) + .unwrap() +} diff --git a/crates/store/re_sorbet/src/chunk_schema.rs b/crates/store/re_sorbet/src/chunk_schema.rs index b03abe34f6ca..46e4c7a6efe0 100644 --- a/crates/store/re_sorbet/src/chunk_schema.rs +++ b/crates/store/re_sorbet/src/chunk_schema.rs @@ -141,6 +141,14 @@ impl ChunkSchema { 1 + self.index_columns.len() + self.data_columns.len() } + pub fn chunk_id_metadata(chunk_id: &ChunkId) -> (String, String) { + ("rerun.id".to_owned(), format!("{:X}", chunk_id.as_u128())) + } + + pub fn entity_path_metadata(entity_path: &EntityPath) -> (String, String) { + ("rerun.entity_path".to_owned(), entity_path.to_string()) + } + pub fn arrow_batch_metadata(&self) -> ArrowBatchMetadata { let Self { chunk_id, @@ -157,8 +165,8 @@ impl ChunkSchema { Self::CHUNK_METADATA_KEY_VERSION.to_owned(), Self::CHUNK_METADATA_VERSION.to_owned(), ), - ("rerun.id".to_owned(), format!("{:X}", chunk_id.as_u128())), - ("rerun.entity_path".to_owned(), entity_path.to_string()), + Self::chunk_id_metadata(chunk_id), + Self::entity_path_metadata(entity_path), ]); if let Some(heap_size_bytes) = heap_size_bytes { arrow_metadata.insert( diff --git a/crates/store/re_sorbet/src/data_column_schema.rs b/crates/store/re_sorbet/src/data_column_schema.rs index 45f886886448..b2bc95df19d2 100644 --- a/crates/store/re_sorbet/src/data_column_schema.rs +++ b/crates/store/re_sorbet/src/data_column_schema.rs @@ -243,6 +243,8 @@ impl ComponentColumnDescriptor { ComponentName::new(field.name()) // Legacy fallback }; + // TODO: check that the data type is a list array and at least warn if it is not + Ok(Self { store_datatype: field.data_type().clone(), entity_path, diff --git a/crates/store/re_sorbet/src/row_id_column_schema.rs b/crates/store/re_sorbet/src/row_id_column_schema.rs index fae2a3b84f11..5a9007f0b458 100644 --- a/crates/store/re_sorbet/src/row_id_column_schema.rs +++ b/crates/store/re_sorbet/src/row_id_column_schema.rs @@ -29,12 +29,14 @@ impl WrongDatatypeError { pub struct RowIdColumnDescriptor {} impl RowIdColumnDescriptor { + pub fn new() -> Self { + Self {} + } + #[inline] pub fn to_arrow_field(&self) -> ArrowField { let Self {} = self; - let nullable = false; // All rows has an id - let metadata = [ Some(("rerun.kind".to_owned(), "control".to_owned())), // This ensures the RowId/Tuid is formatted correctly: @@ -47,6 +49,7 @@ impl RowIdColumnDescriptor { .flatten() .collect(); + let nullable = false; // All rows has an id ArrowField::new( RowId::descriptor().to_string(), RowId::arrow_datatype(), diff --git a/crates/top/re_sdk/src/lib.rs b/crates/top/re_sdk/src/lib.rs index 0a2d442ac185..6397d4008793 100644 --- a/crates/top/re_sdk/src/lib.rs +++ b/crates/top/re_sdk/src/lib.rs @@ -82,7 +82,6 @@ pub mod log { pub use re_chunk::{ Chunk, ChunkBatcher, ChunkBatcherConfig, ChunkBatcherError, ChunkBatcherResult, ChunkComponents, ChunkError, ChunkId, ChunkResult, PendingRow, RowId, TimeColumn, - TransportChunk, }; pub use re_log_types::LogMsg; } From 8326fae19aba2ecea809a6d6ae2e8d926ce67761 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Mon, 10 Feb 2025 16:12:09 +0100 Subject: [PATCH 21/38] Cleanup --- crates/store/re_chunk/src/transport.rs | 20 ++++------ crates/store/re_grpc_client/src/lib.rs | 40 +++++++++---------- .../re_log_encoding/src/codec/file/decoder.rs | 2 +- .../re_log_encoding/src/codec/wire/mod.rs | 2 +- .../src/protobuf_conversions.rs | 2 +- .../tests/arrow_encode_roundtrip.rs | 2 +- crates/store/re_sorbet/src/chunk_batch.rs | 8 ++-- .../re_sorbet/src/row_id_column_schema.rs | 3 +- rerun_py/src/remote.rs | 2 +- 9 files changed, 39 insertions(+), 42 deletions(-) diff --git a/crates/store/re_chunk/src/transport.rs b/crates/store/re_chunk/src/transport.rs index ae0acdd98bdd..b55c5b3b02d5 100644 --- a/crates/store/re_chunk/src/transport.rs +++ b/crates/store/re_chunk/src/transport.rs @@ -1,18 +1,14 @@ -use arrow::{ - array::{Array as ArrowArray, ListArray as ArrowListArray, RecordBatch as ArrowRecordBatch}, - datatypes::Field as ArrowField, +use arrow::array::{ + Array as ArrowArray, ListArray as ArrowListArray, RecordBatch as ArrowRecordBatch, }; use itertools::Itertools; use nohash_hasher::IntMap; use re_arrow_util::{arrow_util::into_arrow_ref, ArrowArrayDowncastRef as _}; use re_byte_size::SizeBytes as _; -use re_log_types::EntityPath; use re_types_core::{arrow_helpers::as_array_ref, ComponentDescriptor, Loggable as _}; -use crate::{chunk::ChunkComponents, Chunk, ChunkError, ChunkId, ChunkResult, RowId, TimeColumn}; - -pub type ArrowMetadata = std::collections::HashMap; +use crate::{chunk::ChunkComponents, Chunk, ChunkError, ChunkResult, RowId, TimeColumn}; // --- @@ -230,7 +226,7 @@ impl Chunk { Ok(res) } - pub fn from_record_batch(batch: ArrowRecordBatch) -> ChunkResult { + pub fn from_record_batch(batch: &ArrowRecordBatch) -> ChunkResult { re_tracing::profile_function!(format!( "num_columns={} num_rows={}", batch.num_columns(), @@ -250,7 +246,7 @@ impl Chunk { on_release: _, } = msg; - Self::from_record_batch(batch.clone()) + Self::from_record_batch(batch) } #[inline] @@ -275,9 +271,9 @@ mod tests { use re_arrow_util::arrow_util; use re_log_types::{ example_components::{MyColor, MyPoint}, - Timeline, + EntityPath, Timeline, }; - use re_types_core::Component as _; + use re_types_core::{ChunkId, Component as _}; use super::*; @@ -357,7 +353,7 @@ mod tests { let arrow_record_batch = ArrowRecordBatch::from(&chunk_batch_before); - let chunk_batch_after = re_sorbet::ChunkBatch::try_from(arrow_record_batch).unwrap(); + let chunk_batch_after = re_sorbet::ChunkBatch::try_from(&arrow_record_batch).unwrap(); assert_eq!( chunk_batch_before.chunk_schema(), diff --git a/crates/store/re_grpc_client/src/lib.rs b/crates/store/re_grpc_client/src/lib.rs index f222510b7bed..05d92ea1777b 100644 --- a/crates/store/re_grpc_client/src/lib.rs +++ b/crates/store/re_grpc_client/src/lib.rs @@ -2,12 +2,11 @@ pub mod message_proxy; -use std::{collections::HashMap, error::Error, sync::Arc}; +use std::error::Error; use arrow::{ array::{ - Array as ArrowArray, ArrayRef as ArrowArrayRef, RecordBatch as ArrowRecordBatch, - StringArray as ArrowStringArray, + ArrayRef as ArrowArrayRef, RecordBatch as ArrowRecordBatch, StringArray as ArrowStringArray, }, datatypes::{DataType as ArrowDataType, Field as ArrowField}, }; @@ -264,7 +263,7 @@ async fn stream_recording_async( re_log::info!("Starting to read..."); while let Some(result) = resp.next().await { let batch = result.map_err(TonicStatusError)?; - let chunk = Chunk::from_record_batch(batch)?; + let chunk = Chunk::from_record_batch(&batch)?; if tx .send(LogMsg::ArrowMsg(store_id.clone(), chunk.to_arrow_msg()?)) @@ -404,27 +403,28 @@ async fn stream_catalog_async( while let Some(result) = resp.next().await { let entity_path = EntityPath::parse_forgiving("catalog"); - let record_batch = result.map_err(TonicStatusError)?; + let mut record_batch = result.map_err(TonicStatusError)?; - let mut metadata = record_batch.schema_ref().metadata.clone(); + { + let mut metadata = record_batch.schema_ref().metadata.clone(); - for (key, value) in [ - re_sorbet::ChunkSchema::chunk_id_metadata(&ChunkId::new()), - re_sorbet::ChunkSchema::entity_path_metadata(&entity_path), - ] { - if !metadata.contains_key(&key) { - metadata.insert(key, value); + for (key, value) in [ + re_sorbet::ChunkSchema::chunk_id_metadata(&ChunkId::new()), + re_sorbet::ChunkSchema::entity_path_metadata(&entity_path), + ] { + metadata.entry(key).or_insert(value); } - } - let schema_with_more_metadata = arrow::datatypes::Schema::clone(record_batch.schema_ref()) - .with_metadata(metadata) - .into(); - let record_batch = record_batch - .with_schema(schema_with_more_metadata) - .expect("Can't fail, because we only added metadata"); + let schema_with_more_metadata = + arrow::datatypes::Schema::clone(record_batch.schema_ref()) + .with_metadata(metadata) + .into(); + record_batch = record_batch + .with_schema(schema_with_more_metadata) + .expect("Can't fail, because we only added metadata"); + } - let chunk_batch = re_sorbet::ChunkBatch::try_from(record_batch.clone())?; + let chunk_batch = re_sorbet::ChunkBatch::try_from(&record_batch)?; let mut chunk = Chunk::from_chunk_batch(&chunk_batch)?; diff --git a/crates/store/re_log_encoding/src/codec/file/decoder.rs b/crates/store/re_log_encoding/src/codec/file/decoder.rs index a8721fd4882d..224e95a02d97 100644 --- a/crates/store/re_log_encoding/src/codec/file/decoder.rs +++ b/crates/store/re_log_encoding/src/codec/file/decoder.rs @@ -44,7 +44,7 @@ pub fn decode_bytes(message_kind: MessageKind, buf: &[u8]) -> Result for ArrowRecordBatch { } } -impl TryFrom for ChunkBatch { +impl TryFrom<&ArrowRecordBatch> for ChunkBatch { type Error = InvalidChunkSchema; /// Will automatically wrap data columns in `ListArrays` if they are not already. - fn try_from(batch: ArrowRecordBatch) -> Result { + fn try_from(batch: &ArrowRecordBatch) -> Result { re_tracing::profile_function!(); - let batch = make_all_data_columns_list_arrays(&batch); + let batch = make_all_data_columns_list_arrays(batch); let chunk_schema = ChunkSchema::try_from(batch.schema_ref().as_ref())?; @@ -323,5 +323,5 @@ fn make_all_data_columns_list_arrays(batch: &ArrowRecordBatch) -> ArrowRecordBat columns, &RecordBatchOptions::default().with_row_count(Some(batch.num_rows())), ) - .unwrap() + .expect("Can't fail") } diff --git a/crates/store/re_sorbet/src/row_id_column_schema.rs b/crates/store/re_sorbet/src/row_id_column_schema.rs index 5a9007f0b458..028753218e3a 100644 --- a/crates/store/re_sorbet/src/row_id_column_schema.rs +++ b/crates/store/re_sorbet/src/row_id_column_schema.rs @@ -25,10 +25,11 @@ impl WrongDatatypeError { } /// Describes the [`RowId`] -#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[derive(Clone, Debug, Default, PartialEq, Eq, Hash, PartialOrd, Ord)] pub struct RowIdColumnDescriptor {} impl RowIdColumnDescriptor { + #[inline] pub fn new() -> Self { Self {} } diff --git a/rerun_py/src/remote.rs b/rerun_py/src/remote.rs index 1b6db9ad1fd1..d43c4fc2951b 100644 --- a/rerun_py/src/remote.rs +++ b/rerun_py/src/remote.rs @@ -840,7 +840,7 @@ impl PyStorageNodeClient { return Err(PyRuntimeError::new_err(err.to_string())); } }; - let chunk = Chunk::from_record_batch(batch) + let chunk = Chunk::from_record_batch(&batch) .map_err(|err| PyRuntimeError::new_err(err.to_string()))?; store From 5d0edb2566e06d4b2f8ba148d49407dd9a6e253a Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Mon, 10 Feb 2025 16:29:35 +0100 Subject: [PATCH 22/38] Use full component and archetype names in metadata for roundtripping --- crates/store/re_chunk/src/transport.rs | 32 +++++++++---------- .../store/re_sorbet/src/data_column_schema.rs | 5 +-- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/crates/store/re_chunk/src/transport.rs b/crates/store/re_chunk/src/transport.rs index b55c5b3b02d5..da688f72df31 100644 --- a/crates/store/re_chunk/src/transport.rs +++ b/crates/store/re_chunk/src/transport.rs @@ -13,10 +13,19 @@ use crate::{chunk::ChunkComponents, Chunk, ChunkError, ChunkResult, RowId, TimeC // --- impl Chunk { + /// Prepare the [`Chunk`] for transport. + /// + /// It is probably a good idea to sort the chunk first. + pub fn to_record_batch(&self) -> ChunkResult { + re_tracing::profile_function!(); + Ok(self.to_chunk_batch()?.into()) + } + /// Prepare the [`Chunk`] for transport. /// /// It is probably a good idea to sort the chunk first. pub fn to_chunk_batch(&self) -> ChunkResult { + re_tracing::profile_function!(); self.sanity_check()?; re_tracing::profile_function!(format!( @@ -120,14 +129,14 @@ impl Chunk { data_arrays, )?) } -} -impl Chunk { - /// Prepare the [`Chunk`] for transport. - /// - /// It is probably a good idea to sort the chunk first. - pub fn to_record_batch(&self) -> ChunkResult { - Ok(self.to_chunk_batch()?.into()) + pub fn from_record_batch(batch: &ArrowRecordBatch) -> ChunkResult { + re_tracing::profile_function!(format!( + "num_columns={} num_rows={}", + batch.num_columns(), + batch.num_rows() + )); + Self::from_chunk_batch(&re_sorbet::ChunkBatch::try_from(batch)?) } pub fn from_chunk_batch(batch: &re_sorbet::ChunkBatch) -> ChunkResult { @@ -225,15 +234,6 @@ impl Chunk { Ok(res) } - - pub fn from_record_batch(batch: &ArrowRecordBatch) -> ChunkResult { - re_tracing::profile_function!(format!( - "num_columns={} num_rows={}", - batch.num_columns(), - batch.num_rows() - )); - Self::from_chunk_batch(&re_sorbet::ChunkBatch::try_from(batch)?) - } } impl Chunk { diff --git a/crates/store/re_sorbet/src/data_column_schema.rs b/crates/store/re_sorbet/src/data_column_schema.rs index b2bc95df19d2..377eb772e008 100644 --- a/crates/store/re_sorbet/src/data_column_schema.rs +++ b/crates/store/re_sorbet/src/data_column_schema.rs @@ -173,16 +173,17 @@ impl ComponentColumnDescriptor { } = self; // TODO(#6889): This needs some proper sorbetization -- I just threw these names randomly. + // We use the long namesa for the archetype and component names so that they roundtrip properly! [ Some(("rerun.kind".to_owned(), "data".to_owned())), Some(("rerun.entity_path".to_owned(), entity_path.to_string())), - archetype_name.map(|name| ("rerun.archetype".to_owned(), name.short_name().to_owned())), + archetype_name.map(|name| ("rerun.archetype".to_owned(), name.full_name().to_owned())), archetype_field_name .as_ref() .map(|name| ("rerun.archetype_field".to_owned(), name.to_string())), Some(( "rerun.component".to_owned(), - component_name.short_name().to_owned(), + component_name.full_name().to_owned(), )), (*is_static).then_some(("rerun.is_static".to_owned(), "true".to_owned())), (*is_indicator).then_some(("rerun.is_indicator".to_owned(), "true".to_owned())), From 2058129df9adb677baf7361d331ef61aa2a204ed Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Mon, 10 Feb 2025 16:45:09 +0100 Subject: [PATCH 23/38] Fix TODOs --- crates/store/re_chunk/src/transport.rs | 8 ++--- crates/store/re_sorbet/src/chunk_batch.rs | 6 ++-- crates/store/re_sorbet/src/chunk_schema.rs | 32 +++++++++++-------- .../store/re_sorbet/src/data_column_schema.rs | 2 -- .../re_sorbet/src/row_id_column_schema.rs | 1 - 5 files changed, 26 insertions(+), 23 deletions(-) diff --git a/crates/store/re_chunk/src/transport.rs b/crates/store/re_chunk/src/transport.rs index da688f72df31..bf10014d10de 100644 --- a/crates/store/re_chunk/src/transport.rs +++ b/crates/store/re_chunk/src/transport.rs @@ -98,10 +98,10 @@ impl Chunk { archetype_field_name, component_name, - is_static: false, // TODO - is_indicator: false, // TODO - is_tombstone: false, // TODO - is_semantically_empty: false, // TODO + is_static: false, // TODO(#8744): figure out what to do here + is_indicator: false, // TODO(#8744): figure out what to do here + is_tombstone: false, // TODO(#8744): figure out what to do here + is_semantically_empty: false, // TODO(#8744): figure out what to do here }; (schema, into_arrow_ref(list_array)) }) diff --git a/crates/store/re_sorbet/src/chunk_batch.rs b/crates/store/re_sorbet/src/chunk_batch.rs index a1d34647c1da..5b16155ba250 100644 --- a/crates/store/re_sorbet/src/chunk_batch.rs +++ b/crates/store/re_sorbet/src/chunk_batch.rs @@ -44,8 +44,6 @@ impl MismatchedChunkSchemaError { #[derive(Debug, Clone)] pub struct ChunkBatch { schema: ChunkSchema, - - // TODO: should we store a record batch here, or just the parsed columns? batch: ArrowRecordBatch, } @@ -239,7 +237,9 @@ impl TryFrom<&ArrowRecordBatch> for ChunkBatch { let chunk_schema = ChunkSchema::try_from(batch.schema_ref().as_ref())?; - // TODO: sanity check that the schema matches the columns + for (field, column) in itertools::izip!(chunk_schema.arrow_fields(), batch.columns()) { + debug_assert_eq!(field.data_type(), column.data_type()); + } // Extend with any metadata that might have been missing: let mut arrow_schema = ArrowSchema::clone(batch.schema_ref().as_ref()); diff --git a/crates/store/re_sorbet/src/chunk_schema.rs b/crates/store/re_sorbet/src/chunk_schema.rs index 46e4c7a6efe0..9f528afefab8 100644 --- a/crates/store/re_sorbet/src/chunk_schema.rs +++ b/crates/store/re_sorbet/src/chunk_schema.rs @@ -180,28 +180,27 @@ impl ChunkSchema { arrow_metadata } -} - -impl From<&ChunkSchema> for ArrowSchema { - fn from(chunk_schema: &ChunkSchema) -> Self { - let metadata = chunk_schema.arrow_batch_metadata(); - let num_columns = chunk_schema.num_columns(); - let ChunkSchema { + pub fn arrow_fields(&self) -> Vec { + let Self { row_id_column, index_columns, data_columns, .. - } = chunk_schema; - - let mut fields: Vec = Vec::with_capacity(num_columns); + } = self; + let mut fields: Vec = Vec::with_capacity(self.num_columns()); fields.push(row_id_column.to_arrow_field()); fields.extend(index_columns.iter().map(|column| column.to_arrow_field())); fields.extend(data_columns.iter().map(|column| column.to_arrow_field())); + fields + } +} +impl From<&ChunkSchema> for ArrowSchema { + fn from(chunk_schema: &ChunkSchema) -> Self { Self { - metadata, - fields: fields.into(), + metadata: chunk_schema.arrow_batch_metadata(), + fields: chunk_schema.arrow_fields().into(), } } } @@ -225,7 +224,14 @@ impl TryFrom<&ArrowSchema> for ChunkSchema { let entity_path = EntityPath::parse_forgiving(metadata.get_or_err("rerun.entity_path")?); let is_sorted = metadata.get_bool("rerun.is_sorted"); let heap_size_bytes = if let Some(heap_size_bytes) = metadata.get("rerun.heap_size_bytes") { - heap_size_bytes.parse().ok() // TODO: log error + heap_size_bytes + .parse() + .map_err(|err| { + re_log::warn_once!( + "Failed to parse heap_size_bytes {heap_size_bytes:?} in chunk: {err}" + ); + }) + .ok() } else { None }; diff --git a/crates/store/re_sorbet/src/data_column_schema.rs b/crates/store/re_sorbet/src/data_column_schema.rs index 377eb772e008..c1642b7a5957 100644 --- a/crates/store/re_sorbet/src/data_column_schema.rs +++ b/crates/store/re_sorbet/src/data_column_schema.rs @@ -244,8 +244,6 @@ impl ComponentColumnDescriptor { ComponentName::new(field.name()) // Legacy fallback }; - // TODO: check that the data type is a list array and at least warn if it is not - Ok(Self { store_datatype: field.data_type().clone(), entity_path, diff --git a/crates/store/re_sorbet/src/row_id_column_schema.rs b/crates/store/re_sorbet/src/row_id_column_schema.rs index 028753218e3a..18b99403a337 100644 --- a/crates/store/re_sorbet/src/row_id_column_schema.rs +++ b/crates/store/re_sorbet/src/row_id_column_schema.rs @@ -69,7 +69,6 @@ impl TryFrom<&ArrowField> for RowIdColumnDescriptor { type Error = WrongDatatypeError; fn try_from(field: &ArrowField) -> Result { - // TODO: check `rerun.kind` Self::try_from(field.data_type()) } } From cb4c498c55946f08cb329f9e68669984a3247cf8 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Mon, 10 Feb 2025 17:23:05 +0100 Subject: [PATCH 24/38] Fix doclink --- crates/store/re_sorbet/src/chunk_batch.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/store/re_sorbet/src/chunk_batch.rs b/crates/store/re_sorbet/src/chunk_batch.rs index 5b16155ba250..e200ddd405bd 100644 --- a/crates/store/re_sorbet/src/chunk_batch.rs +++ b/crates/store/re_sorbet/src/chunk_batch.rs @@ -40,7 +40,7 @@ impl MismatchedChunkSchemaError { /// This is a wrapper around a [`ArrowRecordBatch`]. /// /// Each [`ChunkBatch`] contains logging data for a single [`EntityPath`]. -/// It always has a [`RowId`] column. +/// It always has a [`re_types_core::RowId`] column. #[derive(Debug, Clone)] pub struct ChunkBatch { schema: ChunkSchema, From 66a6013f4d4b97807895713ec4b7c14baf0bfedc Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Mon, 10 Feb 2025 17:35:23 +0100 Subject: [PATCH 25/38] Only include entity path in column name in dataframe batches --- Cargo.lock | 1 + .../formatting__format_chunk_store.snap | 2 +- crates/store/re_dataframe/Cargo.toml | 1 + crates/store/re_dataframe/src/query.rs | 2 +- crates/store/re_sorbet/src/chunk_batch.rs | 2 +- crates/store/re_sorbet/src/chunk_schema.rs | 6 ++- crates/store/re_sorbet/src/column_schema.rs | 17 ++++--- .../store/re_sorbet/src/data_column_schema.rs | 45 +++++++++++-------- crates/store/re_sorbet/src/lib.rs | 9 ++++ 9 files changed, 57 insertions(+), 28 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bcdcb0c7d891..a4b741fce4d0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6048,6 +6048,7 @@ dependencies = [ "re_log_encoding", "re_log_types", "re_query", + "re_sorbet", "re_tracing", "re_types", "re_types_core", diff --git a/crates/store/re_chunk_store/tests/snapshots/formatting__format_chunk_store.snap b/crates/store/re_chunk_store/tests/snapshots/formatting__format_chunk_store.snap index 690ec61f453b..124630814753 100644 --- a/crates/store/re_chunk_store/tests/snapshots/formatting__format_chunk_store.snap +++ b/crates/store/re_chunk_store/tests/snapshots/formatting__format_chunk_store.snap @@ -21,7 +21,7 @@ ChunkStore { │ * version: "1" │ ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ ┌──────────────────────────────────┬────────────────────────┬───────────────────────────────┬──────────────────────────────┬──────────────────────────────┐ │ - │ │ RowId ┆ frame_nr ┆ log_time ┆ /this/that:example.MyColor ┆ /this/that:example.MyIndex │ │ + │ │ RowId ┆ frame_nr ┆ log_time ┆ example.MyColor ┆ example.MyIndex │ │ │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ │ │ type: "Struct[2]" ┆ type: "i64" ┆ type: "Timestamp(ns)" ┆ type: "List[u32]" ┆ type: "List[u64]" │ │ │ │ ARROW:extension:name: "TUID" ┆ index_name: "frame_nr" ┆ index_name: "log_time" ┆ component: "example.MyColor" ┆ component: "example.MyIndex" │ │ diff --git a/crates/store/re_dataframe/Cargo.toml b/crates/store/re_dataframe/Cargo.toml index 36f5247dc317..3f1873ca4292 100644 --- a/crates/store/re_dataframe/Cargo.toml +++ b/crates/store/re_dataframe/Cargo.toml @@ -34,6 +34,7 @@ re_log.workspace = true re_log_encoding.workspace = true re_log_types.workspace = true re_query.workspace = true +re_sorbet.workspace = true re_tracing.workspace = true re_types_core.workspace = true diff --git a/crates/store/re_dataframe/src/query.rs b/crates/store/re_dataframe/src/query.rs index 016ce25fb147..054b6259cd3b 100644 --- a/crates/store/re_dataframe/src/query.rs +++ b/crates/store/re_dataframe/src/query.rs @@ -193,7 +193,7 @@ impl QueryHandle { let arrow_schema = ArrowSchemaRef::from(ArrowSchema::new_with_metadata( selected_contents .iter() - .map(|(_, descr)| descr.to_arrow_field()) + .map(|(_, descr)| descr.to_arrow_field(re_sorbet::BatchType::Dataframe)) .collect::(), Default::default(), )); diff --git a/crates/store/re_sorbet/src/chunk_batch.rs b/crates/store/re_sorbet/src/chunk_batch.rs index e200ddd405bd..1eb820cb9160 100644 --- a/crates/store/re_sorbet/src/chunk_batch.rs +++ b/crates/store/re_sorbet/src/chunk_batch.rs @@ -92,7 +92,7 @@ impl ChunkBatch { if array.len() != row_count { return Err(MismatchedChunkSchemaError::custom(format!( "Data column {:?} had {} rows, but we got {} row IDs", - schema.column_name(), + schema.column_name(crate::BatchType::Chunk), array.len(), row_count ))); diff --git a/crates/store/re_sorbet/src/chunk_schema.rs b/crates/store/re_sorbet/src/chunk_schema.rs index 9f528afefab8..b090e76bc6bd 100644 --- a/crates/store/re_sorbet/src/chunk_schema.rs +++ b/crates/store/re_sorbet/src/chunk_schema.rs @@ -191,7 +191,11 @@ impl ChunkSchema { let mut fields: Vec = Vec::with_capacity(self.num_columns()); fields.push(row_id_column.to_arrow_field()); fields.extend(index_columns.iter().map(|column| column.to_arrow_field())); - fields.extend(data_columns.iter().map(|column| column.to_arrow_field())); + fields.extend( + data_columns + .iter() + .map(|column| column.to_arrow_field(crate::BatchType::Chunk)), + ); fields } } diff --git a/crates/store/re_sorbet/src/column_schema.rs b/crates/store/re_sorbet/src/column_schema.rs index 68b99e71965b..14b2b937cfb0 100644 --- a/crates/store/re_sorbet/src/column_schema.rs +++ b/crates/store/re_sorbet/src/column_schema.rs @@ -68,16 +68,19 @@ impl ColumnDescriptor { } #[inline] - pub fn to_arrow_field(&self) -> ArrowField { + pub fn to_arrow_field(&self, batch_type: crate::BatchType) -> ArrowField { match self { Self::Time(descr) => descr.to_arrow_field(), - Self::Component(descr) => descr.to_arrow_field(), + Self::Component(descr) => descr.to_arrow_field(batch_type), } } #[inline] - pub fn to_arrow_fields(columns: &[Self]) -> ArrowFields { - columns.iter().map(|c| c.to_arrow_field()).collect() + pub fn to_arrow_fields(columns: &[Self], batch_type: crate::BatchType) -> ArrowFields { + columns + .iter() + .map(|c| c.to_arrow_field(batch_type)) + .collect() } /// `chunk_entity_path`: if this column is part of a chunk batch, @@ -141,8 +144,10 @@ fn test_schema_over_ipc() { }), ]; - let original_schema = - arrow::datatypes::Schema::new(ColumnDescriptor::to_arrow_fields(&original_columns)); + let original_schema = arrow::datatypes::Schema::new(ColumnDescriptor::to_arrow_fields( + &original_columns, + crate::BatchType::Dataframe, + )); let ipc_bytes = crate::ipc_from_schema(&original_schema).unwrap(); let recovered_schema = crate::schema_from_ipc(&ipc_bytes).unwrap(); diff --git a/crates/store/re_sorbet/src/data_column_schema.rs b/crates/store/re_sorbet/src/data_column_schema.rs index c1642b7a5957..dbcc83b2ea1d 100644 --- a/crates/store/re_sorbet/src/data_column_schema.rs +++ b/crates/store/re_sorbet/src/data_column_schema.rs @@ -3,7 +3,7 @@ use arrow::datatypes::{DataType as ArrowDatatype, Field as ArrowField}; use re_log_types::{ComponentPath, EntityPath}; use re_types_core::{ArchetypeFieldName, ArchetypeName, ComponentDescriptor, ComponentName}; -use crate::{ArrowFieldMetadata, MetadataExt as _, MissingFieldMetadata}; +use crate::{ArrowFieldMetadata, BatchType, MetadataExt as _, MissingFieldMetadata}; /// Describes a data/component column, such as `Position3D`, in a dataframe. /// @@ -201,27 +201,36 @@ impl ComponentColumnDescriptor { self.store_datatype.clone() } - pub fn column_name(&self) -> String { - let entity_path = &self.entity_path; - let descriptor = ComponentDescriptor { - archetype_name: self.archetype_name, - archetype_field_name: self.archetype_field_name, - component_name: self.component_name, - }; - - format!("{}:{}", entity_path, descriptor.component_name.short_name()) - - // NOTE: Uncomment this to expose fully-qualified names in the Dataframe APIs! - // I'm not doing that right now, to avoid breaking changes (and we need to talk about - // what the syntax for these fully-qualified paths need to look like first). - // format!("{entity_path}@{}", descriptor.short_name()) + pub fn column_name(&self, batch_type: BatchType) -> String { + match batch_type { + BatchType::Chunk => self.component_name.short_name().to_owned(), + BatchType::Dataframe => { + let entity_path = &self.entity_path; + let descriptor = ComponentDescriptor { + archetype_name: self.archetype_name, + archetype_field_name: self.archetype_field_name, + component_name: self.component_name, + }; + + format!("{}:{}", entity_path, descriptor.component_name.short_name()) + + // NOTE: Uncomment this to expose fully-qualified names in the Dataframe APIs! + // I'm not doing that right now, to avoid breaking changes (and we need to talk about + // what the syntax for these fully-qualified paths need to look like first). + // format!("{entity_path}@{}", descriptor.short_name()) + } + } } #[inline] - pub fn to_arrow_field(&self) -> ArrowField { + pub fn to_arrow_field(&self, batch_type: BatchType) -> ArrowField { let nullable = true; - ArrowField::new(self.column_name(), self.returned_datatype(), nullable) - .with_metadata(self.metadata()) + ArrowField::new( + self.column_name(batch_type), + self.returned_datatype(), + nullable, + ) + .with_metadata(self.metadata()) } } diff --git a/crates/store/re_sorbet/src/lib.rs b/crates/store/re_sorbet/src/lib.rs index 4bff398ecc02..b9bf625980c1 100644 --- a/crates/store/re_sorbet/src/lib.rs +++ b/crates/store/re_sorbet/src/lib.rs @@ -24,3 +24,12 @@ pub use self::{ }, row_id_column_schema::{RowIdColumnDescriptor, WrongDatatypeError}, }; + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum BatchType { + /// Data for one entity + Chunk, + + /// Potentially multiple entities + Dataframe, +} From 0216e84e8589d2c50145423149bf890f388c13ca Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Mon, 10 Feb 2025 17:42:39 +0100 Subject: [PATCH 26/38] Only set entity_path-metdata per column in dataframes --- .../formatting__format_chunk_store.snap | 4 +- .../store/re_sorbet/src/data_column_schema.rs | 68 +++++++++++++------ 2 files changed, 49 insertions(+), 23 deletions(-) diff --git a/crates/store/re_chunk_store/tests/snapshots/formatting__format_chunk_store.snap b/crates/store/re_chunk_store/tests/snapshots/formatting__format_chunk_store.snap index 124630814753..eed4b3c5ee97 100644 --- a/crates/store/re_chunk_store/tests/snapshots/formatting__format_chunk_store.snap +++ b/crates/store/re_chunk_store/tests/snapshots/formatting__format_chunk_store.snap @@ -25,8 +25,8 @@ ChunkStore { │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ │ │ type: "Struct[2]" ┆ type: "i64" ┆ type: "Timestamp(ns)" ┆ type: "List[u32]" ┆ type: "List[u64]" │ │ │ │ ARROW:extension:name: "TUID" ┆ index_name: "frame_nr" ┆ index_name: "log_time" ┆ component: "example.MyColor" ┆ component: "example.MyIndex" │ │ - │ │ kind: "control" ┆ is_sorted: "true" ┆ is_sorted: "true" ┆ entity_path: "/this/that" ┆ entity_path: "/this/that" │ │ - │ │ ┆ kind: "index" ┆ kind: "index" ┆ kind: "data" ┆ kind: "data" │ │ + │ │ kind: "control" ┆ is_sorted: "true" ┆ is_sorted: "true" ┆ kind: "data" ┆ kind: "data" │ │ + │ │ ┆ kind: "index" ┆ kind: "index" ┆ ┆ │ │ │ ╞══════════════════════════════════╪════════════════════════╪═══════════════════════════════╪══════════════════════════════╪══════════════════════════════╡ │ │ │ 0000000067816A6BB4B8C1254D40007B ┆ 1 ┆ 2025-01-10T18:43:42.123456789 ┆ [0, 1, 2] ┆ [0, 1, 2] │ │ │ └──────────────────────────────────┴────────────────────────┴───────────────────────────────┴──────────────────────────────┴──────────────────────────────┘ │ diff --git a/crates/store/re_sorbet/src/data_column_schema.rs b/crates/store/re_sorbet/src/data_column_schema.rs index dbcc83b2ea1d..7f9e4c5fbce0 100644 --- a/crates/store/re_sorbet/src/data_column_schema.rs +++ b/crates/store/re_sorbet/src/data_column_schema.rs @@ -159,7 +159,7 @@ impl ComponentColumnDescriptor { &self.entity_path == entity_path && self.component_name.matches(component_name) } - fn metadata(&self) -> ArrowFieldMetadata { + fn metadata(&self, batch_type: BatchType) -> ArrowFieldMetadata { let Self { entity_path, archetype_name, @@ -173,27 +173,53 @@ impl ComponentColumnDescriptor { } = self; // TODO(#6889): This needs some proper sorbetization -- I just threw these names randomly. - // We use the long namesa for the archetype and component names so that they roundtrip properly! - [ - Some(("rerun.kind".to_owned(), "data".to_owned())), - Some(("rerun.entity_path".to_owned(), entity_path.to_string())), - archetype_name.map(|name| ("rerun.archetype".to_owned(), name.full_name().to_owned())), - archetype_field_name - .as_ref() - .map(|name| ("rerun.archetype_field".to_owned(), name.to_string())), - Some(( + // We use the long names for the archetype and component names so that they roundtrip properly! + let mut metadata = std::collections::HashMap::from([ + ("rerun.kind".to_owned(), "data".to_owned()), + ( "rerun.component".to_owned(), component_name.full_name().to_owned(), - )), - (*is_static).then_some(("rerun.is_static".to_owned(), "true".to_owned())), - (*is_indicator).then_some(("rerun.is_indicator".to_owned(), "true".to_owned())), - (*is_tombstone).then_some(("rerun.is_tombstone".to_owned(), "true".to_owned())), - (*is_semantically_empty) - .then_some(("rerun.is_semantically_empty".to_owned(), "true".to_owned())), - ] - .into_iter() - .flatten() - .collect() + ), + ]); + + match batch_type { + BatchType::Dataframe => { + metadata.insert("rerun.entity_path".to_owned(), entity_path.to_string()); + } + BatchType::Chunk => { + // The whole chhunk is for the same entity, which is set in the record batch metadata. + // No need to repeat it here. + } + } + + if let Some(archetype_name) = archetype_name { + metadata.insert( + "rerun.archetype".to_owned(), + archetype_name.full_name().to_owned(), + ); + } + + if let Some(archetype_field_name) = archetype_field_name { + metadata.insert( + "rerun.archetype_field".to_owned(), + archetype_field_name.to_string(), + ); + } + + if *is_static { + metadata.insert("rerun.is_static".to_owned(), "true".to_owned()); + } + if *is_indicator { + metadata.insert("rerun.is_indicator".to_owned(), "true".to_owned()); + } + if *is_tombstone { + metadata.insert("rerun.is_tombstone".to_owned(), "true".to_owned()); + } + if *is_semantically_empty { + metadata.insert("rerun.is_semantically_empty".to_owned(), "true".to_owned()); + } + + metadata } #[inline] @@ -230,7 +256,7 @@ impl ComponentColumnDescriptor { self.returned_datatype(), nullable, ) - .with_metadata(self.metadata()) + .with_metadata(self.metadata(batch_type)) } } From 3e73478740b3c2c1db4f11c52e94e55462a40dfb Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Mon, 10 Feb 2025 17:44:21 +0100 Subject: [PATCH 27/38] Remove unused dependencies --- Cargo.lock | 2 -- crates/store/re_grpc_client/Cargo.toml | 2 -- 2 files changed, 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a4b741fce4d0..f838463b2061 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6151,11 +6151,9 @@ version = "0.23.0-alpha.1+dev" dependencies = [ "arrow", "async-stream", - "itertools 0.13.0", "re_arrow_util", "re_chunk", "re_error", - "re_format_arrow", "re_log", "re_log_encoding", "re_log_types", diff --git a/crates/store/re_grpc_client/Cargo.toml b/crates/store/re_grpc_client/Cargo.toml index 106b79cbf653..c7cf18683615 100644 --- a/crates/store/re_grpc_client/Cargo.toml +++ b/crates/store/re_grpc_client/Cargo.toml @@ -23,7 +23,6 @@ all-features = true re_arrow_util.workspace = true re_chunk.workspace = true re_error.workspace = true -re_format_arrow.workspace = true re_log.workspace = true re_log_encoding = { workspace = true, features = ["encoder", "decoder"] } re_log_types.workspace = true @@ -34,7 +33,6 @@ re_types.workspace = true arrow.workspace = true async-stream.workspace = true -itertools.workspace = true thiserror.workspace = true tokio-stream.workspace = true url.workspace = true From 11c1e7240bdc9bcdc705896815921660431323c9 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Mon, 10 Feb 2025 17:59:27 +0100 Subject: [PATCH 28/38] Improve docstrings --- crates/store/re_sorbet/src/chunk_batch.rs | 4 ++-- crates/store/re_sorbet/src/lib.rs | 4 ++++ crates/store/re_sorbet/src/row_id_column_schema.rs | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/crates/store/re_sorbet/src/chunk_batch.rs b/crates/store/re_sorbet/src/chunk_batch.rs index 1eb820cb9160..14bafe07c046 100644 --- a/crates/store/re_sorbet/src/chunk_batch.rs +++ b/crates/store/re_sorbet/src/chunk_batch.rs @@ -35,9 +35,9 @@ impl MismatchedChunkSchemaError { } } -/// The arrow [`ArrowRecordBatch`] representation of a Rerun chunk. +/// The [`ArrowRecordBatch`] representation of a Rerun chunk. /// -/// This is a wrapper around a [`ArrowRecordBatch`]. +/// This is a wrapper around a [`ChunkSchema`] and a [`ArrowRecordBatch`]. /// /// Each [`ChunkBatch`] contains logging data for a single [`EntityPath`]. /// It always has a [`re_types_core::RowId`] column. diff --git a/crates/store/re_sorbet/src/lib.rs b/crates/store/re_sorbet/src/lib.rs index b9bf625980c1..18e20c9b937f 100644 --- a/crates/store/re_sorbet/src/lib.rs +++ b/crates/store/re_sorbet/src/lib.rs @@ -1,6 +1,10 @@ //! Rerun arrow metadata and record batch definitions. //! //! Handles the structure of arrow record batches and their meta data for different use cases for Rerun. +//! +//! An arrow record batch needs to follow a specific schema to be be compatible with Rerun, +//! and that schema is defined in [`ChunkSchema`]. +//! If a record batch matches the schema, it can be converted to a [`ChunkBatch`]. mod chunk_batch; mod chunk_schema; diff --git a/crates/store/re_sorbet/src/row_id_column_schema.rs b/crates/store/re_sorbet/src/row_id_column_schema.rs index 18b99403a337..d772d03b87fe 100644 --- a/crates/store/re_sorbet/src/row_id_column_schema.rs +++ b/crates/store/re_sorbet/src/row_id_column_schema.rs @@ -24,7 +24,7 @@ impl WrongDatatypeError { } } -/// Describes the [`RowId`] +/// Describes the schema of the primary [`RowId`] column. #[derive(Clone, Debug, Default, PartialEq, Eq, Hash, PartialOrd, Ord)] pub struct RowIdColumnDescriptor {} From ef6f1cdc276d2131c4b06fcde4508a36a941a04c Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Mon, 10 Feb 2025 18:04:32 +0100 Subject: [PATCH 29/38] typos --- crates/store/re_sorbet/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/store/re_sorbet/src/lib.rs b/crates/store/re_sorbet/src/lib.rs index 18e20c9b937f..a9934c5bd9eb 100644 --- a/crates/store/re_sorbet/src/lib.rs +++ b/crates/store/re_sorbet/src/lib.rs @@ -2,7 +2,7 @@ //! //! Handles the structure of arrow record batches and their meta data for different use cases for Rerun. //! -//! An arrow record batch needs to follow a specific schema to be be compatible with Rerun, +//! An arrow record batch needs to follow a specific schema to be compatible with Rerun, //! and that schema is defined in [`ChunkSchema`]. //! If a record batch matches the schema, it can be converted to a [`ChunkBatch`]. From 3d9147c25f7e89808833b757ddf122aa7f70ccd4 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Tue, 11 Feb 2025 09:15:58 +0100 Subject: [PATCH 30/38] Add sanity checks to find source of doubly-prefixed `rerun.components.` --- crates/store/re_chunk/src/chunk.rs | 6 +++- crates/store/re_chunk/src/transport.rs | 2 ++ crates/store/re_chunk_store/src/dataframe.rs | 4 +++ crates/store/re_dataframe/src/query.rs | 30 +++++++++++++++---- crates/store/re_sorbet/src/column_schema.rs | 10 +++++++ .../store/re_sorbet/src/data_column_schema.rs | 24 +++++++++++++-- crates/store/re_types_core/src/archetype.rs | 13 ++++++++ .../re_types_core/src/component_descriptor.rs | 17 +++++++++-- crates/store/re_types_core/src/loggable.rs | 15 ++++++++++ rerun_py/rerun_sdk/rerun/_baseclasses.py | 4 +++ rerun_py/rerun_sdk/rerun/_log.py | 1 + rerun_py/src/arrow.rs | 6 ++-- 12 files changed, 119 insertions(+), 13 deletions(-) diff --git a/crates/store/re_chunk/src/chunk.rs b/crates/store/re_chunk/src/chunk.rs index 287649b4ee65..85e234eb0d89 100644 --- a/crates/store/re_chunk/src/chunk.rs +++ b/crates/store/re_chunk/src/chunk.rs @@ -1447,6 +1447,7 @@ impl Chunk { /// Returns an error if the Chunk's invariants are not upheld. /// /// Costly checks are only run in debug builds. + #[track_caller] pub fn sanity_check(&self) -> ChunkResult<()> { re_tracing::profile_function!(); @@ -1516,8 +1517,10 @@ impl Chunk { } // Components - for (_component_name, per_desc) in components.iter() { + for (component_name, per_desc) in components.iter() { + component_name.sanity_check(); for (component_desc, list_array) in per_desc { + component_desc.component_name.sanity_check(); if !matches!(list_array.data_type(), arrow::datatypes::DataType::List(_)) { return Err(ChunkError::Malformed { reason: format!( @@ -1568,6 +1571,7 @@ impl TimeColumn { /// Returns an error if the Chunk's invariants are not upheld. /// /// Costly checks are only run in debug builds. + #[track_caller] pub fn sanity_check(&self) -> ChunkResult<()> { let Self { timeline: _, diff --git a/crates/store/re_chunk/src/transport.rs b/crates/store/re_chunk/src/transport.rs index bf10014d10de..0dda84d32656 100644 --- a/crates/store/re_chunk/src/transport.rs +++ b/crates/store/re_chunk/src/transport.rs @@ -90,6 +90,8 @@ impl Chunk { component_name, } = *component_desc; + component_name.sanity_check(); + let schema = re_sorbet::ComponentColumnDescriptor { store_datatype: list_array.data_type().clone(), entity_path: entity_path.clone(), diff --git a/crates/store/re_chunk_store/src/dataframe.rs b/crates/store/re_chunk_store/src/dataframe.rs index ab0a82c56f65..ab4183412a63 100644 --- a/crates/store/re_chunk_store/src/dataframe.rs +++ b/crates/store/re_chunk_store/src/dataframe.rs @@ -405,6 +405,8 @@ impl ChunkStore { is_semantically_empty, } = metadata; + component_descr.component_name.sanity_check(); + ComponentColumnDescriptor { // NOTE: The data is always a at least a list, whether it's latest-at or range. // It might be wrapped further in e.g. a dict, but at the very least @@ -477,6 +479,8 @@ impl ChunkStore { let component_name = component_descr.map_or(selected_component_name, |descr| descr.component_name); + component_name.sanity_check(); + let ColumnMetadata { is_static, is_indicator, diff --git a/crates/store/re_dataframe/src/query.rs b/crates/store/re_dataframe/src/query.rs index 054b6259cd3b..4219a1a41a04 100644 --- a/crates/store/re_dataframe/src/query.rs +++ b/crates/store/re_dataframe/src/query.rs @@ -234,6 +234,8 @@ impl QueryHandle { continue; }; + descr.sanity_check(); + // NOTE: It would be tempting to concatenate all these individual clear chunks into one // single big chunk, but that'd be a mistake: 1) it's costly to do so but more // importantly 2) that would lead to likely very large chunk overlap, which is very bad @@ -333,6 +335,8 @@ impl QueryHandle { .map(|(_view_idx, descr)| match descr { ColumnDescriptor::Time(_) => None, ColumnDescriptor::Component(descr) => { + descr.sanity_check(); + let query = re_chunk::LatestAtQuery::new( Timeline::new_sequence(""), TimeInt::STATIC, @@ -350,6 +354,14 @@ impl QueryHandle { .collect_vec() }; + for descr in &view_contents { + descr.sanity_check(); + } + + for (_, descr) in &selected_contents { + descr.sanity_check(); + } + QueryHandleState { view_contents, selected_contents, @@ -403,6 +415,11 @@ impl QueryHandle { component_name: selected_component_name, } = selected_column; + debug_assert!( + !selected_component_name.starts_with("rerun.components.rerun.components."), + "Found component with full name {selected_component_name:?}. Maybe some bad round-tripping?" + ); + view_contents .iter() .enumerate() @@ -1166,11 +1183,14 @@ impl QueryHandle { StreamingJoinState::Retrofilled(unit) => { let component_desc = state.view_contents.get(view_idx).and_then(|col| match col { - ColumnDescriptor::Component(descr) => Some(re_types_core::ComponentDescriptor { - component_name: descr.component_name, - archetype_name: descr.archetype_name, - archetype_field_name: descr.archetype_field_name, - }), + ColumnDescriptor::Component(descr) => { + descr.component_name.sanity_check(); + Some(re_types_core::ComponentDescriptor { + component_name: descr.component_name, + archetype_name: descr.archetype_name, + archetype_field_name: descr.archetype_field_name, + }) + }, ColumnDescriptor::Time(_) => None, })?; unit.components().get_by_descriptor(&component_desc).cloned() diff --git a/crates/store/re_sorbet/src/column_schema.rs b/crates/store/re_sorbet/src/column_schema.rs index 14b2b937cfb0..5294dc6e032e 100644 --- a/crates/store/re_sorbet/src/column_schema.rs +++ b/crates/store/re_sorbet/src/column_schema.rs @@ -35,6 +35,16 @@ pub enum ColumnDescriptor { } impl ColumnDescriptor { + /// Debug-only sanity check. + #[inline] + #[track_caller] + pub fn sanity_check(&self) { + match self { + Self::Time(_) => {} + Self::Component(descr) => descr.sanity_check(), + } + } + #[inline] pub fn entity_path(&self) -> Option<&EntityPath> { match self { diff --git a/crates/store/re_sorbet/src/data_column_schema.rs b/crates/store/re_sorbet/src/data_column_schema.rs index 7f9e4c5fbce0..d77dc4add0be 100644 --- a/crates/store/re_sorbet/src/data_column_schema.rs +++ b/crates/store/re_sorbet/src/data_column_schema.rs @@ -127,6 +127,7 @@ impl std::fmt::Display for ComponentColumnDescriptor { impl From for re_types_core::ComponentDescriptor { #[inline] fn from(descr: ComponentColumnDescriptor) -> Self { + descr.sanity_check(); Self { archetype_name: descr.archetype_name, archetype_field_name: descr.archetype_field_name, @@ -138,6 +139,7 @@ impl From for re_types_core::ComponentDescriptor { impl From<&ComponentColumnDescriptor> for re_types_core::ComponentDescriptor { #[inline] fn from(descr: &ComponentColumnDescriptor) -> Self { + descr.sanity_check(); Self { archetype_name: descr.archetype_name, archetype_field_name: descr.archetype_field_name, @@ -147,6 +149,16 @@ impl From<&ComponentColumnDescriptor> for re_types_core::ComponentDescriptor { } impl ComponentColumnDescriptor { + /// Debug-only sanity check. + #[inline] + #[track_caller] + pub fn sanity_check(&self) { + self.component_name.sanity_check(); + if let Some(archetype_name) = &self.archetype_name { + archetype_name.sanity_check(); + } + } + pub fn component_path(&self) -> ComponentPath { ComponentPath { entity_path: self.entity_path.clone(), @@ -160,6 +172,8 @@ impl ComponentColumnDescriptor { } fn metadata(&self, batch_type: BatchType) -> ArrowFieldMetadata { + self.sanity_check(); + let Self { entity_path, archetype_name, @@ -228,6 +242,8 @@ impl ComponentColumnDescriptor { } pub fn column_name(&self, batch_type: BatchType) -> String { + self.sanity_check(); + match batch_type { BatchType::Chunk => self.component_name.short_name().to_owned(), BatchType::Dataframe => { @@ -279,7 +295,7 @@ impl ComponentColumnDescriptor { ComponentName::new(field.name()) // Legacy fallback }; - Ok(Self { + let schema = Self { store_datatype: field.data_type().clone(), entity_path, archetype_name: field.get_opt("rerun.archetype").map(|x| x.into()), @@ -289,6 +305,10 @@ impl ComponentColumnDescriptor { is_indicator: field.get_bool("rerun.is_indicator"), is_tombstone: field.get_bool("rerun.is_tombstone"), is_semantically_empty: field.get_bool("rerun.is_semantically_empty"), - }) + }; + + schema.sanity_check(); + + Ok(schema) } } diff --git a/crates/store/re_types_core/src/archetype.rs b/crates/store/re_types_core/src/archetype.rs index 356fc62af86f..a6f39d8a6ef2 100644 --- a/crates/store/re_types_core/src/archetype.rs +++ b/crates/store/re_types_core/src/archetype.rs @@ -139,11 +139,23 @@ re_string_interner::declare_new_type!( ); impl ArchetypeName { + /// Runs some asserts in debug mode to make sure the name is not weird. + #[inline] + #[track_caller] + pub fn sanity_check(&self) { + let full_name = self.0.as_str(); + debug_assert!( + !full_name.starts_with("rerun.archetypes.rerun.archetypes.") || full_name.contains(':'), + "DEBUG ASSERT: Found archetype with full name {full_name:?}. Maybe some bad round-tripping?" + ); + } + /// Returns the fully-qualified name, e.g. `rerun.archetypes.Points3D`. /// /// This is the default `Display` implementation for [`ArchetypeName`]. #[inline] pub fn full_name(&self) -> &'static str { + self.sanity_check(); self.0.as_str() } @@ -157,6 +169,7 @@ impl ArchetypeName { /// ``` #[inline] pub fn short_name(&self) -> &'static str { + self.sanity_check(); let full_name = self.0.as_str(); if let Some(short_name) = full_name.strip_prefix("rerun.archetypes.") { short_name diff --git a/crates/store/re_types_core/src/component_descriptor.rs b/crates/store/re_types_core/src/component_descriptor.rs index 3d0b95b136e2..0d2574c0e302 100644 --- a/crates/store/re_types_core/src/component_descriptor.rs +++ b/crates/store/re_types_core/src/component_descriptor.rs @@ -102,17 +102,25 @@ impl ComponentDescriptor { } } + #[inline] + #[track_caller] + pub fn sanity_check(&self) { + self.component_name.sanity_check(); + } + /// Returns the fully-qualified name, e.g. `rerun.archetypes.Points3D:rerun.components.Position3D#positions`. /// /// This is the default `Display` implementation for [`ComponentDescriptor`]. #[inline] pub fn full_name(&self) -> String { + self.sanity_check(); self.to_string() } /// Returns the unqualified name, e.g. `Points3D:Position3D#positions`. #[inline] pub fn short_name(&self) -> String { + self.sanity_check(); self.to_any_string(true) } } @@ -135,6 +143,7 @@ impl ComponentDescriptor { #[inline] pub fn new(component_name: impl Into) -> Self { let component_name = component_name.into(); + component_name.sanity_check(); Self { archetype_name: None, archetype_field_name: None, @@ -197,7 +206,7 @@ impl From for ComponentDescriptor { fn from(field: arrow::datatypes::Field) -> Self { let md = field.metadata(); - Self { + let descr = Self { archetype_name: md .get(FIELD_METADATA_KEY_ARCHETYPE_NAME) .cloned() @@ -206,7 +215,9 @@ impl From for ComponentDescriptor { .get(FIELD_METADATA_KEY_ARCHETYPE_FIELD_NAME) .cloned() .map(Into::into), - component_name: field.name().clone().into(), - } + component_name: field.name().to_string().into(), + }; + descr.sanity_check(); + descr } } diff --git a/crates/store/re_types_core/src/loggable.rs b/crates/store/re_types_core/src/loggable.rs index 38a9f7d2df7b..11b2d59f4afa 100644 --- a/crates/store/re_types_core/src/loggable.rs +++ b/crates/store/re_types_core/src/loggable.rs @@ -131,6 +131,7 @@ re_string_interner::declare_new_type!( impl From for Cow<'static, ComponentDescriptor> { #[inline] fn from(name: ComponentName) -> Self { + name.sanity_check(); Cow::Owned(ComponentDescriptor::new(name)) } } @@ -141,16 +142,29 @@ impl From for Cow<'static, ComponentDescriptor> { impl From<&ComponentName> for Cow<'static, ComponentDescriptor> { #[inline] fn from(name: &ComponentName) -> Self { + name.sanity_check(); Cow::Owned(ComponentDescriptor::new(*name)) } } impl ComponentName { + /// Runs some asserts in debug mode to make sure the name is not weird. + #[inline] + #[track_caller] + pub fn sanity_check(&self) { + let full_name = self.0.as_str(); + debug_assert!( + !full_name.starts_with("rerun.components.rerun.components.") || full_name.contains(':'), + "DEBUG ASSERT: Found component with full name {full_name:?}. Maybe some bad round-tripping?" + ); + } + /// Returns the fully-qualified name, e.g. `rerun.components.Position2D`. /// /// This is the default `Display` implementation for [`ComponentName`]. #[inline] pub fn full_name(&self) -> &'static str { + self.sanity_check(); self.0.as_str() } @@ -164,6 +178,7 @@ impl ComponentName { /// ``` #[inline] pub fn short_name(&self) -> &'static str { + self.sanity_check(); let full_name = self.0.as_str(); if let Some(short_name) = full_name.strip_prefix("rerun.blueprint.components.") { short_name diff --git a/rerun_py/rerun_sdk/rerun/_baseclasses.py b/rerun_py/rerun_sdk/rerun/_baseclasses.py index 891b3f75bd40..43c10db70552 100644 --- a/rerun_py/rerun_sdk/rerun/_baseclasses.py +++ b/rerun_py/rerun_sdk/rerun/_baseclasses.py @@ -54,6 +54,10 @@ def __init__( archetype_name: str | None = None, archetype_field_name: str | None = None, ) -> None: + assert not component_name.startswith("rerun.components.rerun.components."), f"Bad component name: {component_name}'" + if archetype_name is not None: + assert not archetype_name.startswith("rerun.archetypes.rerun.archetypes."), f"Bad archetype name '{archetype_name}'" + self.archetype_name = archetype_name self.archetype_field_name = archetype_field_name self.component_name = component_name diff --git a/rerun_py/rerun_sdk/rerun/_log.py b/rerun_py/rerun_sdk/rerun/_log.py index bac403522eb6..4c364b1fa679 100644 --- a/rerun_py/rerun_sdk/rerun/_log.py +++ b/rerun_py/rerun_sdk/rerun/_log.py @@ -36,6 +36,7 @@ def __init__(self, archetype_name: str) -> None: """ self.data = pa.nulls(1, type=pa.null()) + assert not archetype_name.startswith("rerun.archetypes.rerun.archetypes."), f"Bad archetype name '{archetype_name}' in IndicatorComponentBatch" self._archetype_name = archetype_name def component_name(self) -> str: diff --git a/rerun_py/src/arrow.rs b/rerun_py/src/arrow.rs index 271beda23cec..c920044ef42d 100644 --- a/rerun_py/src/arrow.rs +++ b/rerun_py/src/arrow.rs @@ -44,11 +44,13 @@ pub fn descriptor_to_rust(component_descr: &Bound<'_, PyAny>) -> PyResult = component_name.extract()?; - Ok(ComponentDescriptor { + let descr = ComponentDescriptor { archetype_name: archetype_name.map(|s| s.as_ref().into()), archetype_field_name: archetype_field_name.map(|s| s.as_ref().into()), component_name: component_name.as_ref().into(), - }) + }; + descr.sanity_check(); + Ok(descr) } /// Perform conversion between a pyarrow array to arrow types. From 0842f842a6881a5abbaedac52af41b781565c38d Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Tue, 11 Feb 2025 09:48:21 +0100 Subject: [PATCH 31/38] Fix full/short name errors --- crates/store/re_chunk_store/src/dataframe.rs | 6 +++--- .../store/re_sorbet/src/data_column_schema.rs | 20 ++++++++++--------- rerun_py/rerun_sdk/rerun/dataframe.py | 6 +++--- rerun_py/src/dataframe.rs | 2 +- 4 files changed, 18 insertions(+), 16 deletions(-) diff --git a/crates/store/re_chunk_store/src/dataframe.rs b/crates/store/re_chunk_store/src/dataframe.rs index ab4183412a63..a9b729f1ca6b 100644 --- a/crates/store/re_chunk_store/src/dataframe.rs +++ b/crates/store/re_chunk_store/src/dataframe.rs @@ -95,7 +95,7 @@ impl From for ComponentColumnSelector { fn from(desc: ComponentColumnDescriptor) -> Self { Self { entity_path: desc.entity_path.clone(), - component_name: desc.component_name.to_string(), + component_name: desc.component_name.short_name().to_owned(), } } } @@ -106,7 +106,7 @@ impl ComponentColumnSelector { pub fn new(entity_path: EntityPath) -> Self { Self { entity_path, - component_name: C::name().to_string(), + component_name: C::name().short_name().to_owned(), } } @@ -115,7 +115,7 @@ impl ComponentColumnSelector { pub fn new_for_component_name(entity_path: EntityPath, component_name: ComponentName) -> Self { Self { entity_path, - component_name: component_name.to_string(), + component_name: component_name.short_name().to_owned(), } } } diff --git a/crates/store/re_sorbet/src/data_column_schema.rs b/crates/store/re_sorbet/src/data_column_schema.rs index d77dc4add0be..5c7120d50a8d 100644 --- a/crates/store/re_sorbet/src/data_column_schema.rs +++ b/crates/store/re_sorbet/src/data_column_schema.rs @@ -245,20 +245,22 @@ impl ComponentColumnDescriptor { self.sanity_check(); match batch_type { - BatchType::Chunk => self.component_name.short_name().to_owned(), + BatchType::Chunk => { + // All columns are of the same entity + self.component_name.short_name().to_owned() + } BatchType::Dataframe => { - let entity_path = &self.entity_path; - let descriptor = ComponentDescriptor { - archetype_name: self.archetype_name, - archetype_field_name: self.archetype_field_name, - component_name: self.component_name, - }; - - format!("{}:{}", entity_path, descriptor.component_name.short_name()) + // Each column can be of a different entity + format!("{}:{}", self.entity_path, self.component_name.short_name()) // NOTE: Uncomment this to expose fully-qualified names in the Dataframe APIs! // I'm not doing that right now, to avoid breaking changes (and we need to talk about // what the syntax for these fully-qualified paths need to look like first). + // let descriptor = ComponentDescriptor { + // archetype_name: self.archetype_name, + // archetype_field_name: self.archetype_field_name, + // component_name: self.component_name, + // }; // format!("{entity_path}@{}", descriptor.short_name()) } } diff --git a/rerun_py/rerun_sdk/rerun/dataframe.py b/rerun_py/rerun_sdk/rerun/dataframe.py index 89c72647e21d..7e16b27b2832 100644 --- a/rerun_py/rerun_sdk/rerun/dataframe.py +++ b/rerun_py/rerun_sdk/rerun/dataframe.py @@ -61,9 +61,9 @@ def __init__(self, metadata: dict[bytes, bytes], col: pa.Array): def component_descriptor(self) -> ComponentDescriptor: kwargs = {} if SORBET_ARCHETYPE_NAME in self.metadata: - kwargs["archetype_name"] = "rerun.archetypes" + self.metadata[SORBET_ARCHETYPE_NAME].decode("utf-8") + kwargs["archetype_name"] = self.metadata[SORBET_ARCHETYPE_NAME].decode("utf-8") if SORBET_COMPONENT_NAME in self.metadata: - kwargs["component_name"] = "rerun.components." + self.metadata[SORBET_COMPONENT_NAME].decode("utf-8") + kwargs["component_name"] = self.metadata[SORBET_COMPONENT_NAME].decode("utf-8") if SORBET_ARCHETYPE_FIELD in self.metadata: kwargs["archetype_field_name"] = self.metadata[SORBET_ARCHETYPE_FIELD].decode("utf-8") @@ -105,7 +105,7 @@ def send_record_batch(batch: pa.RecordBatch, rec: Optional[RecordingStream] = No archetypes[entity_path].add(metadata[SORBET_ARCHETYPE_NAME].decode("utf-8")) for entity_path, archetype_set in archetypes.items(): for archetype in archetype_set: - data[entity_path].append(IndicatorComponentBatch("rerun.archetypes." + archetype)) + data[entity_path].append(IndicatorComponentBatch(archetype)) for entity_path, columns in data.items(): send_columns( diff --git a/rerun_py/src/dataframe.rs b/rerun_py/src/dataframe.rs index 9d03268e5519..5b8f4c94d792 100644 --- a/rerun_py/src/dataframe.rs +++ b/rerun_py/src/dataframe.rs @@ -188,7 +188,7 @@ impl PyComponentColumnDescriptor { /// This property is read-only. #[getter] fn component_name(&self) -> &str { - &self.0.component_name + self.0.component_name.full_name() } /// Whether the column is static. From 861e496287de4a80046e64adcb2eaa589df4c262 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Tue, 11 Feb 2025 09:48:38 +0100 Subject: [PATCH 32/38] Add `#[track_caller]` to sanity checks --- crates/store/re_types/src/archetypes/mesh3d_ext.rs | 1 + crates/store/re_types/src/components/view_coordinates_ext.rs | 1 + crates/viewer/re_renderer/src/mesh.rs | 1 + 3 files changed, 3 insertions(+) diff --git a/crates/store/re_types/src/archetypes/mesh3d_ext.rs b/crates/store/re_types/src/archetypes/mesh3d_ext.rs index 853a8e3676d7..70c58c392c0b 100644 --- a/crates/store/re_types/src/archetypes/mesh3d_ext.rs +++ b/crates/store/re_types/src/archetypes/mesh3d_ext.rs @@ -52,6 +52,7 @@ impl Mesh3D { /// and that we have the same number of positions and normals (if any). /// /// Only use this when logging a whole new mesh. Not meaningful for field updates! + #[track_caller] pub fn sanity_check(&self) -> Result<(), Mesh3DError> { let num_vertices = self.num_vertices(); diff --git a/crates/store/re_types/src/components/view_coordinates_ext.rs b/crates/store/re_types/src/components/view_coordinates_ext.rs index f5eaa0e1a3e0..eb8629208b23 100644 --- a/crates/store/re_types/src/components/view_coordinates_ext.rs +++ b/crates/store/re_types/src/components/view_coordinates_ext.rs @@ -37,6 +37,7 @@ impl ViewCoordinates { } /// Returns an error if this does not span all three dimensions. + #[track_caller] pub fn sanity_check(&self) -> Result<(), String> { let mut dims = [false; 3]; for dir in *self.0 { diff --git a/crates/viewer/re_renderer/src/mesh.rs b/crates/viewer/re_renderer/src/mesh.rs index fb61b7d8ca3c..e2e78f086154 100644 --- a/crates/viewer/re_renderer/src/mesh.rs +++ b/crates/viewer/re_renderer/src/mesh.rs @@ -71,6 +71,7 @@ pub struct CpuMesh { } impl CpuMesh { + #[track_caller] pub fn sanity_check(&self) -> Result<(), MeshError> { re_tracing::profile_function!(); From b094db79731ec969684e1ce3280f3fdc5ca061b7 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Tue, 11 Feb 2025 09:50:35 +0100 Subject: [PATCH 33/38] py-fmt --- rerun_py/rerun_sdk/rerun/_baseclasses.py | 8 ++++++-- rerun_py/rerun_sdk/rerun/_log.py | 4 +++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/rerun_py/rerun_sdk/rerun/_baseclasses.py b/rerun_py/rerun_sdk/rerun/_baseclasses.py index 43c10db70552..030038781bd3 100644 --- a/rerun_py/rerun_sdk/rerun/_baseclasses.py +++ b/rerun_py/rerun_sdk/rerun/_baseclasses.py @@ -54,9 +54,13 @@ def __init__( archetype_name: str | None = None, archetype_field_name: str | None = None, ) -> None: - assert not component_name.startswith("rerun.components.rerun.components."), f"Bad component name: {component_name}'" + assert not component_name.startswith( + "rerun.components.rerun.components." + ), f"Bad component name: {component_name}'" if archetype_name is not None: - assert not archetype_name.startswith("rerun.archetypes.rerun.archetypes."), f"Bad archetype name '{archetype_name}'" + assert not archetype_name.startswith( + "rerun.archetypes.rerun.archetypes." + ), f"Bad archetype name '{archetype_name}'" self.archetype_name = archetype_name self.archetype_field_name = archetype_field_name diff --git a/rerun_py/rerun_sdk/rerun/_log.py b/rerun_py/rerun_sdk/rerun/_log.py index 4c364b1fa679..aef830373cb9 100644 --- a/rerun_py/rerun_sdk/rerun/_log.py +++ b/rerun_py/rerun_sdk/rerun/_log.py @@ -36,7 +36,9 @@ def __init__(self, archetype_name: str) -> None: """ self.data = pa.nulls(1, type=pa.null()) - assert not archetype_name.startswith("rerun.archetypes.rerun.archetypes."), f"Bad archetype name '{archetype_name}' in IndicatorComponentBatch" + assert not archetype_name.startswith( + "rerun.archetypes.rerun.archetypes." + ), f"Bad archetype name '{archetype_name}' in IndicatorComponentBatch" self._archetype_name = archetype_name def component_name(self) -> str: From f9df88e6bffca29a01520586533dddb5cdc7e923 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Tue, 11 Feb 2025 10:06:39 +0100 Subject: [PATCH 34/38] Fix bad sanity checks --- crates/store/re_types_core/src/archetype.rs | 2 +- crates/store/re_types_core/src/loggable.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/store/re_types_core/src/archetype.rs b/crates/store/re_types_core/src/archetype.rs index a6f39d8a6ef2..429d7fa69ff9 100644 --- a/crates/store/re_types_core/src/archetype.rs +++ b/crates/store/re_types_core/src/archetype.rs @@ -145,7 +145,7 @@ impl ArchetypeName { pub fn sanity_check(&self) { let full_name = self.0.as_str(); debug_assert!( - !full_name.starts_with("rerun.archetypes.rerun.archetypes.") || full_name.contains(':'), + !full_name.starts_with("rerun.archetypes.rerun.archetypes.") && !full_name.contains(':'), "DEBUG ASSERT: Found archetype with full name {full_name:?}. Maybe some bad round-tripping?" ); } diff --git a/crates/store/re_types_core/src/loggable.rs b/crates/store/re_types_core/src/loggable.rs index 11b2d59f4afa..d4be8a487f92 100644 --- a/crates/store/re_types_core/src/loggable.rs +++ b/crates/store/re_types_core/src/loggable.rs @@ -154,7 +154,7 @@ impl ComponentName { pub fn sanity_check(&self) { let full_name = self.0.as_str(); debug_assert!( - !full_name.starts_with("rerun.components.rerun.components.") || full_name.contains(':'), + !full_name.starts_with("rerun.components.rerun.components.") && !full_name.contains(':'), "DEBUG ASSERT: Found component with full name {full_name:?}. Maybe some bad round-tripping?" ); } From 39ecc473fb1960b9cabdac25bfcffa8fa4ea1d34 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Tue, 11 Feb 2025 10:06:50 +0100 Subject: [PATCH 35/38] Explain some TODOs --- crates/store/re_chunk/src/transport.rs | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/crates/store/re_chunk/src/transport.rs b/crates/store/re_chunk/src/transport.rs index 0dda84d32656..9cc3623329d5 100644 --- a/crates/store/re_chunk/src/transport.rs +++ b/crates/store/re_chunk/src/transport.rs @@ -100,10 +100,13 @@ impl Chunk { archetype_field_name, component_name, - is_static: false, // TODO(#8744): figure out what to do here - is_indicator: false, // TODO(#8744): figure out what to do here - is_tombstone: false, // TODO(#8744): figure out what to do here - is_semantically_empty: false, // TODO(#8744): figure out what to do here + // These are a consequence of using `ComponentColumnDescriptor` both for chunk batches and dataframe batches. + // Setting them all to `false` at least ensures they aren't written to the arrow metadata: + // TODO(#8744): figure out what to do here + is_static: false, + is_indicator: false, + is_tombstone: false, + is_semantically_empty: false, }; (schema, into_arrow_ref(list_array)) }) From 1c13ee4a2706e34d6375cddb53b23f087f4aa390 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Tue, 11 Feb 2025 14:14:21 +0100 Subject: [PATCH 36/38] Add a TODO --- crates/store/re_sorbet/src/chunk_batch.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/store/re_sorbet/src/chunk_batch.rs b/crates/store/re_sorbet/src/chunk_batch.rs index 14bafe07c046..f3943ab45329 100644 --- a/crates/store/re_sorbet/src/chunk_batch.rs +++ b/crates/store/re_sorbet/src/chunk_batch.rs @@ -299,6 +299,7 @@ fn make_all_data_columns_list_arrays(batch: &ArrowRecordBatch) -> ArrowRecordBat sliced.push(data.clone().slice(idx, 1)); } + // TODO(teh-cmc): we can do something faster/simpler here; see https://github.com/rerun-io/rerun/pull/8945#discussion_r1950689060 let data_arrays = sliced.iter().map(|e| Some(e.as_ref())).collect::>(); #[allow(clippy::unwrap_used)] // we know we've given the right field type let list_array: arrow::array::ListArray = From 163f7fb373da9c76507c2a5194471661747f84d3 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Tue, 11 Feb 2025 14:28:16 +0100 Subject: [PATCH 37/38] Clean up the list-array-wrapping code --- crates/store/re_sorbet/src/chunk_batch.rs | 85 +++++++++++------------ 1 file changed, 42 insertions(+), 43 deletions(-) diff --git a/crates/store/re_sorbet/src/chunk_batch.rs b/crates/store/re_sorbet/src/chunk_batch.rs index f3943ab45329..05a322ba9481 100644 --- a/crates/store/re_sorbet/src/chunk_batch.rs +++ b/crates/store/re_sorbet/src/chunk_batch.rs @@ -2,8 +2,8 @@ use std::sync::Arc; use arrow::{ array::{ - Array as _, ArrayRef as ArrowArrayRef, AsArray, RecordBatch as ArrowRecordBatch, - RecordBatchOptions, StructArray as ArrowStructArray, + Array as ArrowArray, ArrayRef as ArrowArrayRef, AsArray, ListArray as ArrowListArray, + RecordBatch as ArrowRecordBatch, RecordBatchOptions, StructArray as ArrowStructArray, }, datatypes::{ DataType as ArrowDataType, Field as ArrowField, FieldRef as ArrowFieldRef, @@ -269,51 +269,19 @@ fn make_all_data_columns_list_arrays(batch: &ArrowRecordBatch) -> ArrowRecordBat let mut fields: Vec = Vec::with_capacity(num_columns); let mut columns: Vec = Vec::with_capacity(num_columns); - for (field, data) in itertools::izip!(batch.schema().fields(), batch.columns()) { - if data - .downcast_array_ref::() - .is_some() - { - // Already fine - fields.push(field.clone()); - columns.push(data.clone()); - } else if field + for (field, array) in itertools::izip!(batch.schema().fields(), batch.columns()) { + let is_list_array = array.downcast_array_ref::().is_some(); + let is_data_column = field .metadata() .get("rerun.kind") - .is_some_and(|kind| kind == "data") - { - // We slice each column array into individual arrays and then convert the whole lot into a ListArray - - let data_field_inner = - ArrowField::new("item", field.data_type().clone(), true /* nullable */); - - let data_field = ArrowField::new( - field.name().clone(), - ArrowDataType::List(Arc::new(data_field_inner.clone())), - false, /* not nullable */ - ) - .with_metadata(field.metadata().clone()); - - let mut sliced: Vec = Vec::new(); - for idx in 0..data.len() { - sliced.push(data.clone().slice(idx, 1)); - } - - // TODO(teh-cmc): we can do something faster/simpler here; see https://github.com/rerun-io/rerun/pull/8945#discussion_r1950689060 - let data_arrays = sliced.iter().map(|e| Some(e.as_ref())).collect::>(); - #[allow(clippy::unwrap_used)] // we know we've given the right field type - let list_array: arrow::array::ListArray = - re_arrow_util::arrow_util::arrays_to_list_array( - data_field_inner.data_type().clone(), - &data_arrays, - ) - .unwrap(); - - fields.push(data_field.into()); - columns.push(into_arrow_ref(list_array)); + .is_some_and(|kind| kind == "data"); + if is_data_column && !is_list_array { + let (field, array) = wrap_in_list_array(field, array); + fields.push(field.into()); + columns.push(into_arrow_ref(array)); } else { fields.push(field.clone()); - columns.push(data.clone()); + columns.push(array.clone()); } } @@ -326,3 +294,34 @@ fn make_all_data_columns_list_arrays(batch: &ArrowRecordBatch) -> ArrowRecordBat ) .expect("Can't fail") } + +// TODO(teh-cmc): we can do something faster/simpler here; see https://github.com/rerun-io/rerun/pull/8945#discussion_r1950689060 +fn wrap_in_list_array(field: &ArrowField, data: &dyn ArrowArray) -> (ArrowField, ArrowListArray) { + re_tracing::profile_function!(); + + // We slice each column array into individual arrays and then convert the whole lot into a ListArray + + let data_field_inner = + ArrowField::new("item", field.data_type().clone(), true /* nullable */); + + let data_field = ArrowField::new( + field.name().clone(), + ArrowDataType::List(Arc::new(data_field_inner.clone())), + false, /* not nullable */ + ) + .with_metadata(field.metadata().clone()); + + let mut sliced: Vec = Vec::new(); + for idx in 0..data.len() { + sliced.push(data.slice(idx, 1)); + } + + let data_arrays = sliced.iter().map(|e| Some(e.as_ref())).collect::>(); + #[allow(clippy::unwrap_used)] // we know we've given the right field type + let list_array: ArrowListArray = re_arrow_util::arrow_util::arrays_to_list_array( + data_field_inner.data_type().clone(), + &data_arrays, + ) + .unwrap(); + (data_field, list_array) +} From 8282fc56be042d50dd048421418efca396d87270 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Tue, 11 Feb 2025 14:33:45 +0100 Subject: [PATCH 38/38] Fix TODO-formatting --- crates/store/re_sorbet/src/chunk_batch.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/store/re_sorbet/src/chunk_batch.rs b/crates/store/re_sorbet/src/chunk_batch.rs index 05a322ba9481..47a8feb71d3b 100644 --- a/crates/store/re_sorbet/src/chunk_batch.rs +++ b/crates/store/re_sorbet/src/chunk_batch.rs @@ -295,7 +295,7 @@ fn make_all_data_columns_list_arrays(batch: &ArrowRecordBatch) -> ArrowRecordBat .expect("Can't fail") } -// TODO(teh-cmc): we can do something faster/simpler here; see https://github.com/rerun-io/rerun/pull/8945#discussion_r1950689060 +// TODO(cmc): we can do something faster/simpler here; see https://github.com/rerun-io/rerun/pull/8945#discussion_r1950689060 fn wrap_in_list_array(field: &ArrowField, data: &dyn ArrowArray) -> (ArrowField, ArrowListArray) { re_tracing::profile_function!();