From 884c4f145ec752447f7cb6f8a073cc2071ebac8d Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Wed, 8 Jan 2025 18:03:39 -0500 Subject: [PATCH] impl FromAvro/ToAvro for all encodings --- Cargo.lock | 10 +- encodings/bytebool/Cargo.toml | 1 + encodings/bytebool/src/array.rs | 4 +- encodings/datetime-parts/Cargo.toml | 1 + encodings/datetime-parts/src/array.rs | 4 +- encodings/dict/Cargo.toml | 1 + encodings/dict/src/array.rs | 12 +- encodings/fastlanes/Cargo.toml | 1 + encodings/fastlanes/src/bitpacking/mod.rs | 4 +- encodings/fastlanes/src/delta/mod.rs | 3 +- encodings/fastlanes/src/for/mod.rs | 4 +- encodings/fsst/Cargo.toml | 1 + encodings/fsst/src/array.rs | 13 +- encodings/roaring/Cargo.toml | 1 + encodings/roaring/src/boolean/mod.rs | 3 +- encodings/roaring/src/integer/mod.rs | 4 +- encodings/runend-bool/Cargo.toml | 1 + encodings/runend-bool/src/array.rs | 19 +-- encodings/runend/Cargo.toml | 1 + encodings/runend/src/array.rs | 11 +- encodings/zigzag/Cargo.toml | 1 + vortex-array/Cargo.toml | 1 - vortex-array/src/array/sparse/mod.rs | 6 +- vortex-avro-derive/src/fromavro.rs | 8 +- vortex-avro-derive/src/schema.rs | 8 +- vortex-avro-derive/src/toavro.rs | 8 +- vortex-avro/src/array.rs | 14 +-- vortex-avro/src/lib.rs | 144 ++++++++++++---------- vortex-avro/src/option.rs | 22 ++-- vortex-avro/src/prim.rs | 38 +++--- vortex-avro/src/string.rs | 9 +- vortex-avro/src/vec.rs | 51 ++++++++ vortex-dtype/src/nullability.rs | 6 +- vortex-scalar/src/pvalue.rs | 49 +++++++- 34 files changed, 301 insertions(+), 163 deletions(-) create mode 100644 vortex-avro/src/vec.rs diff --git a/Cargo.lock b/Cargo.lock index fd8ac87fb9..9c337def74 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4987,7 +4987,6 @@ dependencies = [ name = "vortex-array" version = "0.21.1" dependencies = [ - "apache-avro", "arbitrary", "arrow-arith", "arrow-array", @@ -5068,6 +5067,7 @@ dependencies = [ "num-traits", "serde", "vortex-array", + "vortex-avro", "vortex-buffer", "vortex-dtype", "vortex-error", @@ -5125,6 +5125,7 @@ version = "0.21.1" dependencies = [ "serde", "vortex-array", + "vortex-avro", "vortex-buffer", "vortex-datetime-dtype", "vortex-dtype", @@ -5143,6 +5144,7 @@ dependencies = [ "rand", "serde", "vortex-array", + "vortex-avro", "vortex-buffer", "vortex-dtype", "vortex-error", @@ -5217,6 +5219,7 @@ dependencies = [ "rand", "serde", "vortex-array", + "vortex-avro", "vortex-buffer", "vortex-dtype", "vortex-error", @@ -5268,6 +5271,7 @@ dependencies = [ "fsst-rs", "serde", "vortex-array", + "vortex-avro", "vortex-buffer", "vortex-dtype", "vortex-error", @@ -5361,6 +5365,7 @@ dependencies = [ "num-traits", "serde", "vortex-array", + "vortex-avro", "vortex-buffer", "vortex-dtype", "vortex-error", @@ -5376,6 +5381,7 @@ dependencies = [ "num-traits", "serde", "vortex-array", + "vortex-avro", "vortex-buffer", "vortex-dtype", "vortex-error", @@ -5394,6 +5400,7 @@ dependencies = [ "rstest", "serde", "vortex-array", + "vortex-avro", "vortex-buffer", "vortex-dtype", "vortex-error", @@ -5471,6 +5478,7 @@ version = "0.21.1" dependencies = [ "serde", "vortex-array", + "vortex-avro", "vortex-buffer", "vortex-dtype", "vortex-error", diff --git a/encodings/bytebool/Cargo.toml b/encodings/bytebool/Cargo.toml index a82d47afc8..419e9f8064 100644 --- a/encodings/bytebool/Cargo.toml +++ b/encodings/bytebool/Cargo.toml @@ -21,6 +21,7 @@ arrow-buffer = { workspace = true } num-traits = { workspace = true } serde = { workspace = true, features = ["derive"] } vortex-array = { workspace = true } +vortex-avro = { workspace = true } vortex-buffer = { workspace = true } vortex-dtype = { workspace = true } vortex-error = { workspace = true } diff --git a/encodings/bytebool/src/array.rs b/encodings/bytebool/src/array.rs index 12a9af7750..1324e6f2f8 100644 --- a/encodings/bytebool/src/array.rs +++ b/encodings/bytebool/src/array.rs @@ -10,13 +10,13 @@ use vortex_array::validity::{LogicalValidity, Validity, ValidityMetadata, Validi use vortex_array::variants::{BoolArrayTrait, VariantsVTable}; use vortex_array::visitor::{ArrayVisitor, VisitorVTable}; use vortex_array::{impl_encoding, ArrayData, ArrayLen, ArrayTrait, Canonical, IntoCanonical}; +use vortex_avro::{FromAvro, ToAvro}; use vortex_buffer::ByteBuffer; use vortex_dtype::DType; use vortex_error::{VortexExpect as _, VortexResult}; - impl_encoding!("vortex.bytebool", ids::BYTE_BOOL, ByteBool); -#[derive(Clone, Debug, Serialize, Deserialize)] +#[derive(Clone, Debug, Serialize, Deserialize, FromAvro, ToAvro)] pub struct ByteBoolMetadata { validity: ValidityMetadata, } diff --git a/encodings/datetime-parts/Cargo.toml b/encodings/datetime-parts/Cargo.toml index 04530e0482..8d73f3bbe7 100644 --- a/encodings/datetime-parts/Cargo.toml +++ b/encodings/datetime-parts/Cargo.toml @@ -19,6 +19,7 @@ workspace = true [dependencies] serde = { workspace = true, features = ["derive"] } vortex-array = { workspace = true } +vortex-avro = { workspace = true } vortex-buffer = { workspace = true } vortex-datetime-dtype = { workspace = true } vortex-dtype = { workspace = true } diff --git a/encodings/datetime-parts/src/array.rs b/encodings/datetime-parts/src/array.rs index 9ee15a1caa..bdf0426508 100644 --- a/encodings/datetime-parts/src/array.rs +++ b/encodings/datetime-parts/src/array.rs @@ -12,14 +12,14 @@ use vortex_array::{ impl_encoding, ArrayDType, ArrayData, ArrayLen, ArrayTrait, Canonical, IntoArrayData, IntoCanonical, }; +use vortex_avro::{FromAvro, ToAvro}; use vortex_dtype::{DType, PType}; use vortex_error::{vortex_bail, VortexExpect as _, VortexResult, VortexUnwrap}; use crate::compute::decode_to_temporal; - impl_encoding!("vortex.datetimeparts", ids::DATE_TIME_PARTS, DateTimeParts); -#[derive(Clone, Debug, Serialize, Deserialize)] +#[derive(Clone, Debug, Serialize, Deserialize, FromAvro, ToAvro)] pub struct DateTimePartsMetadata { // Validity lives in the days array // TODO(ngates): we should actually model this with a Tuple array when we have one. diff --git a/encodings/dict/Cargo.toml b/encodings/dict/Cargo.toml index 89924fea65..e586e6d145 100644 --- a/encodings/dict/Cargo.toml +++ b/encodings/dict/Cargo.toml @@ -19,6 +19,7 @@ hashbrown = { workspace = true } num-traits = { workspace = true } serde = { workspace = true } vortex-array = { workspace = true } +vortex-avro = { workspace = true } vortex-buffer = { workspace = true } vortex-dtype = { workspace = true } vortex-error = { workspace = true } diff --git a/encodings/dict/src/array.rs b/encodings/dict/src/array.rs index 1858409952..4dc30b170e 100644 --- a/encodings/dict/src/array.rs +++ b/encodings/dict/src/array.rs @@ -13,15 +13,15 @@ use vortex_array::{ impl_encoding, ArrayDType, ArrayData, ArrayLen, ArrayTrait, Canonical, IntoArrayData, IntoArrayVariant, IntoCanonical, }; +use vortex_avro::{FromAvro, ToAvro}; use vortex_dtype::{match_each_integer_ptype, DType, PType}; use vortex_error::{vortex_bail, vortex_panic, VortexExpect as _, VortexResult}; - impl_encoding!("vortex.dict", ids::DICT, Dict); -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize, FromAvro, ToAvro)] pub struct DictMetadata { codes_ptype: PType, - values_len: usize, + values_len: u64, } impl Display for DictMetadata { @@ -41,7 +41,7 @@ impl DictArray { DictMetadata { codes_ptype: PType::try_from(codes.dtype()) .vortex_expect("codes dtype must be uint"), - values_len: values.len(), + values_len: values.len() as u64, }, [codes, values].into(), StatsSet::default(), @@ -58,7 +58,7 @@ impl DictArray { #[inline] pub fn values(&self) -> ArrayData { self.as_ref() - .child(1, self.dtype(), self.metadata().values_len) + .child(1, self.dtype(), self.metadata().values_len as usize) .vortex_expect("DictArray is missing its values child array") } } @@ -135,7 +135,7 @@ mod test { "dict.metadata", DictMetadata { codes_ptype: PType::U64, - values_len: usize::MAX, + values_len: u64::MAX, }, ); } diff --git a/encodings/fastlanes/Cargo.toml b/encodings/fastlanes/Cargo.toml index 4eab5d61c3..32b793957b 100644 --- a/encodings/fastlanes/Cargo.toml +++ b/encodings/fastlanes/Cargo.toml @@ -24,6 +24,7 @@ itertools = { workspace = true } num-traits = { workspace = true } serde = { workspace = true } vortex-array = { workspace = true } +vortex-avro = { workspace = true } vortex-buffer = { workspace = true } vortex-dtype = { workspace = true } vortex-error = { workspace = true } diff --git a/encodings/fastlanes/src/bitpacking/mod.rs b/encodings/fastlanes/src/bitpacking/mod.rs index 4be720ad0d..a34d1cf422 100644 --- a/encodings/fastlanes/src/bitpacking/mod.rs +++ b/encodings/fastlanes/src/bitpacking/mod.rs @@ -14,16 +14,16 @@ use vortex_array::visitor::{ArrayVisitor, VisitorVTable}; use vortex_array::{ impl_encoding, ArrayDType, ArrayData, ArrayLen, ArrayTrait, Canonical, IntoCanonical, }; +use vortex_avro::{FromAvro, ToAvro}; use vortex_buffer::ByteBuffer; use vortex_dtype::{DType, NativePType, PType}; use vortex_error::{vortex_bail, vortex_err, VortexExpect as _, VortexResult}; - mod compress; mod compute; impl_encoding!("fastlanes.bitpacked", ids::FL_BITPACKED, BitPacked); -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize, FromAvro, ToAvro)] pub struct BitPackedMetadata { validity: ValidityMetadata, bit_width: u8, diff --git a/encodings/fastlanes/src/delta/mod.rs b/encodings/fastlanes/src/delta/mod.rs index 1f6a871753..1f62c80377 100644 --- a/encodings/fastlanes/src/delta/mod.rs +++ b/encodings/fastlanes/src/delta/mod.rs @@ -12,6 +12,7 @@ use vortex_array::{ impl_encoding, ArrayDType, ArrayData, ArrayLen, ArrayTrait, Canonical, IntoArrayData, IntoCanonical, }; +use vortex_avro::{FromAvro, ToAvro}; use vortex_buffer::Buffer; use vortex_dtype::{match_each_unsigned_integer_ptype, NativePType}; use vortex_error::{vortex_bail, vortex_panic, VortexExpect as _, VortexResult}; @@ -21,7 +22,7 @@ mod compute; impl_encoding!("fastlanes.delta", ids::FL_DELTA, Delta); -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize, FromAvro, ToAvro)] pub struct DeltaMetadata { validity: ValidityMetadata, deltas_len: u64, diff --git a/encodings/fastlanes/src/for/mod.rs b/encodings/fastlanes/src/for/mod.rs index c3d750c85a..c9f742db35 100644 --- a/encodings/fastlanes/src/for/mod.rs +++ b/encodings/fastlanes/src/for/mod.rs @@ -10,16 +10,16 @@ use vortex_array::visitor::{ArrayVisitor, VisitorVTable}; use vortex_array::{ impl_encoding, ArrayDType, ArrayData, ArrayLen, ArrayTrait, Canonical, IntoCanonical, }; +use vortex_avro::{FromAvro, ToAvro}; use vortex_dtype::DType; use vortex_error::{vortex_bail, VortexExpect as _, VortexResult}; use vortex_scalar::{PValue, Scalar}; - mod compress; mod compute; impl_encoding!("fastlanes.for", ids::FL_FOR, FoR); -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize, FromAvro, ToAvro)] pub struct FoRMetadata { reference: PValue, shift: u8, diff --git a/encodings/fsst/Cargo.toml b/encodings/fsst/Cargo.toml index 7e4d9e113d..8efc8da82f 100644 --- a/encodings/fsst/Cargo.toml +++ b/encodings/fsst/Cargo.toml @@ -22,6 +22,7 @@ fsst-rs = { workspace = true } serde = { workspace = true } vortex-array = { workspace = true } +vortex-avro = { workspace = true } vortex-buffer = { workspace = true } vortex-dtype = { workspace = true } vortex-error = { workspace = true } diff --git a/encodings/fsst/src/array.rs b/encodings/fsst/src/array.rs index b2e671f1d7..d4faa9ebc5 100644 --- a/encodings/fsst/src/array.rs +++ b/encodings/fsst/src/array.rs @@ -10,6 +10,7 @@ use vortex_array::validity::{ArrayValidity, LogicalValidity, Validity, ValidityV use vortex_array::variants::{BinaryArrayTrait, Utf8ArrayTrait, VariantsVTable}; use vortex_array::visitor::{ArrayVisitor, VisitorVTable}; use vortex_array::{impl_encoding, ArrayDType, ArrayData, ArrayLen, ArrayTrait, IntoCanonical}; +use vortex_avro::{FromAvro, ToAvro}; use vortex_dtype::{DType, Nullability, PType}; use vortex_error::{vortex_bail, VortexExpect, VortexResult}; @@ -18,9 +19,9 @@ impl_encoding!("vortex.fsst", ids::FSST, FSST); static SYMBOLS_DTYPE: DType = DType::Primitive(PType::U64, Nullability::NonNullable); static SYMBOL_LENS_DTYPE: DType = DType::Primitive(PType::U8, Nullability::NonNullable); -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize, FromAvro, ToAvro)] pub struct FSSTMetadata { - symbols_len: usize, + symbols_len: u64, codes_nullability: Nullability, uncompressed_lengths_ptype: PType, } @@ -95,7 +96,7 @@ impl FSSTArray { dtype, len, FSSTMetadata { - symbols_len, + symbols_len: symbols_len as u64, codes_nullability, uncompressed_lengths_ptype, }, @@ -107,14 +108,14 @@ impl FSSTArray { /// Access the symbol table array pub fn symbols(&self) -> ArrayData { self.as_ref() - .child(0, &SYMBOLS_DTYPE, self.metadata().symbols_len) + .child(0, &SYMBOLS_DTYPE, self.metadata().symbols_len as usize) .vortex_expect("FSSTArray symbols child") } /// Access the symbol table array pub fn symbol_lengths(&self) -> ArrayData { self.as_ref() - .child(1, &SYMBOL_LENS_DTYPE, self.metadata().symbols_len) + .child(1, &SYMBOL_LENS_DTYPE, self.metadata().symbols_len as usize) .vortex_expect("FSSTArray symbol_lengths child") } @@ -239,7 +240,7 @@ mod test { check_metadata( "fsst.metadata", FSSTMetadata { - symbols_len: usize::MAX, + symbols_len: u64::MAX, codes_nullability: Nullability::Nullable, uncompressed_lengths_ptype: PType::U64, }, diff --git a/encodings/roaring/Cargo.toml b/encodings/roaring/Cargo.toml index ed3c5c498a..a60e2595a4 100644 --- a/encodings/roaring/Cargo.toml +++ b/encodings/roaring/Cargo.toml @@ -19,6 +19,7 @@ croaring = { workspace = true } num-traits = { workspace = true } serde = { workspace = true } vortex-array = { workspace = true } +vortex-avro = { workspace = true } vortex-buffer = { workspace = true } vortex-dtype = { workspace = true } vortex-error = { workspace = true } diff --git a/encodings/roaring/src/boolean/mod.rs b/encodings/roaring/src/boolean/mod.rs index e3c114bece..ae7267d8a7 100644 --- a/encodings/roaring/src/boolean/mod.rs +++ b/encodings/roaring/src/boolean/mod.rs @@ -15,6 +15,7 @@ use vortex_array::visitor::{ArrayVisitor, VisitorVTable}; use vortex_array::{ impl_encoding, ArrayData, ArrayLen, ArrayTrait, Canonical, IntoArrayData, IntoCanonical, }; +use vortex_avro::{FromAvro, ToAvro}; use vortex_buffer::ByteBuffer; use vortex_dtype::{DType, Nullability}; use vortex_error::{vortex_bail, vortex_err, VortexExpect as _, VortexResult}; @@ -25,7 +26,7 @@ mod stats; impl_encoding!("vortex.roaring_bool", ids::ROARING_BOOL, RoaringBool); -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize, FromAvro, ToAvro)] pub struct RoaringBoolMetadata; impl Display for RoaringBoolMetadata { diff --git a/encodings/roaring/src/integer/mod.rs b/encodings/roaring/src/integer/mod.rs index de8489db2d..e94991dfe5 100644 --- a/encodings/roaring/src/integer/mod.rs +++ b/encodings/roaring/src/integer/mod.rs @@ -15,17 +15,17 @@ use vortex_array::{ impl_encoding, ArrayDType as _, ArrayData, ArrayLen, ArrayTrait, Canonical, IntoArrayData, IntoCanonical, }; +use vortex_avro::{FromAvro, ToAvro}; use vortex_buffer::{Buffer, ByteBuffer}; use vortex_dtype::Nullability::NonNullable; use vortex_dtype::{DType, PType}; use vortex_error::{vortex_bail, VortexExpect as _, VortexResult}; - mod compress; mod compute; impl_encoding!("vortex.roaring_int", ids::ROARING_INT, RoaringInt); -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize, FromAvro, ToAvro)] pub struct RoaringIntMetadata { ptype: PType, } diff --git a/encodings/runend-bool/Cargo.toml b/encodings/runend-bool/Cargo.toml index d41945a7f6..85b52ca1be 100644 --- a/encodings/runend-bool/Cargo.toml +++ b/encodings/runend-bool/Cargo.toml @@ -19,6 +19,7 @@ itertools = { workspace = true } num-traits = { workspace = true } serde = { workspace = true } vortex-array = { workspace = true } +vortex-avro = { workspace = true } vortex-buffer = { workspace = true } vortex-dtype = { workspace = true } vortex-error = { workspace = true } diff --git a/encodings/runend-bool/src/array.rs b/encodings/runend-bool/src/array.rs index d79e635681..b578ecf217 100644 --- a/encodings/runend-bool/src/array.rs +++ b/encodings/runend-bool/src/array.rs @@ -12,6 +12,7 @@ use vortex_array::{ impl_encoding, ArrayDType, ArrayData, ArrayLen, ArrayTrait, Canonical, IntoArrayData, IntoArrayVariant, IntoCanonical, }; +use vortex_avro::{FromAvro, ToAvro}; use vortex_dtype::{match_each_integer_ptype, match_each_unsigned_integer_ptype, DType, PType}; use vortex_error::{vortex_bail, VortexExpect as _, VortexResult}; use vortex_scalar::Scalar; @@ -20,13 +21,13 @@ use crate::compress::{runend_bool_decode_slice, runend_bool_encode_slice, trimme impl_encoding!("vortex.runendbool", ids::RUN_END_BOOL, RunEndBool); -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize, FromAvro, ToAvro)] pub struct RunEndBoolMetadata { start: bool, validity: ValidityMetadata, ends_ptype: PType, - num_runs: usize, - offset: usize, + num_runs: u64, + offset: u64, } impl Display for RunEndBoolMetadata { @@ -77,8 +78,8 @@ impl RunEndBoolArray { start, validity: validity.to_metadata(length)?, ends_ptype, - num_runs: ends.len(), - offset, + num_runs: ends.len() as u64, + offset: offset as u64, }; let stats = if matches!(validity, Validity::AllValid | Validity::NonNullable) { @@ -118,7 +119,7 @@ impl RunEndBoolArray { #[inline] pub(crate) fn offset(&self) -> usize { - self.metadata().offset + self.metadata().offset as usize } #[inline] @@ -132,7 +133,7 @@ impl RunEndBoolArray { .child( 0, &self.metadata().ends_ptype.into(), - self.metadata().num_runs, + self.metadata().num_runs as usize, ) .vortex_expect("RunEndBoolArray is missing its run ends") } @@ -267,9 +268,9 @@ mod test { check_metadata( "runend_bool.metadata", RunEndBoolMetadata { - num_runs: usize::MAX, + num_runs: u64::MAX, ends_ptype: PType::U64, - offset: usize::MAX, + offset: u64::MAX, validity: ValidityMetadata::AllValid, start: true, }, diff --git a/encodings/runend/Cargo.toml b/encodings/runend/Cargo.toml index 2209c1fb76..adb6907f13 100644 --- a/encodings/runend/Cargo.toml +++ b/encodings/runend/Cargo.toml @@ -19,6 +19,7 @@ itertools = { workspace = true } num-traits = { workspace = true } serde = { workspace = true } vortex-array = { workspace = true } +vortex-avro = { workspace = true } vortex-buffer = { workspace = true } vortex-dtype = { workspace = true } vortex-error = { workspace = true } diff --git a/encodings/runend/src/array.rs b/encodings/runend/src/array.rs index d714c9dbc0..1fcb398d10 100644 --- a/encodings/runend/src/array.rs +++ b/encodings/runend/src/array.rs @@ -14,6 +14,7 @@ use vortex_array::{ impl_encoding, ArrayDType, ArrayData, ArrayLen, ArrayTrait, Canonical, IntoArrayData, IntoArrayVariant, IntoCanonical, }; +use vortex_avro::{FromAvro, ToAvro}; use vortex_buffer::Buffer; use vortex_dtype::{DType, PType}; use vortex_error::{vortex_bail, VortexExpect as _, VortexResult}; @@ -23,11 +24,11 @@ use crate::compress::{runend_decode_bools, runend_decode_primitive, runend_encod impl_encoding!("vortex.runend", ids::RUN_END, RunEnd); -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize, FromAvro, ToAvro)] pub struct RunEndMetadata { ends_ptype: PType, - num_runs: usize, - offset: usize, + num_runs: u64, + offset: u64, } impl Display for RunEndMetadata { @@ -76,8 +77,8 @@ impl RunEndArray { let dtype = values.dtype().clone(); let metadata = RunEndMetadata { ends_ptype: PType::try_from(ends.dtype())?, - num_runs: ends.len(), - offset, + num_runs: ends.len() as u64, + offset: offset as u64, }; Self::try_from_parts( diff --git a/encodings/zigzag/Cargo.toml b/encodings/zigzag/Cargo.toml index a35f671a85..3d8ca0935d 100644 --- a/encodings/zigzag/Cargo.toml +++ b/encodings/zigzag/Cargo.toml @@ -16,6 +16,7 @@ readme = { workspace = true } [dependencies] serde = { workspace = true, features = ["derive"] } vortex-array = { workspace = true } +vortex-avro = { workspace = true } vortex-buffer = { workspace = true } vortex-dtype = { workspace = true } vortex-error = { workspace = true } diff --git a/vortex-array/Cargo.toml b/vortex-array/Cargo.toml index 4fb35a6976..1fa3784930 100644 --- a/vortex-array/Cargo.toml +++ b/vortex-array/Cargo.toml @@ -22,7 +22,6 @@ bench = false workspace = true [dependencies] -apache-avro = { workspace = true } arbitrary = { workspace = true, optional = true } arrow-arith = { workspace = true } arrow-array = { workspace = true } diff --git a/vortex-array/src/array/sparse/mod.rs b/vortex-array/src/array/sparse/mod.rs index 6914c6d29f..2c74471907 100644 --- a/vortex-array/src/array/sparse/mod.rs +++ b/vortex-array/src/array/sparse/mod.rs @@ -23,7 +23,7 @@ impl_encoding!("vortex.sparse", ids::SPARSE, Sparse); #[derive(Debug, Clone, Serialize, Deserialize, FromAvro, ToAvro)] pub struct SparseMetadata { // Offset value for patch indices as a result of slicing - pub(crate) indices_offset: usize, + pub(crate) indices_offset: u64, pub(crate) patches: PatchesMetadata, pub(crate) fill_value: ScalarValue, } @@ -92,7 +92,7 @@ impl SparseArray { patches.dtype().clone(), len, SparseMetadata { - indices_offset, + indices_offset: indices_offset as u64, patches: patches_metadata, fill_value: fill_value.into_value(), }, @@ -103,7 +103,7 @@ impl SparseArray { #[inline] pub fn indices_offset(&self) -> usize { - self.metadata().indices_offset + self.metadata().indices_offset as usize } #[inline] diff --git a/vortex-avro-derive/src/fromavro.rs b/vortex-avro-derive/src/fromavro.rs index 0ce379188c..495954e217 100644 --- a/vortex-avro-derive/src/fromavro.rs +++ b/vortex-avro-derive/src/fromavro.rs @@ -53,8 +53,8 @@ fn derive_from_avro_enum_unit(name: &syn::Ident, e: &syn::DataEnum) -> proc_macr // Generate the EnumSchema. let enum_schema = quote! { - apache_avro::Schema::Enum(apache_avro::schema::EnumSchema { - name: apache_avro::schema::Name { + vortex_avro::avro_private::Schema::Enum(vortex_avro::avro_private::schema::EnumSchema { + name: vortex_avro::avro_private::schema::Name { name: stringify!(#name).to_string(), namespace: None, }, @@ -95,7 +95,7 @@ fn derive_from_avro_enum_unit(name: &syn::Ident, e: &syn::DataEnum) -> proc_macr } impl FromAvro for #name { - fn read_schema() -> apache_avro::Schema { + fn read_schema() -> vortex_avro::avro_private::Schema { #enum_schema } } @@ -161,7 +161,7 @@ fn generate_try_from_avrovalue( } impl vortex_avro::FromAvro for #typename { - fn read_schema() -> apache_avro::Schema { + fn read_schema() -> vortex_avro::avro_private::Schema { #read_schema } } diff --git a/vortex-avro-derive/src/schema.rs b/vortex-avro-derive/src/schema.rs index aa451b067f..97717087ea 100644 --- a/vortex-avro-derive/src/schema.rs +++ b/vortex-avro-derive/src/schema.rs @@ -15,13 +15,13 @@ pub(crate) fn generate_schema_struct( let name = f.ident.clone().unwrap(); let typ = f.ty.clone(); quote! { - apache_avro::schema::RecordField { + vortex_avro::avro_private::schema::RecordField { name: stringify!(#name).to_string(), doc: None, schema: <#typ as vortex_avro::FromAvro>::read_schema(), aliases: core::default::Default::default(), default: core::default::Default::default(), - order: apache_avro::schema::RecordFieldOrder::Ignore, + order: vortex_avro::avro_private::schema::RecordFieldOrder::Ignore, position: #idx, custom_attributes: core::default::Default::default(), } @@ -31,8 +31,8 @@ pub(crate) fn generate_schema_struct( // Generate the RecordSchema quote! { - apache_avro::Schema::Record(apache_avro::schema::RecordSchema { - name: apache_avro::schema::Name { + vortex_avro::avro_private::Schema::Record(vortex_avro::avro_private::schema::RecordSchema { + name: vortex_avro::avro_private::schema::Name { name: stringify!(#typename).to_string(), namespace: None, }, diff --git a/vortex-avro-derive/src/toavro.rs b/vortex-avro-derive/src/toavro.rs index d05c8f44fc..7175e38d95 100644 --- a/vortex-avro-derive/src/toavro.rs +++ b/vortex-avro-derive/src/toavro.rs @@ -55,7 +55,7 @@ fn derive_toavro_struct(typename: &syn::Ident, fields: &FieldsNamed) -> proc_mac } impl vortex_avro::ToAvro for #typename { - fn write_schema() -> apache_avro::Schema { + fn write_schema() -> vortex_avro::avro_private::Schema { #schema } } @@ -90,8 +90,8 @@ fn derive_to_avro_enum_unit(typename: &syn::Ident, e: &syn::DataEnum) -> proc_ma .collect::>(); let enum_schema = quote! { - apache_avro::Schema::Enum(apache_avro::schema::EnumSchema { - name: apache_avro::schema::Name { + vortex_avro::avro_private::Schema::Enum(vortex_avro::avro_private::schema::EnumSchema { + name: vortex_avro::avro_private::schema::Name { name: stringify!(#typename).to_string(), namespace: None, }, @@ -115,7 +115,7 @@ fn derive_to_avro_enum_unit(typename: &syn::Ident, e: &syn::DataEnum) -> proc_ma } impl vortex_avro::ToAvro for #typename { - fn write_schema() -> apache_avro::Schema { + fn write_schema() -> vortex_avro::avro_private::Schema { #enum_schema } } diff --git a/vortex-avro/src/array.rs b/vortex-avro/src/array.rs index 6eab6cc87c..6a18e44493 100644 --- a/vortex-avro/src/array.rs +++ b/vortex-avro/src/array.rs @@ -1,9 +1,3 @@ -// -// Fixed-size primitive arrays. -// - -use apache_avro::schema::ArraySchema; -use apache_avro::Schema; use vortex_error::{vortex_bail, vortex_err, VortexError}; use crate::{AvroValue, FromAvro, ToAvro}; @@ -16,8 +10,8 @@ impl> From<[T; N]> for AvroValue { } impl ToAvro for [T; N] { - fn write_schema() -> Schema { - Schema::Array(ArraySchema { + fn write_schema() -> crate::avro_private::Schema { + crate::avro_private::Schema::Array(crate::avro_private::ArraySchema { items: Box::new(T::write_schema()), attributes: Default::default(), }) @@ -47,8 +41,8 @@ impl TryFrom for [T; N] { } impl FromAvro for [T; N] { - fn read_schema() -> Schema { - Schema::Array(ArraySchema { + fn read_schema() -> crate::avro_private::Schema { + crate::avro_private::Schema::Array(crate::avro_private::ArraySchema { items: Box::new(T::read_schema()), attributes: Default::default(), }) diff --git a/vortex-avro/src/lib.rs b/vortex-avro/src/lib.rs index d31512d77f..f4bac58712 100644 --- a/vortex-avro/src/lib.rs +++ b/vortex-avro/src/lib.rs @@ -2,8 +2,6 @@ use std::collections::HashMap; use std::io::Cursor; -use apache_avro::types::Value; -use apache_avro::{from_avro_datum, to_avro_datum, BigDecimal, Decimal, Duration, Schema}; use uuid::Uuid; pub use vortex_avro_derive::{FromAvro, ToAvro}; use vortex_error::{vortex_err, VortexError, VortexResult}; @@ -12,6 +10,13 @@ mod array; mod option; mod prim; mod string; +mod vec; + +pub mod avro_private { + pub use apache_avro::schema::*; + pub use apache_avro::types::*; + pub use apache_avro::*; +} /// AvroValue is based on `Value` from the Avro crate, but without the blanket impls. This is so we have control over how the /// conversions for primitives are implemented. @@ -30,8 +35,8 @@ pub enum AvroValue { Map(HashMap), Record(Vec<(String, AvroValue)>), Date(i32), - Decimal(Decimal), - BigDecimal(BigDecimal), + Decimal(avro_private::Decimal), + BigDecimal(avro_private::BigDecimal), TimeMillis(i32), TimeMicros(i64), TimestampMillis(i64), @@ -40,88 +45,90 @@ pub enum AvroValue { LocalTimestampMillis(i64), LocalTimestampMicros(i64), LocalTimestampNanos(i64), - Duration(Duration), + Duration(avro_private::Duration), Uuid(Uuid), Long(i64), String(String), } // Helper conversion into our AvroValue type from upstream `apache_avro::Value`. -impl From for AvroValue { - fn from(value: Value) -> Self { +impl From for AvroValue { + fn from(value: avro_private::Value) -> Self { match value { - Value::Long(i) => AvroValue::Long(i), - Value::String(s) => AvroValue::String(s), - Value::Int(i) => AvroValue::Int(i), - Value::Null => AvroValue::Null, - Value::Boolean(b) => AvroValue::Boolean(b), - Value::Float(f) => AvroValue::Float(f), - Value::Double(d) => AvroValue::Double(d), - Value::Bytes(b) => AvroValue::Bytes(b), - Value::Fixed(size, bytes) => AvroValue::Fixed(size, bytes), - Value::Enum(i, s) => AvroValue::Enum(i, s), - Value::Union(i, v) => AvroValue::Union(i, Box::new((*v).into())), - Value::Array(items) => { + avro_private::Value::Long(i) => AvroValue::Long(i), + avro_private::Value::String(s) => AvroValue::String(s), + avro_private::Value::Int(i) => AvroValue::Int(i), + avro_private::Value::Null => AvroValue::Null, + avro_private::Value::Boolean(b) => AvroValue::Boolean(b), + avro_private::Value::Float(f) => AvroValue::Float(f), + avro_private::Value::Double(d) => AvroValue::Double(d), + avro_private::Value::Bytes(b) => AvroValue::Bytes(b), + avro_private::Value::Fixed(size, bytes) => AvroValue::Fixed(size, bytes), + avro_private::Value::Enum(i, s) => AvroValue::Enum(i, s), + avro_private::Value::Union(i, v) => AvroValue::Union(i, Box::new((*v).into())), + avro_private::Value::Array(items) => { AvroValue::Array(items.into_iter().map(AvroValue::from).collect()) } - Value::Map(items) => { + avro_private::Value::Map(items) => { AvroValue::Map(items.into_iter().map(|(k, v)| (k, v.into())).collect()) } - Value::Record(items) => { + avro_private::Value::Record(items) => { AvroValue::Record(items.into_iter().map(|(k, v)| (k, v.into())).collect()) } - Value::Date(d) => AvroValue::Date(d), - Value::Decimal(d) => AvroValue::Decimal(d), - Value::BigDecimal(d) => AvroValue::BigDecimal(d), - Value::TimeMillis(t) => AvroValue::TimeMillis(t), - Value::TimeMicros(t) => AvroValue::TimeMicros(t), - Value::TimestampMillis(t) => AvroValue::TimestampMillis(t), - Value::TimestampMicros(t) => AvroValue::TimestampMicros(t), - Value::TimestampNanos(t) => AvroValue::TimestampNanos(t), - Value::LocalTimestampMillis(t) => AvroValue::LocalTimestampMillis(t), - Value::LocalTimestampMicros(t) => AvroValue::LocalTimestampMicros(t), - Value::LocalTimestampNanos(t) => AvroValue::LocalTimestampNanos(t), - Value::Duration(d) => AvroValue::Duration(d), - Value::Uuid(u) => AvroValue::Uuid(u), + avro_private::Value::Date(d) => AvroValue::Date(d), + avro_private::Value::Decimal(d) => AvroValue::Decimal(d), + avro_private::Value::BigDecimal(d) => AvroValue::BigDecimal(d), + avro_private::Value::TimeMillis(t) => AvroValue::TimeMillis(t), + avro_private::Value::TimeMicros(t) => AvroValue::TimeMicros(t), + avro_private::Value::TimestampMillis(t) => AvroValue::TimestampMillis(t), + avro_private::Value::TimestampMicros(t) => AvroValue::TimestampMicros(t), + avro_private::Value::TimestampNanos(t) => AvroValue::TimestampNanos(t), + avro_private::Value::LocalTimestampMillis(t) => AvroValue::LocalTimestampMillis(t), + avro_private::Value::LocalTimestampMicros(t) => AvroValue::LocalTimestampMicros(t), + avro_private::Value::LocalTimestampNanos(t) => AvroValue::LocalTimestampNanos(t), + avro_private::Value::Duration(d) => AvroValue::Duration(d), + avro_private::Value::Uuid(u) => AvroValue::Uuid(u), } } } // Helper conversion into upstream `apache_avro::Value` from our `AvroValue` type. -impl From for Value { +impl From for avro_private::Value { fn from(value: AvroValue) -> Self { match value { - AvroValue::Long(i) => Value::Long(i), - AvroValue::String(s) => Value::String(s), - AvroValue::Int(i) => Value::Int(i), - AvroValue::Null => Value::Null, - AvroValue::Boolean(b) => Value::Boolean(b), - AvroValue::Float(f) => Value::Float(f), - AvroValue::Double(d) => Value::Double(d), - AvroValue::Bytes(b) => Value::Bytes(b), - AvroValue::Fixed(size, bytes) => Value::Fixed(size, bytes), - AvroValue::Enum(i, s) => Value::Enum(i, s), - AvroValue::Union(i, v) => Value::Union(i, Box::new((*v).into())), - AvroValue::Array(items) => Value::Array(items.into_iter().map(Value::from).collect()), + AvroValue::Long(i) => avro_private::Value::Long(i), + AvroValue::String(s) => avro_private::Value::String(s), + AvroValue::Int(i) => avro_private::Value::Int(i), + AvroValue::Null => avro_private::Value::Null, + AvroValue::Boolean(b) => avro_private::Value::Boolean(b), + AvroValue::Float(f) => avro_private::Value::Float(f), + AvroValue::Double(d) => avro_private::Value::Double(d), + AvroValue::Bytes(b) => avro_private::Value::Bytes(b), + AvroValue::Fixed(size, bytes) => avro_private::Value::Fixed(size, bytes), + AvroValue::Enum(i, s) => avro_private::Value::Enum(i, s), + AvroValue::Union(i, v) => avro_private::Value::Union(i, Box::new((*v).into())), + AvroValue::Array(items) => avro_private::Value::Array( + items.into_iter().map(avro_private::Value::from).collect(), + ), AvroValue::Map(items) => { - Value::Map(items.into_iter().map(|(k, v)| (k, v.into())).collect()) + avro_private::Value::Map(items.into_iter().map(|(k, v)| (k, v.into())).collect()) } AvroValue::Record(items) => { - Value::Record(items.into_iter().map(|(k, v)| (k, v.into())).collect()) + avro_private::Value::Record(items.into_iter().map(|(k, v)| (k, v.into())).collect()) } - AvroValue::Date(d) => Value::Date(d), - AvroValue::Decimal(d) => Value::Decimal(d), - AvroValue::BigDecimal(d) => Value::BigDecimal(d), - AvroValue::TimeMillis(t) => Value::TimeMillis(t), - AvroValue::TimeMicros(t) => Value::TimeMicros(t), - AvroValue::TimestampMillis(t) => Value::TimestampMillis(t), - AvroValue::TimestampMicros(t) => Value::TimestampMicros(t), - AvroValue::TimestampNanos(t) => Value::TimestampNanos(t), - AvroValue::LocalTimestampMillis(t) => Value::LocalTimestampMillis(t), - AvroValue::LocalTimestampMicros(t) => Value::LocalTimestampMicros(t), - AvroValue::LocalTimestampNanos(t) => Value::LocalTimestampNanos(t), - AvroValue::Duration(d) => Value::Duration(d), - AvroValue::Uuid(u) => Value::Uuid(u), + AvroValue::Date(d) => avro_private::Value::Date(d), + AvroValue::Decimal(d) => avro_private::Value::Decimal(d), + AvroValue::BigDecimal(d) => avro_private::Value::BigDecimal(d), + AvroValue::TimeMillis(t) => avro_private::Value::TimeMillis(t), + AvroValue::TimeMicros(t) => avro_private::Value::TimeMicros(t), + AvroValue::TimestampMillis(t) => avro_private::Value::TimestampMillis(t), + AvroValue::TimestampMicros(t) => avro_private::Value::TimestampMicros(t), + AvroValue::TimestampNanos(t) => avro_private::Value::TimestampNanos(t), + AvroValue::LocalTimestampMillis(t) => avro_private::Value::LocalTimestampMillis(t), + AvroValue::LocalTimestampMicros(t) => avro_private::Value::LocalTimestampMicros(t), + AvroValue::LocalTimestampNanos(t) => avro_private::Value::LocalTimestampNanos(t), + AvroValue::Duration(d) => avro_private::Value::Duration(d), + AvroValue::Uuid(u) => avro_private::Value::Uuid(u), } } } @@ -134,7 +141,7 @@ impl From for Value { /// Additionally, types must provide a schema that can be used to write the type to the Avro binary format. pub trait ToAvro: Into { // TODO(aduffy): just have one schema instead of read/write. - fn write_schema() -> Schema; + fn write_schema() -> avro_private::Schema; } /// Types that can be deserialized from an Avro binary format. @@ -145,7 +152,7 @@ pub trait ToAvro: Into { /// Additionally, types must provide a schema that can be used to read the type from the Avro binary format. pub trait FromAvro: TryFrom { /// Retrieve the Avro schema that is used to read this type from the Avro binary format. - fn read_schema() -> Schema; + fn read_schema() -> avro_private::Schema; } /// Convert a type into the Avro binary format. @@ -153,15 +160,18 @@ pub trait FromAvro: TryFrom { /// This function will return an error if the type cannot be converted into the Avro binary format. pub fn to_avro_binary(value: T) -> VortexResult> { let avro_value: AvroValue = value.into(); - to_avro_datum(&T::write_schema(), avro_value) + avro_private::to_avro_datum(&T::write_schema(), avro_value) .map_err(|err| vortex_err!("Failed to convert type to Avro binary format: {err}")) } /// Read into a type from the Avro binary format. /// /// This function will return an error if the type cannot be read from the Avro binary format. -pub fn from_avro_binary(schema: &Schema, avro_bytes: Vec) -> VortexResult { - let value = from_avro_datum(schema, &mut Cursor::new(avro_bytes), None) +pub fn from_avro_binary( + schema: &avro_private::Schema, + avro_bytes: Vec, +) -> VortexResult { + let value = avro_private::from_avro_datum(schema, &mut Cursor::new(avro_bytes), None) .map_err(|err| vortex_err!("Failed to read type from Avro binary format: {err}"))?; >::try_from(value.into()) } diff --git a/vortex-avro/src/option.rs b/vortex-avro/src/option.rs index f745b8f5c5..7ff7afd370 100644 --- a/vortex-avro/src/option.rs +++ b/vortex-avro/src/option.rs @@ -1,5 +1,3 @@ -use apache_avro::schema::UnionSchema; -use apache_avro::Schema; use vortex_error::vortex_bail; use crate::{AvroValue, FromAvro, ToAvro}; @@ -40,9 +38,13 @@ where T: FromAvro, { #[allow(clippy::expect_used)] - fn read_schema() -> Schema { - Schema::Union( - UnionSchema::new(vec![Schema::Null, T::read_schema()]).expect("Option schema"), + fn read_schema() -> crate::avro_private::Schema { + crate::avro_private::Schema::Union( + crate::avro_private::UnionSchema::new(vec![ + crate::avro_private::Schema::Null, + T::read_schema(), + ]) + .expect("Option schema"), ) } } @@ -52,9 +54,13 @@ where T: ToAvro, { #[allow(clippy::expect_used)] - fn write_schema() -> Schema { - Schema::Union( - UnionSchema::new(vec![Schema::Null, T::write_schema()]).expect("Option schema"), + fn write_schema() -> crate::avro_private::Schema { + crate::avro_private::Schema::Union( + crate::avro_private::UnionSchema::new(vec![ + crate::avro_private::Schema::Null, + T::write_schema(), + ]) + .expect("Option schema"), ) } } diff --git a/vortex-avro/src/prim.rs b/vortex-avro/src/prim.rs index f3652245f6..d57fe1c751 100644 --- a/vortex-avro/src/prim.rs +++ b/vortex-avro/src/prim.rs @@ -1,6 +1,6 @@ use vortex_error::{vortex_err, VortexError}; -use crate::{AvroValue, FromAvro, Schema, ToAvro}; +use crate::{AvroValue, FromAvro}; macro_rules! impl_primitive { ($ty:ty, $inner:ty, $value_variant:path, $schema_variant:path) => { @@ -10,8 +10,8 @@ macro_rules! impl_primitive { } } - impl ToAvro for $ty { - fn write_schema() -> Schema { + impl $crate::ToAvro for $ty { + fn write_schema() -> $crate::avro_private::Schema { $schema_variant } } @@ -32,21 +32,31 @@ macro_rules! impl_primitive { } impl FromAvro for $ty { - fn read_schema() -> Schema { + fn read_schema() -> $crate::avro_private::Schema { $schema_variant } } }; } -impl_primitive!(i8, i32, AvroValue::Int, Schema::Int); -impl_primitive!(i16, i32, AvroValue::Int, Schema::Int); -impl_primitive!(i32, i32, AvroValue::Int, Schema::Int); -impl_primitive!(u8, i32, AvroValue::Int, Schema::Int); -impl_primitive!(u16, i32, AvroValue::Int, Schema::Int); -impl_primitive!(u32, i32, AvroValue::Int, Schema::Int); -impl_primitive!(i64, i64, AvroValue::Long, Schema::Long); -impl_primitive!(u64, i64, AvroValue::Long, Schema::Long); +impl_primitive!(i8, i32, AvroValue::Int, crate::avro_private::Schema::Int); +impl_primitive!(i16, i32, AvroValue::Int, crate::avro_private::Schema::Int); +impl_primitive!(i32, i32, AvroValue::Int, crate::avro_private::Schema::Int); +impl_primitive!(u8, i32, AvroValue::Int, crate::avro_private::Schema::Int); +impl_primitive!(u16, i32, AvroValue::Int, crate::avro_private::Schema::Int); +impl_primitive!(u32, i32, AvroValue::Int, crate::avro_private::Schema::Int); +impl_primitive!(i64, i64, AvroValue::Long, crate::avro_private::Schema::Long); +impl_primitive!(u64, i64, AvroValue::Long, crate::avro_private::Schema::Long); // TODO(aduffy): f16 support? -impl_primitive!(f32, f32, AvroValue::Float, Schema::Float); -impl_primitive!(f64, f64, AvroValue::Double, Schema::Double); +impl_primitive!( + f32, + f32, + AvroValue::Float, + crate::avro_private::Schema::Float +); +impl_primitive!( + f64, + f64, + AvroValue::Double, + crate::avro_private::Schema::Double +); diff --git a/vortex-avro/src/string.rs b/vortex-avro/src/string.rs index 1c2914ffd4..7d20cf9d0a 100644 --- a/vortex-avro/src/string.rs +++ b/vortex-avro/src/string.rs @@ -1,4 +1,3 @@ -use apache_avro::Schema; use vortex_error::{vortex_err, VortexError}; use crate::{AvroValue, FromAvro, ToAvro}; @@ -10,8 +9,8 @@ impl From for AvroValue { } impl ToAvro for String { - fn write_schema() -> Schema { - Schema::String + fn write_schema() -> crate::avro_private::Schema { + crate::avro_private::Schema::String } } @@ -28,7 +27,7 @@ impl TryFrom for String { } impl FromAvro for String { - fn read_schema() -> Schema { - Schema::String + fn read_schema() -> crate::avro_private::Schema { + crate::avro_private::Schema::String } } diff --git a/vortex-avro/src/vec.rs b/vortex-avro/src/vec.rs new file mode 100644 index 0000000000..61715bf89b --- /dev/null +++ b/vortex-avro/src/vec.rs @@ -0,0 +1,51 @@ +// +// Fixed-size primitive arrays. +// + +use apache_avro::schema::ArraySchema; +use apache_avro::Schema; +use vortex_error::{vortex_bail, VortexError}; + +use crate::{AvroValue, FromAvro, ToAvro}; + +// ToAvro +impl> From> for AvroValue { + fn from(value: Vec) -> Self { + AvroValue::Array(value.into_iter().map(Into::into).collect()) + } +} + +impl ToAvro for Vec { + fn write_schema() -> Schema { + Schema::Array(ArraySchema { + items: Box::new(T::write_schema()), + attributes: Default::default(), + }) + } +} + +// FromAvro +impl TryFrom for Vec { + type Error = VortexError; + + fn try_from(value: AvroValue) -> Result { + let AvroValue::Array(items) = value else { + vortex_bail!("Expected value to be an array but it was not"); + }; + let items: Vec = items + .into_iter() + .map(T::try_from) + .collect::, _>>()?; + + Ok(items) + } +} + +impl FromAvro for Vec { + fn read_schema() -> Schema { + Schema::Array(ArraySchema { + items: Box::new(T::read_schema()), + attributes: Default::default(), + }) + } +} diff --git a/vortex-dtype/src/nullability.rs b/vortex-dtype/src/nullability.rs index e30ee7df8e..ad2ebf23e8 100644 --- a/vortex-dtype/src/nullability.rs +++ b/vortex-dtype/src/nullability.rs @@ -1,7 +1,9 @@ use std::fmt::{Display, Formatter}; -/// Whether an instance of a DType can be `null or not -#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Hash, Ord, PartialOrd)] +use vortex_avro::{FromAvro, ToAvro}; + +/// Whether an instance of a DType can be `null` or not +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Hash, Ord, PartialOrd, FromAvro, ToAvro)] pub enum Nullability { /// Instances of this DType are guaranteed to be non-nullable #[default] diff --git a/vortex-scalar/src/pvalue.rs b/vortex-scalar/src/pvalue.rs index 88d2eb97b8..abdc7ae227 100644 --- a/vortex-scalar/src/pvalue.rs +++ b/vortex-scalar/src/pvalue.rs @@ -4,10 +4,10 @@ use std::mem; use num_traits::NumCast; use paste::paste; +use vortex_avro::{AvroValue, FromAvro, ToAvro}; use vortex_dtype::half::f16; use vortex_dtype::{NativePType, PType}; -use vortex_error::{vortex_err, VortexError, VortexExpect}; - +use vortex_error::{vortex_bail, vortex_err, VortexError, VortexExpect}; #[derive(Debug, Clone, Copy)] pub enum PValue { U8(u8), @@ -23,6 +23,51 @@ pub enum PValue { F64(f64), } +// Impl of FromAvro for PValue +// In Avro, we will always serialize a PValue as an Int64, as that is the maximum width of PValue. +// We can then cast the bits accordingly on read. +impl TryFrom for PValue { + type Error = VortexError; + + fn try_from(value: AvroValue) -> Result { + let AvroValue::Long(v) = value else { + vortex_bail!("Expected AvroValue::Long, got {:?}", value); + }; + Ok(Self::I64(v)) + } +} + +impl FromAvro for PValue { + fn read_schema() -> vortex_avro::avro_private::Schema { + vortex_avro::avro_private::Schema::Long + } +} + +// Impl of ToAvro for PValue +impl From for AvroValue { + fn from(value: PValue) -> Self { + match value { + PValue::U8(v) => AvroValue::Long(v as i64), + PValue::U16(v) => AvroValue::Long(v as i64), + PValue::U32(v) => AvroValue::Long(v as i64), + PValue::U64(v) => AvroValue::Long(v as i64), + PValue::I8(v) => AvroValue::Long(v as i64), + PValue::I16(v) => AvroValue::Long(v as i64), + PValue::I32(v) => AvroValue::Long(v as i64), + PValue::I64(v) => AvroValue::Long(v), + PValue::F16(_) => unimplemented!("F16 serialization to avro not supported currently."), + PValue::F32(v) => AvroValue::Long(v.to_bits() as i64), + PValue::F64(v) => AvroValue::Long(v.to_bits() as i64), + } + } +} + +impl ToAvro for PValue { + fn write_schema() -> vortex_avro::avro_private::Schema { + vortex_avro::avro_private::Schema::Long + } +} + impl PartialEq for PValue { fn eq(&self, other: &Self) -> bool { match (self, other) {