risingwavelabs · xxhZs · Dec 14, 2023 · Nov 21, 2023 · Nov 21, 2023 · Nov 23, 2023
diff --git a/Cargo.toml b/Cargo.toml
@@ -128,6 +128,12 @@ arrow-flight = "49"
 arrow-select = "49"
 arrow-ord = "49"
 arrow-row = "49"
+arrow-array-deltalake = { package = "arrow-array", version = "48.0.1" }
+arrow-buffer-deltalake = { package = "arrow-buffer", version = "48.0.1" }
+arrow-cast-deltalake = { package = "arrow-cast", version = "48.0.1" }
+arrow-schema-deltalake = { package = "arrow-schema", version = "48.0.1" }
+deltalake = { git = "https://github.com/delta-io/delta-rs", rev = "72505449e9538371fe5fda35d545dbd662facd07", features = ["s3"] }
+parquet = "49"
 thiserror-ext = "0.0.8"
 tikv-jemalloc-ctl = { git = "https://github.com/risingwavelabs/jemallocator.git", rev = "64a2d9" }
 tikv-jemallocator = { git = "https://github.com/risingwavelabs/jemallocator.git", features = [

diff --git a/src/common/Cargo.toml b/src/common/Cargo.toml
@@ -17,9 +17,13 @@ normal = ["workspace-hack"]
 anyhow = "1"
 arc-swap = "1"
 arrow-array = { workspace = true }
+arrow-array-deltalake = { workspace = true }
 arrow-buffer = { workspace = true }
+arrow-buffer-deltalake = { workspace = true }
 arrow-cast = { workspace = true }
+arrow-cast-deltalake = { workspace = true }
 arrow-schema = { workspace = true }
+arrow-schema-deltalake = { workspace = true }
 async-trait = "0.1"
 auto_enums = "0.8"
 auto_impl = "1"

diff --git a/src/common/src/array/arrow.rs → src/common/src/array/arrow/arrow.rs b/src/common/src/array/arrow.rs → src/common/src/array/arrow/arrow.rs
@@ -15,21 +15,21 @@
 //! Converts between arrays and Apache Arrow arrays.
 
 use std::fmt::Write;
+use std::sync::Arc;
 
-use arrow_array::Array as ArrowArray;
-use arrow_cast::cast;
-use arrow_schema::{Field, Schema, SchemaRef, DECIMAL256_MAX_PRECISION};
 use chrono::{NaiveDateTime, NaiveTime};
 use itertools::Itertools;
 
-use super::*;
-use crate::types::{Int256, StructType};
+use super::{arrow_array, arrow_buffer, arrow_cast, arrow_schema};
+use crate::array::*;
+use crate::buffer::Bitmap;
+use crate::types::*;
 use crate::util::iter_util::ZipEqFast;
 
 /// Converts RisingWave array to Arrow array with the schema.
 /// This function will try to convert the array if the type is not same with the schema.
 pub fn to_record_batch_with_schema(
-    schema: SchemaRef,
+    schema: arrow_schema::SchemaRef,
     chunk: &DataChunk,
 ) -> Result<arrow_array::RecordBatch, ArrayError> {
     if !chunk.is_compacted() {
@@ -45,7 +45,7 @@ pub fn to_record_batch_with_schema(
             if column.data_type() == field.data_type() {
                 Ok(column)
             } else {
-                cast(&column, field.data_type())
+                arrow_cast::cast(&column, field.data_type())
                     .map_err(|err| ArrayError::FromArrow(err.to_string()))
             }
         })
@@ -73,14 +73,14 @@ impl TryFrom<&DataChunk> for arrow_array::RecordBatch {
 
         let fields: Vec<_> = columns
             .iter()
-            .map(|array: &Arc<dyn ArrowArray>| {
+            .map(|array: &Arc<dyn arrow_array::Array>| {
                 let nullable = array.null_count() > 0;
                 let data_type = array.data_type().clone();
-                Field::new("", data_type, nullable)
+                arrow_schema::Field::new("", data_type, nullable)
             })
             .collect();
 
-        let schema = Arc::new(Schema::new(fields));
+        let schema = Arc::new(arrow_schema::Schema::new(fields));
         let opts =
             arrow_array::RecordBatchOptions::default().with_row_count(Some(chunk.capacity()));
         arrow_array::RecordBatch::try_new_with_options(schema, columns, &opts)
@@ -203,7 +203,7 @@ impl TryFrom<&StructType> for arrow_schema::Fields {
     fn try_from(struct_type: &StructType) -> Result<Self, Self::Error> {
         struct_type
             .iter()
-            .map(|(name, ty)| Ok(Field::new(name, ty.try_into()?, true)))
+            .map(|(name, ty)| Ok(arrow_schema::Field::new(name, ty.try_into()?, true)))
             .try_collect()
     }
 }
@@ -223,7 +223,7 @@ impl TryFrom<&DataType> for arrow_schema::DataType {
             DataType::Int16 => Ok(Self::Int16),
             DataType::Int32 => Ok(Self::Int32),
             DataType::Int64 => Ok(Self::Int64),
-            DataType::Int256 => Ok(Self::Decimal256(DECIMAL256_MAX_PRECISION, 0)),
+            DataType::Int256 => Ok(Self::Decimal256(arrow_schema::DECIMAL256_MAX_PRECISION, 0)),
             DataType::Float32 => Ok(Self::Float32),
             DataType::Float64 => Ok(Self::Float64),
             DataType::Date => Ok(Self::Date32),
@@ -241,10 +241,10 @@ impl TryFrom<&DataType> for arrow_schema::DataType {
             DataType::Struct(struct_type) => Ok(Self::Struct(
                 struct_type
                     .iter()
-                    .map(|(name, ty)| Ok(Field::new(name, ty.try_into()?, true)))
+                    .map(|(name, ty)| Ok(arrow_schema::Field::new(name, ty.try_into()?, true)))
                     .try_collect::<_, _, ArrayError>()?,
             )),
-            DataType::List(datatype) => Ok(Self::List(Arc::new(Field::new(
+            DataType::List(datatype) => Ok(Self::List(Arc::new(arrow_schema::Field::new(
                 "item",
                 datatype.as_ref().try_into()?,
                 true,
@@ -546,6 +546,20 @@ impl TryFrom<&arrow_array::LargeStringArray> for JsonbArray {
     }
 }
 
+impl From<arrow_buffer::i256> for Int256 {
+    fn from(value: arrow_buffer::i256) -> Self {
+        let buffer = value.to_be_bytes();
+        Int256::from_be_bytes(buffer)
+    }
+}
+
+impl<'a> From<Int256Ref<'a>> for arrow_buffer::i256 {
+    fn from(val: Int256Ref<'a>) -> Self {
+        let buffer = val.to_be_bytes();
+        arrow_buffer::i256::from_be_bytes(buffer)
+    }
+}
+
 impl From<&Int256Array> for arrow_array::Decimal256Array {
     fn from(array: &Int256Array) -> Self {
         array
@@ -622,7 +636,7 @@ impl TryFrom<&ListArray> for arrow_array::ListArray {
                 array,
                 a,
                 Decimal256Builder::with_capacity(a.len()).with_data_type(
-                    arrow_schema::DataType::Decimal256(DECIMAL256_MAX_PRECISION, 0),
+                    arrow_schema::DataType::Decimal256(arrow_schema::DECIMAL256_MAX_PRECISION, 0),
                 ),
                 |b, v| b.append_option(v.map(Into::into)),
             ),
@@ -682,7 +696,11 @@ impl TryFrom<&ListArray> for arrow_array::ListArray {
             ArrayImpl::Struct(a) => {
                 let values = Arc::new(arrow_array::StructArray::try_from(a)?);
                 arrow_array::ListArray::new(
-                    Arc::new(Field::new("item", a.data_type().try_into()?, true)),
+                    Arc::new(arrow_schema::Field::new(
+                        "item",
+                        a.data_type().try_into()?,
+                        true,
+                    )),
                     arrow_buffer::OffsetBuffer::new(arrow_buffer::ScalarBuffer::from(
                         array
                             .offsets()
@@ -709,6 +727,7 @@ impl TryFrom<&arrow_array::ListArray> for ListArray {
     type Error = ArrayError;
 
     fn try_from(array: &arrow_array::ListArray) -> Result<Self, Self::Error> {
+        use arrow_array::Array;
         Ok(ListArray {
             value: Box::new(ArrayImpl::try_from(array.values())?),
             bitmap: match array.nulls() {
@@ -886,7 +905,7 @@ mod tests {
 
     #[test]
     fn struct_array() {
-        use arrow_array::Array as _;
+        use super::arrow_array::Array as _;
 
         // Empty array - risingwave to arrow conversion.
         let test_arr = StructArray::new(StructType::empty(), vec![], Bitmap::ones(0));

diff --git a/src/common/src/array/arrow/arrow_common.rs b/src/common/src/array/arrow/arrow_common.rs
@@ -0,0 +1,6 @@
+pub use arrow_impl::to_record_batch_with_schema;
+use {arrow_array, arrow_buffer, arrow_cast, arrow_schema};
+
+#[allow(clippy::duplicate_mod)]
+#[path = "./arrow.rs"]
+mod arrow_impl;
diff --git a/src/common/src/array/arrow/arrow_deltalake.rs b/src/common/src/array/arrow/arrow_deltalake.rs
@@ -0,0 +1,9 @@
+pub use arrow_impl::to_record_batch_with_schema as to_deltalake_record_batch_with_schema;
+use {
+    arrow_array_deltalake as arrow_array, arrow_buffer_deltalake as arrow_buffer,
+    arrow_cast_deltalake as arrow_cast, arrow_schema_deltalake as arrow_schema,
+};
+
+#[allow(clippy::duplicate_mod)]
+#[path = "./arrow.rs"]
+mod arrow_impl;
diff --git a/src/common/src/array/arrow/mod.rs b/src/common/src/array/arrow/mod.rs
@@ -0,0 +1,5 @@
+mod arrow_common;
+mod arrow_deltalake;
+
+pub use arrow_common::to_record_batch_with_schema;
+pub use arrow_deltalake::to_deltalake_record_batch_with_schema;
diff --git a/src/common/src/array/mod.rs b/src/common/src/array/mod.rs
@@ -15,7 +15,7 @@
 //! `Array` defines all in-memory representations of vectorized execution framework.
 
 mod arrow;
-pub use arrow::to_record_batch_with_schema;
+pub use arrow::{to_deltalake_record_batch_with_schema, to_record_batch_with_schema};
 mod bool_array;
 pub mod bytes_array;
 mod chrono_array;

diff --git a/src/common/src/types/num256.rs b/src/common/src/types/num256.rs
@@ -326,20 +326,6 @@ impl Num for Int256 {
     }
 }
 
-impl From<arrow_buffer::i256> for Int256 {
-    fn from(value: arrow_buffer::i256) -> Self {
-        let buffer = value.to_be_bytes();
-        Int256::from_be_bytes(buffer)
-    }
-}
-
-impl<'a> From<Int256Ref<'a>> for arrow_buffer::i256 {
-    fn from(val: Int256Ref<'a>) -> Self {
-        let buffer = val.to_be_bytes();
-        arrow_buffer::i256::from_be_bytes(buffer)
-    }
-}
-
 impl EstimateSize for Int256 {
     fn estimated_heap_size(&self) -> usize {
         mem::size_of::<i128>() * 2

diff --git a/src/connector/Cargo.toml b/src/connector/Cargo.toml
@@ -46,6 +46,7 @@ clickhouse = { git = "https://github.com/risingwavelabs/clickhouse.rs", rev = "6
     "time",
 ] }
 csv = "1.3"
+deltalake = { workspace = true }
 duration-str = "0.7.0"
 easy-ext = "1"
 enum-as-inner = "0.6"