diff --git a/cider/exec/module/batch/CiderArrowBufferHolder.cpp b/cider/exec/module/batch/CiderArrowBufferHolder.cpp index 390a24636..4a529fedf 100644 --- a/cider/exec/module/batch/CiderArrowBufferHolder.cpp +++ b/cider/exec/module/batch/CiderArrowBufferHolder.cpp @@ -46,10 +46,14 @@ CiderArrowArrayBufferHolder::CiderArrowArrayBufferHolder( CiderArrowArrayBufferHolder::~CiderArrowArrayBufferHolder() { for (size_t i = 0; i < buffers_.size(); ++i) { - relaseBuffer(i); + releaseBuffer(i); } } +size_t CiderArrowArrayBufferHolder::getBufferSizeAt(size_t index) { + return buffers_bytes_[index]; +} + void CiderArrowArrayBufferHolder::allocBuffer(size_t index, size_t bytes) { if (buffers_[index]) { buffers_[index] = allocator_->reallocate( @@ -61,7 +65,7 @@ void CiderArrowArrayBufferHolder::allocBuffer(size_t index, size_t bytes) { } } -void CiderArrowArrayBufferHolder::relaseBuffer(size_t index) { +void CiderArrowArrayBufferHolder::releaseBuffer(size_t index) { if (buffers_[index]) { allocator_->deallocate(reinterpret_cast(buffers_[index]), buffers_bytes_[index]); diff --git a/cider/exec/module/batch/CiderArrowBufferHolder.h b/cider/exec/module/batch/CiderArrowBufferHolder.h index cf422c81c..f1bf509c0 100644 --- a/cider/exec/module/batch/CiderArrowBufferHolder.h +++ b/cider/exec/module/batch/CiderArrowBufferHolder.h @@ -50,8 +50,10 @@ class CiderArrowArrayBufferHolder { ArrowArray* getDictPtr(); + size_t getBufferSizeAt(size_t index); + private: - void relaseBuffer(size_t index); + void releaseBuffer(size_t index); std::vector buffers_; std::vector buffers_bytes_; // Used for allocator. diff --git a/cider/exec/module/batch/CiderBatch.cpp b/cider/exec/module/batch/CiderBatch.cpp index 0ccbe468f..dbf3c671f 100644 --- a/cider/exec/module/batch/CiderBatch.cpp +++ b/cider/exec/module/batch/CiderBatch.cpp @@ -58,7 +58,9 @@ CiderBatch::CiderBatch(ArrowSchema* schema, CiderBatch::~CiderBatch() { releaseArrowEntries(); +#ifdef CIDER_BATCH_CIDER_IMPL destroy(); // TODO: Remove +#endif } CiderBatch::CiderBatch(const CiderBatch& rh) { @@ -95,8 +97,9 @@ CiderBatch::CiderBatch(CiderBatch&& rh) noexcept { rh.arrow_schema_ = nullptr; rh.ownership_ = false; rh.reallocate_ = false; - +#ifdef CIDER_BATCH_CIDER_IMPL moveFrom(&rh); // TODO: Remove +#endif } CiderBatch& CiderBatch::operator=(CiderBatch&& rh) noexcept { @@ -115,8 +118,9 @@ CiderBatch& CiderBatch::operator=(CiderBatch&& rh) noexcept { rh.ownership_ = false; rh.reallocate_ = false; +#ifdef CIDER_BATCH_CIDER_IMPL moveFrom(&rh); // TODO: Remove - +#endif return *this; } @@ -283,13 +287,8 @@ void CiderBatch::convertToArrowRepresentation() { arrow_array_->children[i] = new ArrowArray(); arrow_array_->children[i]->length = row_num(); arrow_array_->children[i]->n_children = 0; - arrow_array_->children[i]->buffers = (const void**)std::malloc(sizeof(void*) * 2); - // FIXME: fill actual null void* null_buf = std::malloc(row_num() / 8 + 1); std::memset(null_buf, 0xFF, row_num() / 8 + 1); - arrow_array_->children[i]->buffers[0] = null_buf; - arrow_array_->children[i]->buffers[1] = table_ptr_[i]; - arrow_array_->children[i]->n_buffers = 2; arrow_array_->children[i]->private_data = nullptr; arrow_array_->children[i]->dictionary = nullptr; arrow_array_->children[i]->release = CiderBatchUtils::ciderEmptyArrowArrayReleaser; @@ -300,6 +299,29 @@ void CiderBatch::convertToArrowRepresentation() { arrow_schema_->children[i]->n_children = 0; arrow_schema_->children[i]->children = nullptr; arrow_schema_->children[i]->release = CiderBatchUtils::ciderEmptyArrowSchemaReleaser; + + // (Kunshang)To be removed. temp code to pass ut. + // CiderStringTest::CiderStringTestArrow + if (schema_->getColumnTypeById(i).has_varchar()) { + arrow_array_->children[i]->n_buffers = 3; + arrow_array_->children[i]->buffers = (const void**)std::malloc(sizeof(void*) * 3); + arrow_array_->children[i]->buffers[0] = null_buf; + + arrow_schema_->children[i]->format = ""; + // 10 string row 0-9 + int32_t* offset_buf = new int[11]{0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100}; + char* data_buf( + "000000000011111111112222222222333333333344444444445555555555666666666677777777" + "7788888888889999999999"); + arrow_array_->children[i]->buffers[1] = offset_buf; + arrow_array_->children[i]->buffers[2] = data_buf; + } else { + arrow_array_->children[i]->buffers = (const void**)std::malloc(sizeof(void*) * 2); + // FIXME: fill actual null + arrow_array_->children[i]->buffers[0] = null_buf; + arrow_array_->children[i]->buffers[1] = table_ptr_[i]; + arrow_array_->children[i]->n_buffers = 2; + } } } diff --git a/cider/exec/module/batch/CiderBatchUtils.cpp b/cider/exec/module/batch/CiderBatchUtils.cpp index 8382a9f90..c4e680413 100644 --- a/cider/exec/module/batch/CiderBatchUtils.cpp +++ b/cider/exec/module/batch/CiderBatchUtils.cpp @@ -154,6 +154,8 @@ int64_t getBufferNum(const ArrowSchema* schema) { if (!strcmp(type, "tdm")) { return 2; } + case 'u': + return 3; default: CIDER_THROW(CiderException, std::string("Unsupported data type to CiderBatch: ") + type); @@ -185,6 +187,8 @@ SQLTypes convertArrowTypeToCiderType(const char* format) { case 's': return kSTRUCT; } + case 'u': + return kVARCHAR; default: CIDER_THROW(CiderCompileException, std::string("Unsupported data type to CiderBatch: ") + format); @@ -209,6 +213,8 @@ const char* convertCiderTypeToArrowType(SQLTypes type) { return "g"; case kSTRUCT: return "+s"; + case kVARCHAR: + return "u"; default: CIDER_THROW(CiderCompileException, std::string("Unsupported to convert type ") + toString(type) + @@ -264,6 +270,8 @@ const char* convertSubstraitTypeToArrowType(const substrait::Type& type) { return "+s"; case Type::kDate: return "tdm"; + case Type::kVarchar: + return "u"; default: CIDER_THROW(CiderRuntimeException, std::string("Unsupported to convert type ") + type.GetTypeName() + @@ -334,6 +342,8 @@ std::unique_ptr createCiderBatch(std::shared_ptr all if (!strcmp(format, "tdm")) { return ScalarBatch::Create(schema, allocator, array); } + case 'u': + return VarcharBatch::Create(schema, allocator, array); default: CIDER_THROW(CiderCompileException, std::string("Unsupported data type to create CiderBatch: ") + format); diff --git a/cider/exec/plan/parser/TypeUtils.h b/cider/exec/plan/parser/TypeUtils.h index 02cf49fb5..9bd144677 100644 --- a/cider/exec/plan/parser/TypeUtils.h +++ b/cider/exec/plan/parser/TypeUtils.h @@ -148,6 +148,10 @@ class TypeUtils { return getIsNullable(type.time().nullability()); case substrait::Type::kTimestamp: return getIsNullable(type.timestamp().nullability()); + case substrait::Type::kVarchar: + return getIsNullable(type.varchar().nullability()); + case substrait::Type::kFixedChar: + return getIsNullable(type.fixed_char().nullability()); default: return true; } diff --git a/cider/exec/template/CodeGenerator.h b/cider/exec/template/CodeGenerator.h index 8c7f8a35d..8d309508a 100644 --- a/cider/exec/template/CodeGenerator.h +++ b/cider/exec/template/CodeGenerator.h @@ -162,6 +162,12 @@ class CodeGenerator { CodegenColValues* rhs, llvm::Value* null); + std::unique_ptr codegenVarcharCmpFun( + const Analyzer::BinOper* bin_oper, + CodegenColValues* lhs, + CodegenColValues* rhs, + llvm::Value* null); + llvm::Value* codegenCmp(const SQLOps, const SQLQualifier, std::vector, @@ -333,6 +339,12 @@ class CodeGenerator { llvm::Value* pos_arg, const CompilationOptions& co); + std::unique_ptr codegenVarCharColVar( + const Analyzer::ColumnVar* col_var, + llvm::Value* col_byte_stream, + llvm::Value* pos_arg, + const CompilationOptions& co); + llvm::Value* codegenFixedLengthColVar(const Analyzer::ColumnVar* col_var, llvm::Value* col_byte_stream, llvm::Value* pos_arg); @@ -350,6 +362,10 @@ class CodeGenerator { llvm::Value* col_byte_stream, llvm::Value* pos_arg); + std::vector codegenVariableLengthStringColVarArrow( + llvm::Value* col_byte_stream, + llvm::Value* pos_arg); + llvm::Value* codegenRowId(const Analyzer::ColumnVar* col_var, const CompilationOptions& co); diff --git a/cider/exec/template/Codec.cpp b/cider/exec/template/Codec.cpp index 47479b5f8..4ce7858c3 100644 --- a/cider/exec/template/Codec.cpp +++ b/cider/exec/template/Codec.cpp @@ -296,3 +296,35 @@ std::vector FixedWidthSmallDate::codegenDecode( return {llvm::CallInst::Create(f, args), nullptr}; } } + +VarcharDecoder::VarcharDecoder(const size_t byte_width, + llvm::IRBuilder<>* ir_builder, + bool nullable) + : Decoder(ir_builder, nullable), byte_width_{byte_width} {} + +llvm::Instruction* VarcharDecoder::codegenDecode(llvm::Value* byte_stream, + llvm::Value* pos, + llvm::Module* module) const { + UNREACHABLE(); +} + +std::vector VarcharDecoder::codegenDecode(llvm::Module* module, + llvm::Value* byte_stream, + llvm::Value* pos) const { + auto nulls = extractNullVector(module, byte_stream); + auto offset_buffer = extractBufferAt(module, byte_stream, 1); + auto data_buffer = extractBufferAt(module, byte_stream, 2); + + llvm::Instruction* str_ptr = llvm::CallInst::Create( + module->getFunction("extract_str_ptr_arrow"), {data_buffer, offset_buffer, pos}); + llvm::Instruction* str_len = llvm::CallInst::Create( + module->getFunction("extract_str_len_arrow"), {offset_buffer, pos}); + + if (nulls) { + auto get_is_null = module->getFunction("check_bit_vector_clear"); + CHECK(get_is_null); + return {str_ptr, str_len, llvm::CallInst::Create(get_is_null, {nulls, pos})}; + } else { + return {str_ptr, str_len}; + } +} diff --git a/cider/exec/template/Codec.h b/cider/exec/template/Codec.h index ec6ffc023..d892683a1 100644 --- a/cider/exec/template/Codec.h +++ b/cider/exec/template/Codec.h @@ -161,4 +161,22 @@ class FixedWidthSmallDate : public Decoder { static constexpr int64_t ret_null_val_ = NULL_BIGINT; }; +class VarcharDecoder : public Decoder { + public: + VarcharDecoder(const size_t byte_width, + llvm::IRBuilder<>* ir_builder, + bool nullable = false); + + llvm::Instruction* codegenDecode(llvm::Value* byte_stream, + llvm::Value* pos, + llvm::Module* module) const override; + + std::vector codegenDecode(llvm::Module* module, + llvm::Value* byte_stream, + llvm::Value* pos) const override; + + private: + const size_t byte_width_; +}; + #endif // QUERYENGINE_CODEC_H diff --git a/cider/exec/template/CodegenColValues.h b/cider/exec/template/CodegenColValues.h index f164d8600..0d02c55aa 100644 --- a/cider/exec/template/CodegenColValues.h +++ b/cider/exec/template/CodegenColValues.h @@ -72,4 +72,28 @@ class FixedSizeColValues : public NullableColValues { DEF_CODEGEN_COL_VALUES_MEMBER(Value, value_) }; +class MultipleValueColValues : public NullableColValues { + public: + MultipleValueColValues(std::vector values, llvm::Value* null = nullptr) + : NullableColValues(null), values_(values) {} + std::unique_ptr copy() const override { + return std::make_unique(*this); + } + std::vector getValues() { return values_; } + const std::vector getValues() const { return values_; } + llvm::Value* getValueAt(int index) { return values_[index]; } + + private: + std::vector values_; +}; + +class TwoValueColValues : public MultipleValueColValues { + public: + TwoValueColValues(llvm::Value* value1, llvm::Value* value2, llvm::Value* null = nullptr) + : MultipleValueColValues({value1, value2}, null) {} + std::unique_ptr copy() const override { + return std::make_unique(*this); + } +}; + #endif diff --git a/cider/exec/template/ColumnIR.cpp b/cider/exec/template/ColumnIR.cpp index 93b031d46..385159b39 100644 --- a/cider/exec/template/ColumnIR.cpp +++ b/cider/exec/template/ColumnIR.cpp @@ -169,7 +169,8 @@ std::unique_ptr CodeGenerator::codegenColumnExpr( break; } case kVARCHAR: - CIDER_THROW(CiderCompileException, "String type ColumnVar is not supported now."); + col_values = codegenVarCharColVar(col_var, input_col_descriptor_ptr, pos_arg, co); + break; case kARRAY: CIDER_THROW(CiderCompileException, "Array type ColumnVar is not supported now."); default: @@ -234,6 +235,28 @@ std::unique_ptr CodeGenerator::codegenFixedLengthColVar( return std::make_unique(dec_val_cast, null); } +std::unique_ptr CodeGenerator::codegenVarCharColVar( + const Analyzer::ColumnVar* col_var, + llvm::Value* col_byte_stream, + llvm::Value* pos_arg, + const CompilationOptions& co) { + AUTOMATIC_IR_METADATA(cgen_state_); + const size_t size = 8; + VarcharDecoder decoder( + size, &cgen_state_->ir_builder_, !col_var->get_type_info().get_notnull()); + std::vector values = + decoder.codegenDecode(cgen_state_->module_, col_byte_stream, pos_arg); + for (auto v : values) { + cgen_state_->ir_builder_.Insert(v); + } + llvm::Instruction* null = nullptr; + if (values.size() == 3) { + null = values[2]; + values.pop_back(); + } + return std::make_unique(values[0], values[1], null); +} + std::vector CodeGenerator::codegenColVar(const Analyzer::ColumnVar* col_var, const bool fetch_column, const bool update_query_plan, diff --git a/cider/exec/template/CompareIR.cpp b/cider/exec/template/CompareIR.cpp index 7a77225dd..935565fc2 100644 --- a/cider/exec/template/CompareIR.cpp +++ b/cider/exec/template/CompareIR.cpp @@ -339,8 +339,8 @@ std::unique_ptr CodeGenerator::codegenCmpFun( if (lhs_nullable && rhs_nullable) { if (lhs_nullable->getNull() && rhs_nullable->getNull()) { - null = cgen_state_->ir_builder_.CreateAnd(lhs_nullable->getNull(), - rhs_nullable->getNull()); + null = cgen_state_->ir_builder_.CreateOr(lhs_nullable->getNull(), + rhs_nullable->getNull()); } else { null = lhs_nullable->getNull() ? lhs_nullable->getNull() : rhs_nullable->getNull(); } @@ -348,7 +348,14 @@ std::unique_ptr CodeGenerator::codegenCmpFun( null = lhs_nullable ? lhs_nullable->getNull() : rhs_nullable->getNull(); } - return codegenFixedSizeColCmpFun(bin_oper, lhs_lv.get(), rhs_lv.get(), null); + switch (lhs_ti.get_type()) { + case kVARCHAR: + case kTEXT: + case kCHAR: + return codegenVarcharCmpFun(bin_oper, lhs_lv.get(), rhs_lv.get(), null); + default: + return codegenFixedSizeColCmpFun(bin_oper, lhs_lv.get(), rhs_lv.get(), null); + } } std::unique_ptr CodeGenerator::codegenFixedSizeColCmpFun( @@ -375,6 +382,25 @@ std::unique_ptr CodeGenerator::codegenFixedSizeColCmpFun( return std::make_unique(value, null); } +std::unique_ptr CodeGenerator::codegenVarcharCmpFun( + const Analyzer::BinOper* bin_oper, + CodegenColValues* lhs, + CodegenColValues* rhs, + llvm::Value* null) { + AUTOMATIC_IR_METADATA(cgen_state_); + auto lhs_fixsize = dynamic_cast(lhs); + CHECK(lhs_fixsize); + auto rhs_fixsize = dynamic_cast(rhs); + CHECK(rhs_fixsize); + + llvm::Value* value = cgen_state_->emitCall("string_eq", + {lhs_fixsize->getValueAt(0), + lhs_fixsize->getValueAt(1), + rhs_fixsize->getValueAt(0), + rhs_fixsize->getValueAt(1)}); + return std::make_unique(value, null); +} + llvm::Value* CodeGenerator::codegenOverlaps(const SQLOps optype, const SQLQualifier qualifier, const std::shared_ptr lhs, diff --git a/cider/exec/template/IRCodegen.cpp b/cider/exec/template/IRCodegen.cpp index da8649da7..8fc9369ea 100644 --- a/cider/exec/template/IRCodegen.cpp +++ b/cider/exec/template/IRCodegen.cpp @@ -240,6 +240,13 @@ std::unique_ptr CodeGenerator::codegenConstantExpr( switch (ti.get_type()) { case kVARCHAR: + CHECK(constant_value.size() == 3); + return std::make_unique( + constant_value[1], + constant_value[2], + constant_expr->get_is_null() + ? llvm::ConstantInt::getTrue(cgen_state_->context_) + : llvm::ConstantInt::getFalse(cgen_state_->context_)); case kARRAY: UNREACHABLE(); default: diff --git a/cider/exec/template/TargetExprBuilder.cpp b/cider/exec/template/TargetExprBuilder.cpp index b8e39c0a1..26536b760 100644 --- a/cider/exec/template/TargetExprBuilder.cpp +++ b/cider/exec/template/TargetExprBuilder.cpp @@ -51,14 +51,15 @@ inline bool is_varlen_projection(const Analyzer::Expr* target_expr, } std::vector agg_fn_base_names(const TargetInfo& target_info, - const bool is_varlen_projection) { + const bool is_varlen_projection, + const bool use_arrow_format = false) { const auto& chosen_type = get_compact_type(target_info); if (is_varlen_projection) { UNREACHABLE(); return {"agg_id_varlen"}; } if (!target_info.is_agg || target_info.agg_kind == kSAMPLE) { - if (chosen_type.is_varlen()) { + if (chosen_type.is_varlen() && !use_arrow_format) { // not a varlen projection (not creating new varlen outputs). Just store the pointer // and offset into the input buffer in the output slots. return {"agg_id", "agg_id"}; @@ -149,7 +150,8 @@ void TargetExprCodegen::codegen( const auto arg_expr = agg_arg(target_expr); const bool varlen_projection = is_varlen_projection(target_expr, target_info.sql_type); - const auto agg_fn_names = agg_fn_base_names(target_info, varlen_projection); + const auto agg_fn_names = + agg_fn_base_names(target_info, varlen_projection, co.use_cider_data_format); const auto window_func = dynamic_cast(target_expr); WindowProjectNodeContext::resetWindowFunctionContext(executor); auto target_lvs = @@ -223,8 +225,10 @@ void TargetExprCodegen::codegenAggregate( auto& context = executor->getContext(); - const auto agg_fn_names = agg_fn_base_names( - target_info, is_varlen_projection(target_expr, target_info.sql_type)); + const auto agg_fn_names = + agg_fn_base_names(target_info, + is_varlen_projection(target_expr, target_info.sql_type), + co.use_cider_data_format); auto arg_expr = agg_arg(target_expr); for (const auto& agg_base_name : agg_fn_names) { @@ -298,16 +302,21 @@ void TargetExprCodegen::codegenAggregate( llvm::Value* project_arraies_i8 = LL_BUILDER.CreateIntToPtr(LL_BUILDER.CreateLoad(project_arraies_ptr, false), llvm::Type::getInt8PtrTy(context)); - llvm::Value* col_data = - executor->cgen_state_->emitCall("cider_ColDecoder_extractArrowBuffersAt", - {project_arraies_i8, LL_INT((uint64_t)1)}); - if (target_info.skip_null_val) { - llvm::Value* col_null = - executor->cgen_state_->emitCall("cider_ColDecoder_extractArrowBuffersAt", - {project_arraies_i8, LL_INT((uint64_t)0)}); - generator->codegen(agg_input_data, col_data, row_num, col_null); + if (target_info.sql_type.is_string()) { + // muset be a ProjectIDStringCodeGenerator + generator->codegen(agg_input_data, project_arraies_i8, row_num); } else { - generator->codegen(agg_input_data, col_data, row_num); + llvm::Value* col_data = + executor->cgen_state_->emitCall("cider_ColDecoder_extractArrowBuffersAt", + {project_arraies_i8, LL_INT((uint64_t)1)}); + if (target_info.skip_null_val) { + llvm::Value* col_null = executor->cgen_state_->emitCall( + "cider_ColDecoder_extractArrowBuffersAt", + {project_arraies_i8, LL_INT((uint64_t)0)}); + generator->codegen(agg_input_data, col_data, row_num, col_null); + } else { + generator->codegen(agg_input_data, col_data, row_num); + } } } else { // Fetch output buffers. @@ -613,8 +622,10 @@ void TargetExprCodegenBuilder::operator()(const Analyzer::Expr* target_expr, target_exprs_to_codegen.emplace_back( target_expr, target_info, slot_index_counter, target_index_counter++, is_group_by); - const auto agg_fn_names = agg_fn_base_names( - target_info, is_varlen_projection(target_expr, target_info.sql_type)); + const auto agg_fn_names = + agg_fn_base_names(target_info, + is_varlen_projection(target_expr, target_info.sql_type), + co.use_cider_data_format); slot_index_counter += agg_fn_names.size(); } diff --git a/cider/exec/template/operator/aggregate/CiderAggregateCodeGenerator.cpp b/cider/exec/template/operator/aggregate/CiderAggregateCodeGenerator.cpp index 8675904ea..4fc513e7c 100644 --- a/cider/exec/template/operator/aggregate/CiderAggregateCodeGenerator.cpp +++ b/cider/exec/template/operator/aggregate/CiderAggregateCodeGenerator.cpp @@ -110,7 +110,7 @@ std::unique_ptr SimpleAggregateCodeGenerator::Make( } } else { CIDER_THROW(CiderCompileException, - "Unsuppored data type for SimpleAggregateCodeGenerator."); + "Unsupported data type for SimpleAggregateCodeGenerator."); } if (target_info.skip_null_val) { @@ -160,7 +160,9 @@ std::unique_ptr ProjectIDCodeGenerator::Make( CHECK(cgen_state); CHECK(base_fname == "agg_id"); - auto generator = std::make_unique(); + auto generator = target_info.sql_type.is_string() + ? std::make_unique() + : std::make_unique(); generator->base_fname_ = "cider_" + base_fname + "_proj"; generator->target_info_ = target_info; @@ -199,8 +201,10 @@ std::unique_ptr ProjectIDCodeGenerator::Make( generator->base_fname_ += "_int64"; } else if (target_info.sql_type.is_boolean()) { generator->base_fname_ += "_bool"; + } else if (target_info.sql_type.is_string()) { + generator->base_fname_ += "_string"; } else { - throw std::runtime_error("Unsuppored data type for ProjectIDCodeGenerator."); + throw std::runtime_error("Unsupported data type for ProjectIDCodeGenerator."); } if (target_info.skip_null_val) { @@ -210,6 +214,48 @@ std::unique_ptr ProjectIDCodeGenerator::Make( return generator; } +void ProjectIDStringCodeGenerator::codegen(CodegenColValues* input, + llvm::Value* project_arraies_i8, + llvm::Value* index, + llvm::Value* output_null_buffer) const { + AUTOMATIC_IR_METADATA(cgen_state_); + CHECK(project_arraies_i8); + CHECK(index); + TwoValueColValues* args = dynamic_cast(input); + if (nullptr == args) { + CIDER_THROW(CiderCompileException, + "ProjectIDStringCodeGenerator only support MultipleValueCol data now."); + } + + cgen_state_->emitCall("reallocate_string_buffer_if_need", {project_arraies_i8, index}); + + llvm::Value* str_data_buffer = + cgen_state_->emitCall("cider_ColDecoder_extractArrowBuffersAt", + {project_arraies_i8, cgen_state_->llInt((uint64_t)2)}); + llvm::Value* str_offset_buffer = + cgen_state_->emitCall("cider_ColDecoder_extractArrowBuffersAt", + {project_arraies_i8, cgen_state_->llInt((uint64_t)1)}); + llvm::Value* null_buffer = + cgen_state_->emitCall("cider_ColDecoder_extractArrowBuffersAt", + {project_arraies_i8, cgen_state_->llInt((uint64_t)0)}); + + auto value_str_ptr = args->getValueAt(0), value_str_len = args->getValueAt(1), + is_null = args->getNull(); + index = cgen_state_->castToTypeIn(index, 64); + + std::vector fun_args = { + str_data_buffer, str_offset_buffer, index, value_str_ptr, value_str_len}; + if (target_info_.skip_null_val) { + CHECK(null_buffer); + CHECK(is_null); + null_buffer = castToIntPtrTyIn(null_buffer, 8); + fun_args.push_back(null_buffer); + fun_args.push_back(is_null); + } + cgen_state_->emitCall(base_fname_, fun_args); + return; +} + std::unique_ptr CountAggregateCodeGenerator::Make( const std::string& base_fname, TargetInfo target_info, diff --git a/cider/exec/template/operator/aggregate/CiderAggregateCodeGenerator.h b/cider/exec/template/operator/aggregate/CiderAggregateCodeGenerator.h index 3087aab76..e45cba775 100644 --- a/cider/exec/template/operator/aggregate/CiderAggregateCodeGenerator.h +++ b/cider/exec/template/operator/aggregate/CiderAggregateCodeGenerator.h @@ -69,6 +69,14 @@ class ProjectIDCodeGenerator : public SimpleAggregateCodeGenerator { CgenState* cgen_state); }; +class ProjectIDStringCodeGenerator : public ProjectIDCodeGenerator { + public: + void codegen(CodegenColValues* input, + llvm::Value* output_buffer, + llvm::Value* index, + llvm::Value* output_null_buffer = nullptr) const override; +}; + // Aggregate code generator for COUNT(*), COUNT(col), DISTINCT COUNT(col). class CountAggregateCodeGenerator : public AggregateCodeGenerator { public: diff --git a/cider/function/aggregate/CiderRuntimeFunctions.h b/cider/function/aggregate/CiderRuntimeFunctions.h index c4112341f..31163efa7 100644 --- a/cider/function/aggregate/CiderRuntimeFunctions.h +++ b/cider/function/aggregate/CiderRuntimeFunctions.h @@ -25,6 +25,7 @@ #include #include +#include "cider/CiderTypes.h" #include "exec/template/TypePunning.h" #include "type/data/funcannotations.h" #include "util/CiderBitUtils.h" @@ -106,6 +107,31 @@ template ALWAYS_INLINE void cider_agg_id(T& agg_val, const T& val) { agg_val = val; } + +extern "C" ALWAYS_INLINE void cider_agg_id_proj_string(int8_t* str_data_buffer, + int8_t* str_offset_buffer, + const uint64_t index, + int8_t* str_ptr, + const int32_t str_len) { + int32_t current_offset = reinterpret_cast(str_offset_buffer)[index]; + reinterpret_cast(str_offset_buffer)[index + 1] = current_offset + str_len; + memcpy(str_data_buffer + current_offset, str_ptr, str_len); +} + +extern "C" ALWAYS_INLINE void cider_agg_id_proj_string_nullable(int8_t* str_data_buffer, + int8_t* str_offset_buffer, + const uint64_t index, + int8_t* str_ptr, + const int32_t str_len, + uint8_t* agg_null_buffer, + bool is_null) { + if (is_null) { + CiderBitUtils::clearBitAt(agg_null_buffer, index); + } else { + cider_agg_id_proj_string(str_data_buffer, str_offset_buffer, index, str_ptr, str_len); + } +} + DEF_CIDER_SIMPLE_AGG_FUNCS(id, cider_agg_id) /********************************************************************************/ diff --git a/cider/function/scalar/DecodersImpl.h b/cider/function/scalar/DecodersImpl.h index 9c81c840a..7eea1fbae 100644 --- a/cider/function/scalar/DecodersImpl.h +++ b/cider/function/scalar/DecodersImpl.h @@ -26,6 +26,7 @@ #include #include "exec/module/batch/ArrowABI.h" +#include "exec/module/batch/CiderArrowBufferHolder.h" #include "type/data/funcannotations.h" extern "C" ALWAYS_INLINE int64_t SUFFIX(fixed_width_int_decode)(const int8_t* byte_stream, @@ -156,4 +157,21 @@ extern "C" ALWAYS_INLINE const uint8_t* cider_ColDecoder_extractArrowBuffersAt( return reinterpret_cast(ptr->buffers[index]); } +extern "C" ALWAYS_INLINE void reallocate_string_buffer_if_need( + const int8_t* input_desc_ptr, + const int64_t pos) { + // assumption: this arrow array is already initialized outside(it has 3 buffers, length + // is set) + const ArrowArray* ptr = reinterpret_cast(input_desc_ptr); + CiderArrowArrayBufferHolder* holder = + reinterpret_cast(ptr->private_data); + int32_t* offset_buffer = (int32_t*)ptr->buffers[1]; + size_t capacity = holder->getBufferSizeAt(2); + if (capacity == 0) { + holder->allocBuffer(2, 4096); + } else if (offset_buffer[pos] > + capacity * 0.9) { // do reallocate when reach 90% of capacity + holder->allocBuffer(2, capacity * 2); + } +} #endif // CIDER_FUNCTION_DECODERSIMPL_H diff --git a/cider/function/scalar/RuntimeFunctions.cpp b/cider/function/scalar/RuntimeFunctions.cpp index fc7815560..3d803fe66 100644 --- a/cider/function/scalar/RuntimeFunctions.cpp +++ b/cider/function/scalar/RuntimeFunctions.cpp @@ -1451,4 +1451,16 @@ extern "C" ALWAYS_INLINE void clear_bit_vector(uint8_t* bit_vector, uint64_t ind CiderBitUtils::clearBitAt(bit_vector, index); } +extern "C" ALWAYS_INLINE int8_t* extract_str_ptr_arrow(int8_t* data_buffer, + int8_t* offset_buffer, + uint64_t pos) { + return (data_buffer + reinterpret_cast(offset_buffer)[pos]); +} + +extern "C" ALWAYS_INLINE int32_t extract_str_len_arrow(int8_t* offset_buffer, + uint64_t pos) { + int32_t* offset = reinterpret_cast(offset_buffer); + return offset[pos + 1] - offset[pos]; +} + #include "function/aggregate/CiderRuntimeFunctions.h" diff --git a/cider/include/cider/batch/CiderBatch.h b/cider/include/cider/batch/CiderBatch.h index 35f3037db..d5005c1c9 100644 --- a/cider/include/cider/batch/CiderBatch.h +++ b/cider/include/cider/batch/CiderBatch.h @@ -30,9 +30,13 @@ #include "exec/module/batch/CiderArrowBufferHolder.h" #include "util/CiderBitUtils.h" +#define CIDER_BATCH_ARROW_IMPL +#define CIDER_BATCH_CIDER_IMPL + /// \class CiderBatch /// \brief This class will be data/table format interface/protocol class CiderBatch { +#ifdef CIDER_BATCH_ARROW_IMPL public: // Both constructors will take over the ownership of ArrowSchema & ArrowArray and // buffers they hold, so ArrowSchema and ArrowArray should allocated from @@ -89,7 +93,7 @@ class CiderBatch { } // TODO: Change to pure virtual function. - // CiderBatch dosen't contain null vector by default until getMutableNulls is called. + // CiderBatch doesn't contain null vector by default until getMutableNulls is called. bool resizeBatch(int64_t size, bool default_not_null = false); // This function has a side effect that the null vector will be allocated if there is no @@ -141,7 +145,9 @@ class CiderBatch { bool ownership_{false}; // Whether need to release the tree of schema_ and array_. bool reallocate_{ false}; // Whether permitted to (re-)allocate memory to buffers of array_. +#endif +#ifdef CIDER_BATCH_CIDER_IMPL public: /// \brief Constructs CiderBatch that will use row memory layout with self memory /// manager. It will allocate row_num * row_size memory internally and the allocated @@ -483,6 +489,7 @@ class CiderBatch { align_ = other->align_; null_vecs_ = other->null_vecs_; } +#endif }; #endif // CIDER_CIDERBATCH_H diff --git a/cider/include/cider/batch/ScalarBatch.h b/cider/include/cider/batch/ScalarBatch.h index 56a0a9346..9ef932429 100644 --- a/cider/include/cider/batch/ScalarBatch.h +++ b/cider/include/cider/batch/ScalarBatch.h @@ -85,4 +85,88 @@ class ScalarBatch final : public CiderBatch { } }; +class VarcharBatch final : public CiderBatch { + public: + static std::unique_ptr Create(ArrowSchema* schema, + std::shared_ptr allocator, + ArrowArray* array = nullptr) { + return array ? std::make_unique(schema, array, allocator) + : std::make_unique(schema, allocator); + } + + explicit VarcharBatch(ArrowSchema* schema, std::shared_ptr allocator) + : CiderBatch(schema, allocator) { + checkArrowEntries(); + } + explicit VarcharBatch(ArrowSchema* schema, + ArrowArray* array, + std::shared_ptr allocator) + : CiderBatch(schema, array, allocator) { + checkArrowEntries(); + } + + uint8_t* getMutableRawData() { + CHECK(!isMoved()); + return reinterpret_cast( + const_cast(getBuffersPtr()[getDataBufferIndex()])); + } + + const uint8_t* getRawData() const { + CHECK(!isMoved()); + return reinterpret_cast(getBuffersPtr()[getDataBufferIndex()]); + } + + int32_t* getMutableRawOffset() { + CHECK(!isMoved()); + return reinterpret_cast( + const_cast(getBuffersPtr()[getOffsetBufferIndex()])); + } + + const int32_t* getRawOffset() const { + CHECK(!isMoved()); + return reinterpret_cast(getBuffersPtr()[getOffsetBufferIndex()]); + } + + bool resizeDataBuffer(int64_t size) { + CHECK(!isMoved()); + if (!permitBufferAllocate()) { + return false; + } + + auto array_holder = reinterpret_cast(getArrayPrivate()); + array_holder->allocBuffer(2, size); + return true; + } + + protected: + inline const size_t getOffsetBufferIndex() const { return 1; } + inline const size_t getDataBufferIndex() const { return 2; } + + bool resizeData(int64_t size) override { + CHECK(!isMoved()); + if (!permitBufferAllocate()) { + return false; + } + + auto array_holder = reinterpret_cast(getArrayPrivate()); + size_t origin_offset_len = array_holder->getBufferSizeAt(1); + array_holder->allocBuffer(1, sizeof(int32_t) * (size + 1)); // offset buffer + std::memset((void*)(getMutableRawOffset() + origin_offset_len / sizeof(int32_t)), + 0, + sizeof(int32_t) * (size + 1) - origin_offset_len); + size_t bytes = array_holder->getBufferSizeAt(2); + array_holder->allocBuffer(2, bytes); // data buffer, it should never shrink. + + setLength(size); + + return true; + } + + private: + void checkArrowEntries() const { + CHECK_EQ(getChildrenNum(), 0); + CHECK_EQ(getBufferNum(), 3); + } +}; + #endif diff --git a/cider/tests/CiderArrowBatchTest.cpp b/cider/tests/CiderArrowBatchTest.cpp index ef8470d29..885f962b0 100644 --- a/cider/tests/CiderArrowBatchTest.cpp +++ b/cider/tests/CiderArrowBatchTest.cpp @@ -310,6 +310,107 @@ TEST_F(CiderArrowBatchTest, ScalarBatchTest) { {true, true, false, true, true}); } +void runVarcharBatchTest(VarcharBatch* batch, + const vector& data, + const vector& offset, + const vector& not_null = {}) { + assert(data.size() == not_null.size() || not_null.empty()); + CHECK(batch); + + EXPECT_EQ(batch->getCiderType(), kVARCHAR); + + EXPECT_TRUE(batch->resizeBatch(data.size())); + size_t total_len = [&]() { + size_t len = 0; + for (auto s : data) { + len += s.length(); + } + return len; + }(); + EXPECT_EQ(total_len, offset[offset.size() - 1]); + EXPECT_TRUE(batch->resizeDataBuffer(total_len)); + EXPECT_EQ(batch->getLength(), data.size()); + EXPECT_EQ(batch->getNullCount(), 0); + + { + auto raw_data = batch->getMutableRawData(); + auto raw_offset = batch->getMutableRawOffset(); + EXPECT_NE(raw_data, nullptr); + EXPECT_NE(raw_offset, nullptr); + + for (size_t i = 0; i < data.size(); ++i) { + std::memcpy(raw_data + raw_offset[i], data[i].c_str(), data[i].length()); + raw_offset[i + 1] = raw_offset[i] + data[i].length(); + } + } + + int64_t null_count = 0; + if (!not_null.empty()) { + auto not_null_data = batch->getMutableNulls(); + EXPECT_EQ(batch->getNullCount(), 0); + EXPECT_NE(not_null_data, nullptr); + for (size_t i = 0; i < not_null.size(); ++i) { + if (!not_null[i]) { + CiderBitUtils::clearBitAt(not_null_data, i); + ++null_count; + } + } + batch->setNullCount(null_count); + } + + EXPECT_TRUE(batch->resizeBatch(2 * data.size())); + EXPECT_EQ(batch->getLength(), 2 * data.size()); + EXPECT_EQ(batch->getNullCount(), + not_null.empty() ? 0 : null_count + (int64_t)data.size()); + + { + auto raw_data = batch->getRawData(); + auto raw_offset = batch->getMutableRawOffset(); + EXPECT_NE(raw_data, nullptr); + EXPECT_NE(raw_offset, nullptr); + for (size_t i = 0; i < data.size(); ++i) { + EXPECT_EQ(0, + std::memcmp(raw_data + raw_offset[i], data[i].c_str(), data[i].length())); + } + } + + if (!not_null.empty()) { + auto not_null_data = batch->getNulls(); + EXPECT_NE(not_null_data, nullptr); + for (size_t i = 0; i < not_null.size(); ++i) { + EXPECT_EQ(CiderBitUtils::isBitSetAt(not_null_data, i), not_null[i]); + } + for (size_t i = 0; i < not_null.size(); ++i) { + EXPECT_EQ(CiderBitUtils::isBitSetAt(not_null_data, i + not_null.size()), false); + } + } +} + +TEST_F(CiderArrowBatchTest, VarcharBatchNoNullTest) { + SQLTypeInfo type_info(kVARCHAR, true); + ArrowSchema* schema = convertCiderTypeInfoToArrowSchema(type_info); + + auto batch = VarcharBatch::Create(schema, ciderAllocator); + EXPECT_TRUE(batch->isRootOwner()); + + std::vector data{"a", "b", "c", "d"}; + std::vector offset{0, 1, 2, 3, 4}; + runVarcharBatchTest(batch.get(), data, offset); +} + +TEST_F(CiderArrowBatchTest, VarcharBatchNullTest) { + SQLTypeInfo type_info(kVARCHAR, true); + ArrowSchema* schema = convertCiderTypeInfoToArrowSchema(type_info); + + auto batch = VarcharBatch::Create(schema, ciderAllocator); + EXPECT_TRUE(batch->isRootOwner()); + + std::vector data{"a", "b", "c", "d"}; + std::vector offset{0, 1, 2, 3, 4}; + std::vector null{true, false, true, false}; + runVarcharBatchTest(batch.get(), data, offset, null); +} + TEST_F(CiderArrowBatchTest, StructBatchTest) { { SQLTypeInfo type( diff --git a/cider/tests/functionality/CiderStringTest.cpp b/cider/tests/functionality/CiderStringTest.cpp index c69a10624..4e783ca9e 100644 --- a/cider/tests/functionality/CiderStringTest.cpp +++ b/cider/tests/functionality/CiderStringTest.cpp @@ -26,7 +26,8 @@ class CiderStringTest : public CiderTestBase { public: CiderStringTest() { table_name_ = "test"; - create_ddl_ = R"(CREATE TABLE test(col_1 INTEGER, col_2 VARCHAR(10));)"; + create_ddl_ = + R"(CREATE TABLE test(col_1 INTEGER NOT NULL, col_2 VARCHAR(10) NOT NULL);)"; input_ = {std::make_shared(QueryDataGenerator::generateBatchByTypes( 10, {"col_1", "col_2"}, @@ -247,6 +248,28 @@ LIKE_STRING_TEST_UNIT(CiderNullableStringTest, likeNullableStringTest) ESCAPE_STRING_TEST_UNIT(CiderNullableStringTest, escapeNullableStringTest) IN_STRING_TEST_UNIT(CiderNullableStringTest, inNullableStringTest) +class CiderStringTestArrow : public CiderTestBase { + public: + CiderStringTestArrow() { + table_name_ = "test"; + create_ddl_ = + R"(CREATE TABLE test(col_1 INTEGER NOT NULL, col_2 VARCHAR(10) NOT NULL);)"; + + input_ = {std::make_shared(QueryDataGenerator::generateBatchByTypes( + 10, + {"col_1", "col_2"}, + {CREATE_SUBSTRAIT_TYPE(I32), CREATE_SUBSTRAIT_TYPE(Varchar)}))}; + } +}; + +TEST_F(CiderStringTestArrow, ArrowBasicStringTest) { + prepareArrowBatch(); + assertQueryArrowTemp("SELECT col_1 FROM test "); + assertQueryArrowTemp("SELECT col_2 FROM test "); + assertQueryArrowTemp("SELECT col_1, col_2 FROM test "); + assertQueryArrowTemp("SELECT col_2 FROM test where col_2 = '0000000000'"); +} + class CiderConstantStringTest : public CiderTestBase { public: CiderConstantStringTest() { diff --git a/cider/tests/utils/CiderTestBase.cpp b/cider/tests/utils/CiderTestBase.cpp index 109d0e71b..ce143a846 100644 --- a/cider/tests/utils/CiderTestBase.cpp +++ b/cider/tests/utils/CiderTestBase.cpp @@ -59,6 +59,17 @@ void CiderTestBase::assertQueryArrow(const std::string& sql, EXPECT_TRUE(CiderBatchChecker::checkArrowEq(duck_res_batch, cider_res_batch, false)); } +void CiderTestBase::assertQueryArrowTemp(const std::string& sql, + const std::string& json_file) { + auto duck_res = duckDbQueryRunner_.runSql(sql); + auto duck_res_batch = DuckDbResultConvertor::fetchDataToCiderBatch(duck_res); + + auto cider_input = json_file.size() ? json_file : sql; + auto cider_res_batch = std::make_shared( + ciderQueryRunner_.runQueryOneBatch(cider_input, input_[0], true)); + EXPECT_TRUE(CiderBatchChecker::checkArrowEqTemp(duck_res_batch, cider_res_batch)); +} + void CiderTestBase::assertQueryArrowIgnoreOrder(const std::string& sql, const std::string& json_file) { auto duck_res = duckDbQueryRunner_.runSql(sql); diff --git a/cider/tests/utils/CiderTestBase.h b/cider/tests/utils/CiderTestBase.h index 8710985bb..f5ca1603b 100644 --- a/cider/tests/utils/CiderTestBase.h +++ b/cider/tests/utils/CiderTestBase.h @@ -55,6 +55,9 @@ class CiderTestBase : public testing::Test { void assertQueryArrow(const std::string& sql, const std::string& json_file = ""); void assertQueryArrowIgnoreOrder(const std::string& sql, const std::string& json_file = ""); + // To be deprecated. for string and other unsupported types. + void assertQueryArrowTemp(const std::string& sql, const std::string& json_file = ""); + // a method for test count distinct with multi batch case void assertQueryForCountDistinct( const std::string& sql,