From 1f58185e78ee5dc5d5a7cbbb9c1c6a93ba18a9f8 Mon Sep 17 00:00:00 2001 From: marty1885 Date: Wed, 26 Jun 2024 14:24:22 +0000 Subject: [PATCH] baseline implementation for CPY --- ggml-metalium.cpp | 71 ++++++++++++++++++++++++++++++----------------- 1 file changed, 46 insertions(+), 25 deletions(-) diff --git a/ggml-metalium.cpp b/ggml-metalium.cpp index 34bd4fae28688..7c13d3a828b16 100644 --- a/ggml-metalium.cpp +++ b/ggml-metalium.cpp @@ -1,15 +1,18 @@ #include "common/bfloat16.hpp" +#include "common/constants.hpp" #include "device/tt_arch_types.h" #include "ggml-backend-impl.h" #include "ggml.h" #include "ggml-metalium.h" #include "host_api.hpp" +#include "hostdevcommon/kernel_structs.h" #include "impl/dispatch/command_queue.hpp" #include "tensor/host_buffer/functions.hpp" #include "tensor/host_buffer/types.hpp" #include "tensor/types.hpp" #include "tt_dnn/op_library/auto_format.hpp" +#include "tt_dnn/op_library/composite/composite_ops.hpp" #include "tt_dnn/op_library/tilize/tilize_op.hpp" #include "tt_dnn/op_library/untilize/untilize_op.hpp" #include @@ -84,6 +87,8 @@ static void ggml_backend_metalium_mul_mat(ggml_backend_metalium_context * ctx, s GGML_ASSERT(am != NULL); GGML_ASSERT(bm != NULL); GGML_ASSERT(cm != NULL); + GGML_ASSERT(am->tensor != NULL); + GGML_ASSERT(bm->tensor != NULL); tt::tt_metal::Tensor& a = *am->tensor; tt::tt_metal::Tensor& b = *bm->tensor; @@ -94,6 +99,7 @@ static void ggml_backend_metalium_mul_mat(ggml_backend_metalium_context * ctx, s auto aT = tt::tt_metal::transpose(a, -2, -1); #if !defined(NDEBUG) || 1 // TODO: Remove this in the future. TTNN has buggy transpose implementation + std::cout << "a.shape: " << a.shape() << " aT.shape: " << aT.shape() << std::endl; GGML_ASSERT(aT.shape()[0] == a.shape()[0]); GGML_ASSERT(aT.shape()[1] == a.shape()[1]); GGML_ASSERT(aT.shape()[3] == a.shape()[2]); @@ -101,11 +107,27 @@ static void ggml_backend_metalium_mul_mat(ggml_backend_metalium_context * ctx, s #endif // TODO: Ask TT to support multiplication of pre-transposed tensors. Calling transpose here is inefficient + // https://github.com/tenstorrent/tt-metal/issues/9709 cm->tensor = std::make_shared(tt::tt_metal::fully_connected(b, aT)); - auto at = tt::tt_metal::transpose(a, 2, 3); GGML_UNUSED(ctx); } +static void ggml_backend_metalium_cpy(ggml_backend_metalium_context * ctx, struct ggml_tensor * dst) { + GGML_UNUSED(ctx); + const struct ggml_tensor * src0 = dst->src[0]; + TensorWithMetadata* meta = (TensorWithMetadata*)src0->extra; + GGML_ASSERT(meta != NULL); + GGML_ASSERT(meta->tensor != NULL); + + TensorWithMetadata* dst_meta = (TensorWithMetadata*)dst->extra; + GGML_ASSERT(dst_meta != NULL); + + tt::tt_metal::Tensor ret = tt::tt_metal::zeros_like(*meta->tensor); + ret.deepcopy(*meta->tensor); + GGML_ASSERT(ret.storage_type() == tt::tt_metal::StorageType::DEVICE || ret.storage_type() == tt::tt_metal::StorageType::MULTI_DEVICE); + dst_meta->tensor = std::make_shared(std::move(ret)); +} + static void ggml_backend_metalium_out_prod(ggml_backend_metalium_context * ctx, struct ggml_tensor * dst) { GGML_UNUSED(ctx); GGML_UNUSED(dst); @@ -207,6 +229,7 @@ static void ggml_backend_metalium_buffer_set_tensor(ggml_backend_buffer_t buffer } else if (ggtype == GGML_TYPE_F32) { // For now we cast F32 to BF16. Need a scalable way to handle this as WORMHOLD_B0 have native support for F32 + // TODO: Might want to consider disabling F32 support for Grayskull in the future std::vector bfloat16_data(size / sizeof(float)); const float* f32_data = (const float*)data; for(size_t i = 0; i < size / sizeof(float); i++) { @@ -232,9 +255,11 @@ static void ggml_backend_metalium_buffer_set_tensor(ggml_backend_buffer_t buffer // I think we can allow this.. right? // GGML_ASSERT(!bufctx->tensors.contains(offset)); + // TODO: Make sure this is the correct tilize we want to use + t = tt::tt_metal::tilize_with_zero_padding(t.to(bufctx->device)); *meta = TensorWithMetadata { - .tensor = std::make_shared(tt::tt_metal::tilize_with_zero_padding(t.to(bufctx->device))), + .tensor = std::make_shared(std::move(t)), .ggtype = ggtype, }; } @@ -274,17 +299,12 @@ static void ggml_backend_metalium_buffer_get_tensor(ggml_backend_buffer_t buffer tt::tt_metal::memcpy(queue, buf.data(), row_major_tensor); tt::tt_metal::Finish(queue); - tt::tt_metal::Shape tt_underlying_shape = row_major_tensor.shape().value(); - if(tt_underlying_shape[2] % 32 != 0) { - tt_underlying_shape[2] = (tt_underlying_shape[2] / 32 + 1) * 32; - } - if (tt_underlying_shape[3] % 32 != 0) { - tt_underlying_shape[3] = (tt_underlying_shape[3] / 32 + 1) * 32; - } + ttnn::Shape tt_underlying_shape = row_major_tensor.shape().with_tile_padding(); std::array stride = {1, tt_underlying_shape[3], tt_underlying_shape[3] * tt_underlying_shape[2], tt_underlying_shape[3] * tt_underlying_shape[2] * tt_underlying_shape[1]}; // Tilize to ROW_MAJOR doesn't mean the tensor is contiguous. It still has the underlying 32x32 tiles // we need to view into the tensor to get the contiguous data + // TODO: Make sure this is correct. As of now not tested for large (>32x32) tensors size_t idx = 0; for(size_t w = 0; w < shape[0]; w++) { for(size_t z = 0; z < shape[1]; z++) { @@ -309,7 +329,7 @@ static void ggml_backend_metalium_buffer_get_tensor(ggml_backend_buffer_t buffer static void * ggml_backend_metalium_buffer_get_base(ggml_backend_buffer_t buffer) { // Not using this. Metalium's allication model is not compatible with GGML's allocator GGML_UNUSED(buffer); - return (void*)0x10000; + return (void*)0xdeadbeef; } GGML_CALL static void @@ -346,7 +366,7 @@ ggml_backend_metalium_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, ctx->ggml_buffer_size_bytes = size; ctx->name = ctx->name; ctx->device = buft_ctx->device; - return ggml_backend_buffer_init(buft, ggml_backend_metalium_buffer_interface, ctx, size); + return ggml_backend_buffer_init(buft, ggml_backend_metalium_buffer_interface, ctx, size + 4096 * 1024); } static ggml_backend_buffer_type_i ggml_backend_metalium_buffer_type_interface = { @@ -398,6 +418,10 @@ GGML_CALL static enum ggml_status ggml_backend_metalium_graph_compute(ggml_backe case GGML_OP_OUT_PROD: ggml_backend_metalium_out_prod(ctx, node); break; + + case GGML_OP_CPY: + ggml_backend_metalium_cpy(ctx, node); + break; case GGML_OP_NONE: case GGML_OP_RESHAPE: @@ -420,21 +444,18 @@ GGML_CALL static enum ggml_status ggml_backend_metalium_graph_compute(ggml_backe GGML_CALL static bool ggml_backend_metalium_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { const struct ggml_tensor * src0 = op->src[0]; const struct ggml_tensor * src1 = op->src[1]; - GGML_UNUSED(src0); - GGML_UNUSED(src1); - - if(op->op == GGML_OP_NONE) { - return true; + GGML_ASSERT(op != NULL); + + switch (op->op) { + case GGML_OP_NONE: + return true; + case GGML_OP_MUL_MAT: + return op->type == GGML_TYPE_F32 && src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32; + case GGML_OP_CPY: + return op->type == GGML_TYPE_F32 && src0->type == GGML_TYPE_F32; + default: + return false; } - return op->op == GGML_OP_MUL_MAT && op->type == GGML_TYPE_F32 && src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32; - - /*return (op->op == GGML_OP_MUL_MAT && ggml_backend_blas_use_blas(op)) || - (op->op == GGML_OP_OUT_PROD && op->src[0]->type == GGML_TYPE_F32 && - op->src[1]->type == GGML_TYPE_F32 && - ggml_is_matrix(src0) && - ggml_is_matrix(src1) && - ggml_is_contiguous(src0) && - (ggml_is_contiguous(src1) || ggml_is_transposed(src1)));*/ GGML_UNUSED(backend); }