diff --git a/icicle/CMakeLists.txt b/icicle/CMakeLists.txt
index ec7d564c1..7aa4284d6 100644
--- a/icicle/CMakeLists.txt
+++ b/icicle/CMakeLists.txt
@@ -18,6 +18,11 @@ endif()
 # Print the selected build type
 message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 
+# Prevent build if both SANITIZE and CUDA_BACKEND are enabled
+if(SANITIZE AND CUDA_BACKEND) 
+  message(FATAL_ERROR "Address sanitizer and Cuda cannot be enabled at the same time.")
+endif()
+
 # Find the ccache program
 find_program(CCACHE_PROGRAM ccache)
 # If ccache is found, use it as the compiler launcher
@@ -132,5 +137,4 @@ endif()
 
 if (BUILD_TESTS)
   add_subdirectory(tests)
-endif()
-
+endif()
\ No newline at end of file
diff --git a/icicle/backend/cpu/src/hash/cpu_merkle_tree.cpp b/icicle/backend/cpu/src/hash/cpu_merkle_tree.cpp
index eaad30eeb..e5def4040 100644
--- a/icicle/backend/cpu/src/hash/cpu_merkle_tree.cpp
+++ b/icicle/backend/cpu/src/hash/cpu_merkle_tree.cpp
@@ -40,18 +40,16 @@ namespace icicle {
 
         if (0 < layer_idx) {
           // update nof_hashes to the next layer (below)
-          const uint64_t next_layer_total_size = layer_hashes[layer_idx - 1].output_size();
-          nof_hashes = nof_hashes * cur_layer.m_hash.default_input_chunk_size() / next_layer_total_size;
+          const uint64_t cur_layer_input_size = layer_hashes[layer_idx].default_input_chunk_size();
+          nof_hashes = nof_hashes * cur_layer_input_size / layer_hashes[layer_idx - 1].output_size();
 
           // Calculate path_size
-          m_pruned_path_size +=
-            layer_hashes[layer_idx].default_input_chunk_size() - layer_hashes[layer_idx - 1].output_size();
-          m_full_path_size += layer_hashes[layer_idx].default_input_chunk_size();
+          m_pruned_path_size += cur_layer_input_size - layer_hashes[layer_idx - 1].output_size();
+          m_full_path_size += cur_layer_input_size;
         }
       }
     }
 
-    // TODO: handle size
     eIcicleError build(const std::byte* leaves, uint64_t leaves_size, const MerkleTreeConfig& config) override
     {
       TasksManager<HashTask> task_manager(get_nof_workers(config)); // Run workers.
@@ -59,23 +57,17 @@ namespace icicle {
         ICICLE_LOG_ERROR << "Tree cannot be built more than one time";
         return eIcicleError::INVALID_ARGUMENT;
       }
-      const uint64_t expected_input_size = m_layers[0].m_nof_hashes * m_layers[0].m_hash.default_input_chunk_size();
-      if (leaves_size < expected_input_size) {
-        ICICLE_LOG_ERROR << "CPU Merkle tree: Expecting " << expected_input_size << " bytes in input, got "
-                         << leaves_size << ". Note: Padding is currently not supported but will be soon";
-        return eIcicleError::INVALID_ARGUMENT;
-      }
-      if (leaves_size > expected_input_size) {
-        ICICLE_LOG_ERROR << "CPU Merkle tree: Expecting " << expected_input_size << " bytes in input, got "
-                         << leaves_size << ". Leaves size cannot exceeds tree size.";
+      // build a vector with the leaves that needs padding.
+      std::vector<std::byte> padded_leaves;
+      if (!init_layers_db(config, leaves_size) || !init_padded_leaves(padded_leaves, leaves, leaves_size, config)) {
         return eIcicleError::INVALID_ARGUMENT;
       }
+      const int nof_layers = m_layers.size();
       m_tree_already_built = true; // Set the tree status as built
       uint64_t l0_segment_idx = 0; // Index for the segment of hashes from layer 0 to send
-      const int nof_layers = m_layers.size();
-      init_layers_db(config);
       const uint64_t nof_segments_at_l0 =
-        (m_layers[0].m_nof_hashes + NOF_OPERATIONS_PER_TASK - 1) / NOF_OPERATIONS_PER_TASK;
+        ((m_layers[0].m_nof_hashes_2_execute - m_layers[0].m_last_hash_config.batch) / NOF_OPERATIONS_PER_TASK) + 1;
+      bool padding_required = !padded_leaves.empty();
 
       // run until the root is processed
       while (1) {
@@ -95,7 +87,8 @@ namespace icicle {
 
           // delete completed_segment_id from the map
           const uint64_t completed_segment_id = completed_segment_idx ^ (completed_layer_idx << 56);
-          m_map_segment_id_2_inputs.erase(completed_segment_id);
+          auto segment = m_map_segment_id_2_inputs.find(completed_segment_id);
+          if (segment != m_map_segment_id_2_inputs.end()) { m_map_segment_id_2_inputs.erase(completed_segment_id); }
 
           // Calculate Current-Segment-ID. The completed task generated inputs for Current-Segment
           const uint64_t cur_layer_idx = completed_layer_idx + 1;
@@ -104,28 +97,35 @@ namespace icicle {
 
           // update m_map_segment_id_2_inputs with the data that is ready for process
           auto cur_segment_iter = m_map_segment_id_2_inputs.find(cur_segment_id);
-          cur_segment_iter->second->m_nof_inputs_ready +=
-            m_layers[completed_layer_idx].m_hash.output_size() * NOF_OPERATIONS_PER_TASK;
+          cur_segment_iter->second->increment_ready(
+            m_layers[completed_layer_idx].m_hash.output_size() * task->m_hash_config->batch);
 
           // check if cur segment is ready to be executed
           const Hash cur_hash = m_layers[cur_layer_idx].m_hash;
           const uint64_t nof_hashes_in_seg = std::min(
-            m_layers[cur_layer_idx].m_nof_hashes - cur_segment_idx * NOF_OPERATIONS_PER_TASK,
+            m_layers[cur_layer_idx].m_nof_hashes_2_execute - cur_segment_idx * NOF_OPERATIONS_PER_TASK,
             uint64_t(NOF_OPERATIONS_PER_TASK));
-          //          ICICLE_ASSERT(
-          //             > 0) << "Edge case negative number of hashes"; // Koren what is that for?
-          if (cur_segment_iter->second->m_nof_inputs_ready >= cur_hash.default_input_chunk_size() * nof_hashes_in_seg) {
-            const std::byte* task_input = (completed_layer_idx < m_output_store_min_layer)
-                                            ? cur_segment_iter->second->m_input_data
-                                            : &(m_layers[completed_layer_idx].m_results[0]);
-            dispatch_task(task, cur_layer_idx, cur_segment_idx, task_input);
+          if (cur_segment_iter->second->is_ready()) {
+            const auto task_input = (completed_layer_idx < m_output_store_min_layer)
+                                      ? cur_segment_iter->second->m_input_data.get()
+                                      : m_layers[completed_layer_idx].m_results.data();
+            dispatch_task(task, cur_layer_idx, cur_segment_idx, task_input, cur_layer_idx > m_output_store_min_layer);
             continue;
           }
         }
 
-        if (l0_segment_idx < nof_segments_at_l0) {
-          // send a task from layer 0
-          dispatch_task(task, 0, l0_segment_idx, leaves);
+        // send task from layer 0:
+        // If leaves data is available, send a task based on leaves
+        if (l0_segment_idx + padding_required < nof_segments_at_l0) {
+          dispatch_task(task, 0, l0_segment_idx, leaves, true);
+          l0_segment_idx++;
+          continue;
+        }
+
+        // If padding is required
+        if (padding_required) {
+          dispatch_task(task, 0, l0_segment_idx, padded_leaves.data(), false);
+          padding_required = false;
           l0_segment_idx++;
           continue;
         }
@@ -155,19 +155,38 @@ namespace icicle {
       const uint64_t element_offset = leaf_idx * m_leaf_element_size;
       const int l0_total_input_size = m_layers[0].m_hash.default_input_chunk_size();
       if (leaf_idx > m_layers[0].m_nof_hashes * l0_total_input_size) {
-        ICICLE_LOG_ERROR << "Element index out of range. Should be smaller than "
+        ICICLE_LOG_ERROR << "Leaf index (" << leaf_idx << ") out of range. Should be smaller than "
                          << m_layers[0].m_nof_hashes * l0_total_input_size / m_leaf_element_size;
       }
-      uint64_t input_chunk_offset = (element_offset / l0_total_input_size) * l0_total_input_size;
 
-      // allocate merkle_proof memory
       const auto [root, root_size] = get_merkle_root();
-      const auto input_chunk_size = m_layers[0].m_hash.default_input_chunk_size();
-      merkle_proof.allocate(is_pruned, leaf_idx, &leaves[input_chunk_offset], input_chunk_size, root, root_size);
+      // location of the leaves to copy to the proof
+      uint64_t proof_leaves_offset = (element_offset / l0_total_input_size) * l0_total_input_size;
+      // leaf size at the proof
+      const auto proof_leaves_size = m_layers[0].m_hash.default_input_chunk_size();
+      // calc the amount of leaves to copy to the proof
+      uint64_t copy_leaves_size = (proof_leaves_offset + proof_leaves_size <= leaves_size) ? proof_leaves_size
+                                                                                           : // all leaves available
+                                    std::min(proof_leaves_size, leaves_size - proof_leaves_offset);
+      // generate a vector with the proof leaves
+      std::vector<std::byte> proof_leaves(proof_leaves_size, std::byte(0));
+      std::memcpy(proof_leaves.data(), &leaves[proof_leaves_offset], copy_leaves_size);
+
+      // if PaddingPolicy::LastValue pad the vector with the last value
+      if (config.padding_policy == PaddingPolicy::LastValue) {
+        const std::byte* last_element = &leaves[leaves_size - m_leaf_element_size];
+        while (copy_leaves_size < proof_leaves_size) {
+          std::memcpy(proof_leaves.data() + copy_leaves_size, last_element, m_leaf_element_size);
+          copy_leaves_size += m_leaf_element_size;
+        }
+      }
+
+      // allocate merkle_proof memory
+      merkle_proof.allocate(is_pruned, leaf_idx, proof_leaves.data(), proof_leaves_size, root, root_size);
 
       std::byte* path = merkle_proof.allocate_path_and_get_ptr(is_pruned ? m_pruned_path_size : m_full_path_size);
 
-      // if not all leaves are stored
+      // if not all results are stored
       if (m_output_store_min_layer != 0) {
         // Define a new tree tree to retrieve the forgotten hash results
         const std::vector<Hash> sub_tree_layer_hashes(
@@ -182,11 +201,12 @@ namespace icicle {
         sub_tree.build(sub_tree_leaves, sub_tree_leaves_size, config);
 
         // retrieve from the sub tree the path and increment path
-        const uint64_t sub_tree_input_chunk_offset = element_offset % sub_tree_leaves_size;
-        path = sub_tree.copy_to_path_from_store_min_layer(sub_tree_input_chunk_offset, is_pruned, path);
+        const uint64_t sub_tree_proof_leaves_offset = element_offset % sub_tree_leaves_size;
+        path = sub_tree.copy_to_path_from_store_min_layer(sub_tree_proof_leaves_offset, is_pruned, path);
       }
 
-      path = copy_to_path_from_store_min_layer(input_chunk_offset, is_pruned, path);
+      path = copy_to_path_from_store_min_layer(proof_leaves_offset, is_pruned, path);
+      // print_proof(merkle_proof);
       return eIcicleError::SUCCESS;
     }
 
@@ -200,7 +220,7 @@ namespace icicle {
         std::cout << std::dec << "Layer " << layer_idx << ": " << m_layers[layer_idx].m_hash.default_input_chunk_size()
                   << " -> " << m_layers[layer_idx].m_hash.output_size() << std::endl;
         print_bytes(
-          m_layers[layer_idx].m_results.data(), m_layers[layer_idx].m_nof_hashes,
+          m_layers[layer_idx].m_results.data(), m_layers[layer_idx].m_nof_hashes_2_execute,
           m_layers[layer_idx].m_hash.output_size());
       }
       return eIcicleError::SUCCESS;
@@ -250,31 +270,31 @@ namespace icicle {
     struct LayerDB {
       LayerDB() : m_hash(nullptr) {}
 
-      Hash m_hash;                      // the hash function
-      int64_t m_nof_hashes;             // number of hash functions.
+      Hash m_hash;                     // the hash function
+      uint64_t m_nof_hashes;           // number of hash functions per layer. Maybe can change to m_input_layer_size
+      uint64_t m_nof_hashes_2_execute; // number of hash functions that needs to be calculated
+
       std::vector<std::byte> m_results; // vector of hash results. This vector might not be fully allocated if layer is
                                         // not in range m_output_store_min/max_layer
       HashConfig m_hash_config;         // config when calling a hash function not last in layer
       HashConfig m_last_hash_config;    // config when calling last in layer hash function
-      std::vector<std::byte> m_zero_padded_input; // contains the last input in case padding is required
-      std::vector<std::byte> m_zero_input;        // zero vector for padded inputs
     };
 
     // the result of each hash segments
     class SegmentDB
     {
     public:
-      SegmentDB(int size_to_allocate) : m_nof_inputs_ready(0)
+      SegmentDB(int input_size, bool allocate_space) : m_nof_inputs_ready(0), m_input_size(input_size)
       {
-        m_input_data = size_to_allocate ? new std::byte[size_to_allocate] : nullptr;
-      }
-      ~SegmentDB()
-      {
-        if (m_input_data) { delete[] m_input_data; }
+        m_input_data.reset(allocate_space ? new std::byte[input_size] : nullptr);
       }
 
+      inline void increment_ready(int nof_inputs_ready) { m_nof_inputs_ready += nof_inputs_ready; }
+
+      inline bool is_ready() const { return (m_nof_inputs_ready >= m_input_size); }
       // members
-      std::byte* m_input_data;
+      std::shared_ptr<std::byte[]> m_input_data;
+      int m_input_size;
       int m_nof_inputs_ready;
     };
 
@@ -285,17 +305,31 @@ namespace icicle {
       HashTask() : TaskBase(), m_hash(nullptr) {}
 
       // The worker execute this function based on the member operands
-      virtual void execute() { m_hash.hash(m_input, m_hash.default_input_chunk_size(), *m_hash_config, m_output); }
+      virtual void execute()
+      {
+        // run the hash runction
+        m_hash.hash(m_input, m_hash.default_input_chunk_size(), *m_hash_config, m_output);
+
+        // pad hash result is necessary
+        for (int padd_idx = 0; padd_idx < m_padd_output; padd_idx++) {
+          const uint64_t padd_offset = m_hash_config->batch * m_hash.output_size();
+          memcpy(
+            m_output + padd_offset + padd_idx * m_hash.output_size(), // dest: start from padd_offset
+            m_output + padd_offset - m_hash.output_size(),            // source: last calculated hash result
+            m_hash.output_size());                                    // size: hash result size
+        }
+      }
 
       Hash m_hash;
       const std::byte* m_input;
       std::byte* m_output;
       HashConfig* m_hash_config;
 
-      // used by the manager
+      // task definition: set by the manager
       uint m_layer_idx;
       int64_t m_segment_idx;
       uint64_t m_next_segment_idx;
+      uint m_padd_output;
     };
 
     // private members
@@ -308,7 +342,7 @@ namespace icicle {
 
     // Map from in hash-segment-id to the data size available for process
     // If this segment is not stored in the tree then SegmentDB also contains the input data for that segment
-    std::unordered_map<uint64_t, SegmentDB*> m_map_segment_id_2_inputs;
+    std::unordered_map<uint64_t, std::unique_ptr<SegmentDB>> m_map_segment_id_2_inputs;
 
     // get the number of workers to launch at the task manager
     int get_nof_workers(const MerkleTreeConfig& config)
@@ -321,37 +355,112 @@ namespace icicle {
       return ((hw_threads > 1) ? hw_threads - 1 : 1); // reduce 1 for the main
     }
 
-    // Allocate tree results memory and update m_layers with the required data
-    void init_layers_db(const MerkleTreeConfig& merkle_config)
+    // Update m_layers when calling to build based on leaves_size and config
+    bool init_layers_db(const MerkleTreeConfig& merkle_config, uint64_t leaves_size)
     {
       const uint nof_layers = m_layers.size();
 
-      // run over all hashes from top layer until bottom layer
+      // Check leaves size
+      if (leaves_size > m_layers[0].m_nof_hashes * m_layers[0].m_hash.default_input_chunk_size()) {
+        ICICLE_LOG_ERROR << "Leaves size (" << leaves_size << ") exceeds the size of the tree ("
+                         << m_layers[0].m_nof_hashes * m_layers[0].m_hash.default_input_chunk_size() << ")\n";
+        return false;
+      }
+      if (
+        leaves_size < m_layers[0].m_nof_hashes * m_layers[0].m_hash.default_input_chunk_size() &&
+        merkle_config.padding_policy == PaddingPolicy::None) {
+        ICICLE_LOG_ERROR << "Leaves size (" << leaves_size << ") is smaller than tree size ("
+                         << m_layers[0].m_nof_hashes * m_layers[0].m_hash.default_input_chunk_size()
+                         << ") while Padding policy is None\n";
+        return false;
+      }
+
+      // run over all hashes from bottom layer to root
       for (int layer_idx = 0; layer_idx < nof_layers; ++layer_idx) {
         auto& cur_layer = m_layers[layer_idx];
 
-        // config when calling a hash function not last in layer
+        // calculate the actual number of hashes to execute based on leaves_size
+        const uint64_t hash_input_size = cur_layer.m_hash.default_input_chunk_size();
+        const uint64_t hash_output_size = cur_layer.m_hash.output_size();
+        // round up the the number of hashes and add 1 more for last hash that is fully padded
+        const uint64_t nof_hashes_2_execute = (leaves_size + hash_input_size - 1) / hash_input_size + 1;
+        // make sure you don't exceed m_nof_hashes
+        cur_layer.m_nof_hashes_2_execute = std::min(cur_layer.m_nof_hashes, nof_hashes_2_execute);
+
+        // config when calling not last in layer hash function
         cur_layer.m_hash_config.batch = NOF_OPERATIONS_PER_TASK;
         cur_layer.m_hash_config.is_async = merkle_config.is_async;
 
-        // config when calling last in layer hash function
-        cur_layer.m_last_hash_config.batch = (cur_layer.m_nof_hashes % NOF_OPERATIONS_PER_TASK)
-                                               ? (cur_layer.m_nof_hashes % NOF_OPERATIONS_PER_TASK)
-                                               : NOF_OPERATIONS_PER_TASK;
+        // config when calling last hash function in layer 2-17 hashes
+        const uint64_t last_batch_size = cur_layer.m_nof_hashes_2_execute < NOF_OPERATIONS_PER_TASK
+                                           ? cur_layer.m_nof_hashes_2_execute
+                                           : (cur_layer.m_nof_hashes_2_execute - 2) % NOF_OPERATIONS_PER_TASK + 2;
+        cur_layer.m_last_hash_config.batch = std::min(cur_layer.m_nof_hashes_2_execute, last_batch_size);
         cur_layer.m_last_hash_config.is_async = merkle_config.is_async;
 
-        // If the current layer is within the range of stored layers (starting from m_output_store_min_layer),
-        // allocate memory based on the number of hashes in the current layer.
-        if (m_output_store_min_layer <= layer_idx) {
-          const uint64_t nof_bytes_to_allocate = cur_layer.m_nof_hashes * cur_layer.m_hash.output_size();
-          cur_layer.m_results.reserve(nof_bytes_to_allocate);
-          cur_layer.m_results.resize(nof_bytes_to_allocate);
+        // update leaves_size for the next layer
+        leaves_size = (nof_hashes_2_execute - 1) * hash_output_size;
+      }
+
+      // allocate the results vectors based on nof_hashes_2_execute of the next layer. part of it might be padded
+      for (int layer_idx = 0; layer_idx < nof_layers; ++layer_idx) {
+        const uint64_t nof_bytes_to_allocate = (layer_idx == nof_layers - 1)
+                                                 ? m_layers[layer_idx].m_hash.output_size()
+                                                 : m_layers[layer_idx + 1].m_nof_hashes_2_execute *
+                                                     m_layers[layer_idx + 1].m_hash.default_input_chunk_size();
+        m_layers[layer_idx].m_results.reserve(nof_bytes_to_allocate);
+        m_layers[layer_idx].m_results.resize(nof_bytes_to_allocate);
+      }
+      return true;
+    }
+
+    // If padding is required resize padded_leaves and populate it with the required data.
+    bool init_padded_leaves(
+      std::vector<std::byte>& padded_leaves,
+      const std::byte* leaves,
+      uint64_t leaves_size,
+      const MerkleTreeConfig& config)
+    {
+      const uint64_t l0_input_size = m_layers[0].m_hash.default_input_chunk_size();
+      if (m_layers[0].m_nof_hashes * l0_input_size == leaves_size) {
+        // No padding is required
+        return true;
+      }
+
+      const uint64_t padded_leaves_size = m_layers[0].m_last_hash_config.batch * l0_input_size;
+      padded_leaves.resize(padded_leaves_size, std::byte(0)); // pad the vector with 0
+
+      // The size of the leaves to copy to padded_leaves
+      const uint64_t last_segment_tail_size = (leaves_size - 1) % (NOF_OPERATIONS_PER_TASK * l0_input_size) + 1;
+      const uint64_t last_segment_offset = leaves_size - last_segment_tail_size;
+      memcpy(padded_leaves.data(), leaves + last_segment_offset, last_segment_tail_size);
+
+      // pad with the last element
+      if (config.padding_policy == PaddingPolicy::LastValue) {
+        if (leaves_size % m_leaf_element_size != 0) {
+          ICICLE_LOG_ERROR << "Leaves size (" << leaves_size << ") must divide leaf_element_size ("
+                           << m_leaf_element_size << ") when Padding policy is LastValue\n";
+          return false;
+        }
+        // pad with the last element
+        for (uint64_t padded_leaves_offset = last_segment_tail_size; padded_leaves_offset < padded_leaves.size();
+             padded_leaves_offset += m_leaf_element_size) {
+          memcpy(
+            padded_leaves.data() + padded_leaves_offset, // dest: pad vector
+            leaves + leaves_size - m_leaf_element_size,  // src: last element
+            m_leaf_element_size);                        // size 1 element size
         }
       }
+      return true;
     }
 
     // build task and dispatch it to task manager
-    void dispatch_task(HashTask* task, int cur_layer_idx, const uint64_t cur_segment_idx, const std::byte* input_bytes)
+    void dispatch_task(
+      HashTask* task,
+      int cur_layer_idx,
+      const uint64_t cur_segment_idx,
+      const std::byte* input_bytes,
+      bool calc_input_offset)
     {
       // Calculate Next-Segment-ID. The current task generates inputs for Next-Segment
       LayerDB& cur_layer = m_layers[cur_layer_idx];
@@ -359,34 +468,45 @@ namespace icicle {
 
       // Set HashTask input
       const uint64_t input_offset =
-        (cur_layer_idx != 0) && (cur_layer_idx - 1 < m_output_store_min_layer)
-          ? 0
-          : cur_segment_idx * NOF_OPERATIONS_PER_TASK * cur_layer.m_hash.default_input_chunk_size();
+        calc_input_offset ? cur_segment_idx * NOF_OPERATIONS_PER_TASK * cur_layer.m_hash.default_input_chunk_size() : 0;
       task->m_input = &(input_bytes[input_offset]);
 
       task->m_hash = cur_layer.m_hash;
-      task->m_hash_config = &cur_layer.m_last_hash_config;
       task->m_layer_idx = cur_layer_idx;
       task->m_segment_idx = cur_segment_idx;
+      task->m_hash_config = &cur_layer.m_last_hash_config;
+      task->m_padd_output = 0;
 
-      // If this is the last layer
+      // If this is the last layer (root)
       if (next_layer_idx == m_layers.size()) {
         task->m_output = cur_layer.m_results.data();
         task->dispatch();
         return;
       }
 
-      // This is not the last layer
-      const uint64_t next_input_size = m_layers[next_layer_idx].m_hash.default_input_chunk_size();
-      const uint64_t next_segment_idx = cur_segment_idx * cur_layer.m_hash.output_size() / next_input_size;
+      // This is not the root layer (root)
+      LayerDB& next_layer = m_layers[next_layer_idx];
+      const uint64_t next_input_size = next_layer.m_hash.default_input_chunk_size();
+      // Ensure next segment does not overflow due to a <NOF_OPERATIONS_PER_TASK+1> sized batch by comparing it to the
+      // max possible segment index (And taking the smaller one)
+      const uint64_t max_segment_idx =
+        (next_layer.m_nof_hashes_2_execute - next_layer.m_last_hash_config.batch) / NOF_OPERATIONS_PER_TASK;
+      const uint64_t next_segment_idx =
+        std::min(cur_segment_idx * cur_layer.m_hash.output_size() / next_input_size, max_segment_idx);
       const uint64_t next_segment_id = next_segment_idx ^ (next_layer_idx << 56);
 
-      // If next_segment does not appear m_map_segment_id_2_inputs, add it
+      // If next_segment does not appear in m_map_segment_id_2_inputs, then add it
       auto next_segment_it = m_map_segment_id_2_inputs.find(next_segment_id);
       if (next_segment_it == m_map_segment_id_2_inputs.end()) {
-        const int size_to_allocate =
-          (cur_layer_idx < m_output_store_min_layer) ? NOF_OPERATIONS_PER_TASK * next_input_size : 0;
-        const auto result = m_map_segment_id_2_inputs.emplace(next_segment_id, new SegmentDB(size_to_allocate));
+        bool is_next_segment_last = next_segment_idx * NOF_OPERATIONS_PER_TASK + next_layer.m_last_hash_config.batch ==
+                                    next_layer.m_nof_hashes_2_execute;
+        const int next_seg_size_to_allocate =
+          is_next_segment_last ? next_layer.m_last_hash_config.batch * next_input_size
+                               :                       // last segment - allocate according to batch size
+            NOF_OPERATIONS_PER_TASK * next_input_size; // middle segment - allocate max
+        const auto result = m_map_segment_id_2_inputs.emplace(
+          next_segment_id,
+          std::make_unique<SegmentDB>(SegmentDB(next_seg_size_to_allocate, cur_layer_idx < m_output_store_min_layer)));
         next_segment_it = result.first;
       }
 
@@ -395,31 +515,50 @@ namespace icicle {
       task->m_output =
         (cur_layer_idx < m_output_store_min_layer)
           ? &(next_segment_it->second->m_input_data[task_output_offset % (NOF_OPERATIONS_PER_TASK * next_input_size)])
-          : &(cur_layer.m_results[task_output_offset]);
-
-      // If this is not the last hash, update hash config
-      if ((cur_segment_idx + 1) * NOF_OPERATIONS_PER_TASK < cur_layer.m_nof_hashes)
+          :                                           // input in SegmentDB
+          &(cur_layer.m_results[task_output_offset]); // next layer result vector
+
+      // If this is the last segment, pad the result
+      bool is_cur_segment_last = cur_segment_idx * NOF_OPERATIONS_PER_TASK + cur_layer.m_last_hash_config.batch ==
+                                 cur_layer.m_nof_hashes_2_execute;
+      if (is_cur_segment_last) {
+        // total size of the next layer inputs
+        const uint64_t result_total_size = next_layer.m_nof_hashes_2_execute * next_input_size;
+        // idx for the last hash at the current segment
+        const uint64_t last_result_idx = cur_segment_idx * NOF_OPERATIONS_PER_TASK + cur_layer.m_last_hash_config.batch;
+        // location of the hash result at the next layer inputs
+        const uint64_t last_result_location = last_result_idx * task->m_hash.output_size();
+        const uint64_t padd_size_in_bytes = result_total_size - last_result_location;
+        task->m_padd_output = padd_size_in_bytes / task->m_hash.output_size();
+        next_segment_it->second->increment_ready(padd_size_in_bytes);
+      } else {
         task->m_hash_config = &cur_layer.m_hash_config;
+      }
 
       // Set task next segment to handle return data
       task->m_next_segment_idx = next_segment_idx;
 
-      // dispatch task
+      // send task to the worker for execution
       task->dispatch();
     }
 
     // restore the proof path from the tree and return the new path pointer
     std::byte*
-    copy_to_path_from_store_min_layer(const uint64_t input_chunk_offset, bool is_pruned, std::byte* path) const
+    copy_to_path_from_store_min_layer(const uint64_t proof_leaves_offset, bool is_pruned, std::byte* path) const
     {
       const uint64_t total_input_size = m_layers[0].m_nof_hashes * m_layers[0].m_hash.default_input_chunk_size();
       for (int layer_idx = m_output_store_min_layer; layer_idx < m_layers.size() - 1; layer_idx++) {
+        const auto& cur_layer_result = m_layers[layer_idx].m_results;
         const uint64_t copy_range_size = m_layers[layer_idx + 1].m_hash.default_input_chunk_size();
         const uint64_t one_element_size = m_layers[layer_idx].m_hash.output_size();
-        const uint64_t element_start =
-          input_chunk_offset * m_layers[layer_idx].m_nof_hashes / total_input_size * one_element_size;
+        uint64_t element_start =
+          proof_leaves_offset * m_layers[layer_idx].m_nof_hashes / total_input_size * one_element_size;
+
+        // if the element exceeds to the padded area, cut it to the padded hash
+        if (element_start >= cur_layer_result.size()) {
+          element_start = cur_layer_result.size() - copy_range_size + element_start % copy_range_size;
+        }
         const uint64_t copy_chunk_start = (element_start / copy_range_size) * copy_range_size;
-        auto& cur_layer_result = m_layers[layer_idx].m_results;
 
         for (uint64_t byte_idx = copy_chunk_start; byte_idx < copy_chunk_start + copy_range_size; byte_idx++) {
           if (
diff --git a/icicle/include/icicle/hash/poseidon.h b/icicle/include/icicle/hash/poseidon.h
index fcb374905..f0a4275d7 100644
--- a/icicle/include/icicle/hash/poseidon.h
+++ b/icicle/include/icicle/hash/poseidon.h
@@ -10,7 +10,8 @@ namespace icicle {
    * contexts and use cases. The width parameter (`t`) determines the number of elements in the state,
    * influencing the security level and output structure of the hash. The optional `domain_tag` pointer parameter
    * enables domain separation, allowing isolation of hash outputs across different contexts or applications.
-   * (See here for a detailed explanation: https://hackmd.io/@7dpNYqjKQGeYC7wMlPxHtQ/ByIbpfX9c#SAFE-Sponge-API-for-Field-Elements-–-A-Toolbox-for-ZK-Hash-Applications)
+   * (See here for a detailed explanation:
+   * https://hackmd.io/@7dpNYqjKQGeYC7wMlPxHtQ/ByIbpfX9c#SAFE-Sponge-API-for-Field-Elements-–-A-Toolbox-for-ZK-Hash-Applications)
    *
    * @param S Represents the type of the field element used by the hash (e.g., a field element class).
    *
diff --git a/icicle/include/icicle/utils/utils.h b/icicle/include/icicle/utils/utils.h
index 1b7057fbd..acf71e532 100644
--- a/icicle/include/icicle/utils/utils.h
+++ b/icicle/include/icicle/utils/utils.h
@@ -29,6 +29,6 @@ static void print_bytes(const std::byte* data, const uint nof_elements, const ui
       std::cout << std::hex << std::setw(2) << std::setfill('0')
                 << static_cast<int>(data[element_idx * element_size + byte_idx]);
     }
-    std::cout << ",\n";
+    std::cout << std::dec << ",\n";
   }
 }
\ No newline at end of file
diff --git a/icicle/tests/test_hash_api.cpp b/icicle/tests/test_hash_api.cpp
index d2e5b9d56..d53082be3 100644
--- a/icicle/tests/test_hash_api.cpp
+++ b/icicle/tests/test_hash_api.cpp
@@ -12,6 +12,7 @@
 #include <string>
 #include <sstream>
 #include <iomanip>
+#include <cmath>
 
 using namespace icicle;
 
@@ -57,7 +58,7 @@ class HashApiTest : public ::testing::Test
   void TearDown() override {}
 
   template <typename T>
-  void randomize(T* arr, uint64_t size)
+  static void randomize(T* arr, uint64_t size)
   {
     // Create a random number generator
     std::random_device rd;                                       // Non-deterministic random number generator
@@ -71,7 +72,7 @@ class HashApiTest : public ::testing::Test
     }
   }
 
-  std::string voidPtrToHexString(const std::byte* byteData, size_t size)
+  static std::string voidPtrToHexString(const std::byte* byteData, size_t size)
   {
     std::ostringstream hexStream;
     for (size_t i = 0; i < size; ++i) {
@@ -196,6 +197,9 @@ TEST_F(HashApiTest, KeccakLarge)
   END_TIMER(cuda_timer_device_mem, "CUDA Keccak large time (on device memory)", true);
   ICICLE_CHECK(icicle_copy(output_main_case_2.get(), d_output, output_size * config.batch));
   ASSERT_EQ(0, memcmp(output_main_case_2.get(), output_ref.get(), output_size * config.batch));
+
+  ICICLE_CHECK(icicle_free(d_input));
+  ICICLE_CHECK(icicle_free(d_output));
 }
 
 TEST_F(HashApiTest, Blake2sLarge)
@@ -241,6 +245,9 @@ TEST_F(HashApiTest, Blake2sLarge)
   END_TIMER(cuda_timer_device_mem, "CUDA blake2s large time (on device memory)", true);
   ICICLE_CHECK(icicle_copy(output_main_case_2.get(), d_output, output_size * config.batch));
   ASSERT_EQ(0, memcmp(output_main_case_2.get(), output_ref.get(), output_size * config.batch));
+
+  ICICLE_CHECK(icicle_free(d_input));
+  ICICLE_CHECK(icicle_free(d_output));
 }
 
 TEST_F(HashApiTest, sha3)
@@ -315,27 +322,75 @@ class HashSumBackend : public HashBackend
   }
 };
 
-void assert_valid_tree(
+/**
+ * @brief Builds tree in a straight-forward single-threaded manner and compares the result with Icicle's calculation.
+ * @param tree - Merkle tree to test (Already built).
+ * @param input_size - Size of input in bytes.
+ * @param leaf_size - Size of each leaf in the input below.
+ * @param inputs - Input as a byte array.
+ * @param hashes - Vector of hashes of each layer in the tree above.
+ * @param config - Configuration of the merkle tree given above, to be used when building the reference.
+ * @return True if the tree's calculations (icicle and test) match.
+ */
+bool is_valid_tree(
   const MerkleTree& tree,
   int input_size,
+  int leaf_size,
   const std::byte* inputs,
   const std::vector<Hash>& hashes,
   const MerkleTreeConfig& config)
 {
-  ICICLE_ASSERT(!hashes.empty());
+  std::vector<std::byte> input_vec(input_size);
+  memcpy(input_vec.data(), inputs, input_size);
+
+  int nof_hashes = 1;
+  for (int i = hashes.size() - 2; i >= 0; i--) {
+    nof_hashes *= hashes[i + 1].default_input_chunk_size() / hashes[i].output_size();
+  }
+  int tree_input_size = nof_hashes * hashes[0].default_input_chunk_size();
+
+  ICICLE_ASSERT((config.padding_policy != PaddingPolicy::None) || (input_size == tree_input_size))
+    << "Leaves size (" << (input_size / leaf_size) << ") is smaller than tree size (" << (tree_input_size / leaf_size)
+    << ") while Padding policy is None\n";
+
+  if (tree_input_size > input_size) {
+    input_vec.resize(tree_input_size);
+    if (config.padding_policy == PaddingPolicy::LastValue) {
+      ICICLE_ASSERT(tree_input_size % leaf_size == 0)
+        << "Leaf size (" << leaf_size << ") must divide tree size (" << tree_input_size << ")";
+      std::vector<std::byte> last_leaf(leaf_size);
+      memcpy(last_leaf.data(), inputs + input_size - leaf_size, leaf_size);
+      int nof_leaves_in_padding = (tree_input_size - input_size) / leaf_size;
+      for (int i = 0; i < nof_leaves_in_padding; i++) {
+        memcpy(input_vec.data() + input_size + i * leaf_size, last_leaf.data(), leaf_size);
+      }
+    }
+  }
+
+  int max_layer_size_bytes = input_vec.size();
+  int input_size_temp = input_vec.size();
+  int output_size_temp = 1;
+
+  for (auto& layer_hash : hashes) {
+    output_size_temp = input_size_temp * layer_hash.output_size() / layer_hash.default_input_chunk_size();
+    if (output_size_temp > max_layer_size_bytes) { max_layer_size_bytes = output_size_temp; }
+
+    input_size_temp = output_size_temp;
+  }
+
+  input_size_temp = input_vec.size();
   int output_size = input_size * hashes[0].output_size() / hashes[0].default_input_chunk_size();
-  auto layer_in =
-    std::make_unique<std::byte[]>(input_size); // Going layer by layer - having the input layer as the largest
-  auto layer_out =
-    std::make_unique<std::byte[]>(output_size); // ensures these are the maximum sizes required for the arrays
+  auto layer_in = std::make_unique<std::byte[]>(max_layer_size_bytes);
+  auto layer_out = std::make_unique<std::byte[]>(max_layer_size_bytes);
   // NOTE there is an assumption here that output number is less or equal to input number for all layers
 
-  memcpy(layer_in.get(), inputs, input_size);
+  memcpy(layer_in.get(), input_vec.data(), input_size_temp);
 
   int side_inputs_offset = 0;
+  int lidx = 0;
   for (auto& layer_hash : hashes) {
-    output_size = input_size * layer_hash.output_size() / layer_hash.default_input_chunk_size();
-    const int nof_hashes = input_size / layer_hash.default_input_chunk_size();
+    output_size = input_size_temp * layer_hash.output_size() / layer_hash.default_input_chunk_size();
+    const int nof_hashes = input_size_temp / layer_hash.default_input_chunk_size();
 
     auto config = default_hash_config();
     config.batch = nof_hashes;
@@ -343,26 +398,169 @@ void assert_valid_tree(
 
     // copy output outputs to inputs before moving to the next layer
     memcpy(layer_in.get(), layer_out.get(), output_size);
-    input_size = output_size;
+    input_size_temp = output_size;
   }
 
   // Compare computed root with the tree's root
   auto [root, root_size] = tree.get_merkle_root();
-
   for (int i = 0; i < root_size; i++) {
-    ASSERT_EQ(root[i], layer_out[i]) << "Mismatch in root[" << i << "]";
+    if (root[i] != layer_out[i]) { return false; }
   }
+  return true;
 }
 
+/**
+ * @brief Wrapper to the non-template version of is_valid_tree above, allowing to insert different types of arrays as
+ * inputs. Builds tree in a straight-forward single-threaded manner and compares the result with Icicle's calculation.
+ * @param tree - Merkle tree to test  (Already built).
+ * @param input_size - Size of input in bytes.
+ * @param inputs - Input as a byte array.
+ * @param hashes - Vector of hashes of each layer in the tree above.
+ * @param config - - Configuration of the merkle tree given above, to be used when building the reference.
+ * @return True if the tree's calculations (icicle and test) match.
+ */
 template <typename T>
-void assert_valid_tree(
+bool is_valid_tree(
   const MerkleTree& tree,
   int nof_inputs,
   const T* inputs,
   const std::vector<Hash>& hashes,
   const MerkleTreeConfig& config)
 {
-  return assert_valid_tree(tree, nof_inputs * sizeof(T), reinterpret_cast<const std::byte*>(inputs), hashes, config);
+  return is_valid_tree(
+    tree, nof_inputs * sizeof(T), sizeof(T), reinterpret_cast<const std::byte*>(inputs), hashes, config);
+}
+
+/**
+ * @brief Function used by the HashApiTest to test the various Merkle trees defined in the tests below. Checks
+ * validity of the tree construction, and correctness/incorrectness of valid/invalid proofs generated by the tree.
+ * @param hashes - Vector of hashes of each layer in the tree above.
+ * @param config - Merkle tree config (Mostly irrelevant for cpu tests).
+ * @param output_store_min_layer - Store layer parameter for the Merkle tree builder.
+ * @param nof_leaves - Size of the T leaves array.
+ * @param leaves - Aforementioned leaves array.
+ * @param explict_leaf_size_in_bytes - Optional. Size of each leaf element in case that leaves is given as a byte array.
+ * NOTE test will fail if this value isn't default (1) and T != std::byte
+ */
+template <typename T>
+void test_merkle_tree(
+  const std::vector<Hash>& hashes,
+  const MerkleTreeConfig& config,
+  const int output_store_min_layer,
+  int nof_leaves,
+  const T* leaves,
+  unsigned explict_leaf_size_in_bytes = 1)
+{
+  ASSERT_TRUE((explict_leaf_size_in_bytes == 1 || std::is_same<T, std::byte>::value))
+    << "Explicit leaf size should only be given when the given leaves array is a bytes array.";
+
+  const unsigned leaf_size = explict_leaf_size_in_bytes > 1 ? explict_leaf_size_in_bytes : sizeof(T);
+  const unsigned leaves_size = nof_leaves * leaf_size;
+
+  T* device_leaves;
+  if (config.is_leaves_on_device) {
+    ICICLE_CHECK(icicle_malloc((void**)&device_leaves, leaves_size));
+    ICICLE_CHECK(icicle_copy(device_leaves, leaves, leaves_size));
+  }
+  const T* leaves4tree = config.is_leaves_on_device ? device_leaves : leaves;
+
+  auto prover_tree = MerkleTree::create(hashes, leaf_size, output_store_min_layer);
+  auto prover_tree2 = MerkleTree::create(hashes, leaf_size, output_store_min_layer);
+  auto verifier_tree = MerkleTree::create(hashes, leaf_size);
+
+  // assert that incorrect size fails
+  if (config.padding_policy == PaddingPolicy::None) {
+    ASSERT_NE(
+      prover_tree.build(leaves4tree, nof_leaves * explict_leaf_size_in_bytes - 1, config), eIcicleError::SUCCESS);
+    ASSERT_NE(
+      prover_tree.build(leaves4tree, nof_leaves * explict_leaf_size_in_bytes + 1, config), eIcicleError::SUCCESS);
+  }
+  // build tree
+  START_TIMER(MerkleTree_build)
+  ICICLE_CHECK(prover_tree.build(leaves4tree, nof_leaves * explict_leaf_size_in_bytes, config));
+  END_TIMER(MerkleTree_build, "Merkle Tree build time", true)
+
+  ASSERT_TRUE(is_valid_tree<T>(prover_tree, nof_leaves * explict_leaf_size_in_bytes, leaves, hashes, config))
+    << "Tree wasn't built correctly.";
+
+  // Create wrong input leaves by taking the original input and swapping some leaves by random values
+  auto wrong_leaves = std::make_unique<T[]>(nof_leaves * explict_leaf_size_in_bytes);
+  memcpy(wrong_leaves.get(), leaves, nof_leaves * explict_leaf_size_in_bytes);
+  const uint64_t nof_indices_modified = 5;
+  unsigned int wrong_indices[nof_indices_modified];
+  HashApiTest::randomize(wrong_indices, nof_indices_modified);
+  for (int i = 0; i < nof_indices_modified; i++) {
+    int wrong_byte_index = wrong_indices[i] % (nof_leaves * leaf_size);
+
+    uint8_t* wrong_leaves_byte_ptr = reinterpret_cast<uint8_t*>(wrong_leaves.get());
+
+    uint8_t new_worng_val;
+    do {
+      new_worng_val = rand();
+    } while (new_worng_val == wrong_leaves_byte_ptr[wrong_byte_index]);
+
+    wrong_leaves_byte_ptr[wrong_byte_index] = new_worng_val;
+
+    int wrong_leaf_idx = wrong_byte_index / leaf_size;
+    ICICLE_LOG_DEBUG << "Wrong input is modified at leaf " << wrong_leaf_idx << " (modified at byte "
+                     << wrong_byte_index % leaf_size << ")";
+  }
+
+  T* wrong_device_leaves;
+  if (config.is_leaves_on_device) {
+    ICICLE_CHECK(icicle_malloc((void**)&wrong_device_leaves, leaves_size));
+    ICICLE_CHECK(icicle_copy(wrong_device_leaves, wrong_leaves.get(), leaves_size));
+  }
+  const T* wrong_leaves4tree = config.is_leaves_on_device ? wrong_device_leaves : wrong_leaves.get();
+
+  // Test the paths at the random indices (Both that the original input is valid and the modified input isn't)
+  for (int i = 0; i < nof_indices_modified; i++) {
+    // int leaf_idx = (wrong_indices[i] % (nof_leaves * leaf_size)) / leaf_size;
+    int leaf_idx = (wrong_indices[i] % (nof_leaves * leaf_size)) / leaf_size;
+    ICICLE_LOG_DEBUG << "Checking proof of index " << leaf_idx << " (Byte idx "
+                     << (wrong_indices[i] % (nof_leaves * leaf_size)) << ")";
+
+    // get root and merkle-path for a leaf
+    auto [root, root_size] = prover_tree.get_merkle_root();
+    MerkleProof merkle_proof{};
+    ICICLE_CHECK(prover_tree.get_merkle_proof(
+      leaves, nof_leaves * explict_leaf_size_in_bytes, leaf_idx, false, config, merkle_proof));
+
+    // Test valid proof
+    bool verification_valid = false;
+    ICICLE_CHECK(verifier_tree.verify(merkle_proof, verification_valid));
+    ASSERT_TRUE(verification_valid) << "Proof of valid inputs at index " << leaf_idx
+                                    << " is invalid (And should be valid).";
+
+    // Test invalid proof (By modifying random data in the leaves)
+    verification_valid = true;
+    ICICLE_CHECK(prover_tree.get_merkle_proof(
+      wrong_leaves4tree, nof_leaves * explict_leaf_size_in_bytes, leaf_idx, false, config, merkle_proof));
+    ICICLE_CHECK(verifier_tree.verify(merkle_proof, verification_valid));
+    ASSERT_FALSE(verification_valid) << "Proof of invalid inputs at index " << leaf_idx
+                                     << " is valid (And should be invalid).";
+
+    // Same for pruned proof
+    verification_valid = false;
+    ICICLE_CHECK(prover_tree.get_merkle_proof(
+      leaves, nof_leaves * explict_leaf_size_in_bytes, leaf_idx, true, config, merkle_proof));
+    ICICLE_CHECK(verifier_tree.verify(merkle_proof, verification_valid));
+    ASSERT_TRUE(verification_valid) << "Pruned proof of valid inputs at index " << leaf_idx
+                                    << " is invalid (And should be valid).";
+
+    // Test invalid proof (By modifying random data in the leaves)
+    verification_valid = true;
+    ICICLE_CHECK(prover_tree.get_merkle_proof(
+      wrong_leaves4tree, nof_leaves * explict_leaf_size_in_bytes, leaf_idx, true, config, merkle_proof));
+    ICICLE_CHECK(verifier_tree.verify(merkle_proof, verification_valid));
+    ASSERT_FALSE(verification_valid) << "Pruned proof of invalid inputs at index " << leaf_idx
+                                     << " is valid (And should be invalid).";
+  }
+
+  if (config.is_leaves_on_device) {
+    ICICLE_CHECK(icicle_free(device_leaves));
+    ICICLE_CHECK(icicle_free(wrong_device_leaves));
+  }
 }
 
 TEST_F(HashApiTest, MerkleTreeBasic)
@@ -391,48 +589,205 @@ TEST_F(HashApiTest, MerkleTreeBasic)
   output_store_min_layer = output_store_min_layer & 3; // Ensure index is in a valid 0-3 range
   ICICLE_LOG_DEBUG << "Min store layer:\t" << output_store_min_layer;
 
-  auto prover_tree = MerkleTree::create(hashes, sizeof(uint32_t), output_store_min_layer);
-  auto verifier_tree = MerkleTree::create(hashes, sizeof(uint32_t), output_store_min_layer);
+  test_merkle_tree(hashes, config, output_store_min_layer, nof_leaves, leaves);
+}
+
+TEST_F(HashApiTest, MerkleTreeZeroPadding)
+{
+  // TODO add loop on devices (and change hash to one supported on gpu)
+  const int leaf_size = sizeof(uint32_t);
+  const int nof_leaves = 100;
+  uint32_t leaves[nof_leaves];
+  randomize(leaves, nof_leaves);
+  ICICLE_CHECK(icicle_set_device(s_reference_target));
+
+  // define the merkle tree
+  auto layer0_hash = HashSumBackend::create(5 * leaf_size, 2 * leaf_size); // in 5 leaves, out 2 leaves 400B -> 160B
+  auto layer1_hash = HashSumBackend::create(4 * leaf_size, leaf_size);     // in 4 leaves, out 1 leaf   160B ->  40B
+  auto layer2_hash = HashSumBackend::create(leaf_size, leaf_size);         // in 1 leaf, out 1 leaf     40B  ->  40B
+  auto layer3_hash = HashSumBackend::create(10 * leaf_size, leaf_size);    // in 10 leaves, out 1 leaf     40B  ->   4B
+
+  int total_nof_input_hashes = nof_leaves * leaf_size / layer0_hash.default_input_chunk_size();
+  std::vector<Hash> hashes = {layer0_hash, layer1_hash, layer2_hash, layer3_hash};
+  int output_store_min_layer = 0;
+
+  auto config = default_merkle_tree_config();
+  // Test zero padding
+  config.padding_policy = PaddingPolicy::ZeroPadding;
+
+  ICICLE_LOG_DEBUG << "Full tree";
+  test_merkle_tree(hashes, config, output_store_min_layer, nof_leaves, leaves);
+
+  ICICLE_LOG_DEBUG << "Last hash isn't full";
+  test_merkle_tree(hashes, config, output_store_min_layer, nof_leaves - 1, leaves);
+
+  const unsigned nof_leaves_in_hash = layer0_hash.default_input_chunk_size() / leaf_size;
+
+  ICICLE_LOG_DEBUG << "19 hashes (Total hashes in layer 0 - 1) - full";
+  test_merkle_tree(hashes, config, output_store_min_layer, nof_leaves - nof_leaves_in_hash, leaves);
+  ICICLE_LOG_DEBUG << "19 hashes (Total hashes in layer 0 - 1) - last hash not full";
+  test_merkle_tree(hashes, config, output_store_min_layer, nof_leaves - nof_leaves_in_hash - 1, leaves);
+
+  ICICLE_LOG_DEBUG << "16 hashes (Batch size) - full";
+  test_merkle_tree(hashes, config, output_store_min_layer, 16 * nof_leaves_in_hash, leaves);
+  ICICLE_LOG_DEBUG << "16 hashes (Batch size) - last hash not full";
+  test_merkle_tree(hashes, config, output_store_min_layer, 16 * nof_leaves_in_hash - 1, leaves);
+  ICICLE_LOG_DEBUG << "17 hashes (Batch size + 1) - full";
+  test_merkle_tree(hashes, config, output_store_min_layer, 17 * nof_leaves_in_hash, leaves);
+  ICICLE_LOG_DEBUG << "17 hashes (Batch size + 1) - last hash not full";
+  test_merkle_tree(hashes, config, output_store_min_layer, 17 * nof_leaves_in_hash - 1, leaves);
+
+  ICICLE_LOG_DEBUG << "1 hash - full";
+  test_merkle_tree(hashes, config, output_store_min_layer, nof_leaves_in_hash, leaves);
+  ICICLE_LOG_DEBUG << "1 leaf in tree";
+  test_merkle_tree(hashes, config, output_store_min_layer, 1, leaves);
+
+  ICICLE_LOG_DEBUG << "A whole number of hashes is missing";
+  int nof_hashes = ((rand() % (total_nof_input_hashes - 2)) + 1);
+  ICICLE_LOG_DEBUG << "Number of used hashes: " << nof_hashes << " / " << total_nof_input_hashes;
+  test_merkle_tree(hashes, config, output_store_min_layer, nof_hashes * nof_leaves_in_hash, leaves);
+
+  ICICLE_LOG_DEBUG << "Random amount of leaves";
+  int nof_partial_leaves = ((rand() % nof_leaves) + 1);
+  ICICLE_LOG_DEBUG << "Random amount of leaves: " << nof_partial_leaves << " / " << nof_leaves;
+  test_merkle_tree(hashes, config, output_store_min_layer, nof_partial_leaves, leaves);
+
+  ICICLE_LOG_DEBUG << "Last leaf isn't fully occupied";
+  auto byte_leaves = reinterpret_cast<const std::byte*>(leaves);
+  int byte_size;
+  do {
+    byte_size = rand() % (nof_leaves * leaf_size);
+  } while (byte_size % leaf_size == 0);
+  byte_size = 327;
+  ICICLE_LOG_DEBUG << "Size of input in bytes: " << byte_size << "\t(" << float(byte_size) / leaf_size << " / "
+                   << nof_leaves << " leaves)";
+
+  auto prover_tree = MerkleTree::create(hashes, leaf_size, output_store_min_layer);
+  auto verifier_tree = MerkleTree::create(hashes, leaf_size, output_store_min_layer);
 
   // build tree
-  ICICLE_CHECK(prover_tree.build(leaves, nof_leaves, config));
-  assert_valid_tree<uint32_t>(prover_tree, nof_leaves, leaves, hashes, config);
+  START_TIMER(MerkleTree_build)
+  ICICLE_CHECK(prover_tree.build(byte_leaves, byte_size, config));
+  END_TIMER(MerkleTree_build, "Merkle Tree CPU", true)
 
-  // get root and merkle-path for a leaf
-  const int nof_leaves_to_test = 5;
-  uint64_t leaf_indices[nof_leaves_to_test];
-  randomize(leaf_indices, nof_leaves_to_test);
+  ASSERT_TRUE(is_valid_tree(prover_tree, byte_size, byte_leaves, hashes, config)) << "Tree wasn't built correctly.";
 
-  for (int i = 0; i < nof_leaves_to_test; i++) {
-    int leaf_idx = leaf_indices[i] % nof_leaves;
+  auto wrong_bytes = std::make_unique<std::byte[]>(byte_size);
+  memcpy(wrong_bytes.get(), byte_leaves, byte_size);
+  // Modify the last byte as the only difference of this test from the previous is proof for the partial index
+  wrong_bytes[byte_size - 1] = static_cast<std::byte>(rand());
 
-    auto [root, root_size] = prover_tree.get_merkle_root();
-    MerkleProof merkle_proof{};
-    ICICLE_CHECK(prover_tree.get_merkle_proof(leaves, nof_leaves, leaf_idx, false, config, merkle_proof));
+  int leaf_idx = byte_size / leaf_size;
+  ICICLE_LOG_DEBUG << "Checking proof of index " << leaf_idx << " (Byte idx " << leaf_idx * leaf_size << ")";
 
-    // Test valid proof
-    bool verification_valid = false;
-    ICICLE_CHECK(verifier_tree.verify(merkle_proof, verification_valid));
-    ASSERT_TRUE(verification_valid);
+  // get root and merkle-path for a leaf
+  auto [root, root_size] = prover_tree.get_merkle_root();
+  MerkleProof merkle_proof{};
+  ICICLE_CHECK(prover_tree.get_merkle_proof(byte_leaves, byte_size, leaf_idx, false, config, merkle_proof));
+
+  // Test valid proof
+  bool verification_valid = false;
+  ICICLE_CHECK(verifier_tree.verify(merkle_proof, verification_valid));
+  ASSERT_TRUE(verification_valid) << "Proof of valid inputs at index " << leaf_idx
+                                  << " is invalid (And should be valid).";
+
+  // Test invalid proof (By modifying random data in the leaves)
+  verification_valid = true;
+  ICICLE_CHECK(prover_tree.get_merkle_proof(wrong_bytes.get(), byte_size, leaf_idx, false, config, merkle_proof));
+  ICICLE_CHECK(verifier_tree.verify(merkle_proof, verification_valid));
+  ASSERT_FALSE(verification_valid) << "Proof of invalid inputs at index " << leaf_idx
+                                   << " is valid (And should be invalid).";
+
+  // Same for pruned proof
+  verification_valid = false;
+  ICICLE_CHECK(prover_tree.get_merkle_proof(byte_leaves, byte_size, leaf_idx, true, config, merkle_proof));
+  ICICLE_CHECK(verifier_tree.verify(merkle_proof, verification_valid));
+  ASSERT_TRUE(verification_valid) << "Pruned proof of valid inputs at index " << leaf_idx
+                                  << " is invalid (And should be valid).";
+
+  // Test invalid proof (By modifying random data in the leaves)
+  verification_valid = true;
+  ICICLE_CHECK(prover_tree.get_merkle_proof(wrong_bytes.get(), byte_size, leaf_idx, true, config, merkle_proof));
+  ICICLE_CHECK(verifier_tree.verify(merkle_proof, verification_valid));
+  ASSERT_FALSE(verification_valid) << "Pruned proof of invalid inputs at index " << leaf_idx
+                                   << " is valid (And should be invalid).";
+}
 
-    // Test invalid proof (By modifying random data in the leaves)
-    verification_valid = true;
-    ICICLE_CHECK(prover_tree.get_merkle_proof(leaves_alternative, nof_leaves, leaf_idx, false, config, merkle_proof));
-    ICICLE_CHECK(verifier_tree.verify(merkle_proof, verification_valid));
-    ASSERT_FALSE(verification_valid);
+TEST_F(HashApiTest, MerkleTreeLastValuePadding)
+{
+  // TODO add loop on devices (and change hash to one supported on gpu)
+  const int leaf_size = sizeof(uint32_t);
+  const int nof_leaves = 100;
+  uint32_t leaves[nof_leaves];
+  randomize(leaves, nof_leaves);
+  ICICLE_CHECK(icicle_set_device(s_reference_target));
 
-    // Same for pruned proof
-    verification_valid = false;
-    ICICLE_CHECK(prover_tree.get_merkle_proof(leaves, nof_leaves, leaf_idx, true, config, merkle_proof));
-    ICICLE_CHECK(verifier_tree.verify(merkle_proof, verification_valid));
-    ASSERT_TRUE(verification_valid);
+  // define the merkle tree
+  auto layer0_hash = HashSumBackend::create(5 * leaf_size, 2 * leaf_size); // in 5 leaves, out 2 leaves 400B -> 160B
+  auto layer1_hash = HashSumBackend::create(4 * leaf_size, leaf_size);     // in 4 leaves, out 1 leaf   160B ->  40B
+  auto layer2_hash = HashSumBackend::create(leaf_size, leaf_size);         // in 1 leaf, out 1 leaf     40B  ->  40B
+  auto layer3_hash = HashSumBackend::create(10 * leaf_size, leaf_size);    // in 10 leaves, out 1 leaf     40B  ->   4B
 
-    // Test invalid proof (By adding random data to the proof)
-    verification_valid = true;
-    ICICLE_CHECK(prover_tree.get_merkle_proof(leaves_alternative, nof_leaves, leaf_idx, true, config, merkle_proof));
-    ICICLE_CHECK(verifier_tree.verify(merkle_proof, verification_valid));
-    ASSERT_FALSE(verification_valid);
-  }
+  int total_nof_input_hashes = nof_leaves * leaf_size / layer0_hash.default_input_chunk_size();
+  std::vector<Hash> hashes = {layer0_hash, layer1_hash, layer2_hash, layer3_hash};
+  int output_store_min_layer = 0;
+
+  auto config = default_merkle_tree_config();
+  // Test zero padding
+  config.padding_policy = PaddingPolicy::LastValue;
+
+  ICICLE_LOG_DEBUG << "Full tree";
+  test_merkle_tree(hashes, config, output_store_min_layer, nof_leaves, leaves);
+
+  ICICLE_LOG_DEBUG << "Last hash isn't full";
+  test_merkle_tree(hashes, config, output_store_min_layer, nof_leaves - 1, leaves);
+
+  const unsigned nof_leaves_in_hash = layer0_hash.default_input_chunk_size() / leaf_size;
+
+  ICICLE_LOG_DEBUG << "19 hashes (Total hashes in layer 0 - 1) - full";
+  test_merkle_tree(hashes, config, output_store_min_layer, nof_leaves - nof_leaves_in_hash, leaves);
+  ICICLE_LOG_DEBUG << "19 hashes (Total hashes in layer 0 - 1) - last hash not full";
+  test_merkle_tree(hashes, config, output_store_min_layer, nof_leaves - nof_leaves_in_hash - 1, leaves);
+
+  ICICLE_LOG_DEBUG << "16 hashes (Batch size) - full";
+  test_merkle_tree(hashes, config, output_store_min_layer, 16 * nof_leaves_in_hash, leaves);
+  ICICLE_LOG_DEBUG << "16 hashes (Batch size) - last hash not full";
+  test_merkle_tree(hashes, config, output_store_min_layer, 16 * nof_leaves_in_hash - 1, leaves);
+  ICICLE_LOG_DEBUG << "17 hashes (Batch size + 1) - full";
+  test_merkle_tree(hashes, config, output_store_min_layer, 17 * nof_leaves_in_hash, leaves);
+  ICICLE_LOG_DEBUG << "17 hashes (Batch size + 1) - last hash not full";
+  test_merkle_tree(hashes, config, output_store_min_layer, 17 * nof_leaves_in_hash - 1, leaves);
+
+  ICICLE_LOG_DEBUG << "1 hash - full";
+  test_merkle_tree(hashes, config, output_store_min_layer, nof_leaves_in_hash, leaves);
+  ICICLE_LOG_DEBUG << "1 leaf in tree";
+  test_merkle_tree(hashes, config, output_store_min_layer, 1, leaves);
+
+  ICICLE_LOG_DEBUG << "A whole number of hashes is missing";
+  int nof_hashes = ((rand() % (total_nof_input_hashes - 2)) + 1);
+  ICICLE_LOG_DEBUG << "Number of used hashes: " << nof_hashes << " / " << total_nof_input_hashes;
+  test_merkle_tree(hashes, config, output_store_min_layer, nof_hashes * nof_leaves_in_hash, leaves);
+
+  ICICLE_LOG_DEBUG << "Random amount of leaves";
+  int nof_partial_leaves = ((rand() % nof_leaves) + 1);
+  ICICLE_LOG_DEBUG << "Random amount of leaves: " << nof_partial_leaves << " / " << nof_leaves;
+  test_merkle_tree(hashes, config, output_store_min_layer, nof_partial_leaves, leaves);
+
+  ICICLE_LOG_DEBUG << "Last leaf isn't fully occupied - check that build should fail";
+  auto byte_leaves = reinterpret_cast<const std::byte*>(leaves);
+  int byte_size;
+  do {
+    byte_size = rand() % (nof_leaves * leaf_size);
+  } while (byte_size % leaf_size == 0);
+  byte_size = 327;
+  ICICLE_LOG_DEBUG << "Size of input in bytes: " << byte_size << "\t(" << float(byte_size) / leaf_size << " / "
+                   << nof_leaves << " leaves)";
+
+  auto prover_tree = MerkleTree::create(hashes, leaf_size, output_store_min_layer);
+  auto verifier_tree = MerkleTree::create(hashes, leaf_size, output_store_min_layer);
+
+  // build should fail when byte size isn't a whole amount of leaves and padding policy is LastValue
+  ASSERT_EQ(prover_tree.build(byte_leaves, byte_size, config), eIcicleError::INVALID_ARGUMENT);
 }
 
 TEST_F(HashApiTest, MerkleTreeMixMediumSize)
@@ -463,52 +818,7 @@ TEST_F(HashApiTest, MerkleTreeMixMediumSize)
     const int output_store_min_layer = rand() % hashes.size();
     ICICLE_LOG_DEBUG << "Min store layer:\t" << output_store_min_layer;
 
-    auto prover_tree = MerkleTree::create(hashes, leaf_size, output_store_min_layer);
-    auto verifier_tree = MerkleTree::create(hashes, leaf_size, output_store_min_layer);
-
-    // assert that incorrect size fails
-    ASSERT_NE(prover_tree.build(leaves.get(), nof_leaves - 1, config), eIcicleError::SUCCESS);
-    ASSERT_NE(prover_tree.build(leaves.get(), nof_leaves + 1, config), eIcicleError::SUCCESS);
-    // build tree
-    START_TIMER(MerkleTree_build)
-    ICICLE_CHECK(prover_tree.build(leaves.get(), nof_leaves, config));
-    END_TIMER(MerkleTree_build, "Merkle Tree large", true)
-    assert_valid_tree<uint32_t>(prover_tree, nof_leaves, leaves.get(), hashes, config);
-
-    // get root and merkle-path to an element
-    for (int test_leaf_idx = 0; test_leaf_idx < 5; test_leaf_idx++) {
-      const int leaf_idx = rand() % nof_leaves;
-
-      auto [root, root_size] = prover_tree.get_merkle_root();
-      MerkleProof merkle_proof{};
-      ICICLE_CHECK(prover_tree.get_merkle_proof(leaves.get(), nof_leaves, leaf_idx, false, config, merkle_proof));
-
-      // Test valid proof
-      bool verification_valid = false;
-      ICICLE_CHECK(verifier_tree.verify(merkle_proof, verification_valid));
-      ASSERT_TRUE(verification_valid);
-
-      // Test invalid proof (By modifying random data in the leaves)
-      verification_valid = true;
-      ICICLE_CHECK(prover_tree.get_merkle_proof(
-        leaves_alternative.get(), nof_leaves, leaf_idx, false /*=pruned*/, config, merkle_proof));
-      ICICLE_CHECK(verifier_tree.verify(merkle_proof, verification_valid));
-      ASSERT_FALSE(verification_valid);
-
-      // Same for pruned proof
-      verification_valid = false;
-      ICICLE_CHECK(
-        prover_tree.get_merkle_proof(leaves.get(), nof_leaves, leaf_idx, true /*=pruned*/, config, merkle_proof));
-      ICICLE_CHECK(verifier_tree.verify(merkle_proof, verification_valid));
-      ASSERT_TRUE(verification_valid);
-
-      // Test invalid proof (By adding random data to the proof)
-      verification_valid = true;
-      ICICLE_CHECK(prover_tree.get_merkle_proof(
-        leaves_alternative.get(), nof_leaves, leaf_idx, true /*=pruned*/, config, merkle_proof));
-      ICICLE_CHECK(verifier_tree.verify(merkle_proof, verification_valid));
-      ASSERT_FALSE(verification_valid);
-    }
+    test_merkle_tree(hashes, config, output_store_min_layer, nof_leaves, leaves.get());
   }
 }
 
@@ -531,41 +841,17 @@ TEST_F(HashApiTest, MerkleTreeDevicePartialTree)
     std::vector<Hash> hashes(tree_height, layer_hash);
 
     auto config = default_merkle_tree_config();
-    auto full_tree = MerkleTree::create(hashes, leaf_size);
-    auto prover_tree = MerkleTree::create(hashes, leaf_size, 4);
-    auto verifier_tree = MerkleTree::create(hashes, leaf_size, 4);
-
-    // build tree
-    ICICLE_CHECK(prover_tree.build(leaves.get(), total_input_size, config));
-    ICICLE_CHECK(full_tree.build(leaves.get(), total_input_size, config));
-
-    auto full_root = full_tree.get_merkle_root();
-    auto partial_root = prover_tree.get_merkle_root();
-    for (int i = 0; i < full_root.second; i++) {
-      ASSERT_TRUE(full_root.first[i] == partial_root.first[i]);
-    }
 
-    // proof leaves and verify
-    for (int test_leaf_idx = 0; test_leaf_idx < 5; test_leaf_idx++) {
-      const int leaf_idx = rand() % nof_leaves;
-
-      auto [root, root_size] = prover_tree.get_merkle_root();
-
-      // test non-pruned path
-      MerkleProof merkle_proof{};
-      bool verification_valid = false;
-      ICICLE_CHECK(
-        prover_tree.get_merkle_proof(leaves.get(), nof_leaves, leaf_idx, false /*=pruned*/, config, merkle_proof));
-      ICICLE_CHECK(verifier_tree.verify(merkle_proof, verification_valid));
-      ASSERT_TRUE(verification_valid);
-
-      // test pruned path
-      verification_valid = false;
-      ICICLE_CHECK(
-        prover_tree.get_merkle_proof(leaves.get(), nof_leaves, leaf_idx, true /*=pruned*/, config, merkle_proof));
-      ICICLE_CHECK(verifier_tree.verify(merkle_proof, verification_valid));
-      ASSERT_TRUE(verification_valid);
-    }
+    // Test with different values of output_store_min_layer
+    test_merkle_tree<std::byte>(
+      hashes, config, /*output_store_min_layer=*/0, nof_leaves, leaves.get(),
+      /*explicit_leaf_size_in_bytes=*/leaf_size);
+    test_merkle_tree<std::byte>(
+      hashes, config, /*output_store_min_layer=*/2, nof_leaves, leaves.get(),
+      /*explicit_leaf_size_in_bytes=*/leaf_size);
+    test_merkle_tree<std::byte>(
+      hashes, config, /*output_store_min_layer=*/4, nof_leaves, leaves.get(),
+      /*explicit_leaf_size_in_bytes=*/leaf_size);
   }
 }
 
@@ -587,44 +873,12 @@ TEST_F(HashApiTest, MerkleTreeLeavesOnDeviceTreeOnHost)
     // Create a vector of `Hash` objects, all initialized with the same `layer_hash`
     std::vector<Hash> hashes(tree_height, layer_hash);
 
-    // copy leaves to device
-    std::byte* device_leaves = nullptr;
-    ICICLE_CHECK(icicle_malloc((void**)&device_leaves, total_input_size));
-    ICICLE_CHECK(icicle_copy(device_leaves, leaves.get(), total_input_size));
-
+    // Specify the config for the test function below
     auto config = default_merkle_tree_config();
     config.is_tree_on_device = false;
     config.is_leaves_on_device = true;
-    auto prover_tree = MerkleTree::create(hashes, leaf_size);
-    auto verifier_tree = MerkleTree::create(hashes, leaf_size);
 
-    // build tree
-    START_TIMER(MerkleTree_build)
-    ICICLE_CHECK(prover_tree.build(device_leaves, total_input_size, config));
-    END_TIMER(MerkleTree_build, "Merkle Tree GPU", true)
-
-    // proof leaves and verify
-    for (int test_leaf_idx = 0; test_leaf_idx < 5; test_leaf_idx++) {
-      const int leaf_idx = rand() % nof_leaves;
-
-      auto [root, root_size] = prover_tree.get_merkle_root();
-
-      // test non-pruned path
-      MerkleProof merkle_proof{};
-      bool verification_valid = false;
-      ICICLE_CHECK(
-        prover_tree.get_merkle_proof(device_leaves, nof_leaves, leaf_idx, false /*=pruned*/, config, merkle_proof));
-      ICICLE_CHECK(verifier_tree.verify(merkle_proof, verification_valid));
-      ASSERT_TRUE(verification_valid);
-
-      // test pruned path
-      verification_valid = false;
-      ICICLE_CHECK(
-        prover_tree.get_merkle_proof(device_leaves, nof_leaves, leaf_idx, true /*=pruned*/, config, merkle_proof));
-      ICICLE_CHECK(verifier_tree.verify(merkle_proof, verification_valid));
-      ASSERT_TRUE(verification_valid);
-    }
-    ICICLE_CHECK(icicle_free(device_leaves));
+    test_merkle_tree<std::byte>(hashes, config, 0, nof_leaves, leaves.get(), /*explicit_leaf_size_in_bytes=*/leaf_size);
   }
 }
 
@@ -634,9 +888,9 @@ TEST_F(HashApiTest, MerkleTreeLarge)
   const uint64_t total_input_size = (1 << 28);
   const uint64_t nof_leaves = total_input_size / leaf_size;
   auto leaves = std::make_unique<std::byte[]>(total_input_size);
-  randomize(leaves.get(), nof_leaves);
+  randomize(leaves.get(), total_input_size);
 
-  for (const auto& device : s_registered_devices) {
+  for (auto&& device : s_registered_devices) {
     ICICLE_LOG_INFO << "MerkleTreeDeviceBig on device=" << device;
     ICICLE_CHECK(icicle_set_device(device));
 
@@ -647,43 +901,13 @@ TEST_F(HashApiTest, MerkleTreeLarge)
     // Create a vector of `Hash` objects, all initialized with the same `layer_hash`
     std::vector<Hash> hashes(tree_height, layer_hash);
 
-    // copy leaves to device
-    std::byte* device_leaves = nullptr;
-    ICICLE_CHECK(icicle_malloc((void**)&device_leaves, total_input_size));
-    ICICLE_CHECK(icicle_copy(device_leaves, leaves.get(), total_input_size));
-
+    // Specify the config for the test function below
     auto config = default_merkle_tree_config();
     config.is_leaves_on_device = true;
     auto prover_tree = MerkleTree::create(hashes, leaf_size);
     auto verifier_tree = MerkleTree::create(hashes, leaf_size);
 
-    // build tree
-    START_TIMER(MerkleTree_build)
-    ICICLE_CHECK(prover_tree.build(device_leaves, total_input_size, config));
-    END_TIMER(MerkleTree_build, "Merkle Tree large", true)
-
-    // proof leaves and verify
-    for (int test_leaf_idx = 0; test_leaf_idx < 5; test_leaf_idx++) {
-      const int leaf_idx = rand() % nof_leaves;
-
-      auto [root, root_size] = prover_tree.get_merkle_root();
-
-      // test non-pruned path
-      MerkleProof merkle_proof{};
-      bool verification_valid = false;
-      ICICLE_CHECK(
-        prover_tree.get_merkle_proof(device_leaves, nof_leaves, leaf_idx, false /*=pruned*/, config, merkle_proof));
-      ICICLE_CHECK(verifier_tree.verify(merkle_proof, verification_valid));
-      ASSERT_TRUE(verification_valid);
-
-      // test pruned path
-      verification_valid = false;
-      ICICLE_CHECK(
-        prover_tree.get_merkle_proof(device_leaves, nof_leaves, leaf_idx, true /*=pruned*/, config, merkle_proof));
-      ICICLE_CHECK(verifier_tree.verify(merkle_proof, verification_valid));
-      ASSERT_TRUE(verification_valid);
-    }
-    ICICLE_CHECK(icicle_free(device_leaves));
+    test_merkle_tree<std::byte>(hashes, config, 0, nof_leaves, leaves.get(), /*explicit_leaf_size_in_bytes=*/leaf_size);
   }
 }
 
@@ -841,4 +1065,38 @@ TEST_F(HashApiTest, poseidon3_batch)
   ASSERT_EQ(0, memcmp(output_cpu.get(), output_cuda.get(), config.batch * sizeof(scalar_t)));
 }
 
-#endif // POSEIDON
\ No newline at end of file
+TEST_F(HashApiTest, poseidon_tree)
+{
+  const uint64_t t = 9;
+  const uint64_t nof_layers = 4;
+  uint64_t nof_leaves = 1;
+  for (int i = 0; i < nof_layers; i++) {
+    nof_leaves *= t;
+  }
+  auto leaves = std::make_unique<scalar_t[]>(nof_leaves);
+  const uint64_t leaf_size = sizeof(scalar_t);
+  const uint64_t total_input_size = nof_leaves * leaf_size;
+
+  scalar_t::rand_host_many(leaves.get(), nof_leaves);
+
+  for (const auto& device : s_registered_devices) {
+    ICICLE_LOG_INFO << "MerkleTreeDeviceBig on device=" << device;
+    ICICLE_CHECK(icicle_set_device(device));
+
+    // Create relevant hash to compose the tree
+    auto layer_hash = Poseidon::create<scalar_t>(t);
+    // Create a vector of `Hash` objects, all initialized with the same `layer_hash`
+    std::vector<Hash> hashes(nof_layers, layer_hash);
+
+    // Specify the config for the test function below
+    auto config = default_merkle_tree_config();
+    config.is_leaves_on_device = true;
+    auto prover_tree = MerkleTree::create(hashes, leaf_size);
+    auto verifier_tree = MerkleTree::create(hashes, leaf_size);
+
+    // Cast to bytes to conform with wrong leaves manipulation inside test_merkle_tree
+    test_merkle_tree(hashes, config, 0, nof_leaves, leaves.get());
+  }
+}
+
+#endif // POSEIDON