#16339: setup DispatchMemMap with custom settings

- Previously DispatchMemMap was hardcoded to use only the default settings - DispatchMemMap initialized using a DispatchSettings object with a core_type & num_hw_cqs combo - Singleton access to DispatchSettings. Buffer sizes may be customized during device initialization
tenstorrent · Feb 3, 2025 · 83b3325 · 83b3325
1 parent 7073093
commit 83b3325
Show file tree

Hide file tree

Showing 6 changed files with 235 additions and 131 deletions.
diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_util/test_dispatch_settings.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_util/test_dispatch_settings.cpp
@@ -15,8 +15,8 @@ using namespace tt::tt_metal;
 
 // Loop through test_func for WORKER, ETH X 1, 2 CQs
 void ForEachCoreTypeXHWCQs(const std::function<void(const CoreType& core_type, const uint32_t num_hw_cqs)>& test_func) {
-    static constexpr auto core_types_to_test = std::array<CoreType, 2>{CoreType::WORKER, CoreType::ETH};
-    static constexpr auto num_hw_cqs_to_test = std::array<uint32_t, 2>{1, 2};
+    const auto core_types_to_test = std::array<CoreType, 2>{CoreType::WORKER, CoreType::ETH};
+    const auto num_hw_cqs_to_test = std::array<uint32_t, 2>{1, 2};
 
     for (const auto& core_type : core_types_to_test) {
         if (core_type == CoreType::ETH &&
@@ -33,7 +33,7 @@ void ForEachCoreTypeXHWCQs(const std::function<void(const CoreType& core_type, c
 
 TEST_F(CommandQueueSingleCardFixture, TestDispatchSettingsDefaultUnsupportedCoreType) {
     const auto unsupported_core = CoreType::ARC;
-    EXPECT_THROW(DispatchSettings::defaults(unsupported_core, tt::Cluster::instance(), 1), std::runtime_error);
+    EXPECT_THROW(DispatchSettings::get(unsupported_core, 1), std::runtime_error);
 }
 
 TEST_F(CommandQueueSingleCardFixture, TestDispatchSettingsMissingArgs) {
@@ -42,60 +42,121 @@ TEST_F(CommandQueueSingleCardFixture, TestDispatchSettingsMissingArgs) {
 }
 
 TEST_F(CommandQueueSingleCardFixture, TestDispatchSettingsEq) {
-    static constexpr uint32_t hw_cqs = 2;
-    auto settings = DispatchSettings::worker_defaults(tt::Cluster::instance(), hw_cqs);
+    const uint32_t hw_cqs = 2;
+    auto settings = DispatchSettings::get(CoreType::WORKER, hw_cqs);
     auto settings_2 = settings; // Copy
     EXPECT_EQ(settings, settings_2);
     settings_2.dispatch_size_ += 1;
     EXPECT_NE(settings, settings_2);
 }
 
 TEST_F(CommandQueueSingleCardFixture, TestDispatchSettingsSetPrefetchDBuffer) {
-    static constexpr uint32_t hw_cqs = 2;
-    static constexpr uint32_t expected_buffer_bytes = 0xcafe;
-    static constexpr uint32_t expected_page_count = expected_buffer_bytes / (1 << DispatchSettings::PREFETCH_D_BUFFER_LOG_PAGE_SIZE);
-    auto settings = DispatchSettings::worker_defaults(tt::Cluster::instance(), hw_cqs);
+    const uint32_t hw_cqs = 2;
+    const uint32_t expected_buffer_bytes = 0xcafe;
+    const uint32_t expected_page_count =
+        expected_buffer_bytes / (1 << DispatchSettings::PREFETCH_D_BUFFER_LOG_PAGE_SIZE);
+    auto settings = DispatchSettings::get(CoreType::WORKER, hw_cqs);
     settings.prefetch_d_buffer_size(expected_buffer_bytes);
     EXPECT_EQ(settings.prefetch_d_buffer_size_, expected_buffer_bytes);
     EXPECT_EQ(settings.prefetch_d_pages_, expected_page_count);
 }
 
 TEST_F(CommandQueueSingleCardFixture, TestDispatchSettingsSetPrefetchQBuffer) {
-    static constexpr uint32_t hw_cqs = 2;
-    static constexpr uint32_t expected_buffer_entries = 0x1000;
-    static constexpr uint32_t expected_buffer_bytes = expected_buffer_entries * sizeof(DispatchSettings::prefetch_q_entry_type);
-    auto settings = DispatchSettings::worker_defaults(tt::Cluster::instance(), hw_cqs);
+    const uint32_t hw_cqs = 2;
+    const uint32_t expected_buffer_entries = 0x1000;
+    const uint32_t expected_buffer_bytes = expected_buffer_entries * sizeof(DispatchSettings::prefetch_q_entry_type);
+    auto settings = DispatchSettings::get(CoreType::WORKER, hw_cqs);
     settings.prefetch_q_entries(expected_buffer_entries);
     EXPECT_EQ(settings.prefetch_q_entries_, expected_buffer_entries);
     EXPECT_EQ(settings.prefetch_q_size_, expected_buffer_bytes);
 }
 
 TEST_F(CommandQueueSingleCardFixture, TestDispatchSettingsSetDispatchBuffer) {
-    static constexpr uint32_t hw_cqs = 2;
-    static constexpr uint32_t expected_buffer_bytes = 0x2000;
-    static constexpr uint32_t expected_page_count = expected_buffer_bytes / (1 << DispatchSettings::DISPATCH_BUFFER_LOG_PAGE_SIZE);
-    auto settings = DispatchSettings::worker_defaults(tt::Cluster::instance(), hw_cqs);
+    const uint32_t hw_cqs = 2;
+    const uint32_t expected_buffer_bytes = 0x2000;
+    const uint32_t expected_page_count = expected_buffer_bytes / (1 << DispatchSettings::DISPATCH_BUFFER_LOG_PAGE_SIZE);
+    auto settings = DispatchSettings::get(CoreType::WORKER, hw_cqs);
     settings.dispatch_size(expected_buffer_bytes);
     EXPECT_EQ(settings.dispatch_size_, expected_buffer_bytes);
     EXPECT_EQ(settings.dispatch_pages_, expected_page_count);
 }
 
 TEST_F(CommandQueueSingleCardFixture, TestDispatchSettingsSetDispatchSBuffer) {
-    static constexpr uint32_t hw_cqs = 2;
-    static constexpr uint32_t expected_buffer_bytes = 0x2000;
-    static constexpr uint32_t expected_page_count = expected_buffer_bytes / (1 << DispatchSettings::DISPATCH_S_BUFFER_LOG_PAGE_SIZE);
-    auto settings = DispatchSettings::worker_defaults(tt::Cluster::instance(), hw_cqs);
+    const uint32_t hw_cqs = 2;
+    const uint32_t expected_buffer_bytes = 0x2000;
+    const uint32_t expected_page_count =
+        expected_buffer_bytes / (1 << DispatchSettings::DISPATCH_S_BUFFER_LOG_PAGE_SIZE);
+    auto settings = DispatchSettings::get(CoreType::WORKER, hw_cqs);
     settings.dispatch_s_buffer_size(expected_buffer_bytes);
     EXPECT_EQ(settings.dispatch_s_buffer_size_, expected_buffer_bytes);
     EXPECT_EQ(settings.dispatch_s_buffer_pages_, expected_page_count);
 }
 
 TEST_F(CommandQueueSingleCardFixture, TestDispatchSettingsSetTunnelerBuffer) {
-    static constexpr uint32_t hw_cqs = 2;
-    static constexpr uint32_t expected_buffer_bytes = 0x2000;
-    static constexpr uint32_t expected_page_count = expected_buffer_bytes / (1 << DispatchSettings::PREFETCH_D_BUFFER_LOG_PAGE_SIZE);
-    auto settings = DispatchSettings::worker_defaults(tt::Cluster::instance(), hw_cqs);
+    const uint32_t hw_cqs = 2;
+    const uint32_t expected_buffer_bytes = 0x2000;
+    const uint32_t expected_page_count =
+        expected_buffer_bytes / (1 << DispatchSettings::PREFETCH_D_BUFFER_LOG_PAGE_SIZE);
+    auto settings = DispatchSettings::get(CoreType::WORKER, hw_cqs);
     settings.tunneling_buffer_size(expected_buffer_bytes);
     EXPECT_EQ(settings.tunneling_buffer_size_, expected_buffer_bytes);
     EXPECT_EQ(settings.tunneling_buffer_pages_, expected_page_count);
 }
+
+TEST_F(CommandQueueSingleCardFixture, TestDispatchSettingsMutations) {
+    if (hal.get_programmable_core_type_index(tt::tt_metal::HalProgrammableCoreType::IDLE_ETH) == -1) {
+        // This device does not have the eth core
+        tt::log_info(tt::LogTest, "Test not supported on this device");
+        return;
+    }
+    const auto core_type = CoreType::WORKER;
+    const uint32_t hw_cqs = 1;
+    const uint32_t prefetch_d_size = 0x1000;
+    const uint32_t mux_size = 0x2000;
+    const uint32_t cmddat_size = 0x2000;
+    const uint32_t dispatch_s_size = 32;
+    const uint32_t dispatch_size = 4096;
+    const uint32_t max_cmd_size = 1024;
+    const uint32_t prefetch_q_entries = 512;
+    const uint32_t scratch_db_size = 5120;
+
+    auto& settings = DispatchSettings::get(core_type, hw_cqs);
+    DispatchSettings original_settings = settings;  // Copy the original to be restored later
+
+    EXPECT_EQ(settings.core_type_, core_type);
+
+    // Modify settings
+    settings.prefetch_q_entries(prefetch_q_entries);
+    settings.prefetch_d_buffer_size(prefetch_d_size);
+    settings.tunneling_buffer_size(mux_size);
+    settings.prefetch_cmddat_q_size(cmddat_size);
+    settings.dispatch_s_buffer_size(dispatch_s_size);
+    settings.prefetch_max_cmd_size(max_cmd_size);
+    settings.prefetch_scratch_db_size(scratch_db_size);
+    settings.dispatch_size(dispatch_size);
+
+    // Change instance
+    // Check they are not the same
+    auto& settings_2 = DispatchSettings::get(CoreType::ETH, hw_cqs);
+    EXPECT_NE(settings_2.prefetch_q_entries_, prefetch_q_entries);
+    EXPECT_NE(settings_2.prefetch_d_buffer_size_, prefetch_d_size);
+    EXPECT_NE(settings_2.tunneling_buffer_size_, mux_size);
+    EXPECT_NE(settings_2.prefetch_cmddat_q_size_, cmddat_size);
+    EXPECT_NE(settings_2.dispatch_s_buffer_size_, dispatch_s_size);
+    EXPECT_NE(settings_2.prefetch_max_cmd_size_, max_cmd_size);
+    EXPECT_NE(settings_2.prefetch_scratch_db_size_, scratch_db_size);
+    EXPECT_NE(settings_2.dispatch_size_, dispatch_size);
+
+    // Change back to the instance that we modified
+    auto& settings_3 = DispatchSettings::get(core_type, hw_cqs);
+    EXPECT_EQ(settings_3.prefetch_q_entries_, prefetch_q_entries);
+    EXPECT_EQ(settings_3.prefetch_d_buffer_size_, prefetch_d_size);
+    EXPECT_EQ(settings_3.tunneling_buffer_size_, mux_size);
+    EXPECT_EQ(settings_3.prefetch_cmddat_q_size_, cmddat_size);
+    EXPECT_EQ(settings_3.dispatch_s_buffer_size_, dispatch_s_size);
+    EXPECT_EQ(settings_3.prefetch_max_cmd_size_, max_cmd_size);
+    EXPECT_EQ(settings_3.prefetch_scratch_db_size_, scratch_db_size);
+    EXPECT_EQ(settings_3.dispatch_size_, dispatch_size);
+
+    DispatchSettings::initialize(original_settings);
+}
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp
@@ -15,6 +15,7 @@
 #include <tt-metalium/command_queue_interface.hpp>
 #include <tt-metalium/dispatch_settings.hpp>
 #include "common.h"
+#include "tt_cluster.hpp"
 #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_test.hpp"
 
@@ -55,6 +56,8 @@ constexpr uint32_t PCIE_TRANSFER_SIZE_DEFAULT = 4096;
 
 constexpr uint32_t host_data_dirty_pattern = 0xbaadf00d;
 
+constexpr CoreType DISPATCH_CORE_TYPE = CoreType::WORKER;
+
 //////////////////////////////////////////////////////////////////////////////////////////
 // Test dispatch program performance
 //
@@ -113,6 +116,8 @@ uint32_t l1_buf_base_g;
 uint32_t test_device_id_g = 0;
 
 void init(int argc, char** argv) {
+    auto default_settings = DispatchSettings::defaults(DISPATCH_CORE_TYPE, tt::Cluster::instance(), 1);
+
     std::vector<std::string> input_args(argv, argv + argc);
 
     if (test_args::has_command_option(input_args, "-h") || test_args::has_command_option(input_args, "--help")) {
@@ -138,10 +143,7 @@ void init(int argc, char** argv) {
         log_info(LogTest, "  -hp: host huge page issue buffer size (default {})", DEFAULT_HUGEPAGE_ISSUE_BUFFER_SIZE);
         log_info(LogTest, "  -pq: prefetch queue entries (default {})", DEFAULT_PREFETCH_Q_ENTRIES);
         log_info(LogTest, "  -cs: cmddat q size (default {})", DEFAULT_CMDDAT_Q_SIZE);
-        log_info(
-            LogTest,
-            "-pdcs: prefetch_d cmddat cb size (default {})",
-            DispatchMemMap::get(CoreType::WORKER, 1).prefetch_d_buffer_size());
+        log_info(LogTest, "-pdcs: prefetch_d cmddat cb size (default {})", default_settings.prefetch_d_buffer_size_);
         log_info(LogTest, "  -ss: scratch cb size (default {})", DEFAULT_SCRATCH_DB_SIZE);
         log_info(
             LogTest,
@@ -174,8 +176,8 @@ void init(int argc, char** argv) {
     pcie_transfer_size_g = test_args::get_command_option_uint32(input_args, "-pcies", PCIE_TRANSFER_SIZE_DEFAULT);
     dram_page_size_g = test_args::get_command_option_uint32(input_args, "-dpgs", DRAM_PAGE_SIZE_DEFAULT);
     dram_pages_to_read_g = test_args::get_command_option_uint32(input_args, "-dpgr", DRAM_PAGES_TO_READ_DEFAULT);
-    prefetch_d_buffer_size_g = test_args::get_command_option_uint32(
-        input_args, "-pdcs", DispatchMemMap::get(CoreType::WORKER, 1).prefetch_d_buffer_size());
+    prefetch_d_buffer_size_g =
+        test_args::get_command_option_uint32(input_args, "-pdcs", default_settings.prefetch_d_buffer_size_);
 
     test_type_g = test_args::get_command_option_uint32(input_args, "-t", DEFAULT_TEST_TYPE);
     all_workers_g.end_coord.x = test_args::get_command_option_uint32(input_args, "-wx", all_workers_g.end_coord.x);
@@ -849,7 +851,7 @@ void gen_rnd_linear_cmd(
     size &= ~(sizeof(uint32_t) - 1);
     uint32_t offset = std::rand() % dispatch_buffer_page_size_g;
     offset = (offset >> 2) << 2;
-    device_data.relevel(CoreType::WORKER);  // XXXXX shouldn't be needed
+    device_data.relevel(DISPATCH_CORE_TYPE);  // XXXXX shouldn't be needed
     if (device_data.size_at(worker_core, 0) * sizeof(uint32_t) < max_linear_cmd_read_size + offset) {
         // Not enough data yet, just bail on this cmd
         return;
@@ -1699,8 +1701,7 @@ void configure_for_single_chip(
     uint32_t& packetized_path_test_results_addr,
     uint32_t packetized_path_test_results_size,
     uint32_t dev_hugepage_base_g) {
-    const CoreType dispatch_core_type = CoreType::WORKER;
-    uint32_t dispatch_buffer_pages = DispatchMemMap::get(dispatch_core_type, 1).dispatch_buffer_block_size_pages() *
+    uint32_t dispatch_buffer_pages = DispatchMemMap::get(DISPATCH_CORE_TYPE, 1).dispatch_buffer_block_size_pages() *
                                      DispatchSettings::DISPATCH_BUFFER_SIZE_BLOCKS;
     uint32_t num_compute_cores =
         device->compute_with_storage_grid_size().x * device->compute_with_storage_grid_size().y;
@@ -1766,9 +1767,9 @@ void configure_for_single_chip(
     uint32_t* host_hugepage_completion_buffer = (uint32_t*)host_hugepage_completion_buffer_base_g;
     vector<uint32_t> tmp = {dev_hugepage_completion_buffer_base >> 4};
     CoreCoord phys_dispatch_host_core = split_dispatcher_g ? phys_dispatch_h_core : phys_dispatch_core;
-    uint32_t completion_q_wr_ptr = DispatchMemMap::get(dispatch_core_type)
+    uint32_t completion_q_wr_ptr = DispatchMemMap::get(DISPATCH_CORE_TYPE)
                                        .get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_WR);
-    uint32_t completion_q_rd_ptr = DispatchMemMap::get(dispatch_core_type)
+    uint32_t completion_q_rd_ptr = DispatchMemMap::get(DISPATCH_CORE_TYPE)
                                        .get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_RD);
     tt::llrt::write_hex_vec_to_core(device->id(), phys_dispatch_host_core, tmp, completion_q_wr_ptr);
     tt::llrt::write_hex_vec_to_core(device->id(), phys_dispatch_host_core, tmp, completion_q_rd_ptr);
@@ -2105,19 +2106,19 @@ void configure_for_single_chip(
     }
 
     uint32_t host_completion_queue_wr_ptr =
-        DispatchMemMap::get(CoreType::WORKER).get_host_command_queue_addr(CommandQueueHostAddrType::COMPLETION_Q_WR);
+        DispatchMemMap::get(DISPATCH_CORE_TYPE).get_host_command_queue_addr(CommandQueueHostAddrType::COMPLETION_Q_WR);
     uint32_t dev_completion_queue_wr_ptr =
-        DispatchMemMap::get(CoreType::WORKER)
+        DispatchMemMap::get(DISPATCH_CORE_TYPE)
             .get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_WR);
     uint32_t dev_completion_queue_rd_ptr =
-        DispatchMemMap::get(CoreType::WORKER)
+        DispatchMemMap::get(DISPATCH_CORE_TYPE)
             .get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_RD);
 
     std::vector<uint32_t> dispatch_compile_args = {
         dispatch_buffer_base,
         DispatchSettings::DISPATCH_BUFFER_LOG_PAGE_SIZE,
         DispatchSettings::DISPATCH_BUFFER_SIZE_BLOCKS *
-            DispatchMemMap::get(dispatch_core_type, 1).dispatch_buffer_block_size_pages(),
+            DispatchMemMap::get(DISPATCH_CORE_TYPE, 1).dispatch_buffer_block_size_pages(),
         dispatch_cb_sem,  // overridden below for h
         split_prefetcher_g ? prefetch_d_downstream_cb_sem
                            : prefetch_downstream_cb_sem,  // overridden below for dispatch_h
@@ -2395,8 +2396,7 @@ void configure_for_multi_chip(
     uint32_t& packetized_path_test_results_addr,
     uint32_t packetized_path_test_results_size,
     uint32_t dev_hugepage_base_g) {
-    const CoreType dispatch_core_type = CoreType::WORKER;
-    uint32_t dispatch_buffer_pages = DispatchMemMap::get(dispatch_core_type, 1).dispatch_buffer_block_size_pages() *
+    uint32_t dispatch_buffer_pages = DispatchMemMap::get(DISPATCH_CORE_TYPE, 1).dispatch_buffer_block_size_pages() *
                                      DispatchSettings::DISPATCH_BUFFER_SIZE_BLOCKS;
     uint32_t num_compute_cores =
         device->compute_with_storage_grid_size().x * device->compute_with_storage_grid_size().y;
@@ -2478,9 +2478,9 @@ void configure_for_multi_chip(
     uint32_t* host_hugepage_completion_buffer = (uint32_t*)host_hugepage_completion_buffer_base_g;
     vector<uint32_t> tmp = {dev_hugepage_completion_buffer_base >> 4};
     CoreCoord phys_dispatch_host_core = split_dispatcher_g ? phys_dispatch_h_core : phys_dispatch_core;
-    uint32_t completion_q_wr_ptr = DispatchMemMap::get(dispatch_core_type)
+    uint32_t completion_q_wr_ptr = DispatchMemMap::get(DISPATCH_CORE_TYPE)
                                        .get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_WR);
-    uint32_t completion_q_rd_ptr = DispatchMemMap::get(dispatch_core_type)
+    uint32_t completion_q_rd_ptr = DispatchMemMap::get(DISPATCH_CORE_TYPE)
                                        .get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_RD);
     tt::llrt::write_hex_vec_to_core(device->id(), phys_dispatch_host_core, tmp, completion_q_wr_ptr);
     tt::llrt::write_hex_vec_to_core(device->id(), phys_dispatch_host_core, tmp, completion_q_rd_ptr);
@@ -2972,18 +2972,18 @@ void configure_for_multi_chip(
     }
 
     uint32_t host_completion_queue_wr_ptr =
-        DispatchMemMap::get(CoreType::WORKER).get_host_command_queue_addr(CommandQueueHostAddrType::COMPLETION_Q_WR);
+        DispatchMemMap::get(DISPATCH_CORE_TYPE).get_host_command_queue_addr(CommandQueueHostAddrType::COMPLETION_Q_WR);
     uint32_t dev_completion_queue_wr_ptr =
-        DispatchMemMap::get(CoreType::WORKER)
+        DispatchMemMap::get(DISPATCH_CORE_TYPE)
             .get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_WR);
     uint32_t dev_completion_queue_rd_ptr =
-        DispatchMemMap::get(CoreType::WORKER)
+        DispatchMemMap::get(DISPATCH_CORE_TYPE)
             .get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_RD);
     std::vector<uint32_t> dispatch_compile_args = {
         dispatch_buffer_base,
         DispatchSettings::DISPATCH_BUFFER_LOG_PAGE_SIZE,
         DispatchSettings::DISPATCH_BUFFER_SIZE_BLOCKS *
-            DispatchMemMap::get(dispatch_core_type, 1).dispatch_buffer_block_size_pages(),
+            DispatchMemMap::get(DISPATCH_CORE_TYPE, 1).dispatch_buffer_block_size_pages(),
         dispatch_cb_sem,  // overridden below for h
         split_prefetcher_g ? prefetch_d_downstream_cb_sem : prefetch_downstream_cb_sem,
         DispatchSettings::DISPATCH_BUFFER_SIZE_BLOCKS,
@@ -3321,7 +3321,7 @@ int main(int argc, char** argv) {
         CoreCoord phys_dispatch_relay_demux_core;
         uint32_t packetized_path_test_results_addr;
         uint32_t cq_start =
-            DispatchMemMap::get(CoreType::WORKER).get_host_command_queue_addr(CommandQueueHostAddrType::UNRESERVED);
+            DispatchMemMap::get(DISPATCH_CORE_TYPE).get_host_command_queue_addr(CommandQueueHostAddrType::UNRESERVED);
         uint32_t dev_hugepage_base_g = 2 * (cq_start * sizeof(uint32_t));  // HOST_CQ uses some at the start address
 
         if (test_device_id_g == 0) {