diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_util/test_dispatch_settings.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_util/test_dispatch_settings.cpp index cb31344a3498..94889e855843 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_util/test_dispatch_settings.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_util/test_dispatch_settings.cpp @@ -15,8 +15,8 @@ using namespace tt::tt_metal; // Loop through test_func for WORKER, ETH X 1, 2 CQs void ForEachCoreTypeXHWCQs(const std::function& test_func) { - static constexpr auto core_types_to_test = std::array{CoreType::WORKER, CoreType::ETH}; - static constexpr auto num_hw_cqs_to_test = std::array{1, 2}; + const auto core_types_to_test = std::array{CoreType::WORKER, CoreType::ETH}; + const auto num_hw_cqs_to_test = std::array{1, 2}; for (const auto& core_type : core_types_to_test) { if (core_type == CoreType::ETH && @@ -33,7 +33,7 @@ void ForEachCoreTypeXHWCQs(const std::function #include #include "common.h" +#include "tt_cluster.hpp" #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" #include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_test.hpp" @@ -55,6 +56,8 @@ constexpr uint32_t PCIE_TRANSFER_SIZE_DEFAULT = 4096; constexpr uint32_t host_data_dirty_pattern = 0xbaadf00d; +constexpr CoreType DISPATCH_CORE_TYPE = CoreType::WORKER; + ////////////////////////////////////////////////////////////////////////////////////////// // Test dispatch program performance // @@ -113,6 +116,8 @@ uint32_t l1_buf_base_g; uint32_t test_device_id_g = 0; void init(int argc, char** argv) { + auto default_settings = DispatchSettings::defaults(DISPATCH_CORE_TYPE, tt::Cluster::instance(), 1); + std::vector input_args(argv, argv + argc); if (test_args::has_command_option(input_args, "-h") || test_args::has_command_option(input_args, "--help")) { @@ -138,10 +143,7 @@ void init(int argc, char** argv) { log_info(LogTest, " -hp: host huge page issue buffer size (default {})", DEFAULT_HUGEPAGE_ISSUE_BUFFER_SIZE); log_info(LogTest, " -pq: prefetch queue entries (default {})", DEFAULT_PREFETCH_Q_ENTRIES); log_info(LogTest, " -cs: cmddat q size (default {})", DEFAULT_CMDDAT_Q_SIZE); - log_info( - LogTest, - "-pdcs: prefetch_d cmddat cb size (default {})", - DispatchMemMap::get(CoreType::WORKER, 1).prefetch_d_buffer_size()); + log_info(LogTest, "-pdcs: prefetch_d cmddat cb size (default {})", default_settings.prefetch_d_buffer_size_); log_info(LogTest, " -ss: scratch cb size (default {})", DEFAULT_SCRATCH_DB_SIZE); log_info( LogTest, @@ -174,8 +176,8 @@ void init(int argc, char** argv) { pcie_transfer_size_g = test_args::get_command_option_uint32(input_args, "-pcies", PCIE_TRANSFER_SIZE_DEFAULT); dram_page_size_g = test_args::get_command_option_uint32(input_args, "-dpgs", DRAM_PAGE_SIZE_DEFAULT); dram_pages_to_read_g = test_args::get_command_option_uint32(input_args, "-dpgr", DRAM_PAGES_TO_READ_DEFAULT); - prefetch_d_buffer_size_g = test_args::get_command_option_uint32( - input_args, "-pdcs", DispatchMemMap::get(CoreType::WORKER, 1).prefetch_d_buffer_size()); + prefetch_d_buffer_size_g = + test_args::get_command_option_uint32(input_args, "-pdcs", default_settings.prefetch_d_buffer_size_); test_type_g = test_args::get_command_option_uint32(input_args, "-t", DEFAULT_TEST_TYPE); all_workers_g.end_coord.x = test_args::get_command_option_uint32(input_args, "-wx", all_workers_g.end_coord.x); @@ -849,7 +851,7 @@ void gen_rnd_linear_cmd( size &= ~(sizeof(uint32_t) - 1); uint32_t offset = std::rand() % dispatch_buffer_page_size_g; offset = (offset >> 2) << 2; - device_data.relevel(CoreType::WORKER); // XXXXX shouldn't be needed + device_data.relevel(DISPATCH_CORE_TYPE); // XXXXX shouldn't be needed if (device_data.size_at(worker_core, 0) * sizeof(uint32_t) < max_linear_cmd_read_size + offset) { // Not enough data yet, just bail on this cmd return; @@ -1699,8 +1701,7 @@ void configure_for_single_chip( uint32_t& packetized_path_test_results_addr, uint32_t packetized_path_test_results_size, uint32_t dev_hugepage_base_g) { - const CoreType dispatch_core_type = CoreType::WORKER; - uint32_t dispatch_buffer_pages = DispatchMemMap::get(dispatch_core_type, 1).dispatch_buffer_block_size_pages() * + uint32_t dispatch_buffer_pages = DispatchMemMap::get(DISPATCH_CORE_TYPE, 1).dispatch_buffer_block_size_pages() * DispatchSettings::DISPATCH_BUFFER_SIZE_BLOCKS; uint32_t num_compute_cores = device->compute_with_storage_grid_size().x * device->compute_with_storage_grid_size().y; @@ -1766,9 +1767,9 @@ void configure_for_single_chip( uint32_t* host_hugepage_completion_buffer = (uint32_t*)host_hugepage_completion_buffer_base_g; vector tmp = {dev_hugepage_completion_buffer_base >> 4}; CoreCoord phys_dispatch_host_core = split_dispatcher_g ? phys_dispatch_h_core : phys_dispatch_core; - uint32_t completion_q_wr_ptr = DispatchMemMap::get(dispatch_core_type) + uint32_t completion_q_wr_ptr = DispatchMemMap::get(DISPATCH_CORE_TYPE) .get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_WR); - uint32_t completion_q_rd_ptr = DispatchMemMap::get(dispatch_core_type) + uint32_t completion_q_rd_ptr = DispatchMemMap::get(DISPATCH_CORE_TYPE) .get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_RD); tt::llrt::write_hex_vec_to_core(device->id(), phys_dispatch_host_core, tmp, completion_q_wr_ptr); tt::llrt::write_hex_vec_to_core(device->id(), phys_dispatch_host_core, tmp, completion_q_rd_ptr); @@ -2105,19 +2106,19 @@ void configure_for_single_chip( } uint32_t host_completion_queue_wr_ptr = - DispatchMemMap::get(CoreType::WORKER).get_host_command_queue_addr(CommandQueueHostAddrType::COMPLETION_Q_WR); + DispatchMemMap::get(DISPATCH_CORE_TYPE).get_host_command_queue_addr(CommandQueueHostAddrType::COMPLETION_Q_WR); uint32_t dev_completion_queue_wr_ptr = - DispatchMemMap::get(CoreType::WORKER) + DispatchMemMap::get(DISPATCH_CORE_TYPE) .get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_WR); uint32_t dev_completion_queue_rd_ptr = - DispatchMemMap::get(CoreType::WORKER) + DispatchMemMap::get(DISPATCH_CORE_TYPE) .get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_RD); std::vector dispatch_compile_args = { dispatch_buffer_base, DispatchSettings::DISPATCH_BUFFER_LOG_PAGE_SIZE, DispatchSettings::DISPATCH_BUFFER_SIZE_BLOCKS * - DispatchMemMap::get(dispatch_core_type, 1).dispatch_buffer_block_size_pages(), + DispatchMemMap::get(DISPATCH_CORE_TYPE, 1).dispatch_buffer_block_size_pages(), dispatch_cb_sem, // overridden below for h split_prefetcher_g ? prefetch_d_downstream_cb_sem : prefetch_downstream_cb_sem, // overridden below for dispatch_h @@ -2395,8 +2396,7 @@ void configure_for_multi_chip( uint32_t& packetized_path_test_results_addr, uint32_t packetized_path_test_results_size, uint32_t dev_hugepage_base_g) { - const CoreType dispatch_core_type = CoreType::WORKER; - uint32_t dispatch_buffer_pages = DispatchMemMap::get(dispatch_core_type, 1).dispatch_buffer_block_size_pages() * + uint32_t dispatch_buffer_pages = DispatchMemMap::get(DISPATCH_CORE_TYPE, 1).dispatch_buffer_block_size_pages() * DispatchSettings::DISPATCH_BUFFER_SIZE_BLOCKS; uint32_t num_compute_cores = device->compute_with_storage_grid_size().x * device->compute_with_storage_grid_size().y; @@ -2478,9 +2478,9 @@ void configure_for_multi_chip( uint32_t* host_hugepage_completion_buffer = (uint32_t*)host_hugepage_completion_buffer_base_g; vector tmp = {dev_hugepage_completion_buffer_base >> 4}; CoreCoord phys_dispatch_host_core = split_dispatcher_g ? phys_dispatch_h_core : phys_dispatch_core; - uint32_t completion_q_wr_ptr = DispatchMemMap::get(dispatch_core_type) + uint32_t completion_q_wr_ptr = DispatchMemMap::get(DISPATCH_CORE_TYPE) .get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_WR); - uint32_t completion_q_rd_ptr = DispatchMemMap::get(dispatch_core_type) + uint32_t completion_q_rd_ptr = DispatchMemMap::get(DISPATCH_CORE_TYPE) .get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_RD); tt::llrt::write_hex_vec_to_core(device->id(), phys_dispatch_host_core, tmp, completion_q_wr_ptr); tt::llrt::write_hex_vec_to_core(device->id(), phys_dispatch_host_core, tmp, completion_q_rd_ptr); @@ -2972,18 +2972,18 @@ void configure_for_multi_chip( } uint32_t host_completion_queue_wr_ptr = - DispatchMemMap::get(CoreType::WORKER).get_host_command_queue_addr(CommandQueueHostAddrType::COMPLETION_Q_WR); + DispatchMemMap::get(DISPATCH_CORE_TYPE).get_host_command_queue_addr(CommandQueueHostAddrType::COMPLETION_Q_WR); uint32_t dev_completion_queue_wr_ptr = - DispatchMemMap::get(CoreType::WORKER) + DispatchMemMap::get(DISPATCH_CORE_TYPE) .get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_WR); uint32_t dev_completion_queue_rd_ptr = - DispatchMemMap::get(CoreType::WORKER) + DispatchMemMap::get(DISPATCH_CORE_TYPE) .get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_RD); std::vector dispatch_compile_args = { dispatch_buffer_base, DispatchSettings::DISPATCH_BUFFER_LOG_PAGE_SIZE, DispatchSettings::DISPATCH_BUFFER_SIZE_BLOCKS * - DispatchMemMap::get(dispatch_core_type, 1).dispatch_buffer_block_size_pages(), + DispatchMemMap::get(DISPATCH_CORE_TYPE, 1).dispatch_buffer_block_size_pages(), dispatch_cb_sem, // overridden below for h split_prefetcher_g ? prefetch_d_downstream_cb_sem : prefetch_downstream_cb_sem, DispatchSettings::DISPATCH_BUFFER_SIZE_BLOCKS, @@ -3321,7 +3321,7 @@ int main(int argc, char** argv) { CoreCoord phys_dispatch_relay_demux_core; uint32_t packetized_path_test_results_addr; uint32_t cq_start = - DispatchMemMap::get(CoreType::WORKER).get_host_command_queue_addr(CommandQueueHostAddrType::UNRESERVED); + DispatchMemMap::get(DISPATCH_CORE_TYPE).get_host_command_queue_addr(CommandQueueHostAddrType::UNRESERVED); uint32_t dev_hugepage_base_g = 2 * (cq_start * sizeof(uint32_t)); // HOST_CQ uses some at the start address if (test_device_id_g == 0) { diff --git a/tt_metal/api/tt-metalium/command_queue_interface.hpp b/tt_metal/api/tt-metalium/command_queue_interface.hpp index 4f7876aa9f24..8320c1cf40a4 100644 --- a/tt_metal/api/tt-metalium/command_queue_interface.hpp +++ b/tt_metal/api/tt-metalium/command_queue_interface.hpp @@ -7,6 +7,7 @@ #include #include #include +#include #include "cq_commands.hpp" #include "dispatch_core_manager.hpp" @@ -16,6 +17,7 @@ #include "dispatch_settings.hpp" #include "helpers.hpp" #include "buffer.hpp" +#include "umd/device/tt_core_coordinates.h" // FIXME: Don't do this in header files using namespace tt::tt_metal; @@ -58,51 +60,61 @@ class DispatchMemMap { DispatchMemMap(const DispatchMemMap&) = delete; DispatchMemMap(DispatchMemMap&& other) noexcept = delete; - // Returns an instance of the memory map for the provided core_type. If the number of HW CQs is not provided then - // the previous one will be used. - static const DispatchMemMap& get(const CoreType& core_type, const uint32_t num_hw_cqs = 0) { - static DispatchMemMap instance; - - if (num_hw_cqs > 0 && (core_type != instance.last_core_type || num_hw_cqs != instance.hw_cqs)) { + // + // Returns an instance. The instance is reset if the core_type and/or num_hw_cqs changed from + // the last call. The memory region sizes can be configured using DispatchSettings. + // + // If the settings changed, then force_reinit_with_settings will recreate the instance with + // the settings for the given core_type / num_hw_cqs. + // + static const DispatchMemMap& get( + const CoreType& core_type, const uint32_t num_hw_cqs = 0, const bool force_reinit_with_settings = false) { + auto& instance = get_instance(); + + if (num_hw_cqs > 0 && (core_type != instance.last_core_type || num_hw_cqs != instance.hw_cqs) || + force_reinit_with_settings) { instance.reset(core_type, num_hw_cqs); } - TT_FATAL(instance.hw_cqs > 0, "Command Queue is not initialized. Call DispatchMemMap::get with non zero num_hw_cqs."); + TT_FATAL( + instance.hw_cqs > 0, + "Command Queue is not initialized. Call DispatchMemMap::get with non zero num_hw_cqs."); return instance; } - uint32_t prefetch_q_entries() const { return prefetch_q_entries_; } + uint32_t prefetch_q_entries() const { return settings.prefetch_q_entries_; } - uint32_t prefetch_q_size() const { return prefetch_q_size_; } + uint32_t prefetch_q_size() const { return settings.prefetch_q_size_; } - uint32_t max_prefetch_command_size() const { return max_prefetch_command_size_; } + uint32_t max_prefetch_command_size() const { return settings.prefetch_max_cmd_size_; } uint32_t cmddat_q_base() const { return cmddat_q_base_; } - uint32_t cmddat_q_size() const { return cmddat_q_size_; } + uint32_t cmddat_q_size() const { return settings.prefetch_cmddat_q_size_; } uint32_t scratch_db_base() const { return scratch_db_base_; } - uint32_t scratch_db_size() const { return scratch_db_size_; } + uint32_t scratch_db_size() const { return settings.prefetch_scratch_db_size_; } uint32_t dispatch_buffer_block_size_pages() const { return dispatch_buffer_block_size_pages_; } uint32_t dispatch_buffer_base() const { return dispatch_buffer_base_; } - uint32_t dispatch_buffer_pages() const { return dispatch_buffer_pages_; } + uint32_t dispatch_buffer_pages() const { return settings.dispatch_pages_; } - uint32_t prefetch_d_buffer_size() const { return prefetch_d_buffer_size_; } + uint32_t prefetch_d_buffer_size() const { return settings.prefetch_d_buffer_size_; } - uint32_t prefetch_d_buffer_pages() const { return prefetch_d_buffer_pages_; } + uint32_t prefetch_d_buffer_pages() const { return settings.prefetch_d_pages_; } - uint32_t mux_buffer_size(uint8_t num_hw_cqs = 1) const { return prefetch_d_buffer_size_ / num_hw_cqs; } + uint32_t mux_buffer_size(uint8_t num_hw_cqs = 1) const { return settings.tunneling_buffer_size_ / num_hw_cqs; } - uint32_t mux_buffer_pages(uint8_t num_hw_cqs = 1) const { return prefetch_d_buffer_pages_ / num_hw_cqs; } + uint32_t mux_buffer_pages(uint8_t num_hw_cqs = 1) const { return settings.tunneling_buffer_pages_ / num_hw_cqs; } - uint32_t dispatch_s_buffer_size() const { return dispatch_s_buffer_size_; } + uint32_t dispatch_s_buffer_size() const { return settings.dispatch_s_buffer_size_; } uint32_t dispatch_s_buffer_pages() const { - return dispatch_s_buffer_size_ / (1 << tt::tt_metal::DispatchSettings::DISPATCH_S_BUFFER_LOG_PAGE_SIZE); + return settings.dispatch_s_buffer_size_ / + (1 << tt::tt_metal::DispatchSettings::DISPATCH_S_BUFFER_LOG_PAGE_SIZE); } uint32_t get_device_command_queue_addr(const CommandQueueDeviceAddrType& device_addr_type) const { @@ -125,30 +137,25 @@ class DispatchMemMap { private: DispatchMemMap() = default; - // Reset the instance using the default settings for the core_type and num_hw_cqs. - void reset(const CoreType& core_type, const uint32_t num_hw_cqs) { - const auto settings = DispatchSettings::defaults(core_type, tt::Cluster::instance(), num_hw_cqs); - reset(settings); + static DispatchMemMap& get_instance() { + static DispatchMemMap instance; + return instance; } - // Reset the instance using the provided settings - void reset(const DispatchSettings& settings) { + // Reset the instance using the settings for the core_type and num_hw_cqs. + void reset(const CoreType& core_type, const uint32_t num_hw_cqs) { + const auto dispatch_settings = DispatchSettings::get(core_type, num_hw_cqs); + this->settings = dispatch_settings; last_core_type = settings.core_type_; hw_cqs = settings.num_hw_cqs_; - prefetch_q_entries_ = settings.prefetch_q_entries_; - max_prefetch_command_size_ = settings.prefetch_max_cmd_size_; - cmddat_q_size_ = settings.prefetch_cmddat_q_size_; - scratch_db_size_ = settings.prefetch_scratch_db_size_; - prefetch_d_buffer_size_ = settings.prefetch_d_buffer_size_; - dispatch_s_buffer_size_ = settings.dispatch_s_buffer_size_; const auto dispatch_buffer_block_size = settings.dispatch_size_; const auto [l1_base, l1_size] = get_device_l1_info(settings.core_type_); const auto pcie_alignment = tt::tt_metal::hal.get_alignment(tt::tt_metal::HalMemType::HOST); const auto l1_alignment = tt::tt_metal::hal.get_alignment(tt::tt_metal::HalMemType::L1); - TT_ASSERT(cmddat_q_size_ >= 2 * max_prefetch_command_size_); - TT_ASSERT(scratch_db_size_ % 2 == 0); + TT_ASSERT(settings.prefetch_cmddat_q_size_ >= 2 * settings.prefetch_max_cmd_size_); + TT_ASSERT(settings.prefetch_scratch_db_size_ % 2 == 0); TT_ASSERT((dispatch_buffer_block_size & (dispatch_buffer_block_size - 1)) == 0); TT_ASSERT( DispatchSettings::DISPATCH_MESSAGE_ENTRIES <= DispatchSettings::DISPATCH_MESSAGES_MAX_OFFSET / l1_alignment + 1, @@ -183,20 +190,17 @@ class DispatchMemMap { } } - prefetch_q_size_ = prefetch_q_entries_ * sizeof(DispatchSettings::prefetch_q_entry_type); uint32_t prefetch_dispatch_unreserved_base = device_cq_addrs_[tt::utils::underlying_type( CommandQueueDeviceAddrType::UNRESERVED)]; - cmddat_q_base_ = prefetch_dispatch_unreserved_base + round_size(prefetch_q_size_, pcie_alignment); - scratch_db_base_ = cmddat_q_base_ + round_size(cmddat_q_size_, pcie_alignment); - - TT_ASSERT(scratch_db_base_ + scratch_db_size_ < l1_size); + cmddat_q_base_ = prefetch_dispatch_unreserved_base + round_size(settings.prefetch_q_size_, pcie_alignment); + scratch_db_base_ = cmddat_q_base_ + round_size(settings.prefetch_cmddat_q_size_, pcie_alignment); dispatch_buffer_base_ = align(prefetch_dispatch_unreserved_base, 1 << DispatchSettings::DISPATCH_BUFFER_LOG_PAGE_SIZE); - dispatch_buffer_pages_ = dispatch_buffer_block_size / (1 << DispatchSettings::DISPATCH_BUFFER_LOG_PAGE_SIZE); - dispatch_buffer_block_size_pages_ = dispatch_buffer_pages_ / DispatchSettings::DISPATCH_BUFFER_SIZE_BLOCKS; + dispatch_buffer_block_size_pages_ = settings.dispatch_pages_ / DispatchSettings::DISPATCH_BUFFER_SIZE_BLOCKS; const uint32_t dispatch_cb_end = dispatch_buffer_base_ + settings.dispatch_size_; + + TT_ASSERT(scratch_db_base_ + settings.prefetch_scratch_db_size_ < l1_size); TT_ASSERT(dispatch_cb_end < l1_size); - prefetch_d_buffer_pages_ = settings.prefetch_d_pages_; } std::pair get_device_l1_info(const CoreType& core_type) const { @@ -219,21 +223,15 @@ class DispatchMemMap { return {l1_base, l1_size}; } - uint32_t prefetch_q_entries_; - uint32_t prefetch_q_size_; - uint32_t max_prefetch_command_size_; uint32_t cmddat_q_base_; - uint32_t cmddat_q_size_; uint32_t scratch_db_base_; - uint32_t scratch_db_size_; uint32_t dispatch_buffer_base_; + uint32_t dispatch_buffer_block_size_pages_; - uint32_t dispatch_buffer_pages_; - uint32_t prefetch_d_buffer_size_; - uint32_t prefetch_d_buffer_pages_; - uint32_t dispatch_s_buffer_size_; std::vector device_cq_addrs_; + DispatchSettings settings; + uint32_t hw_cqs{0}; // 0 means uninitialized CoreType last_core_type{CoreType::WORKER}; }; diff --git a/tt_metal/api/tt-metalium/dispatch_settings.hpp b/tt_metal/api/tt-metalium/dispatch_settings.hpp index 65242a4f2592..357e5220d16a 100644 --- a/tt_metal/api/tt-metalium/dispatch_settings.hpp +++ b/tt_metal/api/tt-metalium/dispatch_settings.hpp @@ -5,10 +5,12 @@ #pragma once #include +#include #include #include "hal.hpp" #include "tt_cluster.hpp" #include +#include #include "umd/device/tt_core_coordinates.h" namespace tt::tt_metal { @@ -16,7 +18,76 @@ namespace tt::tt_metal { // // Dispatch Kernel Settings // -struct DispatchSettings { +class DispatchSettings { +private: + struct DispatchSettingsContainerKey { + CoreType core_type; + uint32_t num_hw_cqs; + + bool operator==(const DispatchSettingsContainerKey& other) const { + return core_type == other.core_type && num_hw_cqs == other.num_hw_cqs; + } + }; + + struct DispatchSettingsContainerKeyHasher { + size_t operator()(const DispatchSettingsContainerKey& k) const { + const auto h1 = std::hash{}(static_cast(k.core_type)); + const auto h2 = std::hash{}(k.num_hw_cqs); + return h1 ^ (h2 << 1); + } + }; + + using DispatchSettingsContainer = + std::unordered_map; + + static DispatchSettingsContainer& get_store() { + static DispatchSettingsContainer store; + return store; + } + +public: + // Returns the default settings for WORKER cores + static DispatchSettings worker_defaults(const tt::Cluster& cluster, const uint32_t num_hw_cqs); + + // Returns the default settings for ETH cores + static DispatchSettings eth_defaults(const tt::Cluster& cluster, const uint32_t num_hw_cqs); + + // Returns the default settings + static DispatchSettings defaults(const CoreType& core_type, const tt::Cluster& cluster, const uint32_t num_hw_cqs); + + // Returns the settings for a core type and number hw cqs. The values can be modified, but customization must occur + // before command queue kernels are created. + static DispatchSettings& get(const CoreType& core_type, const uint32_t num_hw_cqs) { + DispatchSettingsContainerKey k{core_type, num_hw_cqs}; + auto& store = get_store(); + if (!store.contains(k)) { + TT_THROW( + "DispatchSettings is not initialized for CoreType {}, {} CQs", + magic_enum::enum_name(core_type), + num_hw_cqs); + } + return store[k]; + } + + // Reset the settings + static void initialize(const tt::Cluster& cluster) { + static constexpr std::array k_SupportedCoreTypes{CoreType::ETH, CoreType::WORKER}; + auto& store = get_store(); + for (const auto& core_type : k_SupportedCoreTypes) { + for (int hw_cqs = 1; hw_cqs <= MAX_NUM_HW_CQS; ++hw_cqs) { + DispatchSettingsContainerKey k{core_type, hw_cqs}; + store[k] = DispatchSettings::defaults(core_type, cluster, hw_cqs); + } + } + } + + // Reset the settings for a core type and number hw cqs to the provided settings + static void initialize(const DispatchSettings& other) { + auto& store = get_store(); + DispatchSettingsContainerKey k{other.core_type_, other.num_hw_cqs_}; + store[k] = other; + } + // // Non Configurable Settings // @@ -128,15 +199,6 @@ struct DispatchSettings { return !(*this == other); } - // Returns the default settings for WORKER cores - static DispatchSettings worker_defaults(const tt::Cluster& cluster, const uint32_t num_hw_cqs); - - // Returns the default settings for ETH cores - static DispatchSettings eth_defaults(const tt::Cluster& cluster, const uint32_t num_hw_cqs); - - // Returns the default settings - static DispatchSettings defaults(const CoreType& core_type, const tt::Cluster& cluster, const uint32_t num_hw_cqs); - DispatchSettings& core_type(const CoreType& val) { this->core_type_ = val; return *this; @@ -223,26 +285,4 @@ struct DispatchSettings { DispatchSettings& build(); }; -struct DispatchSettingsContainerKey { - CoreType core_type; - uint32_t num_hw_cqs; - - bool operator==(const DispatchSettingsContainerKey& other) const { - return core_type == other.core_type && num_hw_cqs == other.num_hw_cqs; - } -}; - -using DispatchSettingsContainer = std::unordered_map; - } // namespace tt::tt_metal - -namespace std { -template <> -struct hash { - size_t operator()(const tt::tt_metal::DispatchSettingsContainerKey& k) const { - const auto h1 = std::hash{}(static_cast(k.core_type)); - const auto h2 = std::hash{}(k.num_hw_cqs); - return h1 ^ (h2 << 1); - } -}; -} // namespace std diff --git a/tt_metal/impl/device/device_pool.cpp b/tt_metal/impl/device/device_pool.cpp index 391c8a26d437..c8a870c8e4cb 100644 --- a/tt_metal/impl/device/device_pool.cpp +++ b/tt_metal/impl/device/device_pool.cpp @@ -12,6 +12,8 @@ #include #include +#include "dispatch_core_manager.hpp" +#include "dispatch_settings.hpp" #include "dprint_server.hpp" #include "host_api.hpp" #include @@ -203,6 +205,8 @@ void DevicePool::initialize( tt::tt_metal::dispatch_core_manager::initialize(dispatch_core_config, num_hw_cqs); // Initialize the dispatch query layer, used by runtime command generation tt_metal::DispatchQueryManager::initialize(num_hw_cqs); + // Initialize DispatchSettings with defaults + tt_metal::DispatchSettings::initialize(tt::Cluster::instance()); if (_inst == nullptr) { static DevicePool device_pool{}; diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index cff7a0d113ac..2ccb761ed090 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -921,6 +921,7 @@ IDevice* CreateDeviceMinimal( ZoneScoped; tt::tt_metal::dispatch_core_manager::initialize(dispatch_core_config, num_hw_cqs); tt_metal::DispatchQueryManager::initialize(num_hw_cqs); + tt_metal::DispatchSettings::initialize(tt::Cluster::instance()); auto dev = new Device(device_id, num_hw_cqs, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, {}, true); tt::Cluster::instance().set_internal_routing_info_for_ethernet_cores(true); return dev;