Skip to content

Commit

Permalink
#16339: setup DispatchMemMap with custom settings
Browse files Browse the repository at this point in the history
- Previously DispatchMemMap was hardcoded to use only
  the default settings
- DispatchMemMap initialized using a DispatchSettings
  object with a core_type & num_hw_cqs combo
- Singleton access to DispatchSettings. Buffer sizes
  may be customized during device initialization
  • Loading branch information
nhuang-tt committed Feb 3, 2025
1 parent 7073093 commit 83b3325
Show file tree
Hide file tree
Showing 6 changed files with 235 additions and 131 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ using namespace tt::tt_metal;

// Loop through test_func for WORKER, ETH X 1, 2 CQs
void ForEachCoreTypeXHWCQs(const std::function<void(const CoreType& core_type, const uint32_t num_hw_cqs)>& test_func) {
static constexpr auto core_types_to_test = std::array<CoreType, 2>{CoreType::WORKER, CoreType::ETH};
static constexpr auto num_hw_cqs_to_test = std::array<uint32_t, 2>{1, 2};
const auto core_types_to_test = std::array<CoreType, 2>{CoreType::WORKER, CoreType::ETH};
const auto num_hw_cqs_to_test = std::array<uint32_t, 2>{1, 2};

for (const auto& core_type : core_types_to_test) {
if (core_type == CoreType::ETH &&
Expand All @@ -33,7 +33,7 @@ void ForEachCoreTypeXHWCQs(const std::function<void(const CoreType& core_type, c

TEST_F(CommandQueueSingleCardFixture, TestDispatchSettingsDefaultUnsupportedCoreType) {
const auto unsupported_core = CoreType::ARC;
EXPECT_THROW(DispatchSettings::defaults(unsupported_core, tt::Cluster::instance(), 1), std::runtime_error);
EXPECT_THROW(DispatchSettings::get(unsupported_core, 1), std::runtime_error);
}

TEST_F(CommandQueueSingleCardFixture, TestDispatchSettingsMissingArgs) {
Expand All @@ -42,60 +42,121 @@ TEST_F(CommandQueueSingleCardFixture, TestDispatchSettingsMissingArgs) {
}

TEST_F(CommandQueueSingleCardFixture, TestDispatchSettingsEq) {
static constexpr uint32_t hw_cqs = 2;
auto settings = DispatchSettings::worker_defaults(tt::Cluster::instance(), hw_cqs);
const uint32_t hw_cqs = 2;
auto settings = DispatchSettings::get(CoreType::WORKER, hw_cqs);
auto settings_2 = settings; // Copy
EXPECT_EQ(settings, settings_2);
settings_2.dispatch_size_ += 1;
EXPECT_NE(settings, settings_2);
}

TEST_F(CommandQueueSingleCardFixture, TestDispatchSettingsSetPrefetchDBuffer) {
static constexpr uint32_t hw_cqs = 2;
static constexpr uint32_t expected_buffer_bytes = 0xcafe;
static constexpr uint32_t expected_page_count = expected_buffer_bytes / (1 << DispatchSettings::PREFETCH_D_BUFFER_LOG_PAGE_SIZE);
auto settings = DispatchSettings::worker_defaults(tt::Cluster::instance(), hw_cqs);
const uint32_t hw_cqs = 2;
const uint32_t expected_buffer_bytes = 0xcafe;
const uint32_t expected_page_count =
expected_buffer_bytes / (1 << DispatchSettings::PREFETCH_D_BUFFER_LOG_PAGE_SIZE);
auto settings = DispatchSettings::get(CoreType::WORKER, hw_cqs);
settings.prefetch_d_buffer_size(expected_buffer_bytes);
EXPECT_EQ(settings.prefetch_d_buffer_size_, expected_buffer_bytes);
EXPECT_EQ(settings.prefetch_d_pages_, expected_page_count);
}

TEST_F(CommandQueueSingleCardFixture, TestDispatchSettingsSetPrefetchQBuffer) {
static constexpr uint32_t hw_cqs = 2;
static constexpr uint32_t expected_buffer_entries = 0x1000;
static constexpr uint32_t expected_buffer_bytes = expected_buffer_entries * sizeof(DispatchSettings::prefetch_q_entry_type);
auto settings = DispatchSettings::worker_defaults(tt::Cluster::instance(), hw_cqs);
const uint32_t hw_cqs = 2;
const uint32_t expected_buffer_entries = 0x1000;
const uint32_t expected_buffer_bytes = expected_buffer_entries * sizeof(DispatchSettings::prefetch_q_entry_type);
auto settings = DispatchSettings::get(CoreType::WORKER, hw_cqs);
settings.prefetch_q_entries(expected_buffer_entries);
EXPECT_EQ(settings.prefetch_q_entries_, expected_buffer_entries);
EXPECT_EQ(settings.prefetch_q_size_, expected_buffer_bytes);
}

TEST_F(CommandQueueSingleCardFixture, TestDispatchSettingsSetDispatchBuffer) {
static constexpr uint32_t hw_cqs = 2;
static constexpr uint32_t expected_buffer_bytes = 0x2000;
static constexpr uint32_t expected_page_count = expected_buffer_bytes / (1 << DispatchSettings::DISPATCH_BUFFER_LOG_PAGE_SIZE);
auto settings = DispatchSettings::worker_defaults(tt::Cluster::instance(), hw_cqs);
const uint32_t hw_cqs = 2;
const uint32_t expected_buffer_bytes = 0x2000;
const uint32_t expected_page_count = expected_buffer_bytes / (1 << DispatchSettings::DISPATCH_BUFFER_LOG_PAGE_SIZE);
auto settings = DispatchSettings::get(CoreType::WORKER, hw_cqs);
settings.dispatch_size(expected_buffer_bytes);
EXPECT_EQ(settings.dispatch_size_, expected_buffer_bytes);
EXPECT_EQ(settings.dispatch_pages_, expected_page_count);
}

TEST_F(CommandQueueSingleCardFixture, TestDispatchSettingsSetDispatchSBuffer) {
static constexpr uint32_t hw_cqs = 2;
static constexpr uint32_t expected_buffer_bytes = 0x2000;
static constexpr uint32_t expected_page_count = expected_buffer_bytes / (1 << DispatchSettings::DISPATCH_S_BUFFER_LOG_PAGE_SIZE);
auto settings = DispatchSettings::worker_defaults(tt::Cluster::instance(), hw_cqs);
const uint32_t hw_cqs = 2;
const uint32_t expected_buffer_bytes = 0x2000;
const uint32_t expected_page_count =
expected_buffer_bytes / (1 << DispatchSettings::DISPATCH_S_BUFFER_LOG_PAGE_SIZE);
auto settings = DispatchSettings::get(CoreType::WORKER, hw_cqs);
settings.dispatch_s_buffer_size(expected_buffer_bytes);
EXPECT_EQ(settings.dispatch_s_buffer_size_, expected_buffer_bytes);
EXPECT_EQ(settings.dispatch_s_buffer_pages_, expected_page_count);
}

TEST_F(CommandQueueSingleCardFixture, TestDispatchSettingsSetTunnelerBuffer) {
static constexpr uint32_t hw_cqs = 2;
static constexpr uint32_t expected_buffer_bytes = 0x2000;
static constexpr uint32_t expected_page_count = expected_buffer_bytes / (1 << DispatchSettings::PREFETCH_D_BUFFER_LOG_PAGE_SIZE);
auto settings = DispatchSettings::worker_defaults(tt::Cluster::instance(), hw_cqs);
const uint32_t hw_cqs = 2;
const uint32_t expected_buffer_bytes = 0x2000;
const uint32_t expected_page_count =
expected_buffer_bytes / (1 << DispatchSettings::PREFETCH_D_BUFFER_LOG_PAGE_SIZE);
auto settings = DispatchSettings::get(CoreType::WORKER, hw_cqs);
settings.tunneling_buffer_size(expected_buffer_bytes);
EXPECT_EQ(settings.tunneling_buffer_size_, expected_buffer_bytes);
EXPECT_EQ(settings.tunneling_buffer_pages_, expected_page_count);
}

TEST_F(CommandQueueSingleCardFixture, TestDispatchSettingsMutations) {
if (hal.get_programmable_core_type_index(tt::tt_metal::HalProgrammableCoreType::IDLE_ETH) == -1) {
// This device does not have the eth core
tt::log_info(tt::LogTest, "Test not supported on this device");
return;
}
const auto core_type = CoreType::WORKER;
const uint32_t hw_cqs = 1;
const uint32_t prefetch_d_size = 0x1000;
const uint32_t mux_size = 0x2000;
const uint32_t cmddat_size = 0x2000;
const uint32_t dispatch_s_size = 32;
const uint32_t dispatch_size = 4096;
const uint32_t max_cmd_size = 1024;
const uint32_t prefetch_q_entries = 512;
const uint32_t scratch_db_size = 5120;

auto& settings = DispatchSettings::get(core_type, hw_cqs);
DispatchSettings original_settings = settings; // Copy the original to be restored later

EXPECT_EQ(settings.core_type_, core_type);

// Modify settings
settings.prefetch_q_entries(prefetch_q_entries);
settings.prefetch_d_buffer_size(prefetch_d_size);
settings.tunneling_buffer_size(mux_size);
settings.prefetch_cmddat_q_size(cmddat_size);
settings.dispatch_s_buffer_size(dispatch_s_size);
settings.prefetch_max_cmd_size(max_cmd_size);
settings.prefetch_scratch_db_size(scratch_db_size);
settings.dispatch_size(dispatch_size);

// Change instance
// Check they are not the same
auto& settings_2 = DispatchSettings::get(CoreType::ETH, hw_cqs);
EXPECT_NE(settings_2.prefetch_q_entries_, prefetch_q_entries);
EXPECT_NE(settings_2.prefetch_d_buffer_size_, prefetch_d_size);
EXPECT_NE(settings_2.tunneling_buffer_size_, mux_size);
EXPECT_NE(settings_2.prefetch_cmddat_q_size_, cmddat_size);
EXPECT_NE(settings_2.dispatch_s_buffer_size_, dispatch_s_size);
EXPECT_NE(settings_2.prefetch_max_cmd_size_, max_cmd_size);
EXPECT_NE(settings_2.prefetch_scratch_db_size_, scratch_db_size);
EXPECT_NE(settings_2.dispatch_size_, dispatch_size);

// Change back to the instance that we modified
auto& settings_3 = DispatchSettings::get(core_type, hw_cqs);
EXPECT_EQ(settings_3.prefetch_q_entries_, prefetch_q_entries);
EXPECT_EQ(settings_3.prefetch_d_buffer_size_, prefetch_d_size);
EXPECT_EQ(settings_3.tunneling_buffer_size_, mux_size);
EXPECT_EQ(settings_3.prefetch_cmddat_q_size_, cmddat_size);
EXPECT_EQ(settings_3.dispatch_s_buffer_size_, dispatch_s_size);
EXPECT_EQ(settings_3.prefetch_max_cmd_size_, max_cmd_size);
EXPECT_EQ(settings_3.prefetch_scratch_db_size_, scratch_db_size);
EXPECT_EQ(settings_3.dispatch_size_, dispatch_size);

DispatchSettings::initialize(original_settings);
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <tt-metalium/command_queue_interface.hpp>
#include <tt-metalium/dispatch_settings.hpp>
#include "common.h"
#include "tt_cluster.hpp"
#include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
#include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_test.hpp"

Expand Down Expand Up @@ -55,6 +56,8 @@ constexpr uint32_t PCIE_TRANSFER_SIZE_DEFAULT = 4096;

constexpr uint32_t host_data_dirty_pattern = 0xbaadf00d;

constexpr CoreType DISPATCH_CORE_TYPE = CoreType::WORKER;

//////////////////////////////////////////////////////////////////////////////////////////
// Test dispatch program performance
//
Expand Down Expand Up @@ -113,6 +116,8 @@ uint32_t l1_buf_base_g;
uint32_t test_device_id_g = 0;

void init(int argc, char** argv) {
auto default_settings = DispatchSettings::defaults(DISPATCH_CORE_TYPE, tt::Cluster::instance(), 1);

std::vector<std::string> input_args(argv, argv + argc);

if (test_args::has_command_option(input_args, "-h") || test_args::has_command_option(input_args, "--help")) {
Expand All @@ -138,10 +143,7 @@ void init(int argc, char** argv) {
log_info(LogTest, " -hp: host huge page issue buffer size (default {})", DEFAULT_HUGEPAGE_ISSUE_BUFFER_SIZE);
log_info(LogTest, " -pq: prefetch queue entries (default {})", DEFAULT_PREFETCH_Q_ENTRIES);
log_info(LogTest, " -cs: cmddat q size (default {})", DEFAULT_CMDDAT_Q_SIZE);
log_info(
LogTest,
"-pdcs: prefetch_d cmddat cb size (default {})",
DispatchMemMap::get(CoreType::WORKER, 1).prefetch_d_buffer_size());
log_info(LogTest, "-pdcs: prefetch_d cmddat cb size (default {})", default_settings.prefetch_d_buffer_size_);
log_info(LogTest, " -ss: scratch cb size (default {})", DEFAULT_SCRATCH_DB_SIZE);
log_info(
LogTest,
Expand Down Expand Up @@ -174,8 +176,8 @@ void init(int argc, char** argv) {
pcie_transfer_size_g = test_args::get_command_option_uint32(input_args, "-pcies", PCIE_TRANSFER_SIZE_DEFAULT);
dram_page_size_g = test_args::get_command_option_uint32(input_args, "-dpgs", DRAM_PAGE_SIZE_DEFAULT);
dram_pages_to_read_g = test_args::get_command_option_uint32(input_args, "-dpgr", DRAM_PAGES_TO_READ_DEFAULT);
prefetch_d_buffer_size_g = test_args::get_command_option_uint32(
input_args, "-pdcs", DispatchMemMap::get(CoreType::WORKER, 1).prefetch_d_buffer_size());
prefetch_d_buffer_size_g =
test_args::get_command_option_uint32(input_args, "-pdcs", default_settings.prefetch_d_buffer_size_);

test_type_g = test_args::get_command_option_uint32(input_args, "-t", DEFAULT_TEST_TYPE);
all_workers_g.end_coord.x = test_args::get_command_option_uint32(input_args, "-wx", all_workers_g.end_coord.x);
Expand Down Expand Up @@ -849,7 +851,7 @@ void gen_rnd_linear_cmd(
size &= ~(sizeof(uint32_t) - 1);
uint32_t offset = std::rand() % dispatch_buffer_page_size_g;
offset = (offset >> 2) << 2;
device_data.relevel(CoreType::WORKER); // XXXXX shouldn't be needed
device_data.relevel(DISPATCH_CORE_TYPE); // XXXXX shouldn't be needed
if (device_data.size_at(worker_core, 0) * sizeof(uint32_t) < max_linear_cmd_read_size + offset) {
// Not enough data yet, just bail on this cmd
return;
Expand Down Expand Up @@ -1699,8 +1701,7 @@ void configure_for_single_chip(
uint32_t& packetized_path_test_results_addr,
uint32_t packetized_path_test_results_size,
uint32_t dev_hugepage_base_g) {
const CoreType dispatch_core_type = CoreType::WORKER;
uint32_t dispatch_buffer_pages = DispatchMemMap::get(dispatch_core_type, 1).dispatch_buffer_block_size_pages() *
uint32_t dispatch_buffer_pages = DispatchMemMap::get(DISPATCH_CORE_TYPE, 1).dispatch_buffer_block_size_pages() *
DispatchSettings::DISPATCH_BUFFER_SIZE_BLOCKS;
uint32_t num_compute_cores =
device->compute_with_storage_grid_size().x * device->compute_with_storage_grid_size().y;
Expand Down Expand Up @@ -1766,9 +1767,9 @@ void configure_for_single_chip(
uint32_t* host_hugepage_completion_buffer = (uint32_t*)host_hugepage_completion_buffer_base_g;
vector<uint32_t> tmp = {dev_hugepage_completion_buffer_base >> 4};
CoreCoord phys_dispatch_host_core = split_dispatcher_g ? phys_dispatch_h_core : phys_dispatch_core;
uint32_t completion_q_wr_ptr = DispatchMemMap::get(dispatch_core_type)
uint32_t completion_q_wr_ptr = DispatchMemMap::get(DISPATCH_CORE_TYPE)
.get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_WR);
uint32_t completion_q_rd_ptr = DispatchMemMap::get(dispatch_core_type)
uint32_t completion_q_rd_ptr = DispatchMemMap::get(DISPATCH_CORE_TYPE)
.get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_RD);
tt::llrt::write_hex_vec_to_core(device->id(), phys_dispatch_host_core, tmp, completion_q_wr_ptr);
tt::llrt::write_hex_vec_to_core(device->id(), phys_dispatch_host_core, tmp, completion_q_rd_ptr);
Expand Down Expand Up @@ -2105,19 +2106,19 @@ void configure_for_single_chip(
}

uint32_t host_completion_queue_wr_ptr =
DispatchMemMap::get(CoreType::WORKER).get_host_command_queue_addr(CommandQueueHostAddrType::COMPLETION_Q_WR);
DispatchMemMap::get(DISPATCH_CORE_TYPE).get_host_command_queue_addr(CommandQueueHostAddrType::COMPLETION_Q_WR);
uint32_t dev_completion_queue_wr_ptr =
DispatchMemMap::get(CoreType::WORKER)
DispatchMemMap::get(DISPATCH_CORE_TYPE)
.get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_WR);
uint32_t dev_completion_queue_rd_ptr =
DispatchMemMap::get(CoreType::WORKER)
DispatchMemMap::get(DISPATCH_CORE_TYPE)
.get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_RD);

std::vector<uint32_t> dispatch_compile_args = {
dispatch_buffer_base,
DispatchSettings::DISPATCH_BUFFER_LOG_PAGE_SIZE,
DispatchSettings::DISPATCH_BUFFER_SIZE_BLOCKS *
DispatchMemMap::get(dispatch_core_type, 1).dispatch_buffer_block_size_pages(),
DispatchMemMap::get(DISPATCH_CORE_TYPE, 1).dispatch_buffer_block_size_pages(),
dispatch_cb_sem, // overridden below for h
split_prefetcher_g ? prefetch_d_downstream_cb_sem
: prefetch_downstream_cb_sem, // overridden below for dispatch_h
Expand Down Expand Up @@ -2395,8 +2396,7 @@ void configure_for_multi_chip(
uint32_t& packetized_path_test_results_addr,
uint32_t packetized_path_test_results_size,
uint32_t dev_hugepage_base_g) {
const CoreType dispatch_core_type = CoreType::WORKER;
uint32_t dispatch_buffer_pages = DispatchMemMap::get(dispatch_core_type, 1).dispatch_buffer_block_size_pages() *
uint32_t dispatch_buffer_pages = DispatchMemMap::get(DISPATCH_CORE_TYPE, 1).dispatch_buffer_block_size_pages() *
DispatchSettings::DISPATCH_BUFFER_SIZE_BLOCKS;
uint32_t num_compute_cores =
device->compute_with_storage_grid_size().x * device->compute_with_storage_grid_size().y;
Expand Down Expand Up @@ -2478,9 +2478,9 @@ void configure_for_multi_chip(
uint32_t* host_hugepage_completion_buffer = (uint32_t*)host_hugepage_completion_buffer_base_g;
vector<uint32_t> tmp = {dev_hugepage_completion_buffer_base >> 4};
CoreCoord phys_dispatch_host_core = split_dispatcher_g ? phys_dispatch_h_core : phys_dispatch_core;
uint32_t completion_q_wr_ptr = DispatchMemMap::get(dispatch_core_type)
uint32_t completion_q_wr_ptr = DispatchMemMap::get(DISPATCH_CORE_TYPE)
.get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_WR);
uint32_t completion_q_rd_ptr = DispatchMemMap::get(dispatch_core_type)
uint32_t completion_q_rd_ptr = DispatchMemMap::get(DISPATCH_CORE_TYPE)
.get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_RD);
tt::llrt::write_hex_vec_to_core(device->id(), phys_dispatch_host_core, tmp, completion_q_wr_ptr);
tt::llrt::write_hex_vec_to_core(device->id(), phys_dispatch_host_core, tmp, completion_q_rd_ptr);
Expand Down Expand Up @@ -2972,18 +2972,18 @@ void configure_for_multi_chip(
}

uint32_t host_completion_queue_wr_ptr =
DispatchMemMap::get(CoreType::WORKER).get_host_command_queue_addr(CommandQueueHostAddrType::COMPLETION_Q_WR);
DispatchMemMap::get(DISPATCH_CORE_TYPE).get_host_command_queue_addr(CommandQueueHostAddrType::COMPLETION_Q_WR);
uint32_t dev_completion_queue_wr_ptr =
DispatchMemMap::get(CoreType::WORKER)
DispatchMemMap::get(DISPATCH_CORE_TYPE)
.get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_WR);
uint32_t dev_completion_queue_rd_ptr =
DispatchMemMap::get(CoreType::WORKER)
DispatchMemMap::get(DISPATCH_CORE_TYPE)
.get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_RD);
std::vector<uint32_t> dispatch_compile_args = {
dispatch_buffer_base,
DispatchSettings::DISPATCH_BUFFER_LOG_PAGE_SIZE,
DispatchSettings::DISPATCH_BUFFER_SIZE_BLOCKS *
DispatchMemMap::get(dispatch_core_type, 1).dispatch_buffer_block_size_pages(),
DispatchMemMap::get(DISPATCH_CORE_TYPE, 1).dispatch_buffer_block_size_pages(),
dispatch_cb_sem, // overridden below for h
split_prefetcher_g ? prefetch_d_downstream_cb_sem : prefetch_downstream_cb_sem,
DispatchSettings::DISPATCH_BUFFER_SIZE_BLOCKS,
Expand Down Expand Up @@ -3321,7 +3321,7 @@ int main(int argc, char** argv) {
CoreCoord phys_dispatch_relay_demux_core;
uint32_t packetized_path_test_results_addr;
uint32_t cq_start =
DispatchMemMap::get(CoreType::WORKER).get_host_command_queue_addr(CommandQueueHostAddrType::UNRESERVED);
DispatchMemMap::get(DISPATCH_CORE_TYPE).get_host_command_queue_addr(CommandQueueHostAddrType::UNRESERVED);
uint32_t dev_hugepage_base_g = 2 * (cq_start * sizeof(uint32_t)); // HOST_CQ uses some at the start address

if (test_device_id_g == 0) {
Expand Down
Loading

0 comments on commit 83b3325

Please sign in to comment.