diff --git a/include/simulator.h b/include/simulator.h index c26518e0c5..80a267488a 100644 --- a/include/simulator.h +++ b/include/simulator.h @@ -105,7 +105,7 @@ class MachineModel { virtual int get_num_gpus() const = 0; virtual float get_intra_node_gpu_bandwidth() const = 0; virtual float get_inter_node_gpu_bandwidth() const = 0; - virtual std::vector get_comm_path(MemDevice *src_mem, MemDevice *tar_mem) const = 0; + virtual std::vector get_comm_path(MemDevice *src_mem, MemDevice *tar_mem) = 0; virtual std::string to_string() const = 0; int version; }; @@ -120,7 +120,7 @@ class SimpleMachineModel : public MachineModel { int get_num_gpus() const; float get_intra_node_gpu_bandwidth() const; float get_inter_node_gpu_bandwidth() const; - std::vector get_comm_path(MemDevice *src_mem, MemDevice *tar_mem) const; + std::vector get_comm_path(MemDevice *src_mem, MemDevice *tar_mem); std::string to_string() const; private: int num_nodes; @@ -152,10 +152,6 @@ class SimpleMachineModel : public MachineModel { */ class EnhancedMachineModel : public MachineModel { public: - enum NicDistribution { - PER_NODE, - PER_SOCKET, - }; EnhancedMachineModel(std::string file, size_t gpu_fb_mem_capacity); ~EnhancedMachineModel(); int get_version() const; @@ -168,10 +164,12 @@ class EnhancedMachineModel : public MachineModel { MemDevice *get_gpu_fb_mem(int device_id) const; MemDevice *get_gpu_fb_mem(int socket_id, int local_id) const; CommDevice *get_nvlink(MemDevice *src_mem, MemDevice *tar_mem) const; + CommDevice *get_next_nic_in(int socket_id); + CommDevice *get_next_nic_out(int socket_id) const; int get_num_gpus() const; float get_intra_node_gpu_bandwidth() const; float get_inter_node_gpu_bandwidth() const; - std::vector get_comm_path(MemDevice *src_mem, MemDevice *tar_mem) const; + std::vector get_comm_path(MemDevice *src_mem, MemDevice *tar_mem); std::string to_string() const; private: int num_nodes; @@ -188,7 +186,8 @@ class EnhancedMachineModel : public MachineModel { float upi_bandwidth; float nic_latency; float nic_bandwidth; - NicDistribution nic_distribution; + int nic_persocket; + int cur_nic_local_id; float pci_latency; float pci_bandwidth; float nvlink_latency; @@ -214,8 +213,8 @@ class EnhancedMachineModel : public MachineModel { std::vector membuses; // socket_id std::vector upi_ins; // socket_id std::vector upi_outs; // socket_id - std::vector nic_ins; // socket_id - std::vector nic_outs; // socket_id + std::vector> nic_ins; // socket_id, local_id + std::vector> nic_outs; // socket_id, local_id std::vector pcis_to_host; // from gpu to main memory, socket_id std::vector pcis_to_device; // from main memory to gpu, socket_id std::vector> nvlinks; // node_id, local_id @@ -226,13 +225,13 @@ class EnhancedMachineModel : public MachineModel { void add_gpus(); void add_membuses(float latency, float bandwidth); void add_upis(float latency, float bandwidth); - void add_nics(float latency, float bandwidth, NicDistribution nic_distribution); + void add_nics(float latency, float bandwidth, int nic_persocket); void add_pcis(float latency, float bandwidth); void add_nvlinks(float latency, float bandwidth); // attach a nvlink communication device to a pair of GPU framebuffer memories void attach_nvlink(MemDevice *src_mem, MemDevice *tar_mem, CommDevice *comm); // return a list of specific communication devices based on the descriptions of a communication path - void add_comm_path(std::vector const &comm_device_list, MemDevice *src_mem, MemDevice *tar_mem, std::vector &ret) const; + void add_comm_path(std::vector const &comm_device_list, MemDevice *src_mem, MemDevice *tar_mem, std::vector &ret); }; class SimTask { diff --git a/machine_config_example b/machine_config_example index 080861b173..11b896805b 100644 --- a/machine_config_example +++ b/machine_config_example @@ -17,10 +17,10 @@ membus_bandwidth = 4.26623 # inter-socket links upi_latency = 0.0004 upi_bandwidth = 10.14039 -# inter-node links, the third argument means the distribution of the NICs (O means one NIC per node and 1 means one NIC per socket) +# inter-node links, the third argument means the number of NICs per socket (O means one NIC per node) nic_latency = 0.000507 nic_bandwidth = 10.9448431 -nic_distribution = 0 +nic_persocket = 0 # pci-e between CPU and GPU pci_latency = 0.001 pci_bandwidth = 12.578468749999999 diff --git a/src/runtime/machine_model.cc b/src/runtime/machine_model.cc index cbfcbd26c9..3d449a3e93 100644 --- a/src/runtime/machine_model.cc +++ b/src/runtime/machine_model.cc @@ -92,7 +92,7 @@ float SimpleMachineModel::get_inter_node_gpu_bandwidth() const } -std::vector SimpleMachineModel::get_comm_path(MemDevice *src_mem, MemDevice *tar_mem) const +std::vector SimpleMachineModel::get_comm_path(MemDevice *src_mem, MemDevice *tar_mem) { std::vector ret; // on the same memory @@ -222,9 +222,9 @@ EnhancedMachineModel::EnhancedMachineModel(std::string file, size_t gpu_fb_mem_c nic_bandwidth = stof(words[2]); printf("nic_bandwidth = %f\n", nic_bandwidth); } - else if (words[0] == "nic_distribution") { - nic_distribution = static_cast(stoi(words[2])); - printf("nic_distribution = %d\n", nic_distribution); + else if (words[0] == "nic_persocket") { + nic_persocket = stoi(words[2]); + printf("nic_persocket = %d\n", nic_persocket); } else if (words[0] == "pci_latency") { pci_latency = stof(words[2]); @@ -345,15 +345,17 @@ EnhancedMachineModel::EnhancedMachineModel(std::string file, size_t gpu_fb_mem_c num_sockets = num_nodes * num_sockets_per_node; num_cpus = num_sockets * num_cpus_per_socket; num_gpus = num_sockets * num_gpus_per_socket; + cur_nic_local_id = 0; num_nvlinks_per_node = 0; mem_to_nvlink.clear(); this->add_cpus(); this->add_gpus(); this->add_membuses(membus_latency, membus_bandwidth * 1024 * 1024); this->add_upis(upi_latency / 2, upi_bandwidth * 2 * 1024 * 1024); - this->add_nics(nic_latency / 2, nic_bandwidth * 2 * 1024 * 1024, nic_distribution); + this->add_nics(nic_latency / 2, nic_bandwidth * 2 * 1024 * 1024, nic_persocket); this->add_pcis(pci_latency, pci_bandwidth * 1024 * 1024); this->add_nvlinks(nvlink_latency, nvlink_bandwidth * 1024 * 1024); +// printf("%s", this->to_string().c_str()); } EnhancedMachineModel::~EnhancedMachineModel() @@ -467,9 +469,9 @@ void EnhancedMachineModel::add_upis(float latency, float bandwidth) } } -void EnhancedMachineModel::add_nics(float latency, float bandwidth, NicDistribution nic_distribution) +void EnhancedMachineModel::add_nics(float latency, float bandwidth, int nic_persocket) { - if (nic_distribution == PER_NODE) { + if (nic_persocket == 0) { for (int i = 0; i < num_nodes; i++) { int node_id = i; for (int j = 0; j < num_sockets_per_node; j++) { @@ -480,36 +482,41 @@ void EnhancedMachineModel::add_nics(float latency, float bandwidth, NicDistribut if (j == 0) { std::string nic_in_name = "NIC_IN " + std::to_string(device_id); nic_in = new CommDevice(nic_in_name, CommDevice::NIC_IN_COMM, node_id, socket_id, device_id, latency, bandwidth); - nic_ins.push_back(nic_in); + nic_ins.push_back({}); + nic_ins[socket_id].push_back(nic_in); std::string nic_out_name = "NIC_OUT " + std::to_string(device_id); nic_out = new CommDevice(nic_out_name, CommDevice::NIC_OUT_COMM, node_id, socket_id, device_id, latency, bandwidth); - nic_outs.push_back(nic_out); + nic_outs.push_back({}); + nic_outs[socket_id].push_back(nic_out); } else { - nic_ins.push_back(nic_in); - nic_outs.push_back(nic_out); + nic_ins.push_back({}); + nic_ins[socket_id].push_back(nic_in); + nic_outs.push_back({}); + nic_outs[socket_id].push_back(nic_out); } } } } - else if (nic_distribution == PER_SOCKET) { + else { for (int i = 0; i < num_nodes; i++) { int node_id = i; for (int j = 0; j < num_sockets_per_node; j++) { int socket_id = i * num_sockets_per_node + j; - int device_id = socket_id; - std::string nic_in_name = "NIC_IN " + std::to_string(device_id); - CommDevice *nic_in = new CommDevice(nic_in_name, CommDevice::NIC_IN_COMM, node_id, socket_id, device_id, latency, bandwidth); - nic_ins.push_back(nic_in); - std::string nic_out_name = "NIC_OUT " + std::to_string(device_id); - CommDevice *nic_out = new CommDevice(nic_out_name, CommDevice::NIC_OUT_COMM, node_id, socket_id, device_id, latency, bandwidth); - nic_outs.push_back(nic_out); + nic_ins.push_back({}); + nic_outs.push_back({}); + for (int k = 0; k < nic_persocket; k++) { + int device_id = socket_id * nic_persocket + k; + std::string nic_in_name = "NIC_IN " + std::to_string(device_id); + CommDevice *nic_in = new CommDevice(nic_in_name, CommDevice::NIC_IN_COMM, node_id, socket_id, device_id, latency, bandwidth); + nic_ins[socket_id].push_back(nic_in); + std::string nic_out_name = "NIC_OUT " + std::to_string(device_id); + CommDevice *nic_out = new CommDevice(nic_out_name, CommDevice::NIC_OUT_COMM, node_id, socket_id, device_id, latency, bandwidth); + nic_outs[socket_id].push_back(nic_out); + } } } } - else { - assert(false && "Unknown nic distribution type"); - } } void EnhancedMachineModel::add_pcis(float latency, float bandwidth) @@ -560,7 +567,7 @@ void EnhancedMachineModel::add_nvlinks(float latency, float bandwidth) local_nvlink_id--; } attach_nvlink(src_gpu_fb_mem, tar_gpu_fb_mem, nvlinks[i][local_nvlink_id]); - printf("add nvlink: gdb_fb_mem %d , gou_fb_mem %d, nvlink %d %d\n", src_gpu_fb_mem->device_id, tar_gpu_fb_mem->device_id, node_id, local_nvlink_id); + printf("add nvlink: gpu_fb_mem %d , gpu_fb_mem %d, nvlink %d %d\n", src_gpu_fb_mem->device_id, tar_gpu_fb_mem->device_id, node_id, local_nvlink_id); } } } @@ -648,13 +655,42 @@ CommDevice *EnhancedMachineModel::get_nvlink(MemDevice *src_mem, MemDevice *tar_ } } +CommDevice *EnhancedMachineModel::get_next_nic_in(int socket_id) +{ + if (nic_persocket == 0) { + return nic_ins[socket_id][0]; + } + if (socket_id < num_sockets) { + CommDevice *ret = nic_ins[socket_id][cur_nic_local_id]; + cur_nic_local_id = (cur_nic_local_id + 1) % nic_persocket; + return ret; + } + else { + printf("MachineModel: get_next_nic_in - cannot find next nic_in socket_id %d cur_nic_local_id %d\n", socket_id, cur_nic_local_id); + assert(false); + } +} + +CommDevice *EnhancedMachineModel::get_next_nic_out(int socket_id) const +{ + if (nic_persocket == 0) { + return nic_outs[socket_id][0]; + } + if (socket_id < num_sockets) { + return nic_outs[socket_id][cur_nic_local_id]; + } + else { + printf("MachineModel: get_next_nic_out - cannot find next nic_out socket_id %d cur_nic_local_id %d\n", socket_id, cur_nic_local_id); + assert(false); + } +} int EnhancedMachineModel::get_num_gpus() const { return num_gpus; } void EnhancedMachineModel::add_comm_path(std::vector const &comm_device_list, MemDevice *src_mem, - MemDevice *tar_mem, std::vector &ret) const + MemDevice *tar_mem, std::vector &ret) { MemDevice *cur_mem = src_mem; for (size_t i = 0; i < comm_device_list.size(); i++) { @@ -672,10 +708,10 @@ void EnhancedMachineModel::add_comm_path(std::vector co break; case CommDevice::NIC_IN_COMM: cur_mem = tar_mem; - ret.emplace_back(nic_ins[cur_mem->socket_id]); + ret.emplace_back(get_next_nic_in(cur_mem->socket_id)); break; case CommDevice::NIC_OUT_COMM: - ret.emplace_back(nic_outs[cur_mem->socket_id]); + ret.emplace_back(get_next_nic_out(cur_mem->socket_id)); break; case CommDevice::PCI_TO_HOST_COMM: ret.emplace_back(pcis_to_host[cur_mem->socket_id]); @@ -692,7 +728,7 @@ void EnhancedMachineModel::add_comm_path(std::vector co } } -std::vector EnhancedMachineModel::get_comm_path(MemDevice *src_mem, MemDevice *tar_mem) const +std::vector EnhancedMachineModel::get_comm_path(MemDevice *src_mem, MemDevice *tar_mem) { std::vector ret; if (src_mem->device_id == tar_mem->device_id) { @@ -790,10 +826,12 @@ std::string EnhancedMachineModel::to_string() const s += membuses[socket_id]->name + '\n'; s += upi_ins[socket_id]->name + '\n'; s += upi_outs[socket_id]->name + '\n'; - s += nic_ins[socket_id]->name + '\n'; - s += nic_outs[socket_id]->name + '\n'; s += pcis_to_host[socket_id]->name + '\n'; s += pcis_to_device[socket_id]->name + '\n'; + for (int k = 0; k < nic_persocket; k++) { + s += nic_ins[socket_id][k]->name + '\n'; + s += nic_outs[socket_id][k]->name + '\n'; + } } s += "------------------------------------------\n"; for (int j = 0; j < num_nvlinks_per_node * 2; j++) { diff --git a/src/runtime/simulator.cc b/src/runtime/simulator.cc index 0502047de1..d1d88622f3 100644 --- a/src/runtime/simulator.cc +++ b/src/runtime/simulator.cc @@ -221,7 +221,7 @@ void Simulator::add_task_dependencies_with_xfer(SimTask* src_task, { std::vector path = machine->get_comm_path(src_task->mem, dst_task->mem); // print the communication path - // printf("Path from %s to %s is: ", src_task->mem->name.c_str(), dst_task->mem->name.c_str()); + // printf("Message: %zu B\nPath from %s to %s is: ", message_size, src_task->mem->name.c_str(), dst_task->mem->name.c_str()); // for (size_t i = 0; i < path.size(); i++) { // printf("%s ", path[i]->name.c_str()); // } @@ -243,6 +243,12 @@ void Simulator::add_task_dependencies_with_xfer(SimTask* src_task, num_segment = max_num_segments; seg_size = message_size / num_segment; } + // optional optimization: can reduce the simulation time, but could also impact the accuracy of the simulation + // (a communication can be occupied by a message for long time without be used by other concurrent communication + // if (path.size() == 1) { + // num_segment = 1; + // seg_size = message_size; + // } // Create all the comm tasks // Divide messages into segments for (size_t i = 0; i < path.size(); i++) { @@ -277,10 +283,10 @@ void Simulator::add_task_dependencies_with_xfer(SimTask* src_task, // overlap between upi_ins and upi_outs, and between nic_ins and nic_outs. if (num_segment > 1 and path.size() >= 2) { for (size_t i = 0; i < path.size(); i++) { - for (int j = 1; j < num_segment; j++) { + for (int j = 0; j < num_segment - 1; j++) { if (((CommDevice *)all_tasks[i][j]->device)->comm_type == CommDevice::NIC_OUT_COMM or ((CommDevice *)all_tasks[i][j]->device)->comm_type == CommDevice::UPI_OUT_COMM) { - all_tasks[i+1][j-1]->add_next_task(all_tasks[i][j]); + all_tasks[i][j]->add_next_task(all_tasks[i-1][j+1]); } } }