From aecdb82d4f54091f8c01cc66ece4def303e82d12 Mon Sep 17 00:00:00 2001 From: Christopher Siefert Date: Wed, 16 Oct 2024 07:44:53 -0700 Subject: [PATCH 1/6] Kernel logger upgrades for better fence control --- debugging/kernel-logger/Makefile | 2 +- debugging/kernel-logger/kp_kernel_logger.cpp | 114 ++++++++++++++++--- 2 files changed, 101 insertions(+), 15 deletions(-) diff --git a/debugging/kernel-logger/Makefile b/debugging/kernel-logger/Makefile index a8e493e4c..ce48b7545 100644 --- a/debugging/kernel-logger/Makefile +++ b/debugging/kernel-logger/Makefile @@ -1,5 +1,5 @@ CXX=g++ -CXXFLAGS=-O3 -std=c++11 -g +CXXFLAGS=-O3 -std=c++11 -g -I../../profiling/all SHARED_CXXFLAGS=-shared -fPIC all: kp_kernel_logger.so diff --git a/debugging/kernel-logger/kp_kernel_logger.cpp b/debugging/kernel-logger/kp_kernel_logger.cpp index dc5b13167..b96d7303f 100644 --- a/debugging/kernel-logger/kp_kernel_logger.cpp +++ b/debugging/kernel-logger/kp_kernel_logger.cpp @@ -19,7 +19,9 @@ #include #include #include +#include #include +#include "impl/Kokkos_Profiling_Interface.hpp" std::vector regions; static uint64_t uniqID; @@ -27,6 +29,62 @@ struct SpaceHandle { char name[64]; }; + + + +// Get a useful label from the deviceId +// NOTE: Relevant code is in: kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp +std::string deviceIdToString(const uint32_t deviceId) { + using namespace Kokkos::Tools::Experimental; + std::string device_label("("); + ExecutionSpaceIdentifier eid = identifier_from_devid(deviceId); + if (eid.type == DeviceType::Serial) device_label+="Serial"; + else if (eid.type == DeviceType::OpenMP) device_label+="OpenMP"; + else if (eid.type == DeviceType::Cuda) device_label+="Cuda"; + else if (eid.type == DeviceType::HIP) device_label+="HIP"; + else if (eid.type == DeviceType::OpenMPTarget) device_label+="OpenMPTarget"; + else if (eid.type == DeviceType::HPX) device_label+="HPX"; + else if (eid.type == DeviceType::Threads) device_label+="Threads"; + else if (eid.type == DeviceType::SYCL) device_label+="SYCL"; + else if (eid.type == DeviceType::OpenACC) device_label+="OpenACC"; + else if (eid.type == DeviceType::Unknown) device_label+="Unknown"; + else device_label+="Unknown to KokkosTools"; + + if(eid.instance_id == int_for_synchronization_reason(SpecialSynchronizationCases::GlobalDeviceSynchronization)) + device_label += " All Instances)"; + else if(eid.instance_id == int_for_synchronization_reason(SpecialSynchronizationCases::DeepCopyResourceSynchronization)) + device_label += " DeepCopyResource)"; + else + device_label += " Instance " + std::to_string(eid.instance_id) + ")"; + + return device_label; +} + + + + +bool suppressCounts() { + static bool value=false; + static bool initialized=false; + + if(initialized) + return value; + else { + const char* varVal = std::getenv("KOKKOS_PROFILE_SUPPRESS_COUNTS"); + if(varVal) { + std::string v = std::string(varVal); + // default to false + if (v=="1" || v=="ON" || v=="on" || v=="TRUE" || v=="true" || v=="YES" || v=="yes") + value=true; + else + value=false; + } + initialized=true; + return value; + } +} + + void kokkosp_print_region_stack_indent(const int level) { printf("KokkosP: "); @@ -67,11 +125,15 @@ extern "C" void kokkosp_begin_parallel_for(const char* name, const uint32_t devID, uint64_t* kID) { *kID = uniqID++; - + int output=*kID; + if(suppressCounts()) + output=0; + + printf( - "KokkosP: Executing parallel-for kernel on device %d with unique " + "KokkosP: Executing parallel-for kernel on device %s with unique " "execution identifier %llu\n", - devID, (unsigned long long)(*kID)); + deviceIdToString(devID).c_str(), (unsigned long long)(output)); int level = kokkosp_print_region_stack(); kokkosp_print_region_stack_indent(level); @@ -80,19 +142,25 @@ extern "C" void kokkosp_begin_parallel_for(const char* name, } extern "C" void kokkosp_end_parallel_for(const uint64_t kID) { + int output=kID; + if(suppressCounts()) + output=0; printf("KokkosP: Execution of kernel %llu is completed.\n", - (unsigned long long)(kID)); + (unsigned long long)output); } extern "C" void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, uint64_t* kID) { *kID = uniqID++; - + int output=*kID; + if(suppressCounts()) + output=0; + printf( - "KokkosP: Executing parallel-scan kernel on device %d with unique " + "KokkosP: Executing parallel-scan kernel on device %s with unique " "execution identifier %llu\n", - devID, (unsigned long long)(*kID)); + deviceIdToString(devID).c_str(), (unsigned long long)(output)); int level = kokkosp_print_region_stack(); kokkosp_print_region_stack_indent(level); @@ -101,19 +169,25 @@ extern "C" void kokkosp_begin_parallel_scan(const char* name, } extern "C" void kokkosp_end_parallel_scan(const uint64_t kID) { + int output=kID; + if(suppressCounts()) + output=0; printf("KokkosP: Execution of kernel %llu is completed.\n", - (unsigned long long)(kID)); + (unsigned long long)(output)); } extern "C" void kokkosp_begin_parallel_reduce(const char* name, const uint32_t devID, uint64_t* kID) { *kID = uniqID++; + int output=*kID; + if(suppressCounts()) + output=0; printf( - "KokkosP: Executing parallel-reduce kernel on device %d with unique " + "KokkosP: Executing parallel-reduce kernel on device %s with unique " "execution identifier %llu\n", - devID, (unsigned long long)(*kID)); + deviceIdToString(devID).c_str(), (unsigned long long)(output)); int level = kokkosp_print_region_stack(); kokkosp_print_region_stack_indent(level); @@ -122,8 +196,12 @@ extern "C" void kokkosp_begin_parallel_reduce(const char* name, } extern "C" void kokkosp_end_parallel_reduce(const uint64_t kID) { + int output=kID; + if(suppressCounts()) + output=0; + printf("KokkosP: Execution of kernel %llu is completed.\n", - (unsigned long long)(kID)); + (unsigned long long)(output)); } extern "C" void kokkosp_begin_fence(const char* name, const uint32_t devID, @@ -139,10 +217,14 @@ extern "C" void kokkosp_begin_fence(const char* name, const uint32_t devID, } else { *kID = uniqID++; + int output=*kID; + if(suppressCounts()) + output=0; + printf( - "KokkosP: Executing fence on device %d with unique execution " + "KokkosP: Executing fence on device %s with unique execution " "identifier %llu\n", - devID, (unsigned long long)(*kID)); + deviceIdToString(devID).c_str(), (unsigned long long)(output)); int level = kokkosp_print_region_stack(); kokkosp_print_region_stack_indent(level); @@ -156,8 +238,12 @@ extern "C" void kokkosp_end_fence(const uint64_t kID) { // dealing with the application's fence, which we filtered out in the callback // for fences if (kID != std::numeric_limits::max()) { + int output=kID; + if(suppressCounts()) + output=0; + printf("KokkosP: Execution of fence %llu is completed.\n", - (unsigned long long)(kID)); + (unsigned long long)(output)); } } From 8358c9f2ce9ffbe610e5a5e4d0cf6ee80653b4d8 Mon Sep 17 00:00:00 2001 From: Chris Siefert Date: Wed, 16 Oct 2024 16:00:55 -0600 Subject: [PATCH 2/6] profiling/all: Updating from Kokkos develop Signed-off-by: Chris Siefert --- profiling/all/impl/Kokkos_Profiling_Interface.hpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/profiling/all/impl/Kokkos_Profiling_Interface.hpp b/profiling/all/impl/Kokkos_Profiling_Interface.hpp index b66886d9f..ddd6223be 100644 --- a/profiling/all/impl/Kokkos_Profiling_Interface.hpp +++ b/profiling/all/impl/Kokkos_Profiling_Interface.hpp @@ -101,6 +101,15 @@ inline uint32_t device_id(ExecutionSpace const& space) noexcept { << num_instance_bits) + space.impl_instance_id(); } + +inline uint32_t int_for_synchronization_reason( + Kokkos::Tools::Experimental::SpecialSynchronizationCases reason) { + switch (reason) { + case GlobalDeviceSynchronization: return 0; + case DeepCopyResourceSynchronization: return 0x00ffffff; + } + return 0; +} } // namespace Experimental } // namespace Tools } // end namespace Kokkos From 62a39a1faa691a409084a97b77dfc1cfca411790 Mon Sep 17 00:00:00 2001 From: Chris Siefert Date: Wed, 16 Oct 2024 16:05:05 -0600 Subject: [PATCH 3/6] All hail clang Signed-off-by: Chris Siefert --- debugging/kernel-logger/kp_kernel_logger.cpp | 131 ++++++++++--------- 1 file changed, 66 insertions(+), 65 deletions(-) diff --git a/debugging/kernel-logger/kp_kernel_logger.cpp b/debugging/kernel-logger/kp_kernel_logger.cpp index b96d7303f..093a34c6c 100644 --- a/debugging/kernel-logger/kp_kernel_logger.cpp +++ b/debugging/kernel-logger/kp_kernel_logger.cpp @@ -29,61 +29,71 @@ struct SpaceHandle { char name[64]; }; - - - // Get a useful label from the deviceId -// NOTE: Relevant code is in: kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp +// NOTE: Relevant code is in: +// kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp std::string deviceIdToString(const uint32_t deviceId) { using namespace Kokkos::Tools::Experimental; std::string device_label("("); ExecutionSpaceIdentifier eid = identifier_from_devid(deviceId); - if (eid.type == DeviceType::Serial) device_label+="Serial"; - else if (eid.type == DeviceType::OpenMP) device_label+="OpenMP"; - else if (eid.type == DeviceType::Cuda) device_label+="Cuda"; - else if (eid.type == DeviceType::HIP) device_label+="HIP"; - else if (eid.type == DeviceType::OpenMPTarget) device_label+="OpenMPTarget"; - else if (eid.type == DeviceType::HPX) device_label+="HPX"; - else if (eid.type == DeviceType::Threads) device_label+="Threads"; - else if (eid.type == DeviceType::SYCL) device_label+="SYCL"; - else if (eid.type == DeviceType::OpenACC) device_label+="OpenACC"; - else if (eid.type == DeviceType::Unknown) device_label+="Unknown"; - else device_label+="Unknown to KokkosTools"; - - if(eid.instance_id == int_for_synchronization_reason(SpecialSynchronizationCases::GlobalDeviceSynchronization)) + if (eid.type == DeviceType::Serial) + device_label += "Serial"; + else if (eid.type == DeviceType::OpenMP) + device_label += "OpenMP"; + else if (eid.type == DeviceType::Cuda) + device_label += "Cuda"; + else if (eid.type == DeviceType::HIP) + device_label += "HIP"; + else if (eid.type == DeviceType::OpenMPTarget) + device_label += "OpenMPTarget"; + else if (eid.type == DeviceType::HPX) + device_label += "HPX"; + else if (eid.type == DeviceType::Threads) + device_label += "Threads"; + else if (eid.type == DeviceType::SYCL) + device_label += "SYCL"; + else if (eid.type == DeviceType::OpenACC) + device_label += "OpenACC"; + else if (eid.type == DeviceType::Unknown) + device_label += "Unknown"; + else + device_label += "Unknown to KokkosTools"; + + if (eid.instance_id == + int_for_synchronization_reason( + SpecialSynchronizationCases::GlobalDeviceSynchronization)) device_label += " All Instances)"; - else if(eid.instance_id == int_for_synchronization_reason(SpecialSynchronizationCases::DeepCopyResourceSynchronization)) + else if (eid.instance_id == + int_for_synchronization_reason( + SpecialSynchronizationCases::DeepCopyResourceSynchronization)) device_label += " DeepCopyResource)"; else device_label += " Instance " + std::to_string(eid.instance_id) + ")"; - + return device_label; } - - - bool suppressCounts() { - static bool value=false; - static bool initialized=false; - - if(initialized) + static bool value = false; + static bool initialized = false; + + if (initialized) return value; else { const char* varVal = std::getenv("KOKKOS_PROFILE_SUPPRESS_COUNTS"); - if(varVal) { + if (varVal) { std::string v = std::string(varVal); // default to false - if (v=="1" || v=="ON" || v=="on" || v=="TRUE" || v=="true" || v=="YES" || v=="yes") - value=true; + if (v == "1" || v == "ON" || v == "on" || v == "TRUE" || v == "true" || + v == "YES" || v == "yes") + value = true; else - value=false; + value = false; } - initialized=true; + initialized = true; return value; } -} - +} void kokkosp_print_region_stack_indent(const int level) { printf("KokkosP: "); @@ -124,12 +134,10 @@ extern "C" void kokkosp_finalize_library() { extern "C" void kokkosp_begin_parallel_for(const char* name, const uint32_t devID, uint64_t* kID) { - *kID = uniqID++; - int output=*kID; - if(suppressCounts()) - output=0; - - + *kID = uniqID++; + int output = *kID; + if (suppressCounts()) output = 0; + printf( "KokkosP: Executing parallel-for kernel on device %s with unique " "execution identifier %llu\n", @@ -142,9 +150,8 @@ extern "C" void kokkosp_begin_parallel_for(const char* name, } extern "C" void kokkosp_end_parallel_for(const uint64_t kID) { - int output=kID; - if(suppressCounts()) - output=0; + int output = kID; + if (suppressCounts()) output = 0; printf("KokkosP: Execution of kernel %llu is completed.\n", (unsigned long long)output); } @@ -152,11 +159,10 @@ extern "C" void kokkosp_end_parallel_for(const uint64_t kID) { extern "C" void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, uint64_t* kID) { - *kID = uniqID++; - int output=*kID; - if(suppressCounts()) - output=0; - + *kID = uniqID++; + int output = *kID; + if (suppressCounts()) output = 0; + printf( "KokkosP: Executing parallel-scan kernel on device %s with unique " "execution identifier %llu\n", @@ -169,9 +175,8 @@ extern "C" void kokkosp_begin_parallel_scan(const char* name, } extern "C" void kokkosp_end_parallel_scan(const uint64_t kID) { - int output=kID; - if(suppressCounts()) - output=0; + int output = kID; + if (suppressCounts()) output = 0; printf("KokkosP: Execution of kernel %llu is completed.\n", (unsigned long long)(output)); } @@ -179,10 +184,9 @@ extern "C" void kokkosp_end_parallel_scan(const uint64_t kID) { extern "C" void kokkosp_begin_parallel_reduce(const char* name, const uint32_t devID, uint64_t* kID) { - *kID = uniqID++; - int output=*kID; - if(suppressCounts()) - output=0; + *kID = uniqID++; + int output = *kID; + if (suppressCounts()) output = 0; printf( "KokkosP: Executing parallel-reduce kernel on device %s with unique " @@ -196,9 +200,8 @@ extern "C" void kokkosp_begin_parallel_reduce(const char* name, } extern "C" void kokkosp_end_parallel_reduce(const uint64_t kID) { - int output=kID; - if(suppressCounts()) - output=0; + int output = kID; + if (suppressCounts()) output = 0; printf("KokkosP: Execution of kernel %llu is completed.\n", (unsigned long long)(output)); @@ -217,10 +220,9 @@ extern "C" void kokkosp_begin_fence(const char* name, const uint32_t devID, } else { *kID = uniqID++; - int output=*kID; - if(suppressCounts()) - output=0; - + int output = *kID; + if (suppressCounts()) output = 0; + printf( "KokkosP: Executing fence on device %s with unique execution " "identifier %llu\n", @@ -238,10 +240,9 @@ extern "C" void kokkosp_end_fence(const uint64_t kID) { // dealing with the application's fence, which we filtered out in the callback // for fences if (kID != std::numeric_limits::max()) { - int output=kID; - if(suppressCounts()) - output=0; - + int output = kID; + if (suppressCounts()) output = 0; + printf("KokkosP: Execution of fence %llu is completed.\n", (unsigned long long)(output)); } From 21d63596d6e24b231234d8c13a30aadb3297fe4a Mon Sep 17 00:00:00 2001 From: Chris Siefert Date: Thu, 17 Oct 2024 08:36:43 -0600 Subject: [PATCH 4/6] Update debugging/kernel-logger/kp_kernel_logger.cpp Co-authored-by: Daniel Arndt --- debugging/kernel-logger/kp_kernel_logger.cpp | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/debugging/kernel-logger/kp_kernel_logger.cpp b/debugging/kernel-logger/kp_kernel_logger.cpp index 093a34c6c..92d584f34 100644 --- a/debugging/kernel-logger/kp_kernel_logger.cpp +++ b/debugging/kernel-logger/kp_kernel_logger.cpp @@ -74,25 +74,18 @@ std::string deviceIdToString(const uint32_t deviceId) { } bool suppressCounts() { - static bool value = false; - static bool initialized = false; - - if (initialized) - return value; - else { + static bool value = [](){ const char* varVal = std::getenv("KOKKOS_PROFILE_SUPPRESS_COUNTS"); if (varVal) { std::string v = std::string(varVal); // default to false if (v == "1" || v == "ON" || v == "on" || v == "TRUE" || v == "true" || v == "YES" || v == "yes") - value = true; - else - value = false; + return true; } - initialized = true; - return value; - } + return false; + }(); + return value; } void kokkosp_print_region_stack_indent(const int level) { From 5f15867b98a7430d4c6d4896d8f910c2bd955bb8 Mon Sep 17 00:00:00 2001 From: Chris Siefert Date: Thu, 17 Oct 2024 08:37:37 -0600 Subject: [PATCH 5/6] Update kp_kernel_logger.cpp --- debugging/kernel-logger/kp_kernel_logger.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debugging/kernel-logger/kp_kernel_logger.cpp b/debugging/kernel-logger/kp_kernel_logger.cpp index 92d584f34..23dfcbe7b 100644 --- a/debugging/kernel-logger/kp_kernel_logger.cpp +++ b/debugging/kernel-logger/kp_kernel_logger.cpp @@ -75,7 +75,7 @@ std::string deviceIdToString(const uint32_t deviceId) { bool suppressCounts() { static bool value = [](){ - const char* varVal = std::getenv("KOKKOS_PROFILE_SUPPRESS_COUNTS"); + const char* varVal = std::getenv("KOKKOS_TOOLS_LOGGER_SUPPRESS_COUNTS"); if (varVal) { std::string v = std::string(varVal); // default to false From 520f74f28a96b9f0eeb0a350f776ce4bdfcf0e5d Mon Sep 17 00:00:00 2001 From: Chris Siefert Date: Tue, 12 Nov 2024 15:35:39 -0700 Subject: [PATCH 6/6] Update debugging/kernel-logger/kp_kernel_logger.cpp That's twisted, but sure, why not? Co-authored-by: Tomasetti Romin --- debugging/kernel-logger/kp_kernel_logger.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/debugging/kernel-logger/kp_kernel_logger.cpp b/debugging/kernel-logger/kp_kernel_logger.cpp index 23dfcbe7b..8711bd494 100644 --- a/debugging/kernel-logger/kp_kernel_logger.cpp +++ b/debugging/kernel-logger/kp_kernel_logger.cpp @@ -159,7 +159,10 @@ extern "C" void kokkosp_begin_parallel_scan(const char* name, printf( "KokkosP: Executing parallel-scan kernel on device %s with unique " "execution identifier %llu\n", - deviceIdToString(devID).c_str(), (unsigned long long)(output)); + printf( + "KokkosP: Executing parallel-scan kernel on device %d (%s) with unique " + "execution identifier %llu\n", + devID, deviceIdToString(devID).c_str(), (unsigned long long)(output)); int level = kokkosp_print_region_stack(); kokkosp_print_region_stack_indent(level);