diff --git a/third_party/acl/inc/acl/acl_prof.h b/third_party/acl/inc/acl/acl_prof.h index 267bf142e..2eae2ef05 100644 --- a/third_party/acl/inc/acl/acl_prof.h +++ b/third_party/acl/inc/acl/acl_prof.h @@ -49,6 +49,7 @@ typedef enum { ACL_AICORE_RESOURCE_CONFLICT_RATIO = 4, ACL_AICORE_MEMORY_UB = 5, ACL_AICORE_L2_CACHE = 6, + ACL_AICORE_MEMORY_ACCESS = 8, ACL_AICORE_NONE = 0xFF } aclprofAicoreMetrics; diff --git a/torch_npu/csrc/profiler/feature_mgr.cpp b/torch_npu/csrc/profiler/feature_mgr.cpp index 28aa60c49..6ebcd0b3f 100644 --- a/torch_npu/csrc/profiler/feature_mgr.cpp +++ b/torch_npu/csrc/profiler/feature_mgr.cpp @@ -12,11 +12,13 @@ const static char* VERSION = "master\0"; static std::unordered_map NAME_TABLE = { {"ATTR", FeatureType::FEATURE_ATTR}, + {"MemoryAccess", FeatureType::FEATURE_MEMORY_ACCESS} }; // featureName, featureVersion static std::unordered_map FMK_FEATURES = { {FeatureType::FEATURE_ATTR, "1"}, + {FeatureType::FEATURE_MEMORY_ACCESS, "1"} }; } diff --git a/torch_npu/csrc/profiler/feature_mgr.h b/torch_npu/csrc/profiler/feature_mgr.h index b70073ab6..0f318fdb1 100644 --- a/torch_npu/csrc/profiler/feature_mgr.h +++ b/torch_npu/csrc/profiler/feature_mgr.h @@ -17,6 +17,7 @@ namespace profiler { enum class FeatureType { FEATURE_MIN = 0, FEATURE_ATTR, + FEATURE_MEMORY_ACCESS, FEATURE_MAX, }; using torch_npu::toolkit::profiler::Utils; diff --git a/torch_npu/csrc/profiler/profiler_mgr.cpp b/torch_npu/csrc/profiler/profiler_mgr.cpp index 381998052..e2ca47909 100644 --- a/torch_npu/csrc/profiler/profiler_mgr.cpp +++ b/torch_npu/csrc/profiler/profiler_mgr.cpp @@ -23,6 +23,7 @@ std::map ProfilerMgr::npu_metrics_map_ = { {"ACL_AICORE_RESOURCE_CONFLICT_RATIO", ACL_AICORE_RESOURCE_CONFLICT_RATIO}, {"ACL_AICORE_MEMORY_UB", ACL_AICORE_MEMORY_UB}, {"ACL_AICORE_L2_CACHE", ACL_AICORE_L2_CACHE}, + {"ACL_AICORE_MEMORY_ACCESS", ACL_AICORE_MEMORY_ACCESS}, {"ACL_AICORE_NONE", ACL_AICORE_NONE}, }; @@ -34,7 +35,19 @@ std::map ProfilerMgr::trace_level_map_ = { }; constexpr uint32_t capacity_ = 1048576; // 2^20, Experience value for default ringbuffer size for single data -constexpr uint32_t trace_capacity_ = 128; // 2^7, Experience value for python trace data ringbuffer size for batch data +constexpr uint32_t trace_capacity_ = 128; // 2^7, Experience value for python trace data ringbuffer size for batch data + +aclprofAicoreMetrics CheckAicMetricsFeature(aclprofAicoreMetrics aic_metrics, int8_t level) +{ + if (aic_metrics == ACL_AICORE_MEMORY_ACCESS && + !FeatureMgr::GetInstance()->IsSupportFeature(FeatureType::FEATURE_MEMORY_ACCESS)) { + ASCEND_LOGW("AiCMetrics is not supported to set to MemoryAccess."); + printf("[WARN]%s,%s:%u:AiCMetrics is not supported to set to MemoryAccess, reset to default.\n", + __FUNCTION__, __FILENAME__, __LINE__); + return (level >= 1 ? ACL_AICORE_PIPE_UTILIZATION : ACL_AICORE_NONE); + } + return aic_metrics; +} ProfilerMgr::ProfilerMgr() : report_enable_(false), @@ -77,12 +90,14 @@ void ProfilerMgr::Start(const NpuTraceConfig &npu_config, bool cpu_trace) c10_npu::npuSynchronizeDevice(); if (npu_trace_.load() == true) { aclprofAicoreMetrics aic_metrics = ACL_AICORE_NONE; + int8_t level_int = trace_level_to_int_.find(npu_config.trace_level) != trace_level_to_int_.end() ? + trace_level_to_int_[npu_config.trace_level] : -1; auto level_iter = trace_level_map_.find(npu_config.trace_level); uint64_t datatype_config = (level_iter == trace_level_map_.end()) ? Level0 : trace_level_map_[npu_config.trace_level]; auto metrics_iter = npu_metrics_map_.find(npu_config.metrics); if (metrics_iter != npu_metrics_map_.end() && npu_config.metrics.compare("ACL_AICORE_NONE") != 0) { datatype_config |= ACL_PROF_AICORE_METRICS; - aic_metrics = npu_metrics_map_[npu_config.metrics]; + aic_metrics = CheckAicMetricsFeature(npu_metrics_map_[npu_config.metrics], level_int); } if (npu_config.l2_cache) { datatype_config |= ACL_PROF_L2CACHE; @@ -111,8 +126,6 @@ void ProfilerMgr::Start(const NpuTraceConfig &npu_config, bool cpu_trace) const uint32_t deviceNum = 1; uint32_t deviceIdList[deviceNum] = {deviceId}; EnableMsProfiler(deviceIdList, deviceNum, aic_metrics, datatype_config); - int8_t level_int = trace_level_to_int_.find(npu_config.trace_level) != trace_level_to_int_.end() ? - trace_level_to_int_[npu_config.trace_level] : -1; trace_level_.store(level_int); } diff --git a/torch_npu/profiler/analysis/prof_common_func/_constant.py b/torch_npu/profiler/analysis/prof_common_func/_constant.py index 7a5cd613c..fab9c71ac 100644 --- a/torch_npu/profiler/analysis/prof_common_func/_constant.py +++ b/torch_npu/profiler/analysis/prof_common_func/_constant.py @@ -118,6 +118,7 @@ class Constant(object): AicMemoryUB = "ACL_AICORE_MEMORY_UB" AicResourceConflictRatio = "ACL_AICORE_RESOURCE_CONFLICT_RATIO" AicL2Cache = "ACL_AICORE_L2_CACHE" + AicMemoryAccess = "ACL_AICORE_MEMORY_ACCESS" AicMetricsNone = "ACL_AICORE_NONE" Db = "db" Text = "text" diff --git a/torch_npu/profiler/experimental_config.py b/torch_npu/profiler/experimental_config.py index c2d831c5e..06a3b6d21 100644 --- a/torch_npu/profiler/experimental_config.py +++ b/torch_npu/profiler/experimental_config.py @@ -21,7 +21,7 @@ def supported_profiler_level(): def supported_ai_core_metrics(): return set((AiCMetrics.AiCoreNone, AiCMetrics.PipeUtilization, AiCMetrics.ArithmeticUtilization, AiCMetrics.Memory, AiCMetrics.MemoryL0, AiCMetrics.MemoryUB, - AiCMetrics.ResourceConflictRatio, AiCMetrics.L2Cache)) + AiCMetrics.ResourceConflictRatio, AiCMetrics.L2Cache, AiCMetrics.MemoryAccess)) def supported_export_type(): @@ -43,6 +43,7 @@ class AiCMetrics: MemoryUB = Constant.AicMemoryUB ResourceConflictRatio = Constant.AicResourceConflictRatio L2Cache = Constant.AicL2Cache + MemoryAccess = Constant.AicMemoryAccess AiCoreNone = Constant.AicMetricsNone @@ -120,10 +121,10 @@ def _check_params(self): if self._aic_metrics not in ( AiCMetrics.L2Cache, AiCMetrics.MemoryL0, AiCMetrics.Memory, AiCMetrics.MemoryUB, AiCMetrics.PipeUtilization, AiCMetrics.ArithmeticUtilization, AiCMetrics.ResourceConflictRatio, - Constant.AicMetricsNone): + AiCMetrics.MemoryAccess, AiCMetrics.AiCoreNone): print_warn_msg("Invalid parameter aic_metrics, reset it to default.") if self._profiler_level == ProfilerLevel.Level0: - self._aic_metrics = Constant.AicMetricsNone + self._aic_metrics = AiCMetrics.AiCoreNone else: self._aic_metrics = AiCMetrics.PipeUtilization if not isinstance(self._op_attr, bool):