From 7d0b0d671cb287a13a305752ac6ea03d4ce200c1 Mon Sep 17 00:00:00 2001 From: David Hunt Date: Wed, 18 Dec 2024 09:25:00 +0000 Subject: [PATCH] add patches to support IPM 24.12 release Add patch sets for DPDK versions, including capacity metric: - 20.11.9 - 21.11.8 - 22.11.6 - 23.11.2 Add patch sets for VPP versions, including capacity metric: - 20.09 - 21.01 - 22.02 - 23.02 - 24.02 Add python tool to read/write MSRs as an alternative to rdmsr/wrmsr. Signed-off-by: David Hunt --- ...001-eal-add-lcore-busyness-telemetry.patch | 22 +- ...l-add-cpuset-lcore-telemetry-entries.patch | 6 +- ...apacity-endpoint-to-telemetry-thread.patch | 356 +++++++ ...001-eal-add-lcore-busyness-telemetry.patch | 48 +- ...l-add-cpuset-lcore-telemetry-entries.patch | 6 +- ...apacity-endpoint-to-telemetry-thread.patch | 356 +++++++ ...001-eal-add-lcore-busyness-telemetry.patch | 48 +- ...l-add-cpuset-lcore-telemetry-entries.patch | 6 +- ...apacity-endpoint-to-telemetry-thread.patch | 356 +++++++ ...001-eal-add-lcore-busyness-telemetry.patch | 976 ++++++++++++++++++ ...l-add-cpuset-lcore-telemetry-entries.patch | 79 ++ ...apacity-endpoint-to-telemetry-thread.patch | 357 +++++++ ipm/patches/dpdk/README.md | 7 +- ...PATCH-1-1-stats-Added-capacity-flags.patch | 325 ++++++ ...PATCH-1-1-stats-Added-capacity-flags.patch | 325 ++++++ ...4-stats-Added-capacity-flag-in-stats.patch | 365 +++++++ ...01-vlib-CPU-load-measurement-and-CLI.patch | 114 ++ ...U-load-and-queue-burst-flag-in-stats.patch | 343 ++++++ ...-encode-cpu-id-in-utilization-metric.patch | 40 + ...4-stats-Added-capacity-flag-in-stats.patch | 365 +++++++ ...01-vlib-CPU-load-measurement-and-CLI.patch | 114 ++ ...U-load-and-queue-burst-flag-in-stats.patch | 351 +++++++ ...-encode-cpu-id-in-utilization-metric.patch | 42 + ...4-stats-Added-capacity-flag-in-stats.patch | 365 +++++++ msrtool/rw_msr_tool.py | 166 +++ 25 files changed, 5467 insertions(+), 71 deletions(-) create mode 100644 ipm/patches/dpdk/20.11/0003-add-capacity-endpoint-to-telemetry-thread.patch create mode 100644 ipm/patches/dpdk/21.11/0003-add-capacity-endpoint-to-telemetry-thread.patch create mode 100644 ipm/patches/dpdk/22.11/0003-add-capacity-endpoint-to-telemetry-thread.patch create mode 100644 ipm/patches/dpdk/23.11/0001-eal-add-lcore-busyness-telemetry.patch create mode 100644 ipm/patches/dpdk/23.11/0002-eal-add-cpuset-lcore-telemetry-entries.patch create mode 100644 ipm/patches/dpdk/23.11/0003-add-capacity-endpoint-to-telemetry-thread.patch create mode 100644 ipm/patches/vpp/20.09/0004-Subject-PATCH-1-1-stats-Added-capacity-flags.patch create mode 100644 ipm/patches/vpp/21.01/0004-Subject-PATCH-1-1-stats-Added-capacity-flags.patch create mode 100644 ipm/patches/vpp/22.02/0004-stats-Added-capacity-flag-in-stats.patch create mode 100644 ipm/patches/vpp/23.02/0001-vlib-CPU-load-measurement-and-CLI.patch create mode 100644 ipm/patches/vpp/23.02/0002-stats-Added-CPU-load-and-queue-burst-flag-in-stats.patch create mode 100644 ipm/patches/vpp/23.02/0003-stats-encode-cpu-id-in-utilization-metric.patch create mode 100644 ipm/patches/vpp/23.02/0004-stats-Added-capacity-flag-in-stats.patch create mode 100644 ipm/patches/vpp/24.02/0001-vlib-CPU-load-measurement-and-CLI.patch create mode 100644 ipm/patches/vpp/24.02/0002-stats-Added-CPU-load-and-queue-burst-flag-in-stats.patch create mode 100644 ipm/patches/vpp/24.02/0003-stats-encode-cpu-id-in-utilization-metric.patch create mode 100644 ipm/patches/vpp/24.02/0004-stats-Added-capacity-flag-in-stats.patch create mode 100755 msrtool/rw_msr_tool.py diff --git a/ipm/patches/dpdk/20.11/0001-eal-add-lcore-busyness-telemetry.patch b/ipm/patches/dpdk/20.11/0001-eal-add-lcore-busyness-telemetry.patch index 1cadea6..7be351e 100644 --- a/ipm/patches/dpdk/20.11/0001-eal-add-lcore-busyness-telemetry.patch +++ b/ipm/patches/dpdk/20.11/0001-eal-add-lcore-busyness-telemetry.patch @@ -1,7 +1,7 @@ -From 6942cd799c2fe3ec8e96d8c7758159456b37f9f4 Mon Sep 17 00:00:00 2001 +From ef227c95dd0ac20017c7190c42b6c6a98dae2ae0 Mon Sep 17 00:00:00 2001 From: Anatoly Burakov Date: Fri, 15 Jul 2022 13:12:53 +0000 -Subject: [PATCH 1/2] eal: add lcore busyness telemetry +Subject: [PATCH 1/3] eal: add lcore busyness telemetry Currently, there is no way to measure lcore busyness in a passive way, without any modifications to the application. This patch adds a new EAL @@ -205,7 +205,7 @@ index c210cf86bd..16806a896a 100644 } diff --git a/lib/librte_distributor/rte_distributor_single.c b/lib/librte_distributor/rte_distributor_single.c -index e8a13ce980..06e4bab89f 100644 +index f4725b1d0b..80460ab5d3 100644 --- a/lib/librte_distributor/rte_distributor_single.c +++ b/lib/librte_distributor/rte_distributor_single.c @@ -34,8 +34,11 @@ rte_distributor_request_pkt_single(struct rte_distributor_single *d, @@ -677,10 +677,10 @@ index a55fd7496d..90c2aa037a 100644 } #endif diff --git a/lib/librte_eal/linux/eal.c b/lib/librte_eal/linux/eal.c -index 5814f9ce69..772cc98143 100644 +index 814572ccbd..ce5701e42d 100644 --- a/lib/librte_eal/linux/eal.c +++ b/lib/librte_eal/linux/eal.c -@@ -1368,6 +1368,7 @@ rte_eal_cleanup(void) +@@ -1364,6 +1364,7 @@ rte_eal_cleanup(void) rte_mp_channel_cleanup(); rte_trace_save(); eal_trace_fini(); @@ -721,10 +721,10 @@ index fe065a41d9..d828a0d791 100644 INTERNAL { diff --git a/lib/librte_ethdev/rte_ethdev.h b/lib/librte_ethdev/rte_ethdev.h -index 5e8331da1c..e800ad05bf 100644 +index 8856648d11..87d30a059a 100644 --- a/lib/librte_ethdev/rte_ethdev.h +++ b/lib/librte_ethdev/rte_ethdev.h -@@ -4890,6 +4890,8 @@ rte_eth_rx_burst(uint16_t port_id, uint16_t queue_id, +@@ -4879,6 +4879,8 @@ rte_eth_rx_burst(uint16_t port_id, uint16_t queue_id, #endif rte_ethdev_trace_rx_burst(port_id, queue_id, (void **)rx_pkts, nb_rx); @@ -734,7 +734,7 @@ index 5e8331da1c..e800ad05bf 100644 } diff --git a/lib/librte_eventdev/rte_eventdev.h b/lib/librte_eventdev/rte_eventdev.h -index bec8f3c0c9..6cfe2925c8 100644 +index ce1fc2ce0f..820b3ba73d 100644 --- a/lib/librte_eventdev/rte_eventdev.h +++ b/lib/librte_eventdev/rte_eventdev.h @@ -1663,13 +1663,19 @@ rte_event_dequeue_burst(uint8_t dev_id, uint8_t port_id, struct rte_event ev[], @@ -786,7 +786,7 @@ index f29164dd15..c37712a3f0 100644 int diff --git a/lib/librte_regexdev/rte_regexdev.h b/lib/librte_regexdev/rte_regexdev.h -index df2312678c..00f0899fa8 100644 +index 0001658925..b3ec648ca9 100644 --- a/lib/librte_regexdev/rte_regexdev.h +++ b/lib/librte_regexdev/rte_regexdev.h @@ -1524,6 +1524,7 @@ rte_regexdev_dequeue_burst(uint8_t dev_id, uint16_t qp_id, @@ -809,7 +809,7 @@ index df2312678c..00f0899fa8 100644 #ifdef __cplusplus diff --git a/lib/librte_ring/rte_ring_elem.h b/lib/librte_ring/rte_ring_elem.h -index 0057da3597..5b1969cba1 100644 +index 7034d29c07..341bdf8dec 100644 --- a/lib/librte_ring/rte_ring_elem.h +++ b/lib/librte_ring/rte_ring_elem.h @@ -475,6 +475,8 @@ __rte_ring_do_dequeue_elem(struct rte_ring *r, void *obj_table, @@ -822,5 +822,5 @@ index 0057da3597..5b1969cba1 100644 } -- -2.31.1 +2.25.1 diff --git a/ipm/patches/dpdk/20.11/0002-eal-add-cpuset-lcore-telemetry-entries.patch b/ipm/patches/dpdk/20.11/0002-eal-add-cpuset-lcore-telemetry-entries.patch index 036dc28..0558d2d 100644 --- a/ipm/patches/dpdk/20.11/0002-eal-add-cpuset-lcore-telemetry-entries.patch +++ b/ipm/patches/dpdk/20.11/0002-eal-add-cpuset-lcore-telemetry-entries.patch @@ -1,7 +1,7 @@ -From cb36d5eb09afc6bca1dc8682e7d7b5676203d66f Mon Sep 17 00:00:00 2001 +From 5c902504efd82545ee41bbeca7ef72682f5f8d65 Mon Sep 17 00:00:00 2001 From: Anatoly Burakov Date: Fri, 15 Jul 2022 13:12:54 +0000 -Subject: [PATCH 2/2] eal: add cpuset lcore telemetry entries +Subject: [PATCH 2/3] eal: add cpuset lcore telemetry entries Expose per-lcore cpuset information to telemetry. @@ -84,5 +84,5 @@ index 2e9033bf5a..f01ccd9a65 100644 } -- -2.31.1 +2.25.1 diff --git a/ipm/patches/dpdk/20.11/0003-add-capacity-endpoint-to-telemetry-thread.patch b/ipm/patches/dpdk/20.11/0003-add-capacity-endpoint-to-telemetry-thread.patch new file mode 100644 index 0000000..825efad --- /dev/null +++ b/ipm/patches/dpdk/20.11/0003-add-capacity-endpoint-to-telemetry-thread.patch @@ -0,0 +1,356 @@ +From 81175d27730b2b69d36d00d4083872696db109e4 Mon Sep 17 00:00:00 2001 +From: David Hunt +Date: Mon, 16 Sep 2024 14:59:56 +0100 +Subject: [PATCH 3/3] add capacity endpoint to telemetry thread + +Busyness is calculated on how busy the current core is, ignoring the +current frequency. So a core that's 50% busy at P1 (e.g. 2GHz), shows +as 100% busy at 1GHz. + +This patch adds a new 'capacity' metric that shows a percentage based on +the P1 (base) freqency of the core, so that if the core is 50% busy at +P1, it should show 50% regardless of what the current frequency is. + +Signed-off-by: David Hunt +--- + .../common/eal_common_lcore_telemetry.c | 240 ++++++++++++++++++ + lib/librte_eal/include/rte_lcore.h | 21 ++ + lib/librte_eal/version.map | 1 + + 3 files changed, 262 insertions(+) + +diff --git a/lib/librte_eal/common/eal_common_lcore_telemetry.c b/lib/librte_eal/common/eal_common_lcore_telemetry.c +index f01ccd9a65..18dcc40b1e 100644 +--- a/lib/librte_eal/common/eal_common_lcore_telemetry.c ++++ b/lib/librte_eal/common/eal_common_lcore_telemetry.c +@@ -10,9 +10,18 @@ + #include + #include + #include ++#include ++#include ++#include + + #ifdef RTE_LCORE_BUSYNESS + #include ++#define MSR_PLATFORM_INFO 0xCE ++#define POWER_SYSFS_CUR_PATH "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_cur_freq" ++#define POWER_SYSFS_BASE_FREQ_PATH "/sys/devices/system/cpu/cpu%u/cpufreq/base_frequency" ++#define POWER_SYSFS_SCALING_DRIVER_PATH "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_driver" ++#define POWER_SYSFS_SCALING_MAX_FREQ_PATH "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_max_freq" ++#define POWER_SYSFS_MSR_PATH "/dev/cpu/%u/msr" + #endif + + int __rte_lcore_telemetry_enabled; +@@ -47,6 +56,182 @@ static struct lcore_telemetry *telemetry_data; + #define SMOOTH_COEFF 5 + #define STATE_CHANGE_OPT 32 + ++static int p1_freq[RTE_MAX_LCORE] = {0}; ++ ++static int ++try_read_base_frequency(unsigned int lcore_id) ++{ ++ char path[PATH_MAX]; ++ int fd; ++ snprintf(path, sizeof(path), POWER_SYSFS_BASE_FREQ_PATH, rte_lcore_to_cpu_id(lcore_id)); ++ ++ fd = open(path, O_RDONLY); ++ if (fd == -1) { ++ return -1; ++ } ++ char buffer[16]; ++ ssize_t bytesRead = pread(fd, buffer, sizeof(buffer) - 1, 0); ++ if (bytesRead == -1) { ++ close(fd); ++ return -1; ++ } ++ buffer[bytesRead] = '\0'; // Null-terminate the buffer ++ close(fd); ++ ++ p1_freq[lcore_id] = atoi(buffer); ++ return p1_freq[lcore_id]; ++ ++ ++} ++ ++static int ++try_read_scaling_max_freq(unsigned int lcore_id) ++{ ++ char path[PATH_MAX]; ++ int freq; ++ int fd; ++ ++ /* ++ * If the driver is acpi_cpufreq, we can read the scaling_max_freq file ++ */ ++ snprintf(path, sizeof(path), POWER_SYSFS_SCALING_DRIVER_PATH, rte_lcore_to_cpu_id(lcore_id)); ++ fd = open(path, O_RDONLY); ++ if (fd == -1) { ++ return -1; ++ } ++ char buffer[16]; ++ ssize_t bytesRead = pread(fd, buffer, sizeof(buffer) - 1, 0); ++ if (bytesRead == -1) { ++ close(fd); ++ return -1; ++ } ++ buffer[bytesRead] = '\0'; // Null-terminate the buffer ++ ++ close(fd); ++ ++ if (strncmp(buffer, "acpi-cpufreq", 12) == 0) { ++ /* we can use the scaling_max_freq to get the p1 */ ++ snprintf(path, sizeof(path), POWER_SYSFS_SCALING_MAX_FREQ_PATH, rte_lcore_to_cpu_id(lcore_id)); ++ fd = open(path, O_RDONLY); ++ if (fd == -1) { ++ return -1; ++ } ++ ssize_t bytesRead = pread(fd, buffer, sizeof(buffer) - 1, 0); ++ if (bytesRead == -1) { ++ close(fd); ++ return -1; ++ } ++ buffer[bytesRead] = '\0'; // Null-terminate the buffer ++ close(fd); ++ freq = atoi(buffer) / 1000; /* convert to KHz */ ++ ++ /* ++ * If the freq value ends with '1', then, turbo is enabled. ++ * Round it down to the nearest 100. Otherwuse use the value. ++ */ ++ return (freq & ~1) * 1000; /* convert to Hz */ ++ } ++ return -1; ++} ++ ++static int ++try_read_msr(unsigned int lcore_id) ++{ ++ char path[PATH_MAX]; ++ int fd; ++ int freq; ++ uint64_t data; ++ ++ /* ++ * If the msr driver is present, we can read p1 from MSR_PLATFORM_INFO register ++ */ ++ snprintf(path, sizeof(path), POWER_SYSFS_MSR_PATH, rte_lcore_to_cpu_id(lcore_id)); ++ fd = open(path, O_RDONLY); ++ if (fd < 0) { ++ return -1; ++ } ++ ++ if (pread(fd, &data, sizeof(data), MSR_PLATFORM_INFO) != sizeof(data)) { ++ close(fd); ++ return -1; ++ } ++ ++ close(fd); ++ ++ freq = ((data >> 8) & 0xff) * 100 * 1000; ++ ++ return freq; ++} ++ ++ ++static ++int read_sysfs_p1_freq(unsigned int lcore_id) { ++ int freq; ++ ++ /* We've previously got the p1 frequency. */ ++ if (p1_freq[lcore_id] != 0) ++ return p1_freq[lcore_id]; ++ ++ /* ++ * Check the base_frequency file, if it's there ++ */ ++ freq = try_read_base_frequency(lcore_id); ++ if (freq != -1) { ++ p1_freq[lcore_id] = freq; ++ return freq; ++ } ++ ++ /* ++ * Check the scaling_max_freq file for the acpi-freq driver ++ */ ++ freq = try_read_scaling_max_freq(lcore_id); ++ if (freq != -1) { ++ p1_freq[lcore_id] = freq; ++ return freq; ++ } ++ ++ /* ++ * Try reading from the MSR register ++ */ ++ freq = try_read_msr(lcore_id); ++ if (freq != -1) { ++ p1_freq[lcore_id] = freq; ++ return freq; ++ } ++ ++ RTE_LOG(ERR, EAL, "Capacity telemetry for lcore %d not supported: no p1 frequency found", ++ lcore_id); ++ ++ return -1; ++} ++ ++ ++int current_fds[RTE_MAX_LCORE] = {0}; ++ ++static ++int read_sysfs_cur_freq(unsigned int lcore_id) { ++ char path[PATH_MAX]; ++ ++ if (current_fds[lcore_id] == 0) { ++ snprintf(path, sizeof(path), POWER_SYSFS_CUR_PATH, rte_lcore_to_cpu_id(lcore_id)); ++ current_fds[lcore_id] = open(path, O_RDONLY); ++ if (current_fds[lcore_id] == -1) { ++ return -1; ++ } ++ } ++ ++ char buffer[16]; ++ ssize_t bytesRead = pread(current_fds[lcore_id], buffer, sizeof(buffer) - 1, 0); ++ if (bytesRead == -1) { ++ return -1; ++ } ++ ++ buffer[bytesRead] = '\0'; // Null-terminate the buffer ++ ++ int value = atoi(buffer); ++ return value; ++} ++ + /* Helper function to check if the lcore is enabled. + * Cannot use rte_lcore_is_enabled since it only catches ROLE_RTE threads which + * does not include ROLE_NON_EAL threads which some application threads, for +@@ -102,6 +287,33 @@ int rte_lcore_busyness(unsigned int lcore_id) + return telemetry_data[lcore_id].busyness; + } + ++int rte_lcore_capacity(unsigned int lcore_id) ++{ ++ const uint64_t active_thresh = RTE_LCORE_BUSYNESS_PERIOD * 1000; ++ struct lcore_telemetry *tdata; ++ ++ if (lcore_id >= RTE_MAX_LCORE) ++ return -EINVAL; ++ tdata = &telemetry_data[lcore_id]; ++ ++ /* if the lcore is not active */ ++ if (tdata->interval_ts == 0) ++ return LCORE_BUSYNESS_NOT_SET; ++ /* if the core hasn't been active in a while */ ++ else if ((rte_rdtsc() - tdata->interval_ts) > active_thresh) ++ return LCORE_BUSYNESS_NOT_SET; ++ ++ int cur_freq = read_sysfs_cur_freq(rte_lcore_to_cpu_id(lcore_id)); ++ int busy = telemetry_data[lcore_id].busyness; ++ int p1 = read_sysfs_p1_freq(lcore_id) ; ++ ++ if ((busy == -1) || (p1 <= 0)) { ++ return -1; ++ } else { ++ return busy * cur_freq / p1; ++ } ++} ++ + int rte_lcore_busyness_enabled(void) + { + return __rte_lcore_telemetry_enabled; +@@ -263,6 +475,26 @@ lcore_handle_busyness(const char *cmd __rte_unused, + return 0; + } + ++static int ++lcore_handle_capacity(const char *cmd __rte_unused, ++ const char *params __rte_unused, struct rte_tel_data *d) ++{ ++ char corenum[64]; ++ int i; ++ ++ rte_tel_data_start_dict(d); ++ ++ /* Foreach lcore - can't use macro since it excludes ROLE_NON_EAL */ ++ for (i = 0; i < RTE_MAX_LCORE; i++) { ++ if (!lcore_enabled(i)) ++ continue; ++ snprintf(corenum, sizeof(corenum), "%d", i); ++ rte_tel_data_add_dict_int(d, corenum, rte_lcore_capacity(i)); ++ } ++ ++ return 0; ++} ++ + static int + lcore_handle_cpuset(const char *cmd __rte_unused, + const char *params __rte_unused, +@@ -326,6 +558,9 @@ RTE_INIT(lcore_init_telemetry) + rte_telemetry_register_cmd("/eal/lcore/busyness", lcore_handle_busyness, + "return percentage busyness of cores"); + ++ rte_telemetry_register_cmd("/eal/lcore/capacity_used", lcore_handle_capacity, ++ "return percentage capacity of cores"); ++ + rte_telemetry_register_cmd("/eal/lcore/busyness_enable", lcore_busyness_enable, + "enable lcore busyness measurement"); + +@@ -340,6 +575,11 @@ RTE_INIT(lcore_init_telemetry) + + #else + ++int rte_lcore_capacity(unsigned int lcore_id __rte_unused) ++{ ++ return -ENOTSUP; ++} ++ + int rte_lcore_busyness(unsigned int lcore_id __rte_unused) + { + return -ENOTSUP; +diff --git a/lib/librte_eal/include/rte_lcore.h b/lib/librte_eal/include/rte_lcore.h +index 90c2aa037a..dddc529ccd 100644 +--- a/lib/librte_eal/include/rte_lcore.h ++++ b/lib/librte_eal/include/rte_lcore.h +@@ -487,6 +487,27 @@ __rte_experimental + int + rte_lcore_busyness(unsigned int lcore_id); + ++/** ++ * @warning ++ * @b EXPERIMENTAL: this API may change without prior notice. ++ * ++ * Read capacity value corresponding to an lcore. ++ * This differs from busyness in that it is related to the current usage ++ * of the lcore compared to P1 frequency, not the current frequency. ++ * ++ * @param lcore_id ++ * Lcore to read capacity value for. ++ * @return ++ * - value between 0 and 100 on success ++ * - -1 if lcore is not active ++ * - -EINVAL if lcore is invalid ++ * - -ENOMEM if not enough memory available ++ * - -ENOTSUP if not supported ++ */ ++__rte_experimental ++int ++rte_lcore_capacity(unsigned int lcore_id); ++ + /** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice. +diff --git a/lib/librte_eal/version.map b/lib/librte_eal/version.map +index d828a0d791..cac187ffdd 100644 +--- a/lib/librte_eal/version.map ++++ b/lib/librte_eal/version.map +@@ -406,6 +406,7 @@ EXPERIMENTAL { + + __rte_lcore_telemetry_timestamp; + __rte_lcore_telemetry_enabled; ++ rte_lcore_capacity; + rte_lcore_busyness; + rte_lcore_busyness_enabled; + rte_lcore_busyness_enabled_set; +-- +2.25.1 + diff --git a/ipm/patches/dpdk/21.11/0001-eal-add-lcore-busyness-telemetry.patch b/ipm/patches/dpdk/21.11/0001-eal-add-lcore-busyness-telemetry.patch index bbe0514..7500cc6 100644 --- a/ipm/patches/dpdk/21.11/0001-eal-add-lcore-busyness-telemetry.patch +++ b/ipm/patches/dpdk/21.11/0001-eal-add-lcore-busyness-telemetry.patch @@ -1,7 +1,7 @@ -From 11d4e3f53f85c76944f86043a552a6b1308c3e32 Mon Sep 17 00:00:00 2001 -From: David Hunt -Date: Fri, 4 Nov 2022 13:09:51 +0000 -Subject: [PATCH 1/2] eal: add lcore busyness telemetry +From 95a7bd751ef9216b0ae9fe7d069af367e74dcf50 Mon Sep 17 00:00:00 2001 +From: Anatoly Burakov +Date: Mon, 11 Nov 2024 08:54:43 +0000 +Subject: [PATCH 1/3] eal: add lcore busyness telemetry Currently, there is no way to measure lcore busyness in a passive way, without any modifications to the application. This patch adds a new EAL @@ -72,12 +72,12 @@ Signed-off-by: Anatoly Burakov create mode 100644 lib/eal/common/eal_common_lcore_telemetry.c diff --git a/config/rte_config.h b/config/rte_config.h -index cab4390a97..f3fdfa5626 100644 +index 2f1a3ffb21..4696d8c97b 100644 --- a/config/rte_config.h +++ b/config/rte_config.h -@@ -39,6 +39,8 @@ +@@ -38,6 +38,8 @@ + #define RTE_MAX_TAILQ 32 #define RTE_LOG_DP_LEVEL RTE_LOG_INFO - #define RTE_BACKTRACE 1 #define RTE_MAX_VFIO_CONTAINERS 64 +#define RTE_LCORE_BUSYNESS 1 +#define RTE_LCORE_BUSYNESS_PERIOD 4000000ULL @@ -154,10 +154,10 @@ index 2e9218af68..5a72e82768 100644 } diff --git a/lib/cryptodev/rte_cryptodev.h b/lib/cryptodev/rte_cryptodev.h -index 59ea5a54df..d81a37f0c7 100644 +index eead3d2bff..e1910da48e 100644 --- a/lib/cryptodev/rte_cryptodev.h +++ b/lib/cryptodev/rte_cryptodev.h -@@ -1867,6 +1867,8 @@ rte_cryptodev_dequeue_burst(uint8_t dev_id, uint16_t qp_id, +@@ -1874,6 +1874,8 @@ rte_cryptodev_dequeue_burst(uint8_t dev_id, uint16_t qp_id, rte_rcu_qsbr_thread_offline(list->qsbr, 0); } #endif @@ -640,17 +640,17 @@ index 917758cc65..a743e66a7d 100644 'eal_common_log.c', 'eal_common_mcfg.c', diff --git a/lib/eal/freebsd/eal.c b/lib/eal/freebsd/eal.c -index 414aad3dd3..c6d3975b43 100644 +index 66553089fa..2a02c036f0 100644 --- a/lib/eal/freebsd/eal.c +++ b/lib/eal/freebsd/eal.c -@@ -988,6 +988,7 @@ rte_eal_cleanup(void) +@@ -998,6 +998,7 @@ rte_eal_cleanup(void) rte_mp_channel_cleanup(); rte_trace_save(); eal_trace_fini(); + eal_lcore_telemetry_free(); + rte_eal_alarm_cleanup(); /* after this point, any DPDK pointers will become dangling */ rte_eal_memory_detach(); - rte_eal_alarm_cleanup(); diff --git a/lib/eal/include/rte_lcore.h b/lib/eal/include/rte_lcore.h index 258bc49b24..85d6e38f4e 100644 --- a/lib/eal/include/rte_lcore.h @@ -747,17 +747,17 @@ index 258bc49b24..85d6e38f4e 100644 } #endif diff --git a/lib/eal/linux/eal.c b/lib/eal/linux/eal.c -index e3d34f7b7c..594a7bc810 100644 +index 6f7e8641d3..520ec01afd 100644 --- a/lib/eal/linux/eal.c +++ b/lib/eal/linux/eal.c -@@ -1370,6 +1370,7 @@ rte_eal_cleanup(void) +@@ -1380,6 +1380,7 @@ rte_eal_cleanup(void) rte_mp_channel_cleanup(); rte_trace_save(); eal_trace_fini(); + eal_lcore_telemetry_free(); + rte_eal_alarm_cleanup(); /* after this point, any DPDK pointers will become dangling */ rte_eal_memory_detach(); - rte_eal_alarm_cleanup(); diff --git a/lib/eal/meson.build b/lib/eal/meson.build index 1722924f67..01b51f0105 100644 --- a/lib/eal/meson.build @@ -791,10 +791,10 @@ index ab28c22791..a06a9c2a47 100644 INTERNAL { diff --git a/lib/ethdev/rte_ethdev.h b/lib/ethdev/rte_ethdev.h -index 0be04c5809..a00d1e7f26 100644 +index 083f324a46..35f2539629 100644 --- a/lib/ethdev/rte_ethdev.h +++ b/lib/ethdev/rte_ethdev.h -@@ -5357,6 +5357,8 @@ rte_eth_rx_burst(uint16_t port_id, uint16_t queue_id, +@@ -5361,6 +5361,8 @@ rte_eth_rx_burst(uint16_t port_id, uint16_t queue_id, #endif rte_ethdev_trace_rx_burst(port_id, queue_id, (void **)rx_pkts, nb_rx); @@ -804,10 +804,10 @@ index 0be04c5809..a00d1e7f26 100644 } diff --git a/lib/eventdev/rte_eventdev.h b/lib/eventdev/rte_eventdev.h -index 476bcbcc21..6da0734515 100644 +index f09ea4a9d1..c3f9e4fcd7 100644 --- a/lib/eventdev/rte_eventdev.h +++ b/lib/eventdev/rte_eventdev.h -@@ -2055,6 +2055,7 @@ rte_event_dequeue_burst(uint8_t dev_id, uint8_t port_id, struct rte_event ev[], +@@ -2070,6 +2070,7 @@ rte_event_dequeue_burst(uint8_t dev_id, uint8_t port_id, struct rte_event ev[], uint16_t nb_events, uint64_t timeout_ticks) { const struct rte_event_fp_ops *fp_ops; @@ -815,7 +815,7 @@ index 476bcbcc21..6da0734515 100644 void *port; fp_ops = &rte_event_fp_ops[dev_id]; -@@ -2077,10 +2078,13 @@ rte_event_dequeue_burst(uint8_t dev_id, uint8_t port_id, struct rte_event ev[], +@@ -2092,10 +2093,13 @@ rte_event_dequeue_burst(uint8_t dev_id, uint8_t port_id, struct rte_event ev[], * requests nb_events as const one */ if (nb_events == 1) @@ -833,7 +833,7 @@ index 476bcbcc21..6da0734515 100644 #define RTE_EVENT_DEV_MAINT_OP_FLUSH (1 << 0) diff --git a/lib/rawdev/rte_rawdev.c b/lib/rawdev/rte_rawdev.c -index a6134e76ea..e759999553 100644 +index c06ed8b9c7..2c46e6fc64 100644 --- a/lib/rawdev/rte_rawdev.c +++ b/lib/rawdev/rte_rawdev.c @@ -240,12 +240,15 @@ rte_rawdev_dequeue_buffers(uint16_t dev_id, @@ -877,10 +877,10 @@ index 513ce5b67c..de29dc3940 100644 #ifdef __cplusplus diff --git a/lib/ring/rte_ring_elem_pvt.h b/lib/ring/rte_ring_elem_pvt.h -index 275ec55393..4f3ed674ce 100644 +index 99786cca95..9f0250636b 100644 --- a/lib/ring/rte_ring_elem_pvt.h +++ b/lib/ring/rte_ring_elem_pvt.h -@@ -379,6 +379,7 @@ __rte_ring_do_dequeue_elem(struct rte_ring *r, void *obj_table, +@@ -385,6 +385,7 @@ __rte_ring_do_dequeue_elem(struct rte_ring *r, void *obj_table, end: if (available != NULL) *available = entries - n; @@ -889,5 +889,5 @@ index 275ec55393..4f3ed674ce 100644 } -- -2.31.1 +2.25.1 diff --git a/ipm/patches/dpdk/21.11/0002-eal-add-cpuset-lcore-telemetry-entries.patch b/ipm/patches/dpdk/21.11/0002-eal-add-cpuset-lcore-telemetry-entries.patch index 48e8f00..1ea9569 100644 --- a/ipm/patches/dpdk/21.11/0002-eal-add-cpuset-lcore-telemetry-entries.patch +++ b/ipm/patches/dpdk/21.11/0002-eal-add-cpuset-lcore-telemetry-entries.patch @@ -1,7 +1,7 @@ -From ab83a7cbdea4d38de86b7f3c1a5e64a2aff0c06e Mon Sep 17 00:00:00 2001 +From ac2b8db5f2dc2578b99a63b0abaea703c092ab42 Mon Sep 17 00:00:00 2001 From: Anatoly Burakov Date: Fri, 15 Jul 2022 13:12:45 +0000 -Subject: [PATCH 2/2] eal: add cpuset lcore telemetry entries +Subject: [PATCH 2/3] eal: add cpuset lcore telemetry entries Expose per-lcore cpuset information to telemetry. @@ -84,5 +84,5 @@ index 2e9033bf5a..f01ccd9a65 100644 } -- -2.31.1 +2.25.1 diff --git a/ipm/patches/dpdk/21.11/0003-add-capacity-endpoint-to-telemetry-thread.patch b/ipm/patches/dpdk/21.11/0003-add-capacity-endpoint-to-telemetry-thread.patch new file mode 100644 index 0000000..8b8b247 --- /dev/null +++ b/ipm/patches/dpdk/21.11/0003-add-capacity-endpoint-to-telemetry-thread.patch @@ -0,0 +1,356 @@ +From 644d8d946ce5e31c9a818da9661f4e0658f57754 Mon Sep 17 00:00:00 2001 +From: David Hunt +Date: Mon, 16 Sep 2024 14:28:18 +0100 +Subject: [PATCH 3/3] add capacity endpoint to telemetry thread + +Busyness is calculated on how busy the current core is, ignoring the +current frequency. So a core that's 50% busy at P1 (e.g. 2GHz), shows +as 100% busy at 1GHz. + +This patch adds a new 'capacity' metric that shows a percentage based on +the P1 (base) freqency of the core, so that if the core is 50% busy at +P1, it should show 50% regardless of what the current frequency is. + +Signed-off-by: David Hunt +--- + lib/eal/common/eal_common_lcore_telemetry.c | 240 ++++++++++++++++++++ + lib/eal/include/rte_lcore.h | 21 ++ + lib/eal/version.map | 1 + + 3 files changed, 262 insertions(+) + +diff --git a/lib/eal/common/eal_common_lcore_telemetry.c b/lib/eal/common/eal_common_lcore_telemetry.c +index f01ccd9a65..18dcc40b1e 100644 +--- a/lib/eal/common/eal_common_lcore_telemetry.c ++++ b/lib/eal/common/eal_common_lcore_telemetry.c +@@ -10,9 +10,18 @@ + #include + #include + #include ++#include ++#include ++#include + + #ifdef RTE_LCORE_BUSYNESS + #include ++#define MSR_PLATFORM_INFO 0xCE ++#define POWER_SYSFS_CUR_PATH "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_cur_freq" ++#define POWER_SYSFS_BASE_FREQ_PATH "/sys/devices/system/cpu/cpu%u/cpufreq/base_frequency" ++#define POWER_SYSFS_SCALING_DRIVER_PATH "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_driver" ++#define POWER_SYSFS_SCALING_MAX_FREQ_PATH "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_max_freq" ++#define POWER_SYSFS_MSR_PATH "/dev/cpu/%u/msr" + #endif + + int __rte_lcore_telemetry_enabled; +@@ -47,6 +56,182 @@ static struct lcore_telemetry *telemetry_data; + #define SMOOTH_COEFF 5 + #define STATE_CHANGE_OPT 32 + ++static int p1_freq[RTE_MAX_LCORE] = {0}; ++ ++static int ++try_read_base_frequency(unsigned int lcore_id) ++{ ++ char path[PATH_MAX]; ++ int fd; ++ snprintf(path, sizeof(path), POWER_SYSFS_BASE_FREQ_PATH, rte_lcore_to_cpu_id(lcore_id)); ++ ++ fd = open(path, O_RDONLY); ++ if (fd == -1) { ++ return -1; ++ } ++ char buffer[16]; ++ ssize_t bytesRead = pread(fd, buffer, sizeof(buffer) - 1, 0); ++ if (bytesRead == -1) { ++ close(fd); ++ return -1; ++ } ++ buffer[bytesRead] = '\0'; // Null-terminate the buffer ++ close(fd); ++ ++ p1_freq[lcore_id] = atoi(buffer); ++ return p1_freq[lcore_id]; ++ ++ ++} ++ ++static int ++try_read_scaling_max_freq(unsigned int lcore_id) ++{ ++ char path[PATH_MAX]; ++ int freq; ++ int fd; ++ ++ /* ++ * If the driver is acpi_cpufreq, we can read the scaling_max_freq file ++ */ ++ snprintf(path, sizeof(path), POWER_SYSFS_SCALING_DRIVER_PATH, rte_lcore_to_cpu_id(lcore_id)); ++ fd = open(path, O_RDONLY); ++ if (fd == -1) { ++ return -1; ++ } ++ char buffer[16]; ++ ssize_t bytesRead = pread(fd, buffer, sizeof(buffer) - 1, 0); ++ if (bytesRead == -1) { ++ close(fd); ++ return -1; ++ } ++ buffer[bytesRead] = '\0'; // Null-terminate the buffer ++ ++ close(fd); ++ ++ if (strncmp(buffer, "acpi-cpufreq", 12) == 0) { ++ /* we can use the scaling_max_freq to get the p1 */ ++ snprintf(path, sizeof(path), POWER_SYSFS_SCALING_MAX_FREQ_PATH, rte_lcore_to_cpu_id(lcore_id)); ++ fd = open(path, O_RDONLY); ++ if (fd == -1) { ++ return -1; ++ } ++ ssize_t bytesRead = pread(fd, buffer, sizeof(buffer) - 1, 0); ++ if (bytesRead == -1) { ++ close(fd); ++ return -1; ++ } ++ buffer[bytesRead] = '\0'; // Null-terminate the buffer ++ close(fd); ++ freq = atoi(buffer) / 1000; /* convert to KHz */ ++ ++ /* ++ * If the freq value ends with '1', then, turbo is enabled. ++ * Round it down to the nearest 100. Otherwuse use the value. ++ */ ++ return (freq & ~1) * 1000; /* convert to Hz */ ++ } ++ return -1; ++} ++ ++static int ++try_read_msr(unsigned int lcore_id) ++{ ++ char path[PATH_MAX]; ++ int fd; ++ int freq; ++ uint64_t data; ++ ++ /* ++ * If the msr driver is present, we can read p1 from MSR_PLATFORM_INFO register ++ */ ++ snprintf(path, sizeof(path), POWER_SYSFS_MSR_PATH, rte_lcore_to_cpu_id(lcore_id)); ++ fd = open(path, O_RDONLY); ++ if (fd < 0) { ++ return -1; ++ } ++ ++ if (pread(fd, &data, sizeof(data), MSR_PLATFORM_INFO) != sizeof(data)) { ++ close(fd); ++ return -1; ++ } ++ ++ close(fd); ++ ++ freq = ((data >> 8) & 0xff) * 100 * 1000; ++ ++ return freq; ++} ++ ++ ++static ++int read_sysfs_p1_freq(unsigned int lcore_id) { ++ int freq; ++ ++ /* We've previously got the p1 frequency. */ ++ if (p1_freq[lcore_id] != 0) ++ return p1_freq[lcore_id]; ++ ++ /* ++ * Check the base_frequency file, if it's there ++ */ ++ freq = try_read_base_frequency(lcore_id); ++ if (freq != -1) { ++ p1_freq[lcore_id] = freq; ++ return freq; ++ } ++ ++ /* ++ * Check the scaling_max_freq file for the acpi-freq driver ++ */ ++ freq = try_read_scaling_max_freq(lcore_id); ++ if (freq != -1) { ++ p1_freq[lcore_id] = freq; ++ return freq; ++ } ++ ++ /* ++ * Try reading from the MSR register ++ */ ++ freq = try_read_msr(lcore_id); ++ if (freq != -1) { ++ p1_freq[lcore_id] = freq; ++ return freq; ++ } ++ ++ RTE_LOG(ERR, EAL, "Capacity telemetry for lcore %d not supported: no p1 frequency found", ++ lcore_id); ++ ++ return -1; ++} ++ ++ ++int current_fds[RTE_MAX_LCORE] = {0}; ++ ++static ++int read_sysfs_cur_freq(unsigned int lcore_id) { ++ char path[PATH_MAX]; ++ ++ if (current_fds[lcore_id] == 0) { ++ snprintf(path, sizeof(path), POWER_SYSFS_CUR_PATH, rte_lcore_to_cpu_id(lcore_id)); ++ current_fds[lcore_id] = open(path, O_RDONLY); ++ if (current_fds[lcore_id] == -1) { ++ return -1; ++ } ++ } ++ ++ char buffer[16]; ++ ssize_t bytesRead = pread(current_fds[lcore_id], buffer, sizeof(buffer) - 1, 0); ++ if (bytesRead == -1) { ++ return -1; ++ } ++ ++ buffer[bytesRead] = '\0'; // Null-terminate the buffer ++ ++ int value = atoi(buffer); ++ return value; ++} ++ + /* Helper function to check if the lcore is enabled. + * Cannot use rte_lcore_is_enabled since it only catches ROLE_RTE threads which + * does not include ROLE_NON_EAL threads which some application threads, for +@@ -102,6 +287,33 @@ int rte_lcore_busyness(unsigned int lcore_id) + return telemetry_data[lcore_id].busyness; + } + ++int rte_lcore_capacity(unsigned int lcore_id) ++{ ++ const uint64_t active_thresh = RTE_LCORE_BUSYNESS_PERIOD * 1000; ++ struct lcore_telemetry *tdata; ++ ++ if (lcore_id >= RTE_MAX_LCORE) ++ return -EINVAL; ++ tdata = &telemetry_data[lcore_id]; ++ ++ /* if the lcore is not active */ ++ if (tdata->interval_ts == 0) ++ return LCORE_BUSYNESS_NOT_SET; ++ /* if the core hasn't been active in a while */ ++ else if ((rte_rdtsc() - tdata->interval_ts) > active_thresh) ++ return LCORE_BUSYNESS_NOT_SET; ++ ++ int cur_freq = read_sysfs_cur_freq(rte_lcore_to_cpu_id(lcore_id)); ++ int busy = telemetry_data[lcore_id].busyness; ++ int p1 = read_sysfs_p1_freq(lcore_id) ; ++ ++ if ((busy == -1) || (p1 <= 0)) { ++ return -1; ++ } else { ++ return busy * cur_freq / p1; ++ } ++} ++ + int rte_lcore_busyness_enabled(void) + { + return __rte_lcore_telemetry_enabled; +@@ -263,6 +475,26 @@ lcore_handle_busyness(const char *cmd __rte_unused, + return 0; + } + ++static int ++lcore_handle_capacity(const char *cmd __rte_unused, ++ const char *params __rte_unused, struct rte_tel_data *d) ++{ ++ char corenum[64]; ++ int i; ++ ++ rte_tel_data_start_dict(d); ++ ++ /* Foreach lcore - can't use macro since it excludes ROLE_NON_EAL */ ++ for (i = 0; i < RTE_MAX_LCORE; i++) { ++ if (!lcore_enabled(i)) ++ continue; ++ snprintf(corenum, sizeof(corenum), "%d", i); ++ rte_tel_data_add_dict_int(d, corenum, rte_lcore_capacity(i)); ++ } ++ ++ return 0; ++} ++ + static int + lcore_handle_cpuset(const char *cmd __rte_unused, + const char *params __rte_unused, +@@ -326,6 +558,9 @@ RTE_INIT(lcore_init_telemetry) + rte_telemetry_register_cmd("/eal/lcore/busyness", lcore_handle_busyness, + "return percentage busyness of cores"); + ++ rte_telemetry_register_cmd("/eal/lcore/capacity_used", lcore_handle_capacity, ++ "return percentage capacity of cores"); ++ + rte_telemetry_register_cmd("/eal/lcore/busyness_enable", lcore_busyness_enable, + "enable lcore busyness measurement"); + +@@ -340,6 +575,11 @@ RTE_INIT(lcore_init_telemetry) + + #else + ++int rte_lcore_capacity(unsigned int lcore_id __rte_unused) ++{ ++ return -ENOTSUP; ++} ++ + int rte_lcore_busyness(unsigned int lcore_id __rte_unused) + { + return -ENOTSUP; +diff --git a/lib/eal/include/rte_lcore.h b/lib/eal/include/rte_lcore.h +index 85d6e38f4e..4a631e9645 100644 +--- a/lib/eal/include/rte_lcore.h ++++ b/lib/eal/include/rte_lcore.h +@@ -443,6 +443,27 @@ __rte_experimental + int + rte_lcore_busyness(unsigned int lcore_id); + ++/** ++ * @warning ++ * @b EXPERIMENTAL: this API may change without prior notice. ++ * ++ * Read capacity value corresponding to an lcore. ++ * This differs from busyness in that it is related to the current usage ++ * of the lcore compared to P1 frequency, not the current frequency. ++ * ++ * @param lcore_id ++ * Lcore to read capacity value for. ++ * @return ++ * - value between 0 and 100 on success ++ * - -1 if lcore is not active ++ * - -EINVAL if lcore is invalid ++ * - -ENOMEM if not enough memory available ++ * - -ENOTSUP if not supported ++ */ ++__rte_experimental ++int ++rte_lcore_capacity(unsigned int lcore_id); ++ + /** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice. +diff --git a/lib/eal/version.map b/lib/eal/version.map +index a06a9c2a47..a405bfb319 100644 +--- a/lib/eal/version.map ++++ b/lib/eal/version.map +@@ -424,6 +424,7 @@ EXPERIMENTAL { + # Telemetry patch set APIs + __rte_lcore_telemetry_timestamp; + __rte_lcore_telemetry_enabled; ++ rte_lcore_capacity; + rte_lcore_busyness; + rte_lcore_busyness_enabled; + rte_lcore_busyness_enabled_set; +-- +2.25.1 + diff --git a/ipm/patches/dpdk/22.11/0001-eal-add-lcore-busyness-telemetry.patch b/ipm/patches/dpdk/22.11/0001-eal-add-lcore-busyness-telemetry.patch index b5ba61d..7e8d5c4 100644 --- a/ipm/patches/dpdk/22.11/0001-eal-add-lcore-busyness-telemetry.patch +++ b/ipm/patches/dpdk/22.11/0001-eal-add-lcore-busyness-telemetry.patch @@ -1,7 +1,7 @@ -From 26e52ef9874d526d0d06a08f1463ee9ec9ba51ba Mon Sep 17 00:00:00 2001 +From 5129fb19501eecce2410efc58548951ffb02c226 Mon Sep 17 00:00:00 2001 From: Anatoly Burakov -Date: Fri, 15 Jul 2022 13:12:44 +0000 -Subject: [PATCH 1/2] eal: add lcore busyness telemetry +Date: Mon, 11 Nov 2024 09:25:29 +0000 +Subject: [PATCH 1/3] eal: add lcore busyness telemetry Currently, there is no way to measure lcore busyness in a passive way, without any modifications to the application. This patch adds a new EAL @@ -72,12 +72,12 @@ Signed-off-by: Anatoly Burakov create mode 100644 lib/eal/common/eal_common_lcore_telemetry.c diff --git a/config/rte_config.h b/config/rte_config.h -index 3c4876d434..864ede39b3 100644 +index 7b8c85e948..1193d936f6 100644 --- a/config/rte_config.h +++ b/config/rte_config.h -@@ -39,6 +39,8 @@ +@@ -38,6 +38,8 @@ + #define RTE_MAX_TAILQ 32 #define RTE_LOG_DP_LEVEL RTE_LOG_INFO - #define RTE_BACKTRACE 1 #define RTE_MAX_VFIO_CONTAINERS 64 +#define RTE_LCORE_BUSYNESS 1 +#define RTE_LCORE_BUSYNESS_PERIOD 4000000ULL @@ -154,10 +154,10 @@ index cf2c20a704..04a854a935 100644 } diff --git a/lib/cryptodev/rte_cryptodev.h b/lib/cryptodev/rte_cryptodev.h -index 86d792e2e7..71d1ccbe9a 100644 +index cef9f2b3cb..82fc21d24d 100644 --- a/lib/cryptodev/rte_cryptodev.h +++ b/lib/cryptodev/rte_cryptodev.h -@@ -1855,6 +1855,8 @@ rte_cryptodev_dequeue_burst(uint8_t dev_id, uint16_t qp_id, +@@ -1874,6 +1874,8 @@ rte_cryptodev_dequeue_burst(uint8_t dev_id, uint16_t qp_id, rte_rcu_qsbr_thread_offline(list->qsbr, 0); } #endif @@ -642,17 +642,17 @@ index 917758cc65..a743e66a7d 100644 'eal_common_log.c', 'eal_common_mcfg.c', diff --git a/lib/eal/freebsd/eal.c b/lib/eal/freebsd/eal.c -index 607684c1a3..a34b2d3116 100644 +index 122daf6c1f..a7be6ab285 100644 --- a/lib/eal/freebsd/eal.c +++ b/lib/eal/freebsd/eal.c -@@ -896,6 +896,7 @@ rte_eal_cleanup(void) +@@ -906,6 +906,7 @@ rte_eal_cleanup(void) eal_bus_cleanup(); rte_trace_save(); eal_trace_fini(); + eal_lcore_telemetry_free(); + rte_eal_alarm_cleanup(); /* after this point, any DPDK pointers will become dangling */ rte_eal_memory_detach(); - rte_eal_alarm_cleanup(); diff --git a/lib/eal/include/rte_lcore.h b/lib/eal/include/rte_lcore.h index 6938c3fd7b..9f4bd6e22f 100644 --- a/lib/eal/include/rte_lcore.h @@ -749,17 +749,17 @@ index 6938c3fd7b..9f4bd6e22f 100644 } #endif diff --git a/lib/eal/linux/eal.c b/lib/eal/linux/eal.c -index 8c118d0d9f..2584213f5f 100644 +index 336698379f..2bff37791e 100644 --- a/lib/eal/linux/eal.c +++ b/lib/eal/linux/eal.c -@@ -1372,6 +1372,7 @@ rte_eal_cleanup(void) +@@ -1382,6 +1382,7 @@ rte_eal_cleanup(void) eal_bus_cleanup(); rte_trace_save(); eal_trace_fini(); + eal_lcore_telemetry_free(); - /* after this point, any DPDK pointers will become dangling */ - rte_eal_memory_detach(); eal_mp_dev_hotplug_cleanup(); + rte_eal_alarm_cleanup(); + /* after this point, any DPDK pointers will become dangling */ diff --git a/lib/eal/meson.build b/lib/eal/meson.build index 056beb9461..7199aa03c2 100644 --- a/lib/eal/meson.build @@ -793,10 +793,10 @@ index 7ad12a7dc9..7791f59314 100644 INTERNAL { diff --git a/lib/ethdev/rte_ethdev.h b/lib/ethdev/rte_ethdev.h -index 13fe73d5a3..8c5518c25b 100644 +index e73244822a..17fabfa5b0 100644 --- a/lib/ethdev/rte_ethdev.h +++ b/lib/ethdev/rte_ethdev.h -@@ -5907,6 +5907,8 @@ rte_eth_rx_burst(uint16_t port_id, uint16_t queue_id, +@@ -5913,6 +5913,8 @@ rte_eth_rx_burst(uint16_t port_id, uint16_t queue_id, #endif rte_ethdev_trace_rx_burst(port_id, queue_id, (void **)rx_pkts, nb_rx); @@ -806,10 +806,10 @@ index 13fe73d5a3..8c5518c25b 100644 } diff --git a/lib/eventdev/rte_eventdev.h b/lib/eventdev/rte_eventdev.h -index a90e23ac8b..6db744db33 100644 +index d0e2463bb8..b4f677c46d 100644 --- a/lib/eventdev/rte_eventdev.h +++ b/lib/eventdev/rte_eventdev.h -@@ -2179,6 +2179,7 @@ rte_event_dequeue_burst(uint8_t dev_id, uint8_t port_id, struct rte_event ev[], +@@ -2194,6 +2194,7 @@ rte_event_dequeue_burst(uint8_t dev_id, uint8_t port_id, struct rte_event ev[], uint16_t nb_events, uint64_t timeout_ticks) { const struct rte_event_fp_ops *fp_ops; @@ -817,7 +817,7 @@ index a90e23ac8b..6db744db33 100644 void *port; fp_ops = &rte_event_fp_ops[dev_id]; -@@ -2201,10 +2202,13 @@ rte_event_dequeue_burst(uint8_t dev_id, uint8_t port_id, struct rte_event ev[], +@@ -2216,10 +2217,13 @@ rte_event_dequeue_burst(uint8_t dev_id, uint8_t port_id, struct rte_event ev[], * requests nb_events as const one */ if (nb_events == 1) @@ -835,7 +835,7 @@ index a90e23ac8b..6db744db33 100644 #define RTE_EVENT_DEV_MAINT_OP_FLUSH (1 << 0) diff --git a/lib/rawdev/rte_rawdev.c b/lib/rawdev/rte_rawdev.c -index 5fbdb94229..b541c73b07 100644 +index dcebe4f653..0faae13c9c 100644 --- a/lib/rawdev/rte_rawdev.c +++ b/lib/rawdev/rte_rawdev.c @@ -237,13 +237,16 @@ rte_rawdev_dequeue_buffers(uint16_t dev_id, @@ -880,10 +880,10 @@ index 25476f1f73..31fd293e48 100644 #ifdef __cplusplus diff --git a/lib/ring/rte_ring_elem_pvt.h b/lib/ring/rte_ring_elem_pvt.h -index 83788c56e6..6db09d4291 100644 +index 4b80f58980..e2a72e3ea7 100644 --- a/lib/ring/rte_ring_elem_pvt.h +++ b/lib/ring/rte_ring_elem_pvt.h -@@ -379,6 +379,7 @@ __rte_ring_do_dequeue_elem(struct rte_ring *r, void *obj_table, +@@ -385,6 +385,7 @@ __rte_ring_do_dequeue_elem(struct rte_ring *r, void *obj_table, end: if (available != NULL) *available = entries - n; @@ -892,5 +892,5 @@ index 83788c56e6..6db09d4291 100644 } -- -2.31.1 +2.25.1 diff --git a/ipm/patches/dpdk/22.11/0002-eal-add-cpuset-lcore-telemetry-entries.patch b/ipm/patches/dpdk/22.11/0002-eal-add-cpuset-lcore-telemetry-entries.patch index 383f984..2a02775 100644 --- a/ipm/patches/dpdk/22.11/0002-eal-add-cpuset-lcore-telemetry-entries.patch +++ b/ipm/patches/dpdk/22.11/0002-eal-add-cpuset-lcore-telemetry-entries.patch @@ -1,7 +1,7 @@ -From e7207152706cce06e1aeecee9b8a63d116a20061 Mon Sep 17 00:00:00 2001 +From 810d87bf69d79351cfa3089df920e4b726f269a5 Mon Sep 17 00:00:00 2001 From: Anatoly Burakov Date: Fri, 15 Jul 2022 13:12:45 +0000 -Subject: [PATCH 2/2] eal: add cpuset lcore telemetry entries +Subject: [PATCH 2/3] eal: add cpuset lcore telemetry entries Expose per-lcore cpuset information to telemetry. @@ -75,5 +75,5 @@ index 1478e5a48a..f01ccd9a65 100644 } -- -2.31.1 +2.25.1 diff --git a/ipm/patches/dpdk/22.11/0003-add-capacity-endpoint-to-telemetry-thread.patch b/ipm/patches/dpdk/22.11/0003-add-capacity-endpoint-to-telemetry-thread.patch new file mode 100644 index 0000000..df6cbd4 --- /dev/null +++ b/ipm/patches/dpdk/22.11/0003-add-capacity-endpoint-to-telemetry-thread.patch @@ -0,0 +1,356 @@ +From ea2762b20c60cd66378758559af90bb48c9a8ee5 Mon Sep 17 00:00:00 2001 +From: David Hunt +Date: Fri, 23 Aug 2024 09:07:08 +0100 +Subject: [PATCH 3/3] add capacity endpoint to telemetry thread + +Busyness is calculated on how busy the current core is, ignoring the +current frequency. So a core that's 50% busy at P1 (e.g. 2GHz), shows +as 100% busy at 1GHz. + +This patch adds a new 'capacity' metric that shows a percentage based on +the P1 (base) freqency of the core, so that if the core is 50% busy at +P1, it should show 50% regardless of what the current frequency is. + +Signed-off-by: David Hunt +--- + lib/eal/common/eal_common_lcore_telemetry.c | 240 ++++++++++++++++++++ + lib/eal/include/rte_lcore.h | 21 ++ + lib/eal/version.map | 1 + + 3 files changed, 262 insertions(+) + +diff --git a/lib/eal/common/eal_common_lcore_telemetry.c b/lib/eal/common/eal_common_lcore_telemetry.c +index f01ccd9a65..18dcc40b1e 100644 +--- a/lib/eal/common/eal_common_lcore_telemetry.c ++++ b/lib/eal/common/eal_common_lcore_telemetry.c +@@ -10,9 +10,18 @@ + #include + #include + #include ++#include ++#include ++#include + + #ifdef RTE_LCORE_BUSYNESS + #include ++#define MSR_PLATFORM_INFO 0xCE ++#define POWER_SYSFS_CUR_PATH "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_cur_freq" ++#define POWER_SYSFS_BASE_FREQ_PATH "/sys/devices/system/cpu/cpu%u/cpufreq/base_frequency" ++#define POWER_SYSFS_SCALING_DRIVER_PATH "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_driver" ++#define POWER_SYSFS_SCALING_MAX_FREQ_PATH "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_max_freq" ++#define POWER_SYSFS_MSR_PATH "/dev/cpu/%u/msr" + #endif + + int __rte_lcore_telemetry_enabled; +@@ -47,6 +56,182 @@ static struct lcore_telemetry *telemetry_data; + #define SMOOTH_COEFF 5 + #define STATE_CHANGE_OPT 32 + ++static int p1_freq[RTE_MAX_LCORE] = {0}; ++ ++static int ++try_read_base_frequency(unsigned int lcore_id) ++{ ++ char path[PATH_MAX]; ++ int fd; ++ snprintf(path, sizeof(path), POWER_SYSFS_BASE_FREQ_PATH, rte_lcore_to_cpu_id(lcore_id)); ++ ++ fd = open(path, O_RDONLY); ++ if (fd == -1) { ++ return -1; ++ } ++ char buffer[16]; ++ ssize_t bytesRead = pread(fd, buffer, sizeof(buffer) - 1, 0); ++ if (bytesRead == -1) { ++ close(fd); ++ return -1; ++ } ++ buffer[bytesRead] = '\0'; // Null-terminate the buffer ++ close(fd); ++ ++ p1_freq[lcore_id] = atoi(buffer); ++ return p1_freq[lcore_id]; ++ ++ ++} ++ ++static int ++try_read_scaling_max_freq(unsigned int lcore_id) ++{ ++ char path[PATH_MAX]; ++ int freq; ++ int fd; ++ ++ /* ++ * If the driver is acpi_cpufreq, we can read the scaling_max_freq file ++ */ ++ snprintf(path, sizeof(path), POWER_SYSFS_SCALING_DRIVER_PATH, rte_lcore_to_cpu_id(lcore_id)); ++ fd = open(path, O_RDONLY); ++ if (fd == -1) { ++ return -1; ++ } ++ char buffer[16]; ++ ssize_t bytesRead = pread(fd, buffer, sizeof(buffer) - 1, 0); ++ if (bytesRead == -1) { ++ close(fd); ++ return -1; ++ } ++ buffer[bytesRead] = '\0'; // Null-terminate the buffer ++ ++ close(fd); ++ ++ if (strncmp(buffer, "acpi-cpufreq", 12) == 0) { ++ /* we can use the scaling_max_freq to get the p1 */ ++ snprintf(path, sizeof(path), POWER_SYSFS_SCALING_MAX_FREQ_PATH, rte_lcore_to_cpu_id(lcore_id)); ++ fd = open(path, O_RDONLY); ++ if (fd == -1) { ++ return -1; ++ } ++ ssize_t bytesRead = pread(fd, buffer, sizeof(buffer) - 1, 0); ++ if (bytesRead == -1) { ++ close(fd); ++ return -1; ++ } ++ buffer[bytesRead] = '\0'; // Null-terminate the buffer ++ close(fd); ++ freq = atoi(buffer) / 1000; /* convert to KHz */ ++ ++ /* ++ * If the freq value ends with '1', then, turbo is enabled. ++ * Round it down to the nearest 100. Otherwuse use the value. ++ */ ++ return (freq & ~1) * 1000; /* convert to Hz */ ++ } ++ return -1; ++} ++ ++static int ++try_read_msr(unsigned int lcore_id) ++{ ++ char path[PATH_MAX]; ++ int fd; ++ int freq; ++ uint64_t data; ++ ++ /* ++ * If the msr driver is present, we can read p1 from MSR_PLATFORM_INFO register ++ */ ++ snprintf(path, sizeof(path), POWER_SYSFS_MSR_PATH, rte_lcore_to_cpu_id(lcore_id)); ++ fd = open(path, O_RDONLY); ++ if (fd < 0) { ++ return -1; ++ } ++ ++ if (pread(fd, &data, sizeof(data), MSR_PLATFORM_INFO) != sizeof(data)) { ++ close(fd); ++ return -1; ++ } ++ ++ close(fd); ++ ++ freq = ((data >> 8) & 0xff) * 100 * 1000; ++ ++ return freq; ++} ++ ++ ++static ++int read_sysfs_p1_freq(unsigned int lcore_id) { ++ int freq; ++ ++ /* We've previously got the p1 frequency. */ ++ if (p1_freq[lcore_id] != 0) ++ return p1_freq[lcore_id]; ++ ++ /* ++ * Check the base_frequency file, if it's there ++ */ ++ freq = try_read_base_frequency(lcore_id); ++ if (freq != -1) { ++ p1_freq[lcore_id] = freq; ++ return freq; ++ } ++ ++ /* ++ * Check the scaling_max_freq file for the acpi-freq driver ++ */ ++ freq = try_read_scaling_max_freq(lcore_id); ++ if (freq != -1) { ++ p1_freq[lcore_id] = freq; ++ return freq; ++ } ++ ++ /* ++ * Try reading from the MSR register ++ */ ++ freq = try_read_msr(lcore_id); ++ if (freq != -1) { ++ p1_freq[lcore_id] = freq; ++ return freq; ++ } ++ ++ RTE_LOG(ERR, EAL, "Capacity telemetry for lcore %d not supported: no p1 frequency found", ++ lcore_id); ++ ++ return -1; ++} ++ ++ ++int current_fds[RTE_MAX_LCORE] = {0}; ++ ++static ++int read_sysfs_cur_freq(unsigned int lcore_id) { ++ char path[PATH_MAX]; ++ ++ if (current_fds[lcore_id] == 0) { ++ snprintf(path, sizeof(path), POWER_SYSFS_CUR_PATH, rte_lcore_to_cpu_id(lcore_id)); ++ current_fds[lcore_id] = open(path, O_RDONLY); ++ if (current_fds[lcore_id] == -1) { ++ return -1; ++ } ++ } ++ ++ char buffer[16]; ++ ssize_t bytesRead = pread(current_fds[lcore_id], buffer, sizeof(buffer) - 1, 0); ++ if (bytesRead == -1) { ++ return -1; ++ } ++ ++ buffer[bytesRead] = '\0'; // Null-terminate the buffer ++ ++ int value = atoi(buffer); ++ return value; ++} ++ + /* Helper function to check if the lcore is enabled. + * Cannot use rte_lcore_is_enabled since it only catches ROLE_RTE threads which + * does not include ROLE_NON_EAL threads which some application threads, for +@@ -102,6 +287,33 @@ int rte_lcore_busyness(unsigned int lcore_id) + return telemetry_data[lcore_id].busyness; + } + ++int rte_lcore_capacity(unsigned int lcore_id) ++{ ++ const uint64_t active_thresh = RTE_LCORE_BUSYNESS_PERIOD * 1000; ++ struct lcore_telemetry *tdata; ++ ++ if (lcore_id >= RTE_MAX_LCORE) ++ return -EINVAL; ++ tdata = &telemetry_data[lcore_id]; ++ ++ /* if the lcore is not active */ ++ if (tdata->interval_ts == 0) ++ return LCORE_BUSYNESS_NOT_SET; ++ /* if the core hasn't been active in a while */ ++ else if ((rte_rdtsc() - tdata->interval_ts) > active_thresh) ++ return LCORE_BUSYNESS_NOT_SET; ++ ++ int cur_freq = read_sysfs_cur_freq(rte_lcore_to_cpu_id(lcore_id)); ++ int busy = telemetry_data[lcore_id].busyness; ++ int p1 = read_sysfs_p1_freq(lcore_id) ; ++ ++ if ((busy == -1) || (p1 <= 0)) { ++ return -1; ++ } else { ++ return busy * cur_freq / p1; ++ } ++} ++ + int rte_lcore_busyness_enabled(void) + { + return __rte_lcore_telemetry_enabled; +@@ -263,6 +475,26 @@ lcore_handle_busyness(const char *cmd __rte_unused, + return 0; + } + ++static int ++lcore_handle_capacity(const char *cmd __rte_unused, ++ const char *params __rte_unused, struct rte_tel_data *d) ++{ ++ char corenum[64]; ++ int i; ++ ++ rte_tel_data_start_dict(d); ++ ++ /* Foreach lcore - can't use macro since it excludes ROLE_NON_EAL */ ++ for (i = 0; i < RTE_MAX_LCORE; i++) { ++ if (!lcore_enabled(i)) ++ continue; ++ snprintf(corenum, sizeof(corenum), "%d", i); ++ rte_tel_data_add_dict_int(d, corenum, rte_lcore_capacity(i)); ++ } ++ ++ return 0; ++} ++ + static int + lcore_handle_cpuset(const char *cmd __rte_unused, + const char *params __rte_unused, +@@ -326,6 +558,9 @@ RTE_INIT(lcore_init_telemetry) + rte_telemetry_register_cmd("/eal/lcore/busyness", lcore_handle_busyness, + "return percentage busyness of cores"); + ++ rte_telemetry_register_cmd("/eal/lcore/capacity_used", lcore_handle_capacity, ++ "return percentage capacity of cores"); ++ + rte_telemetry_register_cmd("/eal/lcore/busyness_enable", lcore_busyness_enable, + "enable lcore busyness measurement"); + +@@ -340,6 +575,11 @@ RTE_INIT(lcore_init_telemetry) + + #else + ++int rte_lcore_capacity(unsigned int lcore_id __rte_unused) ++{ ++ return -ENOTSUP; ++} ++ + int rte_lcore_busyness(unsigned int lcore_id __rte_unused) + { + return -ENOTSUP; +diff --git a/lib/eal/include/rte_lcore.h b/lib/eal/include/rte_lcore.h +index 9f4bd6e22f..132cdb9139 100644 +--- a/lib/eal/include/rte_lcore.h ++++ b/lib/eal/include/rte_lcore.h +@@ -437,6 +437,27 @@ __rte_experimental + int + rte_lcore_busyness(unsigned int lcore_id); + ++/** ++ * @warning ++ * @b EXPERIMENTAL: this API may change without prior notice. ++ * ++ * Read capacity value corresponding to an lcore. ++ * This differs from busyness in that it is related to the current usage ++ * of the lcore compared to P1 frequency, not the current frequency. ++ * ++ * @param lcore_id ++ * Lcore to read capacity value for. ++ * @return ++ * - value between 0 and 100 on success ++ * - -1 if lcore is not active ++ * - -EINVAL if lcore is invalid ++ * - -ENOMEM if not enough memory available ++ * - -ENOTSUP if not supported ++ */ ++__rte_experimental ++int ++rte_lcore_capacity(unsigned int lcore_id); ++ + /** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice. +diff --git a/lib/eal/version.map b/lib/eal/version.map +index 7791f59314..5bb8429b29 100644 +--- a/lib/eal/version.map ++++ b/lib/eal/version.map +@@ -444,6 +444,7 @@ EXPERIMENTAL { + # Added for busyness telemetry + __rte_lcore_telemetry_timestamp; + __rte_lcore_telemetry_enabled; ++ rte_lcore_capacity; + rte_lcore_busyness; + rte_lcore_busyness_enabled; + rte_lcore_busyness_enabled_set; +-- +2.25.1 + diff --git a/ipm/patches/dpdk/23.11/0001-eal-add-lcore-busyness-telemetry.patch b/ipm/patches/dpdk/23.11/0001-eal-add-lcore-busyness-telemetry.patch new file mode 100644 index 0000000..79354fa --- /dev/null +++ b/ipm/patches/dpdk/23.11/0001-eal-add-lcore-busyness-telemetry.patch @@ -0,0 +1,976 @@ +From 80cd70f677c7747f0f930c6212bbd42bcd58c02c Mon Sep 17 00:00:00 2001 +From: Anatoly Burakov +Date: Fri, 20 Sep 2024 09:08:28 +0100 +Subject: [PATCH 1/3] eal: add lcore busyness telemetry + +Currently, there is no way to measure lcore busyness in a passive way, +without any modifications to the application. This patch adds a new EAL +API that will be able to passively track core busyness. + +The busyness is calculated by relying on the fact that most DPDK API's +will poll for packets. Empty polls can be counted as "idle", while +non-empty polls can be counted as busy. To measure lcore busyness, we +simply call the telemetry timestamping function with the number of polls +a particular code section has processed, and count the number of cycles +we've spent processing empty bursts. The more empty bursts we encounter, +the less cycles we spend in "busy" state, and the less core busyness +will be reported. + +In order for all of the above to work without modifications to the +application, the library code needs to be instrumented with calls to +the lcore telemetry busyness timestamping function. The following parts +of DPDK are instrumented with lcore telemetry calls: + +- All major driver API's: + - ethdev + - cryptodev + - compressdev + - regexdev + - bbdev + - rawdev + - eventdev + - dmadev +- Some additional libraries: + - ring + - distributor + +To avoid performance impact from having lcore telemetry support, a +global variable is exported by EAL, and a call to timestamping function +is wrapped into a macro, so that whenever telemetry is disabled, it only +takes one additional branch and no function calls are performed. It is +also possible to disable it at compile time by commenting out +RTE_LCORE_BUSYNESS from build config. + +This patch also adds a telemetry endpoint to report lcore busyness, as +well as telemetry endpoints to enable/disable lcore telemetry. + +Signed-off-by: Kevin Laatz +Signed-off-by: Conor Walsh +Signed-off-by: David Hunt +Signed-off-by: Anatoly Burakov +--- + config/rte_config.h | 2 + + lib/bbdev/rte_bbdev.h | 17 +- + lib/compressdev/rte_compressdev.c | 2 + + lib/cryptodev/rte_cryptodev.h | 2 + + lib/distributor/rte_distributor.c | 21 +- + lib/distributor/rte_distributor_single.c | 14 +- + lib/dmadev/rte_dmadev.h | 16 +- + lib/eal/common/eal_common_lcore_telemetry.c | 319 ++++++++++++++++++++ + lib/eal/common/meson.build | 1 + + lib/eal/freebsd/eal.c | 1 + + lib/eal/include/rte_lcore.h | 84 ++++++ + lib/eal/linux/eal.c | 1 + + lib/eal/meson.build | 3 + + lib/eal/version.map | 58 ++++ + lib/ethdev/rte_ethdev.h | 2 + + lib/eventdev/rte_eventdev.h | 9 +- + lib/rawdev/rte_rawdev.c | 5 +- + lib/regexdev/rte_regexdev.h | 5 +- + lib/ring/rte_ring.h | 8 +- + lib/ring/rte_ring_elem_pvt.h | 1 + + 20 files changed, 545 insertions(+), 26 deletions(-) + create mode 100644 lib/eal/common/eal_common_lcore_telemetry.c + +diff --git a/config/rte_config.h b/config/rte_config.h +index da265d7dd2..7e0083abf3 100644 +--- a/config/rte_config.h ++++ b/config/rte_config.h +@@ -38,6 +38,8 @@ + #define RTE_MAX_TAILQ 32 + #define RTE_LOG_DP_LEVEL RTE_LOG_INFO + #define RTE_MAX_VFIO_CONTAINERS 64 ++#define RTE_LCORE_BUSYNESS 1 ++#define RTE_LCORE_BUSYNESS_PERIOD 4000000ULL + + /* bsd module defines */ + #define RTE_CONTIGMEM_MAX_NUM_BUFS 64 +diff --git a/lib/bbdev/rte_bbdev.h b/lib/bbdev/rte_bbdev.h +index 0cbfdd1c95..536f3a79d3 100644 +--- a/lib/bbdev/rte_bbdev.h ++++ b/lib/bbdev/rte_bbdev.h +@@ -29,6 +29,7 @@ extern "C" { + + #include + #include ++#include + + #include "rte_bbdev_op.h" + +@@ -748,7 +749,9 @@ rte_bbdev_dequeue_enc_ops(uint16_t dev_id, uint16_t queue_id, + { + struct rte_bbdev *dev = &rte_bbdev_devices[dev_id]; + struct rte_bbdev_queue_data *q_data = &dev->data->queues[queue_id]; +- return dev->dequeue_enc_ops(q_data, ops, num_ops); ++ const uint16_t nb_ops = dev->dequeue_enc_ops(q_data, ops, num_ops); ++ RTE_LCORE_TELEMETRY_TIMESTAMP(nb_ops); ++ return nb_ops; + } + + /** +@@ -780,7 +783,9 @@ rte_bbdev_dequeue_dec_ops(uint16_t dev_id, uint16_t queue_id, + { + struct rte_bbdev *dev = &rte_bbdev_devices[dev_id]; + struct rte_bbdev_queue_data *q_data = &dev->data->queues[queue_id]; +- return dev->dequeue_dec_ops(q_data, ops, num_ops); ++ const uint16_t nb_ops = dev->dequeue_dec_ops(q_data, ops, num_ops); ++ RTE_LCORE_TELEMETRY_TIMESTAMP(nb_ops); ++ return nb_ops; + } + + +@@ -811,7 +816,9 @@ rte_bbdev_dequeue_ldpc_enc_ops(uint16_t dev_id, uint16_t queue_id, + { + struct rte_bbdev *dev = &rte_bbdev_devices[dev_id]; + struct rte_bbdev_queue_data *q_data = &dev->data->queues[queue_id]; +- return dev->dequeue_ldpc_enc_ops(q_data, ops, num_ops); ++ const uint16_t nb_ops = dev->dequeue_ldpc_enc_ops(q_data, ops, num_ops); ++ RTE_LCORE_TELEMETRY_TIMESTAMP(nb_ops); ++ return nb_ops; + } + + /** +@@ -841,7 +848,9 @@ rte_bbdev_dequeue_ldpc_dec_ops(uint16_t dev_id, uint16_t queue_id, + { + struct rte_bbdev *dev = &rte_bbdev_devices[dev_id]; + struct rte_bbdev_queue_data *q_data = &dev->data->queues[queue_id]; +- return dev->dequeue_ldpc_dec_ops(q_data, ops, num_ops); ++ const uint16_t nb_ops = dev->dequeue_ldpc_dec_ops(q_data, ops, num_ops); ++ RTE_LCORE_TELEMETRY_TIMESTAMP(nb_ops); ++ return nb_ops; + } + + /** +diff --git a/lib/compressdev/rte_compressdev.c b/lib/compressdev/rte_compressdev.c +index cf2c20a704..04a854a935 100644 +--- a/lib/compressdev/rte_compressdev.c ++++ b/lib/compressdev/rte_compressdev.c +@@ -587,6 +587,8 @@ rte_compressdev_dequeue_burst(uint8_t dev_id, uint16_t qp_id, + nb_ops = (*dev->dequeue_burst) + (dev->data->queue_pairs[qp_id], ops, nb_ops); + ++ RTE_LCORE_TELEMETRY_TIMESTAMP(nb_ops); ++ + return nb_ops; + } + +diff --git a/lib/cryptodev/rte_cryptodev.h b/lib/cryptodev/rte_cryptodev.h +index a42a4fc04e..12e986c6e4 100644 +--- a/lib/cryptodev/rte_cryptodev.h ++++ b/lib/cryptodev/rte_cryptodev.h +@@ -1933,6 +1933,8 @@ rte_cryptodev_dequeue_burst(uint8_t dev_id, uint16_t qp_id, + rte_rcu_qsbr_thread_offline(list->qsbr, 0); + } + #endif ++ ++ RTE_LCORE_TELEMETRY_TIMESTAMP(nb_ops); + return nb_ops; + } + +diff --git a/lib/distributor/rte_distributor.c b/lib/distributor/rte_distributor.c +index 2ecb95c3e5..0c8f9a1dea 100644 +--- a/lib/distributor/rte_distributor.c ++++ b/lib/distributor/rte_distributor.c +@@ -57,6 +57,8 @@ rte_distributor_request_pkt(struct rte_distributor *d, + + while (rte_rdtsc() < t) + rte_pause(); ++ /* this was an empty poll */ ++ RTE_LCORE_TELEMETRY_TIMESTAMP(0); + } + + /* +@@ -135,24 +137,29 @@ rte_distributor_get_pkt(struct rte_distributor *d, + + if (unlikely(d->alg_type == RTE_DIST_ALG_SINGLE)) { + if (return_count <= 1) { ++ uint16_t cnt; + pkts[0] = rte_distributor_get_pkt_single(d->d_single, +- worker_id, return_count ? oldpkt[0] : NULL); +- return (pkts[0]) ? 1 : 0; +- } else +- return -EINVAL; ++ worker_id, ++ return_count ? oldpkt[0] : NULL); ++ cnt = (pkts[0] != NULL) ? 1 : 0; ++ RTE_LCORE_TELEMETRY_TIMESTAMP(cnt); ++ return cnt; ++ } ++ return -EINVAL; + } + + rte_distributor_request_pkt(d, worker_id, oldpkt, return_count); + +- count = rte_distributor_poll_pkt(d, worker_id, pkts); +- while (count == -1) { ++ while ((count = rte_distributor_poll_pkt(d, worker_id, pkts)) == -1) { + uint64_t t = rte_rdtsc() + 100; + + while (rte_rdtsc() < t) + rte_pause(); + +- count = rte_distributor_poll_pkt(d, worker_id, pkts); ++ /* this was an empty poll */ ++ RTE_LCORE_TELEMETRY_TIMESTAMP(0); + } ++ RTE_LCORE_TELEMETRY_TIMESTAMP(count); + return count; + } + +diff --git a/lib/distributor/rte_distributor_single.c b/lib/distributor/rte_distributor_single.c +index d4b3e12648..3c18805ecd 100644 +--- a/lib/distributor/rte_distributor_single.c ++++ b/lib/distributor/rte_distributor_single.c +@@ -31,8 +31,13 @@ rte_distributor_request_pkt_single(struct rte_distributor_single *d, + union rte_distributor_buffer_single *buf = &d->bufs[worker_id]; + int64_t req = (((int64_t)(uintptr_t)oldpkt) << RTE_DISTRIB_FLAG_BITS) + | RTE_DISTRIB_GET_BUF; +- RTE_WAIT_UNTIL_MASKED(&buf->bufptr64, RTE_DISTRIB_FLAGS_MASK, +- ==, 0, rte_memory_order_relaxed); ++ ++ while ((__atomic_load_n(&buf->bufptr64, __ATOMIC_RELAXED) ++ & RTE_DISTRIB_FLAGS_MASK) != 0) { ++ rte_pause(); ++ /* this was an empty poll */ ++ RTE_LCORE_TELEMETRY_TIMESTAMP(0); ++ } + + /* Sync with distributor on GET_BUF flag. */ + rte_atomic_store_explicit(&buf->bufptr64, req, rte_memory_order_release); +@@ -59,8 +64,11 @@ rte_distributor_get_pkt_single(struct rte_distributor_single *d, + { + struct rte_mbuf *ret; + rte_distributor_request_pkt_single(d, worker_id, oldpkt); +- while ((ret = rte_distributor_poll_pkt_single(d, worker_id)) == NULL) ++ while ((ret = rte_distributor_poll_pkt_single(d, worker_id)) == NULL) { + rte_pause(); ++ /* this was an empty poll */ ++ RTE_LCORE_TELEMETRY_TIMESTAMP(0); ++ } + return ret; + } + +diff --git a/lib/dmadev/rte_dmadev.h b/lib/dmadev/rte_dmadev.h +index 450b81c307..e32d089a48 100644 +--- a/lib/dmadev/rte_dmadev.h ++++ b/lib/dmadev/rte_dmadev.h +@@ -148,6 +148,8 @@ + + #include + #include ++#include ++#include + + #ifdef __cplusplus + extern "C" { +@@ -995,7 +997,7 @@ rte_dma_completed(int16_t dev_id, uint16_t vchan, const uint16_t nb_cpls, + uint16_t *last_idx, bool *has_error) + { + struct rte_dma_fp_object *obj = &rte_dma_fp_objs[dev_id]; +- uint16_t idx; ++ uint16_t idx, nb_ops; + bool err; + + #ifdef RTE_DMADEV_DEBUG +@@ -1019,8 +1021,10 @@ rte_dma_completed(int16_t dev_id, uint16_t vchan, const uint16_t nb_cpls, + has_error = &err; + + *has_error = false; +- return (*obj->completed)(obj->dev_private, vchan, nb_cpls, last_idx, +- has_error); ++ nb_ops = (*obj->completed)(obj->dev_private, vchan, nb_cpls, last_idx, ++ has_error); ++ RTE_LCORE_TELEMETRY_TIMESTAMP(nb_ops); ++ return nb_ops; + } + + /** +@@ -1055,7 +1059,7 @@ rte_dma_completed_status(int16_t dev_id, uint16_t vchan, + enum rte_dma_status_code *status) + { + struct rte_dma_fp_object *obj = &rte_dma_fp_objs[dev_id]; +- uint16_t idx; ++ uint16_t idx, nb_ops; + + #ifdef RTE_DMADEV_DEBUG + if (!rte_dma_is_valid(dev_id) || nb_cpls == 0 || status == NULL) +@@ -1067,8 +1071,10 @@ rte_dma_completed_status(int16_t dev_id, uint16_t vchan, + if (last_idx == NULL) + last_idx = &idx; + +- return (*obj->completed_status)(obj->dev_private, vchan, nb_cpls, ++ nb_ops = (*obj->completed_status)(obj->dev_private, vchan, nb_cpls, + last_idx, status); ++ RTE_LCORE_TELEMETRY_TIMESTAMP(nb_ops); ++ return nb_ops; + } + + /** +diff --git a/lib/eal/common/eal_common_lcore_telemetry.c b/lib/eal/common/eal_common_lcore_telemetry.c +new file mode 100644 +index 0000000000..1478e5a48a +--- /dev/null ++++ b/lib/eal/common/eal_common_lcore_telemetry.c +@@ -0,0 +1,319 @@ ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright(c) 2022 Intel Corporation ++ */ ++ ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++ ++#ifdef RTE_LCORE_BUSYNESS ++#include ++#endif ++ ++int __rte_lcore_telemetry_enabled; ++ ++#ifdef RTE_LCORE_BUSYNESS ++ ++#include "eal_private.h" ++ ++struct lcore_telemetry { ++ int busyness; ++ /**< Calculated busyness (gets set/returned by the API) */ ++ int raw_busyness; ++ /**< Calculated busyness times 100. */ ++ uint64_t interval_ts; ++ /**< when previous telemetry interval started */ ++ uint64_t empty_cycles; ++ /**< empty cycle count since last interval */ ++ uint64_t last_poll_ts; ++ /**< last poll timestamp */ ++ bool last_empty; ++ /**< if last poll was empty */ ++ unsigned int contig_poll_cnt; ++ /**< contiguous (always empty/non empty) poll counter */ ++} __rte_cache_aligned; ++ ++static struct lcore_telemetry *telemetry_data; ++ ++#define LCORE_BUSYNESS_MAX 100 ++#define LCORE_BUSYNESS_NOT_SET -1 ++#define LCORE_BUSYNESS_MIN 0 ++ ++#define SMOOTH_COEFF 5 ++#define STATE_CHANGE_OPT 32 ++ ++/* Helper function to check if the lcore is enabled. ++ * Cannot use rte_lcore_is_enabled since it only catches ROLE_RTE threads which ++ * does not include ROLE_NON_EAL threads which some application threads, for ++ * example OvS polling threads, are marked as. ++ */ ++static int ++lcore_enabled(unsigned int lcore_id) ++{ ++ enum rte_lcore_role_t role = rte_eal_lcore_role(lcore_id); ++ ++ return role == ROLE_RTE || role == ROLE_NON_EAL; ++} ++ ++static void lcore_config_init(void) ++{ ++ struct lcore_telemetry *td; ++ int lcore_id; ++ ++ /* Foreach lcore - can't use macro since it excludes ROLE_NON_EAL */ ++ for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { ++ if (!lcore_enabled(lcore_id)) ++ continue; ++ ++ td = &telemetry_data[lcore_id]; ++ ++ td->interval_ts = 0; ++ td->last_poll_ts = 0; ++ td->empty_cycles = 0; ++ td->last_empty = true; ++ td->contig_poll_cnt = 0; ++ td->busyness = LCORE_BUSYNESS_NOT_SET; ++ td->raw_busyness = 0; ++ } ++} ++ ++int rte_lcore_busyness(unsigned int lcore_id) ++{ ++ const uint64_t active_thresh = RTE_LCORE_BUSYNESS_PERIOD * 1000; ++ struct lcore_telemetry *tdata; ++ ++ if (lcore_id >= RTE_MAX_LCORE) ++ return -EINVAL; ++ tdata = &telemetry_data[lcore_id]; ++ ++ /* if the lcore is not active */ ++ if (tdata->interval_ts == 0) ++ return LCORE_BUSYNESS_NOT_SET; ++ /* if the core hasn't been active in a while */ ++ else if ((rte_rdtsc() - tdata->interval_ts) > active_thresh) ++ return LCORE_BUSYNESS_NOT_SET; ++ ++ /* this core is active, report its busyness */ ++ return telemetry_data[lcore_id].busyness; ++} ++ ++int rte_lcore_busyness_enabled(void) ++{ ++ return __rte_lcore_telemetry_enabled; ++} ++ ++void rte_lcore_busyness_enabled_set(int enable) ++{ ++ __rte_lcore_telemetry_enabled = !!enable; ++ ++ if (!enable) ++ lcore_config_init(); ++} ++ ++static inline int calc_raw_busyness(const struct lcore_telemetry *tdata, ++ const uint64_t empty, const uint64_t total) ++{ ++ /* ++ * we don't want to use floating point math here, but we want for our ++ * busyness to react smoothly to sudden changes, while still keeping the ++ * accuracy and making sure that over time the average follows busyness ++ * as measured just-in-time. therefore, we will calculate the average ++ * busyness using integer math, but shift the decimal point two places ++ * to the right, so that 100.0 becomes 10000. this allows us to report ++ * integer values (0..100) while still allowing ourselves to follow the ++ * just-in-time measurements when we calculate our averages. ++ */ ++ const int max_raw_idle = LCORE_BUSYNESS_MAX * 100; ++ ++ const int prev_raw_idle = max_raw_idle - tdata->raw_busyness; ++ ++ /* calculate rate of idle cycles, times 100 */ ++ const int cur_raw_idle = (int)((empty * max_raw_idle) / total); ++ ++ /* smoothen the idleness */ ++ const int smoothened_idle = ++ (cur_raw_idle + prev_raw_idle * (SMOOTH_COEFF - 1)) / SMOOTH_COEFF; ++ ++ /* convert idleness back to busyness */ ++ return max_raw_idle - smoothened_idle; ++} ++ ++void __rte_lcore_telemetry_timestamp(uint16_t nb_rx) ++{ ++ const unsigned int lcore_id = rte_lcore_id(); ++ uint64_t interval_ts, empty_cycles, cur_tsc, last_poll_ts; ++ struct lcore_telemetry *tdata; ++ const bool empty = nb_rx == 0; ++ uint64_t diff_int, diff_last; ++ bool last_empty; ++ ++ /* This telemetry is not supported for unregistered non-EAL threads */ ++ if (lcore_id >= RTE_MAX_LCORE) { ++ RTE_LOG(DEBUG, EAL, ++ "Lcore telemetry not supported on unregistered non-EAL thread %d", ++ lcore_id); ++ return; ++ } ++ ++ tdata = &telemetry_data[lcore_id]; ++ last_empty = tdata->last_empty; ++ ++ /* optimization: don't do anything if status hasn't changed */ ++ if (last_empty == empty && tdata->contig_poll_cnt++ < STATE_CHANGE_OPT) ++ return; ++ /* status changed or we're waiting for too long, reset counter */ ++ tdata->contig_poll_cnt = 0; ++ ++ cur_tsc = rte_rdtsc(); ++ ++ interval_ts = tdata->interval_ts; ++ empty_cycles = tdata->empty_cycles; ++ last_poll_ts = tdata->last_poll_ts; ++ ++ diff_int = cur_tsc - interval_ts; ++ diff_last = cur_tsc - last_poll_ts; ++ ++ /* is this the first time we're here? */ ++ if (interval_ts == 0) { ++ tdata->busyness = LCORE_BUSYNESS_MIN; ++ tdata->raw_busyness = 0; ++ tdata->interval_ts = cur_tsc; ++ tdata->empty_cycles = 0; ++ tdata->contig_poll_cnt = 0; ++ goto end; ++ } ++ ++ /* update the empty counter if we got an empty poll earlier */ ++ if (last_empty) ++ empty_cycles += diff_last; ++ ++ /* have we passed the interval? */ ++ if (diff_int > RTE_LCORE_BUSYNESS_PERIOD) { ++ int raw_busyness; ++ ++ /* get updated busyness value */ ++ raw_busyness = calc_raw_busyness(tdata, empty_cycles, diff_int); ++ ++ /* set a new interval, reset empty counter */ ++ tdata->interval_ts = cur_tsc; ++ tdata->empty_cycles = 0; ++ tdata->raw_busyness = raw_busyness; ++ /* bring busyness back to 0..100 range, biased to round up */ ++ tdata->busyness = (raw_busyness + 50) / 100; ++ } else ++ /* we may have updated empty counter */ ++ tdata->empty_cycles = empty_cycles; ++ ++end: ++ /* update status for next poll */ ++ tdata->last_poll_ts = cur_tsc; ++ tdata->last_empty = empty; ++} ++ ++static int ++lcore_busyness_enable(const char *cmd __rte_unused, ++ const char *params __rte_unused, ++ struct rte_tel_data *d) ++{ ++ rte_lcore_busyness_enabled_set(1); ++ ++ rte_tel_data_start_dict(d); ++ ++ rte_tel_data_add_dict_int(d, "busyness_enabled", 1); ++ ++ return 0; ++} ++ ++static int ++lcore_busyness_disable(const char *cmd __rte_unused, ++ const char *params __rte_unused, ++ struct rte_tel_data *d) ++{ ++ rte_lcore_busyness_enabled_set(0); ++ ++ rte_tel_data_start_dict(d); ++ ++ rte_tel_data_add_dict_int(d, "busyness_enabled", 0); ++ ++ return 0; ++} ++ ++static int ++lcore_handle_busyness(const char *cmd __rte_unused, ++ const char *params __rte_unused, struct rte_tel_data *d) ++{ ++ char corenum[64]; ++ int i; ++ ++ rte_tel_data_start_dict(d); ++ ++ /* Foreach lcore - can't use macro since it excludes ROLE_NON_EAL */ ++ for (i = 0; i < RTE_MAX_LCORE; i++) { ++ if (!lcore_enabled(i)) ++ continue; ++ snprintf(corenum, sizeof(corenum), "%d", i); ++ rte_tel_data_add_dict_int(d, corenum, rte_lcore_busyness(i)); ++ } ++ ++ return 0; ++} ++ ++void ++eal_lcore_telemetry_free(void) ++{ ++ if (telemetry_data != NULL) { ++ free(telemetry_data); ++ telemetry_data = NULL; ++ } ++} ++ ++RTE_INIT(lcore_init_telemetry) ++{ ++ telemetry_data = calloc(RTE_MAX_LCORE, sizeof(telemetry_data[0])); ++ if (telemetry_data == NULL) ++ rte_panic("Could not init lcore telemetry data: Out of memory\n"); ++ ++ lcore_config_init(); ++ ++ rte_telemetry_register_cmd("/eal/lcore/busyness", lcore_handle_busyness, ++ "return percentage busyness of cores"); ++ ++ rte_telemetry_register_cmd("/eal/lcore/busyness_enable", lcore_busyness_enable, ++ "enable lcore busyness measurement"); ++ ++ rte_telemetry_register_cmd("/eal/lcore/busyness_disable", lcore_busyness_disable, ++ "disable lcore busyness measurement"); ++ ++ __rte_lcore_telemetry_enabled = true; ++} ++ ++#else ++ ++int rte_lcore_busyness(unsigned int lcore_id __rte_unused) ++{ ++ return -ENOTSUP; ++} ++ ++int rte_lcore_busyness_enabled(void) ++{ ++ return -ENOTSUP; ++} ++ ++void rte_lcore_busyness_enabled_set(int enable __rte_unused) ++{ ++} ++ ++void __rte_lcore_telemetry_timestamp(uint16_t nb_rx __rte_unused) ++{ ++} ++ ++void eal_lcore_telemetry_free(void) ++{ ++} ++ ++#endif +diff --git a/lib/eal/common/meson.build b/lib/eal/common/meson.build +index 22a626ba6f..1ca6f9a420 100644 +--- a/lib/eal/common/meson.build ++++ b/lib/eal/common/meson.build +@@ -17,6 +17,7 @@ sources += files( + 'eal_common_hexdump.c', + 'eal_common_interrupts.c', + 'eal_common_launch.c', ++ 'eal_common_lcore_telemetry.c', + 'eal_common_lcore.c', + 'eal_common_mcfg.c', + 'eal_common_memalloc.c', +diff --git a/lib/eal/freebsd/eal.c b/lib/eal/freebsd/eal.c +index 568e06e9ed..1c27688e39 100644 +--- a/lib/eal/freebsd/eal.c ++++ b/lib/eal/freebsd/eal.c +@@ -929,6 +929,7 @@ rte_eal_cleanup(void) + eal_bus_cleanup(); + rte_trace_save(); + eal_trace_fini(); ++ eal_lcore_telemetry_free(); + rte_eal_alarm_cleanup(); + /* after this point, any DPDK pointers will become dangling */ + rte_eal_memory_detach(); +diff --git a/lib/eal/include/rte_lcore.h b/lib/eal/include/rte_lcore.h +index 7deae47af3..3c64774bcb 100644 +--- a/lib/eal/include/rte_lcore.h ++++ b/lib/eal/include/rte_lcore.h +@@ -407,6 +407,90 @@ rte_thread_register(void); + void + rte_thread_unregister(void); + ++/** ++ * @warning ++ * @b EXPERIMENTAL: this API may change without prior notice. ++ * ++ * Read busyness value corresponding to an lcore. ++ * ++ * @param lcore_id ++ * Lcore to read busyness value for. ++ * @return ++ * - value between 0 and 100 on success ++ * - -1 if lcore is not active ++ * - -EINVAL if lcore is invalid ++ * - -ENOMEM if not enough memory available ++ * - -ENOTSUP if not supported ++ */ ++__rte_experimental ++int ++rte_lcore_busyness(unsigned int lcore_id); ++ ++/** ++ * @warning ++ * @b EXPERIMENTAL: this API may change without prior notice. ++ * ++ * Check if lcore busyness telemetry is enabled. ++ * ++ * @return ++ * - 1 if lcore telemetry is enabled ++ * - 0 if lcore telemetry is disabled ++ * - -ENOTSUP if not lcore telemetry supported ++ */ ++__rte_experimental ++int ++rte_lcore_busyness_enabled(void); ++ ++/** ++ * @warning ++ * @b EXPERIMENTAL: this API may change without prior notice. ++ * ++ * Enable or disable busyness telemetry. ++ * ++ * @param enable ++ * 1 to enable, 0 to disable ++ */ ++__rte_experimental ++void ++rte_lcore_busyness_enabled_set(int enable); ++ ++/** ++ * @warning ++ * @b EXPERIMENTAL: this API may change without prior notice. ++ * ++ * Lcore telemetry timestamping function. ++ * ++ * @param nb_rx ++ * Number of buffers processed by lcore. ++ */ ++__rte_experimental ++void ++__rte_lcore_telemetry_timestamp(uint16_t nb_rx); ++ ++/** @internal lcore telemetry enabled status */ ++extern int __rte_lcore_telemetry_enabled; ++ ++/** @internal free memory allocated for lcore telemetry */ ++void ++eal_lcore_telemetry_free(void); ++ ++/** ++ * Call lcore telemetry timestamp function. ++ * ++ * @param nb_rx ++ * Number of buffers processed by lcore. ++ */ ++#ifdef RTE_LCORE_BUSYNESS ++#define RTE_LCORE_TELEMETRY_TIMESTAMP(nb_rx) \ ++ do { \ ++ if (__rte_lcore_telemetry_enabled) \ ++ __rte_lcore_telemetry_timestamp(nb_rx); \ ++ } while (0) ++#else ++#define RTE_LCORE_TELEMETRY_TIMESTAMP(nb_rx) \ ++ while (0) {} ++#endif ++ + #ifdef __cplusplus + } + #endif +diff --git a/lib/eal/linux/eal.c b/lib/eal/linux/eal.c +index 57da058cec..7734b4b8a4 100644 +--- a/lib/eal/linux/eal.c ++++ b/lib/eal/linux/eal.c +@@ -1373,6 +1373,7 @@ rte_eal_cleanup(void) + eal_bus_cleanup(); + rte_trace_save(); + eal_trace_fini(); ++ eal_lcore_telemetry_free(); + eal_mp_dev_hotplug_cleanup(); + rte_eal_alarm_cleanup(); + /* after this point, any DPDK pointers will become dangling */ +diff --git a/lib/eal/meson.build b/lib/eal/meson.build +index e1d6c4cf17..833d33138c 100644 +--- a/lib/eal/meson.build ++++ b/lib/eal/meson.build +@@ -17,6 +17,9 @@ subdir(arch_subdir) + deps += ['log', 'kvargs'] + if not is_windows + deps += ['telemetry'] ++else ++ # core busyness telemetry depends on telemetry library ++ dpdk_conf.set('RTE_LCORE_BUSYNESS', false) + endif + if dpdk_conf.has('RTE_USE_LIBBSD') + ext_deps += libbsd +diff --git a/lib/eal/version.map b/lib/eal/version.map +index 5e0cd47c82..a4451d58eb 100644 +--- a/lib/eal/version.map ++++ b/lib/eal/version.map +@@ -385,6 +385,64 @@ EXPERIMENTAL { + # added in 20.11 + __rte_eal_trace_generic_size_t; # WINDOWS_NO_EXPORT + rte_cpu_get_intrinsics_support; # WINDOWS_NO_EXPORT ++ rte_service_lcore_may_be_active; ++ rte_vect_get_max_simd_bitwidth; ++ rte_vect_set_max_simd_bitwidth; ++ ++ # added in 21.02 ++ rte_power_monitor; # WINDOWS_NO_EXPORT ++ rte_power_monitor_wakeup; # WINDOWS_NO_EXPORT ++ rte_power_pause; # WINDOWS_NO_EXPORT ++ ++ # added in 21.05 ++ rte_devargs_reset; ++ rte_intr_callback_unregister_sync; ++ rte_thread_key_create; ++ rte_thread_key_delete; ++ rte_thread_value_get; ++ rte_thread_value_set; ++ rte_version_minor; ++ rte_version_month; ++ rte_version_prefix; ++ rte_version_release; ++ rte_version_suffix; ++ rte_version_year; ++ ++ # added in 21.08 ++ rte_power_monitor_multi; # WINDOWS_NO_EXPORT ++ ++ # added in 21.11 ++ rte_intr_fd_get; ++ rte_intr_fd_set; ++ rte_intr_instance_alloc; ++ rte_intr_instance_free; ++ rte_intr_type_get; ++ rte_intr_type_set; ++ ++ # added in 22.07 ++ rte_drand; ++ rte_thread_get_affinity_by_id; ++ rte_thread_get_priority; ++ rte_thread_self; ++ rte_thread_set_affinity_by_id; ++ rte_thread_set_priority; ++ ++ # added in 22.11 ++ rte_thread_attr_get_affinity; ++ rte_thread_attr_init; ++ rte_thread_attr_set_affinity; ++ rte_thread_attr_set_priority; ++ rte_thread_create; ++ rte_thread_detach; ++ rte_thread_equal; ++ rte_thread_join; ++ ++ # added in 20.11 ++ __rte_lcore_telemetry_timestamp; ++ __rte_lcore_telemetry_enabled; ++ rte_lcore_busyness; ++ rte_lcore_busyness_enabled; ++ rte_lcore_busyness_enabled_set; + + # added in 23.03 + rte_lcore_register_usage_cb; +diff --git a/lib/ethdev/rte_ethdev.h b/lib/ethdev/rte_ethdev.h +index 545799c341..e89eef37ee 100644 +--- a/lib/ethdev/rte_ethdev.h ++++ b/lib/ethdev/rte_ethdev.h +@@ -6099,6 +6099,8 @@ rte_eth_rx_burst(uint16_t port_id, uint16_t queue_id, + #endif + + rte_ethdev_trace_rx_burst(port_id, queue_id, (void **)rx_pkts, nb_rx); ++ ++ RTE_LCORE_TELEMETRY_TIMESTAMP(nb_rx); + return nb_rx; + } + +diff --git a/lib/eventdev/rte_eventdev.h b/lib/eventdev/rte_eventdev.h +index 7fd9016ca7..ce4e57b60d 100644 +--- a/lib/eventdev/rte_eventdev.h ++++ b/lib/eventdev/rte_eventdev.h +@@ -2408,6 +2408,7 @@ rte_event_dequeue_burst(uint8_t dev_id, uint8_t port_id, struct rte_event ev[], + uint16_t nb_events, uint64_t timeout_ticks) + { + const struct rte_event_fp_ops *fp_ops; ++ uint16_t nb_evts; + void *port; + + fp_ops = &rte_event_fp_ops[dev_id]; +@@ -2430,10 +2431,12 @@ rte_event_dequeue_burst(uint8_t dev_id, uint8_t port_id, struct rte_event ev[], + * requests nb_events as const one + */ + if (nb_events == 1) +- return (fp_ops->dequeue)(port, ev, timeout_ticks); ++ nb_evts = (fp_ops->dequeue)(port, ev, timeout_ticks); + else +- return (fp_ops->dequeue_burst)(port, ev, nb_events, +- timeout_ticks); ++ nb_evts = (fp_ops->dequeue_burst)(port, ev, nb_events, ++ timeout_ticks); ++ RTE_LCORE_TELEMETRY_TIMESTAMP(nb_evts); ++ return nb_evts; + } + + #define RTE_EVENT_DEV_MAINT_OP_FLUSH (1 << 0) +diff --git a/lib/rawdev/rte_rawdev.c b/lib/rawdev/rte_rawdev.c +index 4f8897b639..2403c78925 100644 +--- a/lib/rawdev/rte_rawdev.c ++++ b/lib/rawdev/rte_rawdev.c +@@ -237,13 +237,16 @@ rte_rawdev_dequeue_buffers(uint16_t dev_id, + rte_rawdev_obj_t context) + { + struct rte_rawdev *dev; ++ int nb_ops; + + RTE_RAWDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL); + dev = &rte_rawdevs[dev_id]; + + if (*dev->dev_ops->dequeue_bufs == NULL) + return -ENOTSUP; +- return (*dev->dev_ops->dequeue_bufs)(dev, buffers, count, context); ++ nb_ops = (*dev->dev_ops->dequeue_bufs)(dev, buffers, count, context); ++ RTE_LCORE_TELEMETRY_TIMESTAMP(nb_ops); ++ return nb_ops; + } + + int +diff --git a/lib/regexdev/rte_regexdev.h b/lib/regexdev/rte_regexdev.h +index d50af775b5..7b243a0866 100644 +--- a/lib/regexdev/rte_regexdev.h ++++ b/lib/regexdev/rte_regexdev.h +@@ -1530,6 +1530,7 @@ rte_regexdev_dequeue_burst(uint8_t dev_id, uint16_t qp_id, + struct rte_regex_ops **ops, uint16_t nb_ops) + { + struct rte_regexdev *dev = &rte_regex_devices[dev_id]; ++ uint16_t deq_ops; + #ifdef RTE_LIBRTE_REGEXDEV_DEBUG + RTE_REGEXDEV_VALID_DEV_ID_OR_ERR_RET(dev_id, -EINVAL); + if (*dev->dequeue == NULL) +@@ -1539,7 +1540,9 @@ rte_regexdev_dequeue_burst(uint8_t dev_id, uint16_t qp_id, + return -EINVAL; + } + #endif +- return (*dev->dequeue)(dev, qp_id, ops, nb_ops); ++ deq_ops = (*dev->dequeue)(dev, qp_id, ops, nb_ops); ++ RTE_LCORE_TELEMETRY_TIMESTAMP(deq_ops); ++ return deq_ops; + } + + #ifdef __cplusplus +diff --git a/lib/ring/rte_ring.h b/lib/ring/rte_ring.h +index c709f30497..057542dcb0 100644 +--- a/lib/ring/rte_ring.h ++++ b/lib/ring/rte_ring.h +@@ -411,8 +411,10 @@ static __rte_always_inline unsigned int + rte_ring_dequeue_bulk(struct rte_ring *r, void **obj_table, unsigned int n, + unsigned int *available) + { +- return rte_ring_dequeue_bulk_elem(r, obj_table, sizeof(void *), ++ uint32_t nb_rx = rte_ring_dequeue_bulk_elem(r, obj_table, sizeof(void *), + n, available); ++ RTE_LCORE_TELEMETRY_TIMESTAMP(nb_rx); ++ return nb_rx; + } + + /** +@@ -811,8 +813,10 @@ static __rte_always_inline unsigned int + rte_ring_dequeue_burst(struct rte_ring *r, void **obj_table, + unsigned int n, unsigned int *available) + { +- return rte_ring_dequeue_burst_elem(r, obj_table, sizeof(void *), ++ uint32_t nb_rx = rte_ring_dequeue_burst_elem(r, obj_table, sizeof(void *), + n, available); ++ RTE_LCORE_TELEMETRY_TIMESTAMP(nb_rx); ++ return nb_rx; + } + + #ifdef __cplusplus +diff --git a/lib/ring/rte_ring_elem_pvt.h b/lib/ring/rte_ring_elem_pvt.h +index 4b80f58980..e2a72e3ea7 100644 +--- a/lib/ring/rte_ring_elem_pvt.h ++++ b/lib/ring/rte_ring_elem_pvt.h +@@ -385,6 +385,7 @@ __rte_ring_do_dequeue_elem(struct rte_ring *r, void *obj_table, + end: + if (available != NULL) + *available = entries - n; ++ RTE_LCORE_TELEMETRY_TIMESTAMP(n); + return n; + } + +-- +2.25.1 + diff --git a/ipm/patches/dpdk/23.11/0002-eal-add-cpuset-lcore-telemetry-entries.patch b/ipm/patches/dpdk/23.11/0002-eal-add-cpuset-lcore-telemetry-entries.patch new file mode 100644 index 0000000..c2bac0e --- /dev/null +++ b/ipm/patches/dpdk/23.11/0002-eal-add-cpuset-lcore-telemetry-entries.patch @@ -0,0 +1,79 @@ +From f689846f602caddf6a0f6c013c3dbb6f0974dec2 Mon Sep 17 00:00:00 2001 +From: Hoang Nguyen +Date: Thu, 1 Aug 2024 16:11:56 +0000 +Subject: [PATCH 2/3] eal: add cpuset lcore telemetry entries + +Expose per-lcore cpuset information to telemetry. + +Signed-off-by: Anatoly Burakov +--- + lib/eal/common/eal_common_lcore_telemetry.c | 46 +++++++++++++++++++++ + 1 file changed, 46 insertions(+) + +diff --git a/lib/eal/common/eal_common_lcore_telemetry.c b/lib/eal/common/eal_common_lcore_telemetry.c +index 1478e5a48a..f01ccd9a65 100644 +--- a/lib/eal/common/eal_common_lcore_telemetry.c ++++ b/lib/eal/common/eal_common_lcore_telemetry.c +@@ -263,6 +263,49 @@ lcore_handle_busyness(const char *cmd __rte_unused, + return 0; + } + ++static int ++lcore_handle_cpuset(const char *cmd __rte_unused, ++ const char *params __rte_unused, ++ struct rte_tel_data *d) ++{ ++ char corenum[64]; ++ int i; ++ ++ rte_tel_data_start_dict(d); ++ ++ /* Foreach lcore - can't use macro since it excludes ROLE_NON_EAL */ ++ for (i = 0; i < RTE_MAX_LCORE; i++) { ++ const struct lcore_config *cfg = &lcore_config[i]; ++ const rte_cpuset_t *cpuset = &cfg->cpuset; ++ struct rte_tel_data *ld; ++ unsigned int cpu; ++ ++ if (!lcore_enabled(i)) ++ continue; ++ ++ /* create an array of integers */ ++ ld = rte_tel_data_alloc(); ++ if (ld == NULL) ++ return -ENOMEM; ++ rte_tel_data_start_array(ld, RTE_TEL_INT_VAL); ++ ++ /* add cpu ID's from cpuset to the array */ ++ for (cpu = 0; cpu < CPU_SETSIZE; cpu++) { ++ if (!CPU_ISSET(cpu, cpuset)) ++ continue; ++ rte_tel_data_add_array_int(ld, cpu); ++ } ++ ++ /* add array to the per-lcore container */ ++ snprintf(corenum, sizeof(corenum), "%d", i); ++ ++ /* tell telemetry library to free this array automatically */ ++ rte_tel_data_add_dict_container(d, corenum, ld, 0); ++ } ++ ++ return 0; ++} ++ + void + eal_lcore_telemetry_free(void) + { +@@ -289,6 +332,9 @@ RTE_INIT(lcore_init_telemetry) + rte_telemetry_register_cmd("/eal/lcore/busyness_disable", lcore_busyness_disable, + "disable lcore busyness measurement"); + ++ rte_telemetry_register_cmd("/eal/lcore/cpuset", lcore_handle_cpuset, ++ "list physical core affinity for each lcore"); ++ + __rte_lcore_telemetry_enabled = true; + } + +-- +2.25.1 + diff --git a/ipm/patches/dpdk/23.11/0003-add-capacity-endpoint-to-telemetry-thread.patch b/ipm/patches/dpdk/23.11/0003-add-capacity-endpoint-to-telemetry-thread.patch new file mode 100644 index 0000000..6be6861 --- /dev/null +++ b/ipm/patches/dpdk/23.11/0003-add-capacity-endpoint-to-telemetry-thread.patch @@ -0,0 +1,357 @@ +From 2a8e1d477157e299f02fb9e64aa5d197d2caee16 Mon Sep 17 00:00:00 2001 +From: David Hunt +Date: Fri, 20 Sep 2024 09:11:45 +0100 +Subject: [PATCH 3/3] add capacity endpoint to telemetry thread + +Busyness is calculated on how busy the current core is, ignoring the +current frequency. So a core that's 50% busy at P1 (e.g. 2GHz), shows +as 100% busy at 1GHz. + +This patch adds a new 'capacity' metric that shows a percentage based on +the P1 (base) freqency of the core, so that if the core is 50% busy at +P1, it should show 50% regardless of what the current frequency is. + +Signed-off-by: David Hunt +--- + lib/eal/common/eal_common_lcore_telemetry.c | 241 ++++++++++++++++++++ + lib/eal/include/rte_lcore.h | 21 ++ + lib/eal/version.map | 1 + + 3 files changed, 263 insertions(+) + +diff --git a/lib/eal/common/eal_common_lcore_telemetry.c b/lib/eal/common/eal_common_lcore_telemetry.c +index f01ccd9a65..1c6d085a55 100644 +--- a/lib/eal/common/eal_common_lcore_telemetry.c ++++ b/lib/eal/common/eal_common_lcore_telemetry.c +@@ -10,9 +10,18 @@ + #include + #include + #include ++#include ++#include ++#include + + #ifdef RTE_LCORE_BUSYNESS + #include ++#define MSR_PLATFORM_INFO 0xCE ++#define POWER_SYSFS_CUR_PATH "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_cur_freq" ++#define POWER_SYSFS_BASE_FREQ_PATH "/sys/devices/system/cpu/cpu%u/cpufreq/base_frequency" ++#define POWER_SYSFS_SCALING_DRIVER_PATH "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_driver" ++#define POWER_SYSFS_SCALING_MAX_FREQ_PATH "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_max_freq" ++#define POWER_SYSFS_MSR_PATH "/dev/cpu/%u/msr" + #endif + + int __rte_lcore_telemetry_enabled; +@@ -47,6 +56,183 @@ static struct lcore_telemetry *telemetry_data; + #define SMOOTH_COEFF 5 + #define STATE_CHANGE_OPT 32 + ++static int p1_freq[RTE_MAX_LCORE] = {0}; ++ ++static int ++try_read_base_frequency(unsigned int lcore_id) ++{ ++ char path[PATH_MAX]; ++ int fd; ++ snprintf(path, sizeof(path), POWER_SYSFS_BASE_FREQ_PATH, rte_lcore_to_cpu_id(lcore_id)); ++ ++ fd = open(path, O_RDONLY); ++ if (fd == -1) { ++ return -1; ++ } ++ char buffer[16]; ++ ssize_t bytesRead = pread(fd, buffer, sizeof(buffer) - 1, 0); ++ if (bytesRead == -1) { ++ close(fd); ++ return -1; ++ } ++ buffer[bytesRead] = '\0'; // Null-terminate the buffer ++ close(fd); ++ ++ p1_freq[lcore_id] = atoi(buffer); ++ return p1_freq[lcore_id]; ++ ++ ++} ++ ++static int ++try_read_scaling_max_freq(unsigned int lcore_id) ++{ ++ char path[PATH_MAX]; ++ int freq; ++ int fd; ++ ++ /* ++ * If the driver is acpi_cpufreq, we can read the scaling_max_freq file ++ */ ++ ++ snprintf(path, sizeof(path), POWER_SYSFS_SCALING_DRIVER_PATH, rte_lcore_to_cpu_id(lcore_id)); ++ fd = open(path, O_RDONLY); ++ if (fd == -1) { ++ return -1; ++ } ++ char buffer[16]; ++ ssize_t bytesRead = pread(fd, buffer, sizeof(buffer) - 1, 0); ++ if (bytesRead == -1) { ++ close(fd); ++ return -1; ++ } ++ buffer[bytesRead] = '\0'; // Null-terminate the buffer ++ ++ close(fd); ++ ++ if (strncmp(buffer, "acpi-cpufreq", 12) == 0) { ++ /* we can use the scaling_max_freq to get the p1 */ ++ snprintf(path, sizeof(path), POWER_SYSFS_SCALING_MAX_FREQ_PATH, rte_lcore_to_cpu_id(lcore_id)); ++ fd = open(path, O_RDONLY); ++ if (fd == -1) { ++ return -1; ++ } ++ ssize_t bytesRead = pread(fd, buffer, sizeof(buffer) - 1, 0); ++ if (bytesRead == -1) { ++ close(fd); ++ return -1; ++ } ++ buffer[bytesRead] = '\0'; // Null-terminate the buffer ++ close(fd); ++ freq = atoi(buffer) / 1000; /* convert to KHz */ ++ ++ /* ++ * If the freq value ends with '1', then, turbo is enabled. ++ * Round it down to the nearest 100. Otherwuse use the value. ++ */ ++ return (freq & ~1) * 1000; /* convert to Hz */ ++ } ++ return -1; ++} ++ ++static int ++try_read_msr(unsigned int lcore_id) ++{ ++ char path[PATH_MAX]; ++ int fd; ++ int freq; ++ uint64_t data; ++ ++ /* ++ * If the msr driver is present, we can read p1 from MSR_PLATFORM_INFO register ++ */ ++ snprintf(path, sizeof(path), POWER_SYSFS_MSR_PATH, rte_lcore_to_cpu_id(lcore_id)); ++ fd = open(path, O_RDONLY); ++ if (fd < 0) { ++ return -1; ++ } ++ ++ if (pread(fd, &data, sizeof(data), MSR_PLATFORM_INFO) != sizeof(data)) { ++ close(fd); ++ return -1; ++ } ++ ++ close(fd); ++ ++ freq = ((data >> 8) & 0xff) * 100 * 1000; ++ ++ return freq; ++} ++ ++ ++static ++int read_sysfs_p1_freq(unsigned int lcore_id) { ++ int freq; ++ ++ /* We've previously got the p1 frequency. */ ++ if (p1_freq[lcore_id] != 0) ++ return p1_freq[lcore_id]; ++ ++ /* ++ * Check the base_frequency file, if it's there ++ */ ++ freq = try_read_base_frequency(lcore_id); ++ if (freq != -1) { ++ p1_freq[lcore_id] = freq; ++ return freq; ++ } ++ ++ /* ++ * Check the scaling_max_freq file for the acpi-freq driver ++ */ ++ freq = try_read_scaling_max_freq(lcore_id); ++ if (freq != -1) { ++ p1_freq[lcore_id] = freq; ++ return freq; ++ } ++ ++ /* ++ * Try reading from the MSR register ++ */ ++ freq = try_read_msr(lcore_id); ++ if (freq != -1) { ++ p1_freq[lcore_id] = freq; ++ return freq; ++ } ++ ++ RTE_LOG(ERR, EAL, "Capacity telemetry for lcore %d not supported: no p1 frequency found", ++ lcore_id); ++ ++ return -1; ++} ++ ++ ++int current_fds[RTE_MAX_LCORE] = {0}; ++ ++static ++int read_sysfs_cur_freq(unsigned int lcore_id) { ++ char path[PATH_MAX]; ++ ++ if (current_fds[lcore_id] == 0) { ++ snprintf(path, sizeof(path), POWER_SYSFS_CUR_PATH, rte_lcore_to_cpu_id(lcore_id)); ++ current_fds[lcore_id] = open(path, O_RDONLY); ++ if (current_fds[lcore_id] == -1) { ++ return -1; ++ } ++ } ++ ++ char buffer[16]; ++ ssize_t bytesRead = pread(current_fds[lcore_id], buffer, sizeof(buffer) - 1, 0); ++ if (bytesRead == -1) { ++ return -1; ++ } ++ ++ buffer[bytesRead] = '\0'; // Null-terminate the buffer ++ ++ int value = atoi(buffer); ++ return value; ++} ++ + /* Helper function to check if the lcore is enabled. + * Cannot use rte_lcore_is_enabled since it only catches ROLE_RTE threads which + * does not include ROLE_NON_EAL threads which some application threads, for +@@ -102,6 +288,33 @@ int rte_lcore_busyness(unsigned int lcore_id) + return telemetry_data[lcore_id].busyness; + } + ++int rte_lcore_capacity(unsigned int lcore_id) ++{ ++ const uint64_t active_thresh = RTE_LCORE_BUSYNESS_PERIOD * 1000; ++ struct lcore_telemetry *tdata; ++ ++ if (lcore_id >= RTE_MAX_LCORE) ++ return -EINVAL; ++ tdata = &telemetry_data[lcore_id]; ++ ++ /* if the lcore is not active */ ++ if (tdata->interval_ts == 0) ++ return LCORE_BUSYNESS_NOT_SET; ++ /* if the core hasn't been active in a while */ ++ else if ((rte_rdtsc() - tdata->interval_ts) > active_thresh) ++ return LCORE_BUSYNESS_NOT_SET; ++ ++ int cur_freq = read_sysfs_cur_freq(rte_lcore_to_cpu_id(lcore_id)); ++ int busy = telemetry_data[lcore_id].busyness; ++ int p1 = read_sysfs_p1_freq(lcore_id) ; ++ ++ if ((busy == -1) || (p1 <= 0)) { ++ return -1; ++ } else { ++ return busy * cur_freq / p1; ++ } ++} ++ + int rte_lcore_busyness_enabled(void) + { + return __rte_lcore_telemetry_enabled; +@@ -263,6 +476,26 @@ lcore_handle_busyness(const char *cmd __rte_unused, + return 0; + } + ++static int ++lcore_handle_capacity(const char *cmd __rte_unused, ++ const char *params __rte_unused, struct rte_tel_data *d) ++{ ++ char corenum[64]; ++ int i; ++ ++ rte_tel_data_start_dict(d); ++ ++ /* Foreach lcore - can't use macro since it excludes ROLE_NON_EAL */ ++ for (i = 0; i < RTE_MAX_LCORE; i++) { ++ if (!lcore_enabled(i)) ++ continue; ++ snprintf(corenum, sizeof(corenum), "%d", i); ++ rte_tel_data_add_dict_int(d, corenum, rte_lcore_capacity(i)); ++ } ++ ++ return 0; ++} ++ + static int + lcore_handle_cpuset(const char *cmd __rte_unused, + const char *params __rte_unused, +@@ -326,6 +559,9 @@ RTE_INIT(lcore_init_telemetry) + rte_telemetry_register_cmd("/eal/lcore/busyness", lcore_handle_busyness, + "return percentage busyness of cores"); + ++ rte_telemetry_register_cmd("/eal/lcore/capacity_used", lcore_handle_capacity, ++ "return percentage capacity of cores"); ++ + rte_telemetry_register_cmd("/eal/lcore/busyness_enable", lcore_busyness_enable, + "enable lcore busyness measurement"); + +@@ -340,6 +576,11 @@ RTE_INIT(lcore_init_telemetry) + + #else + ++int rte_lcore_capacity(unsigned int lcore_id __rte_unused) ++{ ++ return -ENOTSUP; ++} ++ + int rte_lcore_busyness(unsigned int lcore_id __rte_unused) + { + return -ENOTSUP; +diff --git a/lib/eal/include/rte_lcore.h b/lib/eal/include/rte_lcore.h +index 3c64774bcb..dffb7d1ab5 100644 +--- a/lib/eal/include/rte_lcore.h ++++ b/lib/eal/include/rte_lcore.h +@@ -426,6 +426,27 @@ __rte_experimental + int + rte_lcore_busyness(unsigned int lcore_id); + ++/** ++ * @warning ++ * @b EXPERIMENTAL: this API may change without prior notice. ++ * ++ * Read capacity value corresponding to an lcore. ++ * This differs from busyness in that it is related to the current usage ++ * of the lcore compared to P1 frequency, not the current frequency. ++ * ++ * @param lcore_id ++ * Lcore to read capacity value for. ++ * @return ++ * - value between 0 and 100 on success ++ * - -1 if lcore is not active ++ * - -EINVAL if lcore is invalid ++ * - -ENOMEM if not enough memory available ++ * - -ENOTSUP if not supported ++ */ ++__rte_experimental ++int ++rte_lcore_capacity(unsigned int lcore_id); ++ + /** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice. +diff --git a/lib/eal/version.map b/lib/eal/version.map +index a4451d58eb..a2a3ba045f 100644 +--- a/lib/eal/version.map ++++ b/lib/eal/version.map +@@ -440,6 +440,7 @@ EXPERIMENTAL { + # added in 20.11 + __rte_lcore_telemetry_timestamp; + __rte_lcore_telemetry_enabled; ++ rte_lcore_capacity; + rte_lcore_busyness; + rte_lcore_busyness_enabled; + rte_lcore_busyness_enabled_set; +-- +2.25.1 + diff --git a/ipm/patches/dpdk/README.md b/ipm/patches/dpdk/README.md index 7b4620a..6876941 100644 --- a/ipm/patches/dpdk/README.md +++ b/ipm/patches/dpdk/README.md @@ -1,6 +1,7 @@ # DPDK Patches Apply the patches using ```git am {patch}.patch```. -1. ```20.11 directory``` are a set of patches that add the busyness telemetry to DPDK 20.11. -2. ```21.11 directory``` are a set of patches that add the busyness telemetry to DPDK 21.11.2. -3. ```22.11 directory``` are a set of patches that add the busyness telemetry to DPDK 22.11. +1. ```20.11 directory``` are a set of patches that add the busyness telemetry to DPDK 20.11.9 +2. ```21.11 directory``` are a set of patches that add the busyness telemetry to DPDK 21.11.8 +3. ```22.11 directory``` are a set of patches that add the busyness telemetry to DPDK 22.11.6 +4. ```23.11 directory``` are a set of patches that add the busyness telemetry to DPDK 23.11.2 diff --git a/ipm/patches/vpp/20.09/0004-Subject-PATCH-1-1-stats-Added-capacity-flags.patch b/ipm/patches/vpp/20.09/0004-Subject-PATCH-1-1-stats-Added-capacity-flags.patch new file mode 100644 index 0000000..ad5ece9 --- /dev/null +++ b/ipm/patches/vpp/20.09/0004-Subject-PATCH-1-1-stats-Added-capacity-flags.patch @@ -0,0 +1,325 @@ +From 5fdd49609bb3ea985196b5fc148f87abcfce7a21 Mon Sep 17 00:00:00 2001 +From: Hoang Nguyen +Date: Tue, 1 Oct 2024 15:33:49 +0000 +Subject: [PATCH 1/1] stats: Added capacity flag in stats + +Busyness is calculated on how busy the current core is, ignoring the +current frequency. So a core that's 50% busy at P1 (e.g. 2GHz), shows +as 100% busy at 1GHz. + +This patch adds a new 'capacity' metric that shows a percentage based on +the P1 (base) freqency of the core, so that if the core is 50% busy at +P1, it should show 50% regardless of what the current frequency is. + +--- + src/vlib/cli.c | 31 ++++++ + src/vlib/main.h | 1 + + src/vpp/stats/stat_segment.c | 199 ++++++++++++++++++++++++++++++++++- + src/vpp/stats/stat_segment.h | 3 + + 4 files changed, 233 insertions(+), 1 deletion(-) + +diff --git a/src/vlib/cli.c b/src/vlib/cli.c +index 0267f4e58..113a28fbd 100644 +--- a/src/vlib/cli.c ++++ b/src/vlib/cli.c +@@ -915,6 +915,37 @@ VLIB_CLI_COMMAND (show_cpu_load_command, static) = { + }; + /* *INDENT-ON* */ + ++static clib_error_t * ++show_cpu_capacity (vlib_main_t * vm, unformat_input_t * input, ++ vlib_cli_command_t * cmd) ++{ ++ uword i; ++ ++ vlib_cli_output (vm, "%10s | %10s | %12s", "Thread", "Core", "Load %"); ++ ++ for (i = 0; i < vlib_get_n_threads (); i++) ++ { ++ vlib_main_t *vm_i; ++ ++ vm_i = vlib_get_main_by_index (i); ++ if (!vm_i) ++ continue; ++ ++ vlib_cli_output (vm, "%8u | %8u | %8.2f", i, vm_i->cpu_id, ++ (f64)vm_i->cpu_capacity / 100.0); ++ } ++ ++ return 0; ++} ++ ++/* *INDENT-OFF* */ ++VLIB_CLI_COMMAND (show_cpu_capacity_command, static) = { ++ .path = "show cpu capacity", ++ .short_help = "Show cpu capacity", ++ .function = show_cpu_capacity, ++ .is_mp_safe = 1, ++}; ++/* *INDENT-ON* */ + + static clib_error_t * + show_cpu (vlib_main_t * vm, unformat_input_t * input, +diff --git a/src/vlib/main.h b/src/vlib/main.h +index eba5b0be9..e9ddf1aae 100644 +--- a/src/vlib/main.h ++++ b/src/vlib/main.h +@@ -138,6 +138,7 @@ typedef struct vlib_main_t + u64 cpu_load_clocks; + u32 cpu_load_points; + u32 cpuload_burst; ++ u64 cpu_capacity; + + /* Incremented once for each main loop. */ + volatile u32 main_loop_count; +diff --git a/src/vpp/stats/stat_segment.c b/src/vpp/stats/stat_segment.c +index 04abcc1e3..e3c76f629 100644 +--- a/src/vpp/stats/stat_segment.c ++++ b/src/vpp/stats/stat_segment.c +@@ -23,9 +23,192 @@ + #include + #include + #include ++#include ++#include ++#include ++#include ++ ++#define MSR_PLATFORM_INFO 0xCE ++#define POWER_SYSFS_CUR_PATH "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_cur_freq" ++#define POWER_SYSFS_BASE_FREQ_PATH "/sys/devices/system/cpu/cpu%u/cpufreq/base_frequency" ++#define POWER_SYSFS_SCALING_DRIVER_PATH "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_driver" ++#define POWER_SYSFS_SCALING_MAX_FREQ_PATH "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_max_freq" ++#define POWER_SYSFS_MSR_PATH "/dev/cpu/%u/msr" ++ ++#define MAX_LCORE 1280 ++#define PATH_MAX 4096 ++int current_fds[MAX_LCORE] = {0}; ++static int p1_freq[MAX_LCORE] = {0}; + + stat_segment_main_t stat_segment_main; + ++static int ++try_read_base_frequency(unsigned int lcore_id) ++{ ++ char path[PATH_MAX]; ++ int fd; ++ snprintf(path, sizeof(path), POWER_SYSFS_BASE_FREQ_PATH, lcore_id); ++ ++ fd = open(path, O_RDONLY); ++ if (fd == -1) { ++ return -1; ++ } ++ char buffer[16]; ++ ssize_t bytesRead = pread(fd, buffer, sizeof(buffer) - 1, 0); ++ if (bytesRead == -1) { ++ return -1; ++ } ++ buffer[bytesRead] = '\0'; // Null-terminate the buffer ++ close(fd); ++ ++ p1_freq[lcore_id] = atoi(buffer); ++ return p1_freq[lcore_id]; ++ ++ ++} ++ ++static int ++try_read_scaling_max_freq(unsigned int lcore_id) ++{ ++ char path[PATH_MAX]; ++ int freq; ++ int fd; ++ ++ /* ++ * If the driver is acpi_cpufreq, we can read the scaling_max_freq file ++ */ ++ snprintf(path, sizeof(path), POWER_SYSFS_SCALING_DRIVER_PATH, lcore_id); ++ fd = open(path, O_RDONLY); ++ if (fd == -1) { ++ return -1; ++ } ++ char buffer[16]; ++ ssize_t bytesRead = pread(fd, buffer, sizeof(buffer) - 1, 0); ++ if (bytesRead == -1) { ++ return -1; ++ } ++ buffer[bytesRead] = '\0'; // Null-terminate the buffer ++ ++ close(fd); ++ ++ if (strncmp(buffer, "acpi-cpufreq", 12) == 0) { ++ /* we can use the scaling_max_freq to get the p1 */ ++ snprintf(path, sizeof(path), POWER_SYSFS_SCALING_MAX_FREQ_PATH, lcore_id); ++ fd = open(path, O_RDONLY); ++ if (fd == -1) { ++ return -1; ++ } ++ ssize_t bytesRead = pread(fd, buffer, sizeof(buffer) - 1, 0); ++ if (bytesRead == -1) { ++ return -1; ++ } ++ buffer[bytesRead] = '\0'; // Null-terminate the buffer ++ close(fd); ++ freq = atoi(buffer) / 1000; /* convert to KHz */ ++ ++ /* ++ * If the freq value ends with '1', then, turbo is enabled. ++ * Round it down to the nearest 100. Otherwuse use the value. ++ */ ++ return (freq & ~1) * 1000; /* convert to Hz */ ++ } ++ return -1; ++} ++ ++static int ++try_read_msr(unsigned int lcore_id) ++{ ++ char path[PATH_MAX]; ++ int fd; ++ int freq; ++ uint64_t data; ++ ++ /* ++ * If the msr driver is present, we can read p1 from MSR_PLATFORM_INFO register ++ */ ++ snprintf(path, sizeof(path), POWER_SYSFS_MSR_PATH, lcore_id); ++ fd = open(path, O_RDONLY); ++ if (fd < 0) { ++ return -1; ++ } ++ ++ if (pread(fd, &data, sizeof(data), MSR_PLATFORM_INFO) != sizeof(data)) { ++ close(fd); ++ return -1; ++ } ++ ++ close(fd); ++ ++ freq = ((data >> 8) & 0xff) * 100 * 1000; ++ ++ return freq; ++} ++ ++ ++static ++int read_sysfs_p1_freq(unsigned int lcore_id) { ++ int freq; ++ ++ /* We've previously got the p1 frequency. */ ++ if (p1_freq[lcore_id] != 0) ++ return p1_freq[lcore_id]; ++ ++ /* ++ * Check the base_frequency file, if it's there ++ */ ++ freq = try_read_base_frequency(lcore_id); ++ if (freq != -1) { ++ p1_freq[lcore_id] = freq; ++ return freq; ++ } ++ ++ /* ++ * Check the scaling_max_freq file for the acpi-freq driver ++ */ ++ freq = try_read_scaling_max_freq(lcore_id); ++ if (freq != -1) { ++ p1_freq[lcore_id] = freq; ++ return freq; ++ } ++ ++ /* ++ * Try reading from the MSR register ++ */ ++ freq = try_read_msr(lcore_id); ++ if (freq != -1) { ++ p1_freq[lcore_id] = freq; ++ return freq; ++ } ++ ++ return -1; ++} ++ ++static ++int read_sysfs_cur_freq(unsigned int lcore_id) { ++ char path[PATH_MAX]; ++ ++ if (current_fds[lcore_id] == 0) { ++ sprintf(path, POWER_SYSFS_CUR_PATH, lcore_id); ++ current_fds[lcore_id] = open(path, O_RDONLY); ++ if (current_fds[lcore_id] == -1) { ++ perror("Failed to open file"); ++ return -1; ++ } ++ } ++ ++ char buffer[16]; ++ ssize_t bytesRead = pread(current_fds[lcore_id], buffer, sizeof(buffer) - 1, 0); ++ if (bytesRead == -1) { ++ perror("Failed to read file"); ++ return -1; ++ } ++ ++ buffer[bytesRead] = '\0'; // Null-terminate the buffer ++ ++ int value = atoi(buffer); ++ return value; ++} ++ + /* + * Used only by VPP writers + */ +@@ -609,6 +792,8 @@ do_stat_segment_updates (stat_segment_main_t * sm) + [STAT_COUNTER_CPU_UTIL_PER_WORKER], 0); + stat_validate_counter_vector (&sm->directory_vector + [STAT_COUNTER_QUEUE_BURST_PER_WORKER], 0); ++ stat_validate_counter_vector (&sm->directory_vector ++ [STAT_COUNTER_CPU_CAPACITY], 0); + num_worker_threads_set = 1; + vlib_stat_segment_unlock (); + clib_mem_set_heap (oldheap); +@@ -644,7 +829,19 @@ do_stat_segment_updates (stat_segment_main_t * sm) + stat_set_simple_counter (&sm->directory_vector + [STAT_COUNTER_QUEUE_BURST_PER_WORKER], i, 0, + ((this_vlib_main->cpu_id << 8) | (this_vlib_main->cpuload_burst))); +- ++ /* Calculate capacity */ ++ int core_id = this_vlib_main->cpu_id; ++ int cur_freq = read_sysfs_cur_freq(core_id); ++ int p1 = read_sysfs_p1_freq(core_id) ; ++ if (p1 <= 0) { ++ this_vlib_main->cpu_capacity = 0; ++ } else { ++ this_vlib_main->cpu_capacity = (u64)this_vlib_main->cpu_load_points * cur_freq/p1; ++ } ++ /* Set the per-worker capacity */ ++ stat_set_simple_counter (&sm->directory_vector ++ [STAT_COUNTER_CPU_CAPACITY], i, 0, ++ ((this_vlib_main->cpu_id << 8) | (this_vlib_main->cpu_capacity/100))); + } + + /* And set the system average rate */ +diff --git a/src/vpp/stats/stat_segment.h b/src/vpp/stats/stat_segment.h +index a92b87cff..58e977819 100644 +--- a/src/vpp/stats/stat_segment.h ++++ b/src/vpp/stats/stat_segment.h +@@ -27,6 +27,7 @@ typedef enum + STAT_COUNTER_VECTOR_RATE_PER_WORKER, + STAT_COUNTER_CPU_UTIL_PER_WORKER, + STAT_COUNTER_QUEUE_BURST_PER_WORKER, ++ STAT_COUNTER_CPU_CAPACITY, + STAT_COUNTER_INPUT_RATE, + STAT_COUNTER_LAST_UPDATE, + STAT_COUNTER_LAST_STATS_CLEAR, +@@ -50,6 +51,8 @@ typedef enum + cpu_util_per_worker, /sys) \ + _(QUEUE_BURST_PER_WORKER, COUNTER_VECTOR_SIMPLE, \ + queue_burst_per_worker, /sys) \ ++ _(CPU_CAPACITY, COUNTER_VECTOR_SIMPLE, \ ++ capacity_per_worker, /sys) \ + _(NUM_WORKER_THREADS, SCALAR_INDEX, num_worker_threads, /sys) \ + _(INPUT_RATE, SCALAR_INDEX, input_rate, /sys) \ + _(LAST_UPDATE, SCALAR_INDEX, last_update, /sys) \ +-- +2.25.1 + diff --git a/ipm/patches/vpp/21.01/0004-Subject-PATCH-1-1-stats-Added-capacity-flags.patch b/ipm/patches/vpp/21.01/0004-Subject-PATCH-1-1-stats-Added-capacity-flags.patch new file mode 100644 index 0000000..7cdd3bf --- /dev/null +++ b/ipm/patches/vpp/21.01/0004-Subject-PATCH-1-1-stats-Added-capacity-flags.patch @@ -0,0 +1,325 @@ +From 578a1dffb5b57781fa0c6e78736c38b1c87a046f Mon Sep 17 00:00:00 2001 +From: Hoang Nguyen +Date: Tue, 1 Oct 2024 17:10:32 +0000 +Subject: [PATCH 1/1] stats: Added capacity flag in stats + +Busyness is calculated on how busy the current core is, ignoring the +current frequency. So a core that's 50% busy at P1 (e.g. 2GHz), shows +as 100% busy at 1GHz. + +This patch adds a new 'capacity' metric that shows a percentage based on +the P1 (base) freqency of the core, so that if the core is 50% busy at +P1, it should show 50% regardless of what the current frequency is. + +--- + src/vlib/cli.c | 31 ++++++ + src/vlib/main.h | 1 + + src/vpp/stats/stat_segment.c | 199 ++++++++++++++++++++++++++++++++++- + src/vpp/stats/stat_segment.h | 3 + + 4 files changed, 233 insertions(+), 1 deletion(-) + +diff --git a/src/vlib/cli.c b/src/vlib/cli.c +index ed78b045f..ce2047f25 100644 +--- a/src/vlib/cli.c ++++ b/src/vlib/cli.c +@@ -949,6 +949,37 @@ VLIB_CLI_COMMAND (show_cpu_load_command, static) = { + }; + /* *INDENT-ON* */ + ++static clib_error_t * ++show_cpu_capacity (vlib_main_t * vm, unformat_input_t * input, ++ vlib_cli_command_t * cmd) ++{ ++ uword i; ++ ++ vlib_cli_output (vm, "%10s | %10s | %12s", "Thread", "Core", "Load %"); ++ ++ for (i = 0; i < vlib_get_n_threads (); i++) ++ { ++ vlib_main_t *vm_i; ++ ++ vm_i = vlib_get_main_by_index (i); ++ if (!vm_i) ++ continue; ++ ++ vlib_cli_output (vm, "%8u | %8u | %8.2f", i, vm_i->cpu_id, ++ (f64)vm_i->cpu_capacity / 100.0); ++ } ++ ++ return 0; ++} ++ ++/* *INDENT-OFF* */ ++VLIB_CLI_COMMAND (show_cpu_capacity_command, static) = { ++ .path = "show cpu capacity", ++ .short_help = "Show cpu capacity", ++ .function = show_cpu_capacity, ++ .is_mp_safe = 1, ++}; ++/* *INDENT-ON* */ + + static clib_error_t * + show_cpu (vlib_main_t * vm, unformat_input_t * input, +diff --git a/src/vlib/main.h b/src/vlib/main.h +index c997381bc..8e07b4a86 100644 +--- a/src/vlib/main.h ++++ b/src/vlib/main.h +@@ -137,6 +137,7 @@ typedef struct vlib_main_t + u64 cpu_load_clocks; + u32 cpu_load_points; + u32 cpuload_burst; ++ u64 cpu_capacity; + + /* Incremented once for each main loop. */ + volatile u32 main_loop_count; +diff --git a/src/vpp/stats/stat_segment.c b/src/vpp/stats/stat_segment.c +index ed806d208..4ee7a8c20 100644 +--- a/src/vpp/stats/stat_segment.c ++++ b/src/vpp/stats/stat_segment.c +@@ -22,9 +22,192 @@ + #undef HAVE_MEMFD_CREATE + #include + #include ++#include ++#include ++#include ++#include ++ ++#define MSR_PLATFORM_INFO 0xCE ++#define POWER_SYSFS_CUR_PATH "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_cur_freq" ++#define POWER_SYSFS_BASE_FREQ_PATH "/sys/devices/system/cpu/cpu%u/cpufreq/base_frequency" ++#define POWER_SYSFS_SCALING_DRIVER_PATH "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_driver" ++#define POWER_SYSFS_SCALING_MAX_FREQ_PATH "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_max_freq" ++#define POWER_SYSFS_MSR_PATH "/dev/cpu/%u/msr" ++ ++#define MAX_LCORE 1280 ++#define PATH_MAX 4096 ++int current_fds[MAX_LCORE] = {0}; ++static int p1_freq[MAX_LCORE] = {0}; + + stat_segment_main_t stat_segment_main; + ++static int ++try_read_base_frequency(unsigned int lcore_id) ++{ ++ char path[PATH_MAX]; ++ int fd; ++ snprintf(path, sizeof(path), POWER_SYSFS_BASE_FREQ_PATH, lcore_id); ++ ++ fd = open(path, O_RDONLY); ++ if (fd == -1) { ++ return -1; ++ } ++ char buffer[16]; ++ ssize_t bytesRead = pread(fd, buffer, sizeof(buffer) - 1, 0); ++ if (bytesRead == -1) { ++ return -1; ++ } ++ buffer[bytesRead] = '\0'; // Null-terminate the buffer ++ close(fd); ++ ++ p1_freq[lcore_id] = atoi(buffer); ++ return p1_freq[lcore_id]; ++ ++ ++} ++ ++static int ++try_read_scaling_max_freq(unsigned int lcore_id) ++{ ++ char path[PATH_MAX]; ++ int freq; ++ int fd; ++ ++ /* ++ * If the driver is acpi_cpufreq, we can read the scaling_max_freq file ++ */ ++ snprintf(path, sizeof(path), POWER_SYSFS_SCALING_DRIVER_PATH, lcore_id); ++ fd = open(path, O_RDONLY); ++ if (fd == -1) { ++ return -1; ++ } ++ char buffer[16]; ++ ssize_t bytesRead = pread(fd, buffer, sizeof(buffer) - 1, 0); ++ if (bytesRead == -1) { ++ return -1; ++ } ++ buffer[bytesRead] = '\0'; // Null-terminate the buffer ++ ++ close(fd); ++ ++ if (strncmp(buffer, "acpi-cpufreq", 12) == 0) { ++ /* we can use the scaling_max_freq to get the p1 */ ++ snprintf(path, sizeof(path), POWER_SYSFS_SCALING_MAX_FREQ_PATH, lcore_id); ++ fd = open(path, O_RDONLY); ++ if (fd == -1) { ++ return -1; ++ } ++ ssize_t bytesRead = pread(fd, buffer, sizeof(buffer) - 1, 0); ++ if (bytesRead == -1) { ++ return -1; ++ } ++ buffer[bytesRead] = '\0'; // Null-terminate the buffer ++ close(fd); ++ freq = atoi(buffer) / 1000; /* convert to KHz */ ++ ++ /* ++ * If the freq value ends with '1', then, turbo is enabled. ++ * Round it down to the nearest 100. Otherwuse use the value. ++ */ ++ return (freq & ~1) * 1000; /* convert to Hz */ ++ } ++ return -1; ++} ++ ++static int ++try_read_msr(unsigned int lcore_id) ++{ ++ char path[PATH_MAX]; ++ int fd; ++ int freq; ++ uint64_t data; ++ ++ /* ++ * If the msr driver is present, we can read p1 from MSR_PLATFORM_INFO register ++ */ ++ snprintf(path, sizeof(path), POWER_SYSFS_MSR_PATH, lcore_id); ++ fd = open(path, O_RDONLY); ++ if (fd < 0) { ++ return -1; ++ } ++ ++ if (pread(fd, &data, sizeof(data), MSR_PLATFORM_INFO) != sizeof(data)) { ++ close(fd); ++ return -1; ++ } ++ ++ close(fd); ++ ++ freq = ((data >> 8) & 0xff) * 100 * 1000; ++ ++ return freq; ++} ++ ++ ++static ++int read_sysfs_p1_freq(unsigned int lcore_id) { ++ int freq; ++ ++ /* We've previously got the p1 frequency. */ ++ if (p1_freq[lcore_id] != 0) ++ return p1_freq[lcore_id]; ++ ++ /* ++ * Check the base_frequency file, if it's there ++ */ ++ freq = try_read_base_frequency(lcore_id); ++ if (freq != -1) { ++ p1_freq[lcore_id] = freq; ++ return freq; ++ } ++ ++ /* ++ * Check the scaling_max_freq file for the acpi-freq driver ++ */ ++ freq = try_read_scaling_max_freq(lcore_id); ++ if (freq != -1) { ++ p1_freq[lcore_id] = freq; ++ return freq; ++ } ++ ++ /* ++ * Try reading from the MSR register ++ */ ++ freq = try_read_msr(lcore_id); ++ if (freq != -1) { ++ p1_freq[lcore_id] = freq; ++ return freq; ++ } ++ ++ return -1; ++} ++ ++static ++int read_sysfs_cur_freq(unsigned int lcore_id) { ++ char path[PATH_MAX]; ++ ++ if (current_fds[lcore_id] == 0) { ++ sprintf(path, POWER_SYSFS_CUR_PATH, lcore_id); ++ current_fds[lcore_id] = open(path, O_RDONLY); ++ if (current_fds[lcore_id] == -1) { ++ perror("Failed to open file"); ++ return -1; ++ } ++ } ++ ++ char buffer[16]; ++ ssize_t bytesRead = pread(current_fds[lcore_id], buffer, sizeof(buffer) - 1, 0); ++ if (bytesRead == -1) { ++ perror("Failed to read file"); ++ return -1; ++ } ++ ++ buffer[bytesRead] = '\0'; // Null-terminate the buffer ++ ++ int value = atoi(buffer); ++ return value; ++} ++ + /* + * Used only by VPP writers + */ +@@ -613,6 +796,8 @@ do_stat_segment_updates (stat_segment_main_t * sm) + [STAT_COUNTER_CPU_UTIL_PER_WORKER], 0); + stat_validate_counter_vector (&sm->directory_vector + [STAT_COUNTER_QUEUE_BURST_PER_WORKER], 0); ++ stat_validate_counter_vector (&sm->directory_vector ++ [STAT_COUNTER_CPU_CAPACITY], 0); + num_worker_threads_set = 1; + vlib_stat_segment_unlock (); + clib_mem_set_heap (oldheap); +@@ -648,7 +833,19 @@ do_stat_segment_updates (stat_segment_main_t * sm) + stat_set_simple_counter (&sm->directory_vector + [STAT_COUNTER_QUEUE_BURST_PER_WORKER], i, 0, + ((this_vlib_main->cpu_id << 8) | (this_vlib_main->cpuload_burst))); +- ++ /* Calculate capacity */ ++ int core_id = this_vlib_main->cpu_id; ++ int cur_freq = read_sysfs_cur_freq(core_id); ++ int p1 = read_sysfs_p1_freq(core_id) ; ++ if (p1 <= 0) { ++ this_vlib_main->cpu_capacity = 0; ++ } else { ++ this_vlib_main->cpu_capacity = (u64)this_vlib_main->cpu_load_points * cur_freq/p1; ++ } ++ /* Set the per-worker capacity */ ++ stat_set_simple_counter (&sm->directory_vector ++ [STAT_COUNTER_CPU_CAPACITY], i, 0, ++ ((this_vlib_main->cpu_id << 8) | (this_vlib_main->cpu_capacity/100))); + } + + /* And set the system average rate */ +diff --git a/src/vpp/stats/stat_segment.h b/src/vpp/stats/stat_segment.h +index fdf9986fb..c059f3c0a 100644 +--- a/src/vpp/stats/stat_segment.h ++++ b/src/vpp/stats/stat_segment.h +@@ -27,6 +27,7 @@ typedef enum + STAT_COUNTER_VECTOR_RATE_PER_WORKER, + STAT_COUNTER_CPU_UTIL_PER_WORKER, + STAT_COUNTER_QUEUE_BURST_PER_WORKER, ++ STAT_COUNTER_CPU_CAPACITY, + STAT_COUNTER_INPUT_RATE, + STAT_COUNTER_LAST_UPDATE, + STAT_COUNTER_LAST_STATS_CLEAR, +@@ -50,6 +51,8 @@ typedef enum + cpu_util_per_worker, /sys) \ + _(QUEUE_BURST_PER_WORKER, COUNTER_VECTOR_SIMPLE, \ + queue_burst_per_worker, /sys) \ ++ _(CPU_CAPACITY, COUNTER_VECTOR_SIMPLE, \ ++ capacity_per_worker, /sys) \ + _(NUM_WORKER_THREADS, SCALAR_INDEX, num_worker_threads, /sys) \ + _(INPUT_RATE, SCALAR_INDEX, input_rate, /sys) \ + _(LAST_UPDATE, SCALAR_INDEX, last_update, /sys) \ +-- +2.25.1 + diff --git a/ipm/patches/vpp/22.02/0004-stats-Added-capacity-flag-in-stats.patch b/ipm/patches/vpp/22.02/0004-stats-Added-capacity-flag-in-stats.patch new file mode 100644 index 0000000..180aa4e --- /dev/null +++ b/ipm/patches/vpp/22.02/0004-stats-Added-capacity-flag-in-stats.patch @@ -0,0 +1,365 @@ +From edbf641d6f1e3386425ed6999be3f52140586a6f Mon Sep 17 00:00:00 2001 +From: Hoang Nguyen +Date: Sun, 29 Sep 2024 17:29:47 +0000 +Subject: [PATCH 1/1] stats: Added capacity flag in stats + +Busyness is calculated on how busy the current core is, ignoring the +current frequency. So a core that's 50% busy at P1 (e.g. 2GHz), shows +as 100% busy at 1GHz. + +This patch adds a new 'capacity' metric that shows a percentage based on +the P1 (base) freqency of the core, so that if the core is 50% busy at +P1, it should show 50% regardless of what the current frequency is. + +--- + src/vlib/cli.c | 32 ++++ + src/vlib/main.h | 1 + + src/vpp/stats/stat_segment.c | 1 + + src/vpp/stats/stat_segment.h | 1 + + src/vpp/stats/stat_segment_provider.c | 233 ++++++++++++++++++++++++++ + 5 files changed, 268 insertions(+) + +diff --git a/src/vlib/cli.c b/src/vlib/cli.c +index c1ae5f7c6..1fa6b2fdf 100644 +--- a/src/vlib/cli.c ++++ b/src/vlib/cli.c +@@ -966,6 +966,38 @@ VLIB_CLI_COMMAND (show_cpu_load_command, static) = { + }; + /* *INDENT-ON* */ + ++static clib_error_t * ++show_cpu_capacity (vlib_main_t * vm, unformat_input_t * input, ++ vlib_cli_command_t * cmd) ++{ ++ uword i; ++ ++ vlib_cli_output (vm, "%10s | %10s | %12s", "Thread", "Core", "Capacity %"); ++ ++ for (i = 0; i < vlib_get_n_threads (); i++) ++ { ++ vlib_main_t *vm_i; ++ ++ vm_i = vlib_get_main_by_index (i); ++ if (!vm_i) ++ continue; ++ ++ vlib_cli_output (vm, "%8u | %8u | %8.2f", i, vm_i->cpu_id, ++ (f64)vm_i->cpu_capacity / 100.0); ++ } ++ ++ return 0; ++} ++ ++/* *INDENT-OFF* */ ++VLIB_CLI_COMMAND (show_cpu_capacity_command, static) = { ++ .path = "show cpu capacity", ++ .short_help = "Show cpu capacity", ++ .function = show_cpu_capacity, ++ .is_mp_safe = 1, ++}; ++/* *INDENT-ON* */ ++ + static clib_error_t * + show_cpu (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +diff --git a/src/vlib/main.h b/src/vlib/main.h +index 84c5ca051..25237f939 100644 +--- a/src/vlib/main.h ++++ b/src/vlib/main.h +@@ -120,6 +120,7 @@ typedef struct vlib_main_t + u64 cpu_load_clocks; + u32 cpu_load_points; + u32 cpuload_burst; ++ u64 cpu_capacity; + + /* Incremented once for each main loop. */ + volatile u32 main_loop_count; +diff --git a/src/vpp/stats/stat_segment.c b/src/vpp/stats/stat_segment.c +index fb0d5b8ee..2b15895a2 100644 +--- a/src/vpp/stats/stat_segment.c ++++ b/src/vpp/stats/stat_segment.c +@@ -758,6 +758,7 @@ do_stat_segment_updates (vlib_main_t *vm, stat_segment_main_t *sm) + stat_provider_register_vector_rate (tm->n_vlib_mains - 1); + stat_provider_register_cpu_util (tm->n_vlib_mains - 1); + stat_provider_register_queue_burst (tm->n_vlib_mains - 1); ++ stat_provider_register_capacity (tm->n_vlib_mains - 1); + + sm->directory_vector[STAT_COUNTER_NUM_WORKER_THREADS].value = + tm->n_vlib_mains - 1; +diff --git a/src/vpp/stats/stat_segment.h b/src/vpp/stats/stat_segment.h +index 10e6e6791..2456efe3b 100644 +--- a/src/vpp/stats/stat_segment.h ++++ b/src/vpp/stats/stat_segment.h +@@ -123,6 +123,7 @@ void vlib_stats_register_symlink (void *oldheap, u8 *name, u32 index1, + void stat_provider_register_vector_rate (u32 num_workers); + void stat_provider_register_cpu_util (u32 num_workers); + void stat_provider_register_queue_burst (u32 num_workers); ++void stat_provider_register_capacity (u32 num_workers); + + f64 + vlib_get_stat_segment_cpuload_rate (void); +diff --git a/src/vpp/stats/stat_segment_provider.c b/src/vpp/stats/stat_segment_provider.c +index 941026557..2aff45fec 100644 +--- a/src/vpp/stats/stat_segment_provider.c ++++ b/src/vpp/stats/stat_segment_provider.c +@@ -23,6 +23,23 @@ + #include + #include + #include "stat_segment.h" ++#include ++#include ++#include ++#include ++ ++#define MSR_PLATFORM_INFO 0xCE ++#define POWER_SYSFS_CUR_PATH "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_cur_freq" ++#define POWER_SYSFS_BASE_FREQ_PATH "/sys/devices/system/cpu/cpu%u/cpufreq/base_frequency" ++#define POWER_SYSFS_SCALING_DRIVER_PATH "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_driver" ++#define POWER_SYSFS_SCALING_MAX_FREQ_PATH "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_max_freq" ++#define POWER_SYSFS_MSR_PATH "/dev/cpu/%u/msr" ++ ++#define MAX_LCORE 1280 ++#define PATH_MAX 4096 ++int current_fds[MAX_LCORE] = {0}; ++static int p1_freq[MAX_LCORE] = {0}; ++ + + clib_mem_heap_t **memory_heaps_vec; + u32 mem_vector_index; +@@ -39,6 +56,173 @@ enum + STAT_MEM_RELEASABLE, + } stat_mem_usage_e; + ++static int ++try_read_base_frequency(unsigned int lcore_id) ++{ ++ char path[PATH_MAX]; ++ int fd; ++ snprintf(path, sizeof(path), POWER_SYSFS_BASE_FREQ_PATH, lcore_id); ++ ++ fd = open(path, O_RDONLY); ++ if (fd == -1) { ++ return -1; ++ } ++ char buffer[16]; ++ ssize_t bytesRead = pread(fd, buffer, sizeof(buffer) - 1, 0); ++ if (bytesRead == -1) { ++ return -1; ++ } ++ buffer[bytesRead] = '\0'; // Null-terminate the buffer ++ close(fd); ++ ++ p1_freq[lcore_id] = atoi(buffer); ++ return p1_freq[lcore_id]; ++ ++ ++} ++ ++static int ++try_read_scaling_max_freq(unsigned int lcore_id) ++{ ++ char path[PATH_MAX]; ++ int freq; ++ int fd; ++ ++ /* ++ * If the driver is acpi_cpufreq, we can read the scaling_max_freq file ++ */ ++ snprintf(path, sizeof(path), POWER_SYSFS_SCALING_DRIVER_PATH, lcore_id); ++ fd = open(path, O_RDONLY); ++ if (fd == -1) { ++ return -1; ++ } ++ char buffer[16]; ++ ssize_t bytesRead = pread(fd, buffer, sizeof(buffer) - 1, 0); ++ if (bytesRead == -1) { ++ return -1; ++ } ++ buffer[bytesRead] = '\0'; // Null-terminate the buffer ++ ++ close(fd); ++ ++ if (strncmp(buffer, "acpi-cpufreq", 12) == 0) { ++ /* we can use the scaling_max_freq to get the p1 */ ++ snprintf(path, sizeof(path), POWER_SYSFS_SCALING_MAX_FREQ_PATH, lcore_id); ++ fd = open(path, O_RDONLY); ++ if (fd == -1) { ++ return -1; ++ } ++ ssize_t bytesRead = pread(fd, buffer, sizeof(buffer) - 1, 0); ++ if (bytesRead == -1) { ++ return -1; ++ } ++ buffer[bytesRead] = '\0'; // Null-terminate the buffer ++ close(fd); ++ freq = atoi(buffer) / 1000; /* convert to KHz */ ++ ++ /* ++ * If the freq value ends with '1', then, turbo is enabled. ++ * Round it down to the nearest 100. Otherwuse use the value. ++ */ ++ return (freq & ~1) * 1000; /* convert to Hz */ ++ } ++ return -1; ++} ++ ++static int ++try_read_msr(unsigned int lcore_id) ++{ ++ char path[PATH_MAX]; ++ int fd; ++ int freq; ++ uint64_t data; ++ ++ /* ++ * If the msr driver is present, we can read p1 from MSR_PLATFORM_INFO register ++ */ ++ snprintf(path, sizeof(path), POWER_SYSFS_MSR_PATH, lcore_id); ++ fd = open(path, O_RDONLY); ++ if (fd < 0) { ++ return -1; ++ } ++ ++ if (pread(fd, &data, sizeof(data), MSR_PLATFORM_INFO) != sizeof(data)) { ++ close(fd); ++ return -1; ++ } ++ ++ close(fd); ++ ++ freq = ((data >> 8) & 0xff) * 100 * 1000; ++ ++ return freq; ++} ++ ++ ++static ++int read_sysfs_p1_freq(unsigned int lcore_id) { ++ int freq; ++ ++ /* We've previously got the p1 frequency. */ ++ if (p1_freq[lcore_id] != 0) ++ return p1_freq[lcore_id]; ++ ++ /* ++ * Check the base_frequency file, if it's there ++ */ ++ freq = try_read_base_frequency(lcore_id); ++ if (freq != -1) { ++ p1_freq[lcore_id] = freq; ++ return freq; ++ } ++ ++ /* ++ * Check the scaling_max_freq file for the acpi-freq driver ++ */ ++ freq = try_read_scaling_max_freq(lcore_id); ++ if (freq != -1) { ++ p1_freq[lcore_id] = freq; ++ return freq; ++ } ++ ++ /* ++ * Try reading from the MSR register ++ */ ++ freq = try_read_msr(lcore_id); ++ if (freq != -1) { ++ p1_freq[lcore_id] = freq; ++ return freq; ++ } ++ ++ return -1; ++} ++ ++static ++int read_sysfs_cur_freq(unsigned int lcore_id) { ++ char path[PATH_MAX]; ++ ++ if (current_fds[lcore_id] == 0) { ++ sprintf(path, POWER_SYSFS_CUR_PATH, lcore_id); ++ current_fds[lcore_id] = open(path, O_RDONLY); ++ if (current_fds[lcore_id] == -1) { ++ perror("Failed to open file"); ++ return -1; ++ } ++ } ++ ++ char buffer[16]; ++ ssize_t bytesRead = pread(current_fds[lcore_id], buffer, sizeof(buffer) - 1, 0); ++ if (bytesRead == -1) { ++ perror("Failed to read file"); ++ return -1; ++ } ++ ++ buffer[bytesRead] = '\0'; // Null-terminate the buffer ++ ++ int value = atoi(buffer); ++ return value; ++} ++ + /* + * Called from the stats periodic process to update memory counters. + */ +@@ -211,6 +395,35 @@ stat_provider_queue_burst_per_thread_update_fn ( + } + } + ++static void ++stat_provider_capacity_per_thread_update_fn ( ++ stat_segment_directory_entry_t *e, u32 index) ++{ ++ vlib_main_t *this_vlib_main; ++ int i; ++ ASSERT (e->data); ++ counter_t **counters = e->data; ++ int core_id = 0; ++ ++ for (i = 0; i < vlib_get_n_threads (); i++) ++ { ++ ++ this_vlib_main = vlib_get_main_by_index (i); ++ core_id = this_vlib_main->cpu_id; ++ int cur_freq = read_sysfs_cur_freq(core_id); ++ int p1 = read_sysfs_p1_freq(core_id) ; ++ /* Set the per-worker queue burst */ ++ counter_t *cb = counters[i]; ++ /* Lower 8-bits is burst flag and rest is core id */ ++ if (p1 <= 0) { ++ this_vlib_main->cpu_capacity = 0; ++ } else { ++ this_vlib_main->cpu_capacity = (u64)this_vlib_main->cpu_load_points * cur_freq/p1; ++ } ++ /* Lower 8-bits is capacity flag and rest is core id */ ++ cb[0] = ((this_vlib_main->cpu_id << 8) | (this_vlib_main->cpu_capacity/100)); ++ } ++} + + void + stat_provider_register_vector_rate (u32 num_workers) +@@ -279,3 +492,23 @@ stat_provider_register_queue_burst (u32 num_workers) + ep->data = stat_validate_counter_vector3 (ep->data, num_workers, 0); + vlib_stat_segment_unlock (); + } ++ ++void ++stat_provider_register_capacity (u32 num_workers) ++{ ++ int i; ++ ++ u8 *s = format (0, "/sys/capacity_per_worker%c", 0); ++ i = stat_segment_new_entry (s, STAT_DIR_TYPE_COUNTER_VECTOR_SIMPLE); ++ if (i == ~0) ++ ASSERT (0); ++ vec_free (s); ++ stat_segment_poll_add (i, stat_provider_capacity_per_thread_update_fn, ~0, ++ 10); ++ ++ stat_segment_main_t *sm = &stat_segment_main; ++ vlib_stat_segment_lock (); ++ stat_segment_directory_entry_t *ep = &sm->directory_vector[i]; ++ ep->data = stat_validate_counter_vector3 (ep->data, num_workers, 0); ++ vlib_stat_segment_unlock (); ++} +-- +2.25.1 + diff --git a/ipm/patches/vpp/23.02/0001-vlib-CPU-load-measurement-and-CLI.patch b/ipm/patches/vpp/23.02/0001-vlib-CPU-load-measurement-and-CLI.patch new file mode 100644 index 0000000..92dd444 --- /dev/null +++ b/ipm/patches/vpp/23.02/0001-vlib-CPU-load-measurement-and-CLI.patch @@ -0,0 +1,114 @@ +From 63409ec1173b4f63496cd5c6f404ab204d55690d Mon Sep 17 00:00:00 2001 +From: Hoang Nguyen +Date: Sun, 11 Aug 2024 15:44:39 +0000 +Subject: [PATCH 1/3] vlib: CPU load measurement and CLI + +The patch calculates CPU load based on number of ticks ellapsed in +processing packets by main/worker thread. + +New CLI command to query CPU load: +`show cpu load` + +Type: improvement +--- + src/vlib/cli.c | 32 ++++++++++++++++++++++++++++++++ + src/vlib/main.c | 15 +++++++++++++++ + src/vlib/main.h | 6 ++++++ + 3 files changed, 53 insertions(+) + +diff --git a/src/vlib/cli.c b/src/vlib/cli.c +index 9c53200f8..3c5b7b533 100644 +--- a/src/vlib/cli.c ++++ b/src/vlib/cli.c +@@ -986,6 +986,38 @@ VLIB_CLI_COMMAND (show_memory_usage_command, static) = { + }; + /* *INDENT-ON* */ + ++static clib_error_t * ++show_cpu_load (vlib_main_t * vm, unformat_input_t * input, ++ vlib_cli_command_t * cmd) ++{ ++ uword i; ++ ++ vlib_cli_output (vm, "%10s | %10s | %12s", "Thread", "Core", "Load %"); ++ ++ for (i = 0; i < vlib_get_n_threads (); i++) ++ { ++ vlib_main_t *vm_i; ++ ++ vm_i = vlib_get_main_by_index (i); ++ if (!vm_i) ++ continue; ++ ++ vlib_cli_output (vm, "%8u | %8u | %8.2f", i, vm_i->cpu_id, ++ (f64)vm_i->cpu_load_points / 100.0); ++ } ++ ++ return 0; ++} ++ ++/* *INDENT-OFF* */ ++VLIB_CLI_COMMAND (show_cpu_load_command, static) = { ++ .path = "show cpu load", ++ .short_help = "Show cpu load", ++ .function = show_cpu_load, ++ .is_mp_safe = 1, ++}; ++/* *INDENT-ON* */ ++ + static clib_error_t * + show_cpu (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +diff --git a/src/vlib/main.c b/src/vlib/main.c +index fc8006447..e567842a0 100644 +--- a/src/vlib/main.c ++++ b/src/vlib/main.c +@@ -977,6 +977,9 @@ dispatch_node (vlib_main_t * vm, + /* n_vectors */ n, + /* n_clocks */ t - last_time_stamp); + ++ if (n) ++ vm->cpu_load_clocks += t - last_time_stamp; ++ + /* When in adaptive mode and vector rate crosses threshold switch to + polling mode and vice versa. */ + if (PREDICT_FALSE (node->flags & VLIB_NODE_FLAG_ADAPTIVE_MODE)) +@@ -1679,6 +1682,18 @@ vlib_main_or_worker_loop (vlib_main_t * vm, int is_main) + /* Record time stamp in case there are no enabled nodes and above + calls do not update time stamp. */ + cpu_time_now = clib_cpu_time_now (); ++ /* Time to update cpu load? */ ++ if (PREDICT_FALSE (cpu_time_now >= vm->cpu_load_interval_end) ) ++ { ++ if (vm->cpu_load_interval_start) ++ { ++ vm->cpu_load_points = (vm->cpu_load_clocks * 1e4) / ++ (cpu_time_now - vm->cpu_load_interval_start); ++ } ++ vm->cpu_load_interval_start = cpu_time_now; ++ vm->cpu_load_interval_end = cpu_time_now + 1e9; ++ vm->cpu_load_clocks = 0; ++ } + vm->loops_this_reporting_interval++; + now = clib_time_now_internal (&vm->clib_time, cpu_time_now); + /* Time to update loops_per_second? */ +diff --git a/src/vlib/main.h b/src/vlib/main.h +index a9cfab4f8..856629ec3 100644 +--- a/src/vlib/main.h ++++ b/src/vlib/main.h +@@ -115,6 +115,12 @@ typedef struct vlib_main_t + /* Time stamp when main loop was entered (time 0). */ + u64 cpu_time_main_loop_start; + ++ /* CPU load measurement */ ++ u64 cpu_load_interval_start; ++ u64 cpu_load_interval_end; ++ u64 cpu_load_clocks; ++ u32 cpu_load_points; ++ + /* Incremented once for each main loop. */ + volatile u32 main_loop_count; + +-- +2.25.1 + diff --git a/ipm/patches/vpp/23.02/0002-stats-Added-CPU-load-and-queue-burst-flag-in-stats.patch b/ipm/patches/vpp/23.02/0002-stats-Added-CPU-load-and-queue-burst-flag-in-stats.patch new file mode 100644 index 0000000..c2c8e2b --- /dev/null +++ b/ipm/patches/vpp/23.02/0002-stats-Added-CPU-load-and-queue-burst-flag-in-stats.patch @@ -0,0 +1,343 @@ +From 8ed57819abe56ed4e7e81e05b4115bc6621ede45 Mon Sep 17 00:00:00 2001 +From: Hoang Nguyen +Date: Tue, 13 Aug 2024 17:27:07 +0000 +Subject: [PATCH 2/3] stats: Added CPU load and queue burst flag in stats + +This patch adds following capabilities: +- flag to indicate when number of packets in DPDK queue cross +configurable queue threshold. +- Stats config parameter to configure interval for CPU load +measurement. + `cpuload-interval