From fe33997b249792342e172d059c17780d06184d24 Mon Sep 17 00:00:00 2001 From: Tristan Brindle Date: Wed, 24 Jan 2024 18:20:09 +0000 Subject: [PATCH 1/2] Update nanobench Use CMake's FetchContent to download from Github rather than bundling our own version of the header --- benchmark/CMakeLists.txt | 17 +- benchmark/nanobench.cpp | 3 - benchmark/nanobench.h | 3432 -------------------------------------- 3 files changed, 12 insertions(+), 3440 deletions(-) delete mode 100644 benchmark/nanobench.cpp delete mode 100644 benchmark/nanobench.h diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 647fea7c..13a7587d 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -1,12 +1,19 @@ -add_library(nanobench nanobench.cpp) -target_include_directories(nanobench PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) +include(FetchContent) + +FetchContent_Declare( + nanobench + GIT_REPOSITORY https://github.com/martinus/nanobench.git + GIT_TAG v4.3.11 +) + +FetchContent_MakeAvailable(nanobench) add_executable(internal-iteration-benchmark internal_iteration_benchmark.cpp) -target_link_libraries(internal-iteration-benchmark PUBLIC nanobench flux) +target_link_libraries(internal-iteration-benchmark PUBLIC nanobench::nanobench flux) add_executable(sort-benchmark sort_benchmark.cpp) -target_link_libraries(sort-benchmark PUBLIC nanobench flux) +target_link_libraries(sort-benchmark PUBLIC nanobench::nanobench flux) add_executable(multidimensional-memset-benchmark multidimensional_memset_benchmark.cpp multidimensional_memset_benchmark_kernels.cpp) -target_link_libraries(multidimensional-memset-benchmark PUBLIC nanobench flux) +target_link_libraries(multidimensional-memset-benchmark PUBLIC nanobench::nanobench flux) diff --git a/benchmark/nanobench.cpp b/benchmark/nanobench.cpp deleted file mode 100644 index ecf6bf42..00000000 --- a/benchmark/nanobench.cpp +++ /dev/null @@ -1,3 +0,0 @@ - -#define ANKERL_NANOBENCH_IMPLEMENT -#include \ No newline at end of file diff --git a/benchmark/nanobench.h b/benchmark/nanobench.h deleted file mode 100644 index c03d9ad2..00000000 --- a/benchmark/nanobench.h +++ /dev/null @@ -1,3432 +0,0 @@ -// __ _ _______ __ _ _____ ______ _______ __ _ _______ _ _ -// | \ | |_____| | \ | | | |_____] |______ | \ | | |_____| -// | \_| | | | \_| |_____| |_____] |______ | \_| |_____ | | -// -// Microbenchmark framework for C++11/14/17/20 -// https://github.com/martinus/nanobench -// -// Licensed under the MIT License . -// SPDX-License-Identifier: MIT -// Copyright (c) 2019-2022 Martin Leitner-Ankerl -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#ifndef ANKERL_NANOBENCH_H_INCLUDED -#define ANKERL_NANOBENCH_H_INCLUDED - -// see https://semver.org/ -#define ANKERL_NANOBENCH_VERSION_MAJOR 4 // incompatible API changes -#define ANKERL_NANOBENCH_VERSION_MINOR 3 // backwards-compatible changes -#define ANKERL_NANOBENCH_VERSION_PATCH 9 // backwards-compatible bug fixes - -/////////////////////////////////////////////////////////////////////////////////////////////////// -// public facing api - as minimal as possible -/////////////////////////////////////////////////////////////////////////////////////////////////// - -#include // high_resolution_clock -#include // memcpy -#include // for std::ostream* custom output target in Config -#include // all names -#include // holds context information of results -#include // holds all results - -#define ANKERL_NANOBENCH(x) ANKERL_NANOBENCH_PRIVATE_##x() - -#define ANKERL_NANOBENCH_PRIVATE_CXX() __cplusplus -#define ANKERL_NANOBENCH_PRIVATE_CXX98() 199711L -#define ANKERL_NANOBENCH_PRIVATE_CXX11() 201103L -#define ANKERL_NANOBENCH_PRIVATE_CXX14() 201402L -#define ANKERL_NANOBENCH_PRIVATE_CXX17() 201703L - -#if ANKERL_NANOBENCH(CXX) >= ANKERL_NANOBENCH(CXX17) -# define ANKERL_NANOBENCH_PRIVATE_NODISCARD() [[nodiscard]] -#else -# define ANKERL_NANOBENCH_PRIVATE_NODISCARD() -#endif - -#if defined(__clang__) -# define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH() \ - _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wpadded\"") -# define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP() _Pragma("clang diagnostic pop") -#else -# define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH() -# define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP() -#endif - -#if defined(__GNUC__) -# define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH() _Pragma("GCC diagnostic push") _Pragma("GCC diagnostic ignored \"-Weffc++\"") -# define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP() _Pragma("GCC diagnostic pop") -#else -# define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH() -# define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP() -#endif - -#if defined(ANKERL_NANOBENCH_LOG_ENABLED) -# include -# define ANKERL_NANOBENCH_LOG(x) \ - do { \ - std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << x << std::endl; \ - } while (0) -#else -# define ANKERL_NANOBENCH_LOG(x) \ - do { \ - } while (0) -#endif - -#define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 0 -#if defined(__linux__) && !defined(ANKERL_NANOBENCH_DISABLE_PERF_COUNTERS) -# include -# if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 3, 0) -// PERF_COUNT_HW_REF_CPU_CYCLES only available since kernel 3.3 -// PERF_FLAG_FD_CLOEXEC since kernel 3.14 -# undef ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS -# define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 1 -# endif -#endif - -#if defined(__clang__) -# define ANKERL_NANOBENCH_NO_SANITIZE(...) __attribute__((no_sanitize(__VA_ARGS__))) -#else -# define ANKERL_NANOBENCH_NO_SANITIZE(...) -#endif - -#if defined(_MSC_VER) -# define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __declspec(noinline) -#else -# define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __attribute__((noinline)) -#endif - -// workaround missing "is_trivially_copyable" in g++ < 5.0 -// See https://stackoverflow.com/a/31798726/48181 -#if defined(__GNUC__) && __GNUC__ < 5 -# define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) __has_trivial_copy(__VA_ARGS__) -#else -# define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) std::is_trivially_copyable<__VA_ARGS__>::value -#endif - -// declarations /////////////////////////////////////////////////////////////////////////////////// - -namespace ankerl { -namespace nanobench { - -using Clock = std::conditional::type; -class Bench; -struct Config; -class Result; -class Rng; -class BigO; - -/** - * @brief Renders output from a mustache-like template and benchmark results. - * - * The templating facility here is heavily inspired by [mustache - logic-less templates](https://mustache.github.io/). - * It adds a few more features that are necessary to get all of the captured data out of nanobench. Please read the - * excellent [mustache manual](https://mustache.github.io/mustache.5.html) to see what this is all about. - * - * nanobench output has two nested layers, *result* and *measurement*. Here is a hierarchy of the allowed tags: - * - * * `{{#result}}` Marks the begin of the result layer. Whatever comes after this will be instantiated as often as - * a benchmark result is available. Within it, you can use these tags: - * - * * `{{title}}` See Bench::title. - * - * * `{{name}}` Benchmark name, usually directly provided with Bench::run, but can also be set with Bench::name. - * - * * `{{unit}}` Unit, e.g. `byte`. Defaults to `op`, see Bench::unit. - * - * * `{{batch}}` Batch size, see Bench::batch. - * - * * `{{complexityN}}` Value used for asymptotic complexity calculation. See Bench::complexityN. - * - * * `{{epochs}}` Number of epochs, see Bench::epochs. - * - * * `{{clockResolution}}` Accuracy of the clock, i.e. what's the smallest time possible to measure with the clock. - * For modern systems, this can be around 20 ns. This value is automatically determined by nanobench at the first - * benchmark that is run, and used as a static variable throughout the application's runtime. - * - * * `{{clockResolutionMultiple}}` Configuration multiplier for `clockResolution`. See Bench::clockResolutionMultiple. - * This is the target runtime for each measurement (epoch). That means the more accurate your clock is, the faster - * will be the benchmark. Basing the measurement's runtime on the clock resolution is the main reason why nanobench is so fast. - * - * * `{{maxEpochTime}}` Configuration for a maximum time each measurement (epoch) is allowed to take. Note that at least - * a single iteration will be performed, even when that takes longer than maxEpochTime. See Bench::maxEpochTime. - * - * * `{{minEpochTime}}` Minimum epoch time, defaults to 1ms. See Bench::minEpochTime. - * - * * `{{minEpochIterations}}` See Bench::minEpochIterations. - * - * * `{{epochIterations}}` See Bench::epochIterations. - * - * * `{{warmup}}` Number of iterations used before measuring starts. See Bench::warmup. - * - * * `{{relative}}` True or false, depending on the setting you have used. See Bench::relative. - * - * * `{{context(variableName)}}` See Bench::context. - * - * Apart from these tags, it is also possible to use some mathematical operations on the measurement data. The operations - * are of the form `{{command(name)}}`. Currently `name` can be one of `elapsed`, `iterations`. If performance counters - * are available (currently only on current Linux systems), you also have `pagefaults`, `cpucycles`, - * `contextswitches`, `instructions`, `branchinstructions`, and `branchmisses`. All the measures (except `iterations`) are - * provided for a single iteration (so `elapsed` is the time a single iteration took). The following tags are available: - * - * * `{{median()}}` Calculate median of a measurement data set, e.g. `{{median(elapsed)}}`. - * - * * `{{average()}}` Average (mean) calculation. - * - * * `{{medianAbsolutePercentError()}}` Calculates MdAPE, the Median Absolute Percentage Error. The MdAPE is an excellent - * metric for the variation of measurements. It is more robust to outliers than the - * [Mean absolute percentage error (M-APE)](https://en.wikipedia.org/wiki/Mean_absolute_percentage_error). - * @f[ - * \mathrm{MdAPE}(e) = \mathrm{med}\{| \frac{e_i - \mathrm{med}\{e\}}{e_i}| \} - * @f] - * E.g. for *elapsed*: First, @f$ \mathrm{med}\{e\} @f$ calculates the median by sorting and then taking the middle element - * of all *elapsed* measurements. This is used to calculate the absolute percentage - * error to this median for each measurement, as in @f$ | \frac{e_i - \mathrm{med}\{e\}}{e_i}| @f$. All these results - * are sorted, and the middle value is chosen as the median absolute percent error. - * - * This measurement is a bit hard to interpret, but it is very robust against outliers. E.g. a value of 5% means that half of the - * measurements deviate less than 5% from the median, and the other deviate more than 5% from the median. - * - * * `{{sum()}}` Sum of all the measurements. E.g. `{{sum(iterations)}}` will give you the total number of iterations -* measured in this benchmark. - * - * * `{{minimum()}}` Minimum of all measurements. - * - * * `{{maximum()}}` Maximum of all measurements. - * - * * `{{sumProduct(, )}}` Calculates the sum of the products of corresponding measures: - * @f[ - * \mathrm{sumProduct}(a,b) = \sum_{i=1}^{n}a_i\cdot b_i - * @f] - * E.g. to calculate total runtime of the benchmark, you multiply iterations with elapsed time for each measurement, and - * sum these results up: - * `{{sumProduct(iterations, elapsed)}}`. - * - * * `{{#measurement}}` To access individual measurement results, open the begin tag for measurements. - * - * * `{{elapsed}}` Average elapsed wall clock time per iteration, in seconds. - * - * * `{{iterations}}` Number of iterations in the measurement. The number of iterations will fluctuate due - * to some applied randomness, to enhance accuracy. - * - * * `{{pagefaults}}` Average number of pagefaults per iteration. - * - * * `{{cpucycles}}` Average number of CPU cycles processed per iteration. - * - * * `{{contextswitches}}` Average number of context switches per iteration. - * - * * `{{instructions}}` Average number of retired instructions per iteration. - * - * * `{{branchinstructions}}` Average number of branches executed per iteration. - * - * * `{{branchmisses}}` Average number of branches that were missed per iteration. - * - * * `{{/measurement}}` Ends the measurement tag. - * - * * `{{/result}}` Marks the end of the result layer. This is the end marker for the template part that will be instantiated - * for each benchmark result. - * - * - * For the layer tags *result* and *measurement* you additionally can use these special markers: - * - * * ``{{#-first}}`` - Begin marker of a template that will be instantiated *only for the first* entry in the layer. Use is only - * allowed between the begin and end marker of the layer. So between ``{{#result}}`` and ``{{/result}}``, or between - * ``{{#measurement}}`` and ``{{/measurement}}``. Finish the template with ``{{/-first}}``. - * - * * ``{{^-first}}`` - Begin marker of a template that will be instantiated *for each except the first* entry in the layer. This, - * this is basically the inversion of ``{{#-first}}``. Use is only allowed between the begin and end marker of the layer. - * So between ``{{#result}}`` and ``{{/result}}``, or between ``{{#measurement}}`` and ``{{/measurement}}``. - * - * * ``{{/-first}}`` - End marker for either ``{{#-first}}`` or ``{{^-first}}``. - * - * * ``{{#-last}}`` - Begin marker of a template that will be instantiated *only for the last* entry in the layer. Use is only - * allowed between the begin and end marker of the layer. So between ``{{#result}}`` and ``{{/result}}``, or between - * ``{{#measurement}}`` and ``{{/measurement}}``. Finish the template with ``{{/-last}}``. - * - * * ``{{^-last}}`` - Begin marker of a template that will be instantiated *for each except the last* entry in the layer. This, - * this is basically the inversion of ``{{#-last}}``. Use is only allowed between the begin and end marker of the layer. - * So between ``{{#result}}`` and ``{{/result}}``, or between ``{{#measurement}}`` and ``{{/measurement}}``. - * - * * ``{{/-last}}`` - End marker for either ``{{#-last}}`` or ``{{^-last}}``. - * - @verbatim embed:rst - - For an overview of all the possible data you can get out of nanobench, please see the tutorial at :ref:`tutorial-template-json`. - - The templates that ship with nanobench are: - - * :cpp:func:`templates::csv() ` - * :cpp:func:`templates::json() ` - * :cpp:func:`templates::htmlBoxplot() ` - * :cpp:func:`templates::pyperf() ` - - @endverbatim - * - * @param mustacheTemplate The template. - * @param bench Benchmark, containing all the results. - * @param out Output for the generated output. - */ -void render(char const* mustacheTemplate, Bench const& bench, std::ostream& out); -void render(std::string const& mustacheTemplate, Bench const& bench, std::ostream& out); - -/** - * Same as render(char const* mustacheTemplate, Bench const& bench, std::ostream& out), but for when - * you only have results available. - * - * @param mustacheTemplate The template. - * @param results All the results to be used for rendering. - * @param out Output for the generated output. - */ -void render(char const* mustacheTemplate, std::vector const& results, std::ostream& out); -void render(std::string const& mustacheTemplate, std::vector const& results, std::ostream& out); - -// Contains mustache-like templates -namespace templates { - -/*! - @brief CSV data for the benchmark results. - - Generates a comma-separated values dataset. First line is the header, each following line is a summary of each benchmark run. - - @verbatim embed:rst - See the tutorial at :ref:`tutorial-template-csv` for an example. - @endverbatim - */ -char const* csv() noexcept; - -/*! - @brief HTML output that uses plotly to generate an interactive boxplot chart. See the tutorial for an example output. - - The output uses only the elapsed wall clock time, and displays each epoch as a single dot. - @verbatim embed:rst - See the tutorial at :ref:`tutorial-template-html` for an example. - @endverbatim - - @see also ankerl::nanobench::render() - */ -char const* htmlBoxplot() noexcept; - -/*! - @brief Output in pyperf compatible JSON format, which can be used for more analyzation. - @verbatim embed:rst - See the tutorial at :ref:`tutorial-template-pyperf` for an example how to further analyze the output. - @endverbatim - */ -char const* pyperf() noexcept; - -/*! - @brief Template to generate JSON data. - - The generated JSON data contains *all* data that has been generated. All times are as double values, in seconds. The output can get - quite large. - @verbatim embed:rst - See the tutorial at :ref:`tutorial-template-json` for an example. - @endverbatim - */ -char const* json() noexcept; - -} // namespace templates - -namespace detail { - -template -struct PerfCountSet; - -class IterationLogic; -class PerformanceCounters; - -#if ANKERL_NANOBENCH(PERF_COUNTERS) -class LinuxPerformanceCounters; -#endif - -} // namespace detail -} // namespace nanobench -} // namespace ankerl - -// definitions //////////////////////////////////////////////////////////////////////////////////// - -namespace ankerl { -namespace nanobench { -namespace detail { - -template -struct PerfCountSet { - T pageFaults{}; - T cpuCycles{}; - T contextSwitches{}; - T instructions{}; - T branchInstructions{}; - T branchMisses{}; -}; - -} // namespace detail - -ANKERL_NANOBENCH(IGNORE_PADDED_PUSH) -struct Config { - // actual benchmark config - std::string mBenchmarkTitle = "benchmark"; - std::string mBenchmarkName = "noname"; - std::string mUnit = "op"; - double mBatch = 1.0; - double mComplexityN = -1.0; - size_t mNumEpochs = 11; - size_t mClockResolutionMultiple = static_cast(1000); - std::chrono::nanoseconds mMaxEpochTime = std::chrono::milliseconds(100); - std::chrono::nanoseconds mMinEpochTime = std::chrono::milliseconds(1); - uint64_t mMinEpochIterations{1}; - uint64_t mEpochIterations{0}; // If not 0, run *exactly* these number of iterations per epoch. - uint64_t mWarmup = 0; - std::ostream* mOut = nullptr; - std::chrono::duration mTimeUnit = std::chrono::nanoseconds{1}; - std::string mTimeUnitName = "ns"; - bool mShowPerformanceCounters = true; - bool mIsRelative = false; - std::unordered_map mContext{}; - - Config(); - ~Config(); - Config& operator=(Config const&); - Config& operator=(Config&&); - Config(Config const&); - Config(Config&&) noexcept; -}; -ANKERL_NANOBENCH(IGNORE_PADDED_POP) - -// Result returned after a benchmark has finished. Can be used as a baseline for relative(). -ANKERL_NANOBENCH(IGNORE_PADDED_PUSH) -class Result { -public: - enum class Measure : size_t { - elapsed, - iterations, - pagefaults, - cpucycles, - contextswitches, - instructions, - branchinstructions, - branchmisses, - _size - }; - - explicit Result(Config const& benchmarkConfig); - - ~Result(); - Result& operator=(Result const&); - Result& operator=(Result&&); - Result(Result const&); - Result(Result&&) noexcept; - - // adds new measurement results - // all values are scaled by iters (except iters...) - void add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const& pc); - - ANKERL_NANOBENCH(NODISCARD) Config const& config() const noexcept; - - ANKERL_NANOBENCH(NODISCARD) double median(Measure m) const; - ANKERL_NANOBENCH(NODISCARD) double medianAbsolutePercentError(Measure m) const; - ANKERL_NANOBENCH(NODISCARD) double average(Measure m) const; - ANKERL_NANOBENCH(NODISCARD) double sum(Measure m) const noexcept; - ANKERL_NANOBENCH(NODISCARD) double sumProduct(Measure m1, Measure m2) const noexcept; - ANKERL_NANOBENCH(NODISCARD) double minimum(Measure m) const noexcept; - ANKERL_NANOBENCH(NODISCARD) double maximum(Measure m) const noexcept; - ANKERL_NANOBENCH(NODISCARD) std::string const& context(char const*) const; - ANKERL_NANOBENCH(NODISCARD) std::string const& context(std::string const&) const; - - ANKERL_NANOBENCH(NODISCARD) bool has(Measure m) const noexcept; - ANKERL_NANOBENCH(NODISCARD) double get(size_t idx, Measure m) const; - ANKERL_NANOBENCH(NODISCARD) bool empty() const noexcept; - ANKERL_NANOBENCH(NODISCARD) size_t size() const noexcept; - - // Finds string, if not found, returns _size. - static Measure fromString(std::string const& str); - -private: - Config mConfig{}; - std::vector> mNameToMeasurements{}; -}; -ANKERL_NANOBENCH(IGNORE_PADDED_POP) - -/** - * An extremely fast random generator. Currently, this implements *RomuDuoJr*, developed by Mark Overton. Source: - * http://www.romu-random.org/ - * - * RomuDuoJr is extremely fast and provides reasonable good randomness. Not enough for large jobs, but definitely - * good enough for a benchmarking framework. - * - * * Estimated capacity: @f$ 2^{51} @f$ bytes - * * Register pressure: 4 - * * State size: 128 bits - * - * This random generator is a drop-in replacement for the generators supplied by ````. It is not - * cryptographically secure. It's intended purpose is to be very fast so that benchmarks that make use - * of randomness are not distorted too much by the random generator. - * - * Rng also provides a few non-standard helpers, optimized for speed. - */ -class Rng final { -public: - /** - * @brief This RNG provides 64bit randomness. - */ - using result_type = uint64_t; - - static constexpr uint64_t(min)(); - static constexpr uint64_t(max)(); - - /** - * As a safety precaution, we don't allow copying. Copying a PRNG would mean you would have two random generators that produce the - * same sequence, which is generally not what one wants. Instead create a new rng with the default constructor Rng(), which is - * automatically seeded from `std::random_device`. If you really need a copy, use `copy()`. - */ - Rng(Rng const&) = delete; - - /** - * Same as Rng(Rng const&), we don't allow assignment. If you need a new Rng create one with the default constructor Rng(). - */ - Rng& operator=(Rng const&) = delete; - - // moving is ok - Rng(Rng&&) noexcept = default; - Rng& operator=(Rng&&) noexcept = default; - ~Rng() noexcept = default; - - /** - * @brief Creates a new Random generator with random seed. - * - * Instead of a default seed (as the random generators from the STD), this properly seeds the random generator from - * `std::random_device`. It guarantees correct seeding. Note that seeding can be relatively slow, depending on the source of - * randomness used. So it is best to create a Rng once and use it for all your randomness purposes. - */ - Rng(); - - /*! - Creates a new Rng that is seeded with a specific seed. Each Rng created from the same seed will produce the same randomness - sequence. This can be useful for deterministic behavior. - - @verbatim embed:rst - .. note:: - - The random algorithm might change between nanobench releases. Whenever a faster and/or better random - generator becomes available, I will switch the implementation. - @endverbatim - - As per the Romu paper, this seeds the Rng with splitMix64 algorithm and performs 10 initial rounds for further mixing up of the - internal state. - - @param seed The 64bit seed. All values are allowed, even 0. - */ - explicit Rng(uint64_t seed) noexcept; - Rng(uint64_t x, uint64_t y) noexcept; - Rng(std::vector const& data); - - /** - * Creates a copy of the Rng, thus the copy provides exactly the same random sequence as the original. - */ - ANKERL_NANOBENCH(NODISCARD) Rng copy() const noexcept; - - /** - * @brief Produces a 64bit random value. This should be very fast, thus it is marked as inline. In my benchmark, this is ~46 times - * faster than `std::default_random_engine` for producing 64bit random values. It seems that the fastest std contender is - * `std::mt19937_64`. Still, this RNG is 2-3 times as fast. - * - * @return uint64_t The next 64 bit random value. - */ - inline uint64_t operator()() noexcept; - - // This is slightly biased. See - - /** - * Generates a random number between 0 and range (excluding range). - * - * The algorithm only produces 32bit numbers, and is slightly biased. The effect is quite small unless your range is close to the - * maximum value of an integer. It is possible to correct the bias with rejection sampling (see - * [here](https://lemire.me/blog/2016/06/30/fast-random-shuffling/), but this is most likely irrelevant in practices for the - * purposes of this Rng. - * - * See Daniel Lemire's blog post [A fast alternative to the modulo - * reduction](https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/) - * - * @param range Upper exclusive range. E.g a value of 3 will generate random numbers 0, 1, 2. - * @return uint32_t Generated random values in range [0, range(. - */ - inline uint32_t bounded(uint32_t range) noexcept; - - // random double in range [0, 1( - // see http://prng.di.unimi.it/ - - /** - * Provides a random uniform double value between 0 and 1. This uses the method described in [Generating uniform doubles in the - * unit interval](http://prng.di.unimi.it/), and is extremely fast. - * - * @return double Uniformly distributed double value in range [0,1(, excluding 1. - */ - inline double uniform01() noexcept; - - /** - * Shuffles all entries in the given container. Although this has a slight bias due to the implementation of bounded(), this is - * preferable to `std::shuffle` because it is over 5 times faster. See Daniel Lemire's blog post [Fast random - * shuffling](https://lemire.me/blog/2016/06/30/fast-random-shuffling/). - * - * @param container The whole container will be shuffled. - */ - template - void shuffle(Container& container) noexcept; - - /** - * Extracts the full state of the generator, e.g. for serialization. For this RNG this is just 2 values, but to stay API compatible - * with future implementations that potentially use more state, we use a vector. - * - * @return Vector containing the full state: - */ - std::vector state() const; - -private: - static constexpr uint64_t rotl(uint64_t x, unsigned k) noexcept; - - uint64_t mX; - uint64_t mY; -}; - -/** - * @brief Main entry point to nanobench's benchmarking facility. - * - * It holds configuration and results from one or more benchmark runs. Usually it is used in a single line, where the object is - * constructed, configured, and then a benchmark is run. E.g. like this: - * - * ankerl::nanobench::Bench().unit("byte").batch(1000).run("random fluctuations", [&] { - * // here be the benchmark code - * }); - * - * In that example Bench() constructs the benchmark, it is then configured with unit() and batch(), and after configuration a - * benchmark is executed with run(). Once run() has finished, it prints the result to `std::cout`. It would also store the results - * in the Bench instance, but in this case the object is immediately destroyed so it's not available any more. - */ -ANKERL_NANOBENCH(IGNORE_PADDED_PUSH) -class Bench { -public: - /** - * @brief Creates a new benchmark for configuration and running of benchmarks. - */ - Bench(); - - Bench(Bench&& other); - Bench& operator=(Bench&& other); - Bench(Bench const& other); - Bench& operator=(Bench const& other); - ~Bench() noexcept; - - /*! - @brief Repeatedly calls `op()` based on the configuration, and performs measurements. - - This call is marked with `noinline` to prevent the compiler to optimize beyond different benchmarks. This can have quite a big - effect on benchmark accuracy. - - @verbatim embed:rst - .. note:: - - Each call to your lambda must have a side effect that the compiler can't possibly optimize it away. E.g. add a result to an - externally defined number (like `x` in the above example), and finally call `doNotOptimizeAway` on the variables the compiler - must not remove. You can also use :cpp:func:`ankerl::nanobench::doNotOptimizeAway` directly in the lambda, but be aware that - this has a small overhead. - - @endverbatim - - @tparam Op The code to benchmark. - */ - template - ANKERL_NANOBENCH(NOINLINE) - Bench& run(char const* benchmarkName, Op&& op); - - template - ANKERL_NANOBENCH(NOINLINE) - Bench& run(std::string const& benchmarkName, Op&& op); - - /** - * @brief Same as run(char const* benchmarkName, Op op), but instead uses the previously set name. - * @tparam Op The code to benchmark. - */ - template - ANKERL_NANOBENCH(NOINLINE) - Bench& run(Op&& op); - - /** - * @brief Title of the benchmark, will be shown in the table header. Changing the title will start a new markdown table. - * - * @param benchmarkTitle The title of the benchmark. - */ - Bench& title(char const* benchmarkTitle); - Bench& title(std::string const& benchmarkTitle); - - /** - * @brief Gets the title of the benchmark - */ - ANKERL_NANOBENCH(NODISCARD) std::string const& title() const noexcept; - - /// Name of the benchmark, will be shown in the table row. - Bench& name(char const* benchmarkName); - Bench& name(std::string const& benchmarkName); - ANKERL_NANOBENCH(NODISCARD) std::string const& name() const noexcept; - - /** - * @brief Set context information. - * - * The information can be accessed using custom render templates via `{{context(variableName)}}`. - * Trying to render a variable that hasn't been set before raises an exception. - * Not included in (default) markdown table. - * - * @see clearContext, render - * - * @param variableName The name of the context variable. - * @param variableValue The value of the context variable. - */ - Bench& context(char const* variableName, char const* variableValue); - Bench& context(std::string const& variableName, std::string const& variableValue); - - /** - * @brief Reset context information. - * - * This may improve efficiency when using many context entries, - * or improve robustness by removing spurious context entries. - * - * @see context - */ - Bench& clearContext(); - - /** - * @brief Sets the batch size. - * - * E.g. number of processed byte, or some other metric for the size of the processed data in each iteration. If you benchmark - * hashing of a 1000 byte long string and want byte/sec as a result, you can specify 1000 as the batch size. - * - * @tparam T Any input type is internally cast to `double`. - * @param b batch size - */ - template - Bench& batch(T b) noexcept; - ANKERL_NANOBENCH(NODISCARD) double batch() const noexcept; - - /** - * @brief Sets the operation unit. - * - * Defaults to "op". Could be e.g. "byte" for string processing. This is used for the table header, e.g. to show `ns/byte`. Use - * singular (*byte*, not *bytes*). A change clears the currently collected results. - * - * @param unit The unit name. - */ - Bench& unit(char const* unit); - Bench& unit(std::string const& unit); - ANKERL_NANOBENCH(NODISCARD) std::string const& unit() const noexcept; - - /** - * @brief Sets the time unit to be used for the default output. - * - * Nanobench defaults to using ns (nanoseconds) as output in the markdown. For some benchmarks this is too coarse, so it is - * possible to configure this. E.g. use `timeUnit(1ms, "ms")` to show `ms/op` instead of `ns/op`. - * - * @param tu Time unit to display the results in, default is 1ns. - * @param tuName Name for the time unit, default is "ns" - */ - Bench& timeUnit(std::chrono::duration const& tu, std::string const& tuName); - ANKERL_NANOBENCH(NODISCARD) std::string const& timeUnitName() const noexcept; - ANKERL_NANOBENCH(NODISCARD) std::chrono::duration const& timeUnit() const noexcept; - - /** - * @brief Set the output stream where the resulting markdown table will be printed to. - * - * The default is `&std::cout`. You can disable all output by setting `nullptr`. - * - * @param outstream Pointer to output stream, can be `nullptr`. - */ - Bench& output(std::ostream* outstream) noexcept; - ANKERL_NANOBENCH(NODISCARD) std::ostream* output() const noexcept; - - /** - * Modern processors have a very accurate clock, being able to measure as low as 20 nanoseconds. This is the main trick nanobech to - * be so fast: we find out how accurate the clock is, then run the benchmark only so often that the clock's accuracy is good enough - * for accurate measurements. - * - * The default is to run one epoch for 1000 times the clock resolution. So for 20ns resolution and 11 epochs, this gives a total - * runtime of - * - * @f[ - * 20ns * 1000 * 11 \approx 0.2ms - * @f] - * - * To be precise, nanobench adds a 0-20% random noise to each evaluation. This is to prevent any aliasing effects, and further - * improves accuracy. - * - * Total runtime will be higher though: Some initial time is needed to find out the target number of iterations for each epoch, and - * there is some overhead involved to start & stop timers and calculate resulting statistics and writing the output. - * - * @param multiple Target number of times of clock resolution. Usually 1000 is a good compromise between runtime and accuracy. - */ - Bench& clockResolutionMultiple(size_t multiple) noexcept; - ANKERL_NANOBENCH(NODISCARD) size_t clockResolutionMultiple() const noexcept; - - /** - * @brief Controls number of epochs, the number of measurements to perform. - * - * The reported result will be the median of evaluation of each epoch. The higher you choose this, the more - * deterministic the result be and outliers will be more easily removed. Also the `err%` will be more accurate the higher this - * number is. Note that the `err%` will not necessarily decrease when number of epochs is increased. But it will be a more accurate - * representation of the benchmarked code's runtime stability. - * - * Choose the value wisely. In practice, 11 has been shown to be a reasonable choice between runtime performance and accuracy. - * This setting goes hand in hand with minEpochIterations() (or minEpochTime()). If you are more interested in *median* runtime, - * you might want to increase epochs(). If you are more interested in *mean* runtime, you might want to increase - * minEpochIterations() instead. - * - * @param numEpochs Number of epochs. - */ - Bench& epochs(size_t numEpochs) noexcept; - ANKERL_NANOBENCH(NODISCARD) size_t epochs() const noexcept; - - /** - * @brief Upper limit for the runtime of each epoch. - * - * As a safety precaution if the clock is not very accurate, we can set an upper limit for the maximum evaluation time per - * epoch. Default is 100ms. At least a single evaluation of the benchmark is performed. - * - * @see minEpochTime, minEpochIterations - * - * @param t Maximum target runtime for a single epoch. - */ - Bench& maxEpochTime(std::chrono::nanoseconds t) noexcept; - ANKERL_NANOBENCH(NODISCARD) std::chrono::nanoseconds maxEpochTime() const noexcept; - - /** - * @brief Minimum time each epoch should take. - * - * Default is zero, so we are fully relying on clockResolutionMultiple(). In most cases this is exactly what you want. If you see - * that the evaluation is unreliable with a high `err%`, you can increase either minEpochTime() or minEpochIterations(). - * - * @see maxEpochTim), minEpochIterations - * - * @param t Minimum time each epoch should take. - */ - Bench& minEpochTime(std::chrono::nanoseconds t) noexcept; - ANKERL_NANOBENCH(NODISCARD) std::chrono::nanoseconds minEpochTime() const noexcept; - - /** - * @brief Sets the minimum number of iterations each epoch should take. - * - * Default is 1, and we rely on clockResolutionMultiple(). If the `err%` is high and you want a more smooth result, you might want - * to increase the minimum number of iterations, or increase the minEpochTime(). - * - * @see minEpochTime, maxEpochTime, minEpochIterations - * - * @param numIters Minimum number of iterations per epoch. - */ - Bench& minEpochIterations(uint64_t numIters) noexcept; - ANKERL_NANOBENCH(NODISCARD) uint64_t minEpochIterations() const noexcept; - - /** - * Sets exactly the number of iterations for each epoch. Ignores all other epoch limits. This forces nanobench to use exactly - * the given number of iterations for each epoch, not more and not less. Default is 0 (disabled). - * - * @param numIters Exact number of iterations to use. Set to 0 to disable. - */ - Bench& epochIterations(uint64_t numIters) noexcept; - ANKERL_NANOBENCH(NODISCARD) uint64_t epochIterations() const noexcept; - - /** - * @brief Sets a number of iterations that are initially performed without any measurements. - * - * Some benchmarks need a few evaluations to warm up caches / database / whatever access. Normally this should not be needed, since - * we show the median result so initial outliers will be filtered away automatically. If the warmup effect is large though, you - * might want to set it. Default is 0. - * - * @param numWarmupIters Number of warmup iterations. - */ - Bench& warmup(uint64_t numWarmupIters) noexcept; - ANKERL_NANOBENCH(NODISCARD) uint64_t warmup() const noexcept; - - /** - * @brief Marks the next run as the baseline. - * - * Call `relative(true)` to mark the run as the baseline. Successive runs will be compared to this run. It is calculated by - * - * @f[ - * 100\% * \frac{baseline}{runtime} - * @f] - * - * * 100% means it is exactly as fast as the baseline - * * >100% means it is faster than the baseline. E.g. 200% means the current run is twice as fast as the baseline. - * * <100% means it is slower than the baseline. E.g. 50% means it is twice as slow as the baseline. - * - * See the tutorial section "Comparing Results" for example usage. - * - * @param isRelativeEnabled True to enable processing - */ - Bench& relative(bool isRelativeEnabled) noexcept; - ANKERL_NANOBENCH(NODISCARD) bool relative() const noexcept; - - /** - * @brief Enables/disables performance counters. - * - * On Linux nanobench has a powerful feature to use performance counters. This enables counting of retired instructions, count - * number of branches, missed branches, etc. On default this is enabled, but you can disable it if you don't need that feature. - * - * @param showPerformanceCounters True to enable, false to disable. - */ - Bench& performanceCounters(bool showPerformanceCounters) noexcept; - ANKERL_NANOBENCH(NODISCARD) bool performanceCounters() const noexcept; - - /** - * @brief Retrieves all benchmark results collected by the bench object so far. - * - * Each call to run() generates a Result that is stored within the Bench instance. This is mostly for advanced users who want to - * see all the nitty gritty details. - * - * @return All results collected so far. - */ - ANKERL_NANOBENCH(NODISCARD) std::vector const& results() const noexcept; - - /*! - @verbatim embed:rst - - Convenience shortcut to :cpp:func:`ankerl::nanobench::doNotOptimizeAway`. - - @endverbatim - */ - template - Bench& doNotOptimizeAway(Arg&& arg); - - /*! - @verbatim embed:rst - - Sets N for asymptotic complexity calculation, so it becomes possible to calculate `Big O - `_ from multiple benchmark evaluations. - - Use :cpp:func:`ankerl::nanobench::Bench::complexityBigO` when the evaluation has finished. See the tutorial - :ref:`asymptotic-complexity` for details. - - @endverbatim - - @tparam T Any type is cast to `double`. - @param b Length of N for the next benchmark run, so it is possible to calculate `bigO`. - */ - template - Bench& complexityN(T b) noexcept; - ANKERL_NANOBENCH(NODISCARD) double complexityN() const noexcept; - - /*! - Calculates [Big O](https://en.wikipedia.org/wiki/Big_O_notation>) of the results with all preconfigured complexity functions. - Currently these complexity functions are fitted into the benchmark results: - - @f$ \mathcal{O}(1) @f$, - @f$ \mathcal{O}(n) @f$, - @f$ \mathcal{O}(\log{}n) @f$, - @f$ \mathcal{O}(n\log{}n) @f$, - @f$ \mathcal{O}(n^2) @f$, - @f$ \mathcal{O}(n^3) @f$. - - If we e.g. evaluate the complexity of `std::sort`, this is the result of `std::cout << bench.complexityBigO()`: - - ``` - | coefficient | err% | complexity - |--------------:|-------:|------------ - | 5.08935e-09 | 2.6% | O(n log n) - | 6.10608e-08 | 8.0% | O(n) - | 1.29307e-11 | 47.2% | O(n^2) - | 2.48677e-15 | 69.6% | O(n^3) - | 9.88133e-06 | 132.3% | O(log n) - | 5.98793e-05 | 162.5% | O(1) - ``` - - So in this case @f$ \mathcal{O}(n\log{}n) @f$ provides the best approximation. - - @verbatim embed:rst - See the tutorial :ref:`asymptotic-complexity` for details. - @endverbatim - @return Evaluation results, which can be printed or otherwise inspected. - */ - std::vector complexityBigO() const; - - /** - * @brief Calculates bigO for a custom function. - * - * E.g. to calculate the mean squared error for @f$ \mathcal{O}(\log{}\log{}n) @f$, which is not part of the default set of - * complexityBigO(), you can do this: - * - * ``` - * auto logLogN = bench.complexityBigO("O(log log n)", [](double n) { - * return std::log2(std::log2(n)); - * }); - * ``` - * - * The resulting mean squared error can be printed with `std::cout << logLogN`. E.g. it prints something like this: - * - * ```text - * 2.46985e-05 * O(log log n), rms=1.48121 - * ``` - * - * @tparam Op Type of mapping operation. - * @param name Name for the function, e.g. "O(log log n)" - * @param op Op's operator() maps a `double` with the desired complexity function, e.g. `log2(log2(n))`. - * @return BigO Error calculation, which is streamable to std::cout. - */ - template - BigO complexityBigO(char const* name, Op op) const; - - template - BigO complexityBigO(std::string const& name, Op op) const; - - /*! - @verbatim embed:rst - - Convenience shortcut to :cpp:func:`ankerl::nanobench::render`. - - @endverbatim - */ - Bench& render(char const* templateContent, std::ostream& os); - Bench& render(std::string const& templateContent, std::ostream& os); - - Bench& config(Config const& benchmarkConfig); - ANKERL_NANOBENCH(NODISCARD) Config const& config() const noexcept; - -private: - Config mConfig{}; - std::vector mResults{}; -}; -ANKERL_NANOBENCH(IGNORE_PADDED_POP) - -/** - * @brief Makes sure none of the given arguments are optimized away by the compiler. - * - * @tparam Arg Type of the argument that shouldn't be optimized away. - * @param arg The input that we mark as being used, even though we don't do anything with it. - */ -template -void doNotOptimizeAway(Arg&& arg); - -namespace detail { - -#if defined(_MSC_VER) -void doNotOptimizeAwaySink(void const*); - -template -void doNotOptimizeAway(T const& val); - -#else - -// These assembly magic is directly from what Google Benchmark is doing. I have previously used what facebook's folly was doing, but -// this seemed to have compilation problems in some cases. Google Benchmark seemed to be the most well tested anyways. -// see https://github.com/google/benchmark/blob/master/include/benchmark/benchmark.h#L307 -template -void doNotOptimizeAway(T const& val) { - // NOLINTNEXTLINE(hicpp-no-assembler) - asm volatile("" : : "r,m"(val) : "memory"); -} - -template -void doNotOptimizeAway(T& val) { -# if defined(__clang__) - // NOLINTNEXTLINE(hicpp-no-assembler) - asm volatile("" : "+r,m"(val) : : "memory"); -# else - // NOLINTNEXTLINE(hicpp-no-assembler) - asm volatile("" : "+m,r"(val) : : "memory"); -# endif -} -#endif - -// internally used, but visible because run() is templated. -// Not movable/copy-able, so we simply use a pointer instead of unique_ptr. This saves us from -// having to include , and the template instantiation overhead of unique_ptr which is unfortunately quite significant. -ANKERL_NANOBENCH(IGNORE_EFFCPP_PUSH) -class IterationLogic { -public: - explicit IterationLogic(Bench const& config) noexcept; - ~IterationLogic(); - - ANKERL_NANOBENCH(NODISCARD) uint64_t numIters() const noexcept; - void add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept; - void moveResultTo(std::vector& results) noexcept; - -private: - struct Impl; - Impl* mPimpl; -}; -ANKERL_NANOBENCH(IGNORE_EFFCPP_POP) - -ANKERL_NANOBENCH(IGNORE_PADDED_PUSH) -class PerformanceCounters { -public: - PerformanceCounters(PerformanceCounters const&) = delete; - PerformanceCounters& operator=(PerformanceCounters const&) = delete; - - PerformanceCounters(); - ~PerformanceCounters(); - - void beginMeasure(); - void endMeasure(); - void updateResults(uint64_t numIters); - - ANKERL_NANOBENCH(NODISCARD) PerfCountSet const& val() const noexcept; - ANKERL_NANOBENCH(NODISCARD) PerfCountSet const& has() const noexcept; - -private: -#if ANKERL_NANOBENCH(PERF_COUNTERS) - LinuxPerformanceCounters* mPc = nullptr; -#endif - PerfCountSet mVal{}; - PerfCountSet mHas{}; -}; -ANKERL_NANOBENCH(IGNORE_PADDED_POP) - -// Gets the singleton -PerformanceCounters& performanceCounters(); - -} // namespace detail - -class BigO { -public: - using RangeMeasure = std::vector>; - - template - static RangeMeasure mapRangeMeasure(RangeMeasure data, Op op) { - for (auto& rangeMeasure : data) { - rangeMeasure.first = op(rangeMeasure.first); - } - return data; - } - - static RangeMeasure collectRangeMeasure(std::vector const& results); - - template - BigO(char const* bigOName, RangeMeasure const& rangeMeasure, Op rangeToN) - : BigO(bigOName, mapRangeMeasure(rangeMeasure, rangeToN)) {} - - template - BigO(std::string const& bigOName, RangeMeasure const& rangeMeasure, Op rangeToN) - : BigO(bigOName, mapRangeMeasure(rangeMeasure, rangeToN)) {} - - BigO(char const* bigOName, RangeMeasure const& scaledRangeMeasure); - BigO(std::string const& bigOName, RangeMeasure const& scaledRangeMeasure); - ANKERL_NANOBENCH(NODISCARD) std::string const& name() const noexcept; - ANKERL_NANOBENCH(NODISCARD) double constant() const noexcept; - ANKERL_NANOBENCH(NODISCARD) double normalizedRootMeanSquare() const noexcept; - ANKERL_NANOBENCH(NODISCARD) bool operator<(BigO const& other) const noexcept; - -private: - std::string mName{}; - double mConstant{}; - double mNormalizedRootMeanSquare{}; -}; -std::ostream& operator<<(std::ostream& os, BigO const& bigO); -std::ostream& operator<<(std::ostream& os, std::vector const& bigOs); - -} // namespace nanobench -} // namespace ankerl - -// implementation ///////////////////////////////////////////////////////////////////////////////// - -namespace ankerl { -namespace nanobench { - -constexpr uint64_t(Rng::min)() { - return 0; -} - -constexpr uint64_t(Rng::max)() { - return (std::numeric_limits::max)(); -} - -ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined") -uint64_t Rng::operator()() noexcept { - auto x = mX; - - mX = UINT64_C(15241094284759029579) * mY; - mY = rotl(mY - x, 27); - - return x; -} - -ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined") -uint32_t Rng::bounded(uint32_t range) noexcept { - uint64_t r32 = static_cast(operator()()); - auto multiresult = r32 * range; - return static_cast(multiresult >> 32U); -} - -double Rng::uniform01() noexcept { - auto i = (UINT64_C(0x3ff) << 52U) | (operator()() >> 12U); - // can't use union in c++ here for type puning, it's undefined behavior. - // std::memcpy is optimized anyways. - double d; - std::memcpy(&d, &i, sizeof(double)); - return d - 1.0; -} - -template -void Rng::shuffle(Container& container) noexcept { - auto i = container.size(); - while (i > 1U) { - using std::swap; - auto n = operator()(); - // using decltype(i) instead of size_t to be compatible to containers with 32bit index (see #80) - auto b1 = static_cast((static_cast(n) * static_cast(i)) >> 32U); - swap(container[--i], container[b1]); - - auto b2 = static_cast(((n >> 32U) * static_cast(i)) >> 32U); - swap(container[--i], container[b2]); - } -} - -ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined") -constexpr uint64_t Rng::rotl(uint64_t x, unsigned k) noexcept { - return (x << k) | (x >> (64U - k)); -} - -template -ANKERL_NANOBENCH_NO_SANITIZE("integer") -Bench& Bench::run(Op&& op) { - // It is important that this method is kept short so the compiler can do better optimizations/ inlining of op() - detail::IterationLogic iterationLogic(*this); - auto& pc = detail::performanceCounters(); - - while (auto n = iterationLogic.numIters()) { - pc.beginMeasure(); - Clock::time_point before = Clock::now(); - while (n-- > 0) { - op(); - } - Clock::time_point after = Clock::now(); - pc.endMeasure(); - pc.updateResults(iterationLogic.numIters()); - iterationLogic.add(after - before, pc); - } - iterationLogic.moveResultTo(mResults); - return *this; -} - -// Performs all evaluations. -template -Bench& Bench::run(char const* benchmarkName, Op&& op) { - name(benchmarkName); - return run(std::forward(op)); -} - -template -Bench& Bench::run(std::string const& benchmarkName, Op&& op) { - name(benchmarkName); - return run(std::forward(op)); -} - -template -BigO Bench::complexityBigO(char const* benchmarkName, Op op) const { - return BigO(benchmarkName, BigO::collectRangeMeasure(mResults), op); -} - -template -BigO Bench::complexityBigO(std::string const& benchmarkName, Op op) const { - return BigO(benchmarkName, BigO::collectRangeMeasure(mResults), op); -} - -// Set the batch size, e.g. number of processed bytes, or some other metric for the size of the processed data in each iteration. -// Any argument is cast to double. -template -Bench& Bench::batch(T b) noexcept { - mConfig.mBatch = static_cast(b); - return *this; -} - -// Sets the computation complexity of the next run. Any argument is cast to double. -template -Bench& Bench::complexityN(T n) noexcept { - mConfig.mComplexityN = static_cast(n); - return *this; -} - -// Convenience: makes sure none of the given arguments are optimized away by the compiler. -template -Bench& Bench::doNotOptimizeAway(Arg&& arg) { - detail::doNotOptimizeAway(std::forward(arg)); - return *this; -} - -// Makes sure none of the given arguments are optimized away by the compiler. -template -void doNotOptimizeAway(Arg&& arg) { - detail::doNotOptimizeAway(std::forward(arg)); -} - -namespace detail { - -#if defined(_MSC_VER) -template -void doNotOptimizeAway(T const& val) { - doNotOptimizeAwaySink(&val); -} - -#endif - -} // namespace detail -} // namespace nanobench -} // namespace ankerl - -#if defined(ANKERL_NANOBENCH_IMPLEMENT) - -/////////////////////////////////////////////////////////////////////////////////////////////////// -// implementation part - only visible in .cpp -/////////////////////////////////////////////////////////////////////////////////////////////////// - -# include // sort, reverse -# include // compare_exchange_strong in loop overhead -# include // getenv -# include // strstr, strncmp -# include // ifstream to parse proc files -# include // setw, setprecision -# include // cout -# include // accumulate -# include // random_device -# include // to_s in Number -# include // throw for rendering templates -# include // std::tie -# if defined(__linux__) -# include //sysconf -# endif -# if ANKERL_NANOBENCH(PERF_COUNTERS) -# include // map - -# include -# include -# include -# include -# endif - -// declarations /////////////////////////////////////////////////////////////////////////////////// - -namespace ankerl { -namespace nanobench { - -// helper stuff that is only intended to be used internally -namespace detail { - -struct TableInfo; - -// formatting utilities -namespace fmt { - -class NumSep; -class StreamStateRestorer; -class Number; -class MarkDownColumn; -class MarkDownCode; - -} // namespace fmt -} // namespace detail -} // namespace nanobench -} // namespace ankerl - -// definitions //////////////////////////////////////////////////////////////////////////////////// - -namespace ankerl { -namespace nanobench { - -uint64_t splitMix64(uint64_t& state) noexcept; - -namespace detail { - -// helpers to get double values -template -inline double d(T t) noexcept { - return static_cast(t); -} -inline double d(Clock::duration duration) noexcept { - return std::chrono::duration_cast>(duration).count(); -} - -// Calculates clock resolution once, and remembers the result -inline Clock::duration clockResolution() noexcept; - -} // namespace detail - -namespace templates { - -char const* csv() noexcept { - return R"DELIM("title";"name";"unit";"batch";"elapsed";"error %";"instructions";"branches";"branch misses";"total" -{{#result}}"{{title}}";"{{name}}";"{{unit}}";{{batch}};{{median(elapsed)}};{{medianAbsolutePercentError(elapsed)}};{{median(instructions)}};{{median(branchinstructions)}};{{median(branchmisses)}};{{sumProduct(iterations, elapsed)}} -{{/result}})DELIM"; -} - -char const* htmlBoxplot() noexcept { - return R"DELIM( - - - - - - -
- - - -)DELIM"; -} - -char const* pyperf() noexcept { - return R"DELIM({ - "benchmarks": [ - { - "runs": [ - { - "values": [ -{{#measurement}} {{elapsed}}{{^-last}}, -{{/last}}{{/measurement}} - ] - } - ] - } - ], - "metadata": { - "loops": {{sum(iterations)}}, - "inner_loops": {{batch}}, - "name": "{{title}}", - "unit": "second" - }, - "version": "1.0" -})DELIM"; -} - -char const* json() noexcept { - return R"DELIM({ - "results": [ -{{#result}} { - "title": "{{title}}", - "name": "{{name}}", - "unit": "{{unit}}", - "batch": {{batch}}, - "complexityN": {{complexityN}}, - "epochs": {{epochs}}, - "clockResolution": {{clockResolution}}, - "clockResolutionMultiple": {{clockResolutionMultiple}}, - "maxEpochTime": {{maxEpochTime}}, - "minEpochTime": {{minEpochTime}}, - "minEpochIterations": {{minEpochIterations}}, - "epochIterations": {{epochIterations}}, - "warmup": {{warmup}}, - "relative": {{relative}}, - "median(elapsed)": {{median(elapsed)}}, - "medianAbsolutePercentError(elapsed)": {{medianAbsolutePercentError(elapsed)}}, - "median(instructions)": {{median(instructions)}}, - "medianAbsolutePercentError(instructions)": {{medianAbsolutePercentError(instructions)}}, - "median(cpucycles)": {{median(cpucycles)}}, - "median(contextswitches)": {{median(contextswitches)}}, - "median(pagefaults)": {{median(pagefaults)}}, - "median(branchinstructions)": {{median(branchinstructions)}}, - "median(branchmisses)": {{median(branchmisses)}}, - "totalTime": {{sumProduct(iterations, elapsed)}}, - "measurements": [ -{{#measurement}} { - "iterations": {{iterations}}, - "elapsed": {{elapsed}}, - "pagefaults": {{pagefaults}}, - "cpucycles": {{cpucycles}}, - "contextswitches": {{contextswitches}}, - "instructions": {{instructions}}, - "branchinstructions": {{branchinstructions}}, - "branchmisses": {{branchmisses}} - }{{^-last}},{{/-last}} -{{/measurement}} ] - }{{^-last}},{{/-last}} -{{/result}} ] -})DELIM"; -} - -ANKERL_NANOBENCH(IGNORE_PADDED_PUSH) -struct Node { - enum class Type { tag, content, section, inverted_section }; - - char const* begin; - char const* end; - std::vector children; - Type type; - - template - // NOLINTNEXTLINE(hicpp-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays) - bool operator==(char const (&str)[N]) const noexcept { - return static_cast(std::distance(begin, end) + 1) == N && 0 == strncmp(str, begin, N - 1); - } -}; -ANKERL_NANOBENCH(IGNORE_PADDED_POP) - -static std::vector parseMustacheTemplate(char const** tpl) { - std::vector nodes; - - while (true) { - auto begin = std::strstr(*tpl, "{{"); - auto end = begin; - if (begin != nullptr) { - begin += 2; - end = std::strstr(begin, "}}"); - } - - if (begin == nullptr || end == nullptr) { - // nothing found, finish node - nodes.emplace_back(Node{*tpl, *tpl + std::strlen(*tpl), std::vector{}, Node::Type::content}); - return nodes; - } - - nodes.emplace_back(Node{*tpl, begin - 2, std::vector{}, Node::Type::content}); - - // we found a tag - *tpl = end + 2; - switch (*begin) { - case '/': - // finished! bail out - return nodes; - - case '#': - nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::section}); - break; - - case '^': - nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::inverted_section}); - break; - - default: - nodes.emplace_back(Node{begin, end, std::vector{}, Node::Type::tag}); - break; - } - } -} - -static bool generateFirstLast(Node const& n, size_t idx, size_t size, std::ostream& out) { - ANKERL_NANOBENCH_LOG("n.type=" << static_cast(n.type)); - bool matchFirst = n == "-first"; - bool matchLast = n == "-last"; - if (!matchFirst && !matchLast) { - return false; - } - - bool doWrite = false; - if (n.type == Node::Type::section) { - doWrite = (matchFirst && idx == 0) || (matchLast && idx == size - 1); - } else if (n.type == Node::Type::inverted_section) { - doWrite = (matchFirst && idx != 0) || (matchLast && idx != size - 1); - } - - if (doWrite) { - for (auto const& child : n.children) { - if (child.type == Node::Type::content) { - out.write(child.begin, std::distance(child.begin, child.end)); - } - } - } - return true; -} - -static bool matchCmdArgs(std::string const& str, std::vector& matchResult) { - matchResult.clear(); - auto idxOpen = str.find('('); - auto idxClose = str.find(')', idxOpen); - if (idxClose == std::string::npos) { - return false; - } - - matchResult.emplace_back(str.substr(0, idxOpen)); - - // split by comma - matchResult.emplace_back(std::string{}); - for (size_t i = idxOpen + 1; i != idxClose; ++i) { - if (str[i] == ' ' || str[i] == '\t') { - // skip whitespace - continue; - } - if (str[i] == ',') { - // got a comma => new string - matchResult.emplace_back(std::string{}); - continue; - } - // no whitespace no comma, append - matchResult.back() += str[i]; - } - return true; -} - -static bool generateConfigTag(Node const& n, Config const& config, std::ostream& out) { - using detail::d; - - if (n == "title") { - out << config.mBenchmarkTitle; - return true; - } else if (n == "name") { - out << config.mBenchmarkName; - return true; - } else if (n == "unit") { - out << config.mUnit; - return true; - } else if (n == "batch") { - out << config.mBatch; - return true; - } else if (n == "complexityN") { - out << config.mComplexityN; - return true; - } else if (n == "epochs") { - out << config.mNumEpochs; - return true; - } else if (n == "clockResolution") { - out << d(detail::clockResolution()); - return true; - } else if (n == "clockResolutionMultiple") { - out << config.mClockResolutionMultiple; - return true; - } else if (n == "maxEpochTime") { - out << d(config.mMaxEpochTime); - return true; - } else if (n == "minEpochTime") { - out << d(config.mMinEpochTime); - return true; - } else if (n == "minEpochIterations") { - out << config.mMinEpochIterations; - return true; - } else if (n == "epochIterations") { - out << config.mEpochIterations; - return true; - } else if (n == "warmup") { - out << config.mWarmup; - return true; - } else if (n == "relative") { - out << config.mIsRelative; - return true; - } - return false; -} - -static std::ostream& generateResultTag(Node const& n, Result const& r, std::ostream& out) { - if (generateConfigTag(n, r.config(), out)) { - return out; - } - // match e.g. "median(elapsed)" - // g++ 4.8 doesn't implement std::regex :( - // static std::regex const regOpArg1("^([a-zA-Z]+)\\(([a-zA-Z]*)\\)$"); - // std::cmatch matchResult; - // if (std::regex_match(n.begin, n.end, matchResult, regOpArg1)) { - std::vector matchResult; - if (matchCmdArgs(std::string(n.begin, n.end), matchResult)) { - if (matchResult.size() == 2) { - if (matchResult[0] == "context") { - return out << r.context(matchResult[1]); - } - - auto m = Result::fromString(matchResult[1]); - if (m == Result::Measure::_size) { - return out << 0.0; - } - - if (matchResult[0] == "median") { - return out << r.median(m); - } - if (matchResult[0] == "average") { - return out << r.average(m); - } - if (matchResult[0] == "medianAbsolutePercentError") { - return out << r.medianAbsolutePercentError(m); - } - if (matchResult[0] == "sum") { - return out << r.sum(m); - } - if (matchResult[0] == "minimum") { - return out << r.minimum(m); - } - if (matchResult[0] == "maximum") { - return out << r.maximum(m); - } - } else if (matchResult.size() == 3) { - auto m1 = Result::fromString(matchResult[1]); - auto m2 = Result::fromString(matchResult[2]); - if (m1 == Result::Measure::_size || m2 == Result::Measure::_size) { - return out << 0.0; - } - - if (matchResult[0] == "sumProduct") { - return out << r.sumProduct(m1, m2); - } - } - } - - // match e.g. "sumProduct(elapsed, iterations)" - // static std::regex const regOpArg2("^([a-zA-Z]+)\\(([a-zA-Z]*)\\s*,\\s+([a-zA-Z]*)\\)$"); - - // nothing matches :( - throw std::runtime_error("command '" + std::string(n.begin, n.end) + "' not understood"); -} - -static void generateResultMeasurement(std::vector const& nodes, size_t idx, Result const& r, std::ostream& out) { - for (auto const& n : nodes) { - if (!generateFirstLast(n, idx, r.size(), out)) { - ANKERL_NANOBENCH_LOG("n.type=" << static_cast(n.type)); - switch (n.type) { - case Node::Type::content: - out.write(n.begin, std::distance(n.begin, n.end)); - break; - - case Node::Type::inverted_section: - throw std::runtime_error("got a inverted section inside measurement"); - - case Node::Type::section: - throw std::runtime_error("got a section inside measurement"); - - case Node::Type::tag: { - auto m = Result::fromString(std::string(n.begin, n.end)); - if (m == Result::Measure::_size || !r.has(m)) { - out << 0.0; - } else { - out << r.get(idx, m); - } - break; - } - } - } - } -} - -static void generateResult(std::vector const& nodes, size_t idx, std::vector const& results, std::ostream& out) { - auto const& r = results[idx]; - for (auto const& n : nodes) { - if (!generateFirstLast(n, idx, results.size(), out)) { - ANKERL_NANOBENCH_LOG("n.type=" << static_cast(n.type)); - switch (n.type) { - case Node::Type::content: - out.write(n.begin, std::distance(n.begin, n.end)); - break; - - case Node::Type::inverted_section: - throw std::runtime_error("got a inverted section inside result"); - - case Node::Type::section: - if (n == "measurement") { - for (size_t i = 0; i < r.size(); ++i) { - generateResultMeasurement(n.children, i, r, out); - } - } else { - throw std::runtime_error("got a section inside result"); - } - break; - - case Node::Type::tag: - generateResultTag(n, r, out); - break; - } - } - } -} - -} // namespace templates - -// helper stuff that only intended to be used internally -namespace detail { - -char const* getEnv(char const* name); -bool isEndlessRunning(std::string const& name); -bool isWarningsEnabled(); - -template -T parseFile(std::string const& filename); - -void gatherStabilityInformation(std::vector& warnings, std::vector& recommendations); -void printStabilityInformationOnce(std::ostream* os); - -// remembers the last table settings used. When it changes, a new table header is automatically written for the new entry. -uint64_t& singletonHeaderHash() noexcept; - -// determines resolution of the given clock. This is done by measuring multiple times and returning the minimum time difference. -Clock::duration calcClockResolution(size_t numEvaluations) noexcept; - -// formatting utilities -namespace fmt { - -// adds thousands separator to numbers -ANKERL_NANOBENCH(IGNORE_PADDED_PUSH) -class NumSep : public std::numpunct { -public: - explicit NumSep(char sep); - char do_thousands_sep() const override; - std::string do_grouping() const override; - -private: - char mSep; -}; -ANKERL_NANOBENCH(IGNORE_PADDED_POP) - -// RAII to save & restore a stream's state -ANKERL_NANOBENCH(IGNORE_PADDED_PUSH) -class StreamStateRestorer { -public: - explicit StreamStateRestorer(std::ostream& s); - ~StreamStateRestorer(); - - // sets back all stream info that we remembered at construction - void restore(); - - // don't allow copying / moving - StreamStateRestorer(StreamStateRestorer const&) = delete; - StreamStateRestorer& operator=(StreamStateRestorer const&) = delete; - StreamStateRestorer(StreamStateRestorer&&) = delete; - StreamStateRestorer& operator=(StreamStateRestorer&&) = delete; - -private: - std::ostream& mStream; - std::locale mLocale; - std::streamsize const mPrecision; - std::streamsize const mWidth; - std::ostream::char_type const mFill; - std::ostream::fmtflags const mFmtFlags; -}; -ANKERL_NANOBENCH(IGNORE_PADDED_POP) - -// Number formatter -class Number { -public: - Number(int width, int precision, double value); - Number(int width, int precision, int64_t value); - std::string to_s() const; - -private: - friend std::ostream& operator<<(std::ostream& os, Number const& n); - std::ostream& write(std::ostream& os) const; - - int mWidth; - int mPrecision; - double mValue; -}; - -// helper replacement for std::to_string of signed/unsigned numbers so we are locale independent -std::string to_s(uint64_t s); - -std::ostream& operator<<(std::ostream& os, Number const& n); - -class MarkDownColumn { -public: - MarkDownColumn(int w, int prec, std::string const& tit, std::string const& suff, double val); - std::string title() const; - std::string separator() const; - std::string invalid() const; - std::string value() const; - -private: - int mWidth; - int mPrecision; - std::string mTitle; - std::string mSuffix; - double mValue; -}; - -// Formats any text as markdown code, escaping backticks. -class MarkDownCode { -public: - explicit MarkDownCode(std::string const& what); - -private: - friend std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode); - std::ostream& write(std::ostream& os) const; - - std::string mWhat{}; -}; - -std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode); - -} // namespace fmt -} // namespace detail -} // namespace nanobench -} // namespace ankerl - -// implementation ///////////////////////////////////////////////////////////////////////////////// - -namespace ankerl { -namespace nanobench { - -void render(char const* mustacheTemplate, std::vector const& results, std::ostream& out) { - detail::fmt::StreamStateRestorer restorer(out); - - out.precision(std::numeric_limits::digits10); - auto nodes = templates::parseMustacheTemplate(&mustacheTemplate); - - for (auto const& n : nodes) { - ANKERL_NANOBENCH_LOG("n.type=" << static_cast(n.type)); - switch (n.type) { - case templates::Node::Type::content: - out.write(n.begin, std::distance(n.begin, n.end)); - break; - - case templates::Node::Type::inverted_section: - throw std::runtime_error("unknown list '" + std::string(n.begin, n.end) + "'"); - - case templates::Node::Type::section: - if (n == "result") { - const size_t nbResults = results.size(); - for (size_t i = 0; i < nbResults; ++i) { - generateResult(n.children, i, results, out); - } - } else if (n == "measurement") { - if (results.size() != 1) { - throw std::runtime_error( - "render: can only use section 'measurement' here if there is a single result, but there are " + - detail::fmt::to_s(results.size())); - } - // when we only have a single result, we can immediately go into its measurement. - auto const& r = results.front(); - for (size_t i = 0; i < r.size(); ++i) { - generateResultMeasurement(n.children, i, r, out); - } - } else { - throw std::runtime_error("render: unknown section '" + std::string(n.begin, n.end) + "'"); - } - break; - - case templates::Node::Type::tag: - if (results.size() == 1) { - // result & config are both supported there - generateResultTag(n, results.front(), out); - } else { - // This just uses the last result's config. - if (!generateConfigTag(n, results.back().config(), out)) { - throw std::runtime_error("unknown tag '" + std::string(n.begin, n.end) + "'"); - } - } - break; - } - } -} - -void render(std::string const& mustacheTemplate, std::vector const& results, std::ostream& out) { - render(mustacheTemplate.c_str(), results, out); -} - -void render(char const* mustacheTemplate, const Bench& bench, std::ostream& out) { - render(mustacheTemplate, bench.results(), out); -} - -void render(std::string const& mustacheTemplate, const Bench& bench, std::ostream& out) { - render(mustacheTemplate.c_str(), bench.results(), out); -} - -namespace detail { - -PerformanceCounters& performanceCounters() { -# if defined(__clang__) -# pragma clang diagnostic push -# pragma clang diagnostic ignored "-Wexit-time-destructors" -# endif - static PerformanceCounters pc; -# if defined(__clang__) -# pragma clang diagnostic pop -# endif - return pc; -} - -// Windows version of doNotOptimizeAway -// see https://github.com/google/benchmark/blob/master/include/benchmark/benchmark.h#L307 -// see https://github.com/facebook/folly/blob/master/folly/Benchmark.h#L280 -// see https://docs.microsoft.com/en-us/cpp/preprocessor/optimize -# if defined(_MSC_VER) -# pragma optimize("", off) -void doNotOptimizeAwaySink(void const*) {} -# pragma optimize("", on) -# endif - -template -T parseFile(std::string const& filename) { - std::ifstream fin(filename); - T num{}; - fin >> num; - return num; -} - -char const* getEnv(char const* name) { -# if defined(_MSC_VER) -# pragma warning(push) -# pragma warning(disable : 4996) // getenv': This function or variable may be unsafe. -# endif - return std::getenv(name); -# if defined(_MSC_VER) -# pragma warning(pop) -# endif -} - -bool isEndlessRunning(std::string const& name) { - auto endless = getEnv("NANOBENCH_ENDLESS"); - return nullptr != endless && endless == name; -} - -// True when environment variable NANOBENCH_SUPPRESS_WARNINGS is either not set at all, or set to "0" -bool isWarningsEnabled() { - auto suppression = getEnv("NANOBENCH_SUPPRESS_WARNINGS"); - return nullptr == suppression || suppression == std::string("0"); -} - -void gatherStabilityInformation(std::vector& warnings, std::vector& recommendations) { - warnings.clear(); - recommendations.clear(); - - bool recommendCheckFlags = false; - -# if defined(DEBUG) - warnings.emplace_back("DEBUG defined"); - recommendCheckFlags = true; -# endif - - bool recommendPyPerf = false; -# if defined(__linux__) - auto nprocs = sysconf(_SC_NPROCESSORS_CONF); - if (nprocs <= 0) { - warnings.emplace_back("couldn't figure out number of processors - no governor, turbo check possible"); - } else { - - // check frequency scaling - for (long id = 0; id < nprocs; ++id) { - auto idStr = detail::fmt::to_s(static_cast(id)); - auto sysCpu = "/sys/devices/system/cpu/cpu" + idStr; - auto minFreq = parseFile(sysCpu + "/cpufreq/scaling_min_freq"); - auto maxFreq = parseFile(sysCpu + "/cpufreq/scaling_max_freq"); - if (minFreq != maxFreq) { - auto minMHz = static_cast(minFreq) / 1000.0; - auto maxMHz = static_cast(maxFreq) / 1000.0; - warnings.emplace_back("CPU frequency scaling enabled: CPU " + idStr + " between " + - detail::fmt::Number(1, 1, minMHz).to_s() + " and " + detail::fmt::Number(1, 1, maxMHz).to_s() + - " MHz"); - recommendPyPerf = true; - break; - } - } - - auto currentGovernor = parseFile("/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor"); - if ("performance" != currentGovernor) { - warnings.emplace_back("CPU governor is '" + currentGovernor + "' but should be 'performance'"); - recommendPyPerf = true; - } - - if (0 == parseFile("/sys/devices/system/cpu/intel_pstate/no_turbo")) { - warnings.emplace_back("Turbo is enabled, CPU frequency will fluctuate"); - recommendPyPerf = true; - } - } -# endif - - if (recommendCheckFlags) { - recommendations.emplace_back("Make sure you compile for Release"); - } - if (recommendPyPerf) { - recommendations.emplace_back("Use 'pyperf system tune' before benchmarking. See https://github.com/psf/pyperf"); - } -} - -void printStabilityInformationOnce(std::ostream* outStream) { - static bool shouldPrint = true; - if (shouldPrint && outStream && isWarningsEnabled()) { - auto& os = *outStream; - shouldPrint = false; - std::vector warnings; - std::vector recommendations; - gatherStabilityInformation(warnings, recommendations); - if (warnings.empty()) { - return; - } - - os << "Warning, results might be unstable:" << std::endl; - for (auto const& w : warnings) { - os << "* " << w << std::endl; - } - - os << std::endl << "Recommendations" << std::endl; - for (auto const& r : recommendations) { - os << "* " << r << std::endl; - } - } -} - -// remembers the last table settings used. When it changes, a new table header is automatically written for the new entry. -uint64_t& singletonHeaderHash() noexcept { - static uint64_t sHeaderHash{}; - return sHeaderHash; -} - -ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined") -inline uint64_t hash_combine(uint64_t seed, uint64_t val) { - return seed ^ (val + UINT64_C(0x9e3779b9) + (seed << 6U) + (seed >> 2U)); -} - -// determines resolution of the given clock. This is done by measuring multiple times and returning the minimum time difference. -Clock::duration calcClockResolution(size_t numEvaluations) noexcept { - auto bestDuration = Clock::duration::max(); - Clock::time_point tBegin; - Clock::time_point tEnd; - for (size_t i = 0; i < numEvaluations; ++i) { - tBegin = Clock::now(); - do { - tEnd = Clock::now(); - } while (tBegin == tEnd); - bestDuration = (std::min)(bestDuration, tEnd - tBegin); - } - return bestDuration; -} - -// Calculates clock resolution once, and remembers the result -Clock::duration clockResolution() noexcept { - static Clock::duration sResolution = calcClockResolution(20); - return sResolution; -} - -ANKERL_NANOBENCH(IGNORE_PADDED_PUSH) -struct IterationLogic::Impl { - enum class State { warmup, upscaling_runtime, measuring, endless }; - - explicit Impl(Bench const& bench) - : mBench(bench) - , mResult(bench.config()) { - printStabilityInformationOnce(mBench.output()); - - // determine target runtime per epoch - mTargetRuntimePerEpoch = detail::clockResolution() * mBench.clockResolutionMultiple(); - if (mTargetRuntimePerEpoch > mBench.maxEpochTime()) { - mTargetRuntimePerEpoch = mBench.maxEpochTime(); - } - if (mTargetRuntimePerEpoch < mBench.minEpochTime()) { - mTargetRuntimePerEpoch = mBench.minEpochTime(); - } - - if (isEndlessRunning(mBench.name())) { - std::cerr << "NANOBENCH_ENDLESS set: running '" << mBench.name() << "' endlessly" << std::endl; - mNumIters = (std::numeric_limits::max)(); - mState = State::endless; - } else if (0 != mBench.warmup()) { - mNumIters = mBench.warmup(); - mState = State::warmup; - } else if (0 != mBench.epochIterations()) { - // exact number of iterations - mNumIters = mBench.epochIterations(); - mState = State::measuring; - } else { - mNumIters = mBench.minEpochIterations(); - mState = State::upscaling_runtime; - } - } - - // directly calculates new iters based on elapsed&iters, and adds a 10% noise. Makes sure we don't underflow. - ANKERL_NANOBENCH(NODISCARD) uint64_t calcBestNumIters(std::chrono::nanoseconds elapsed, uint64_t iters) noexcept { - auto doubleElapsed = d(elapsed); - auto doubleTargetRuntimePerEpoch = d(mTargetRuntimePerEpoch); - auto doubleNewIters = doubleTargetRuntimePerEpoch / doubleElapsed * d(iters); - - auto doubleMinEpochIters = d(mBench.minEpochIterations()); - if (doubleNewIters < doubleMinEpochIters) { - doubleNewIters = doubleMinEpochIters; - } - doubleNewIters *= 1.0 + 0.2 * mRng.uniform01(); - - // +0.5 for correct rounding when casting - // NOLINTNEXTLINE(bugprone-incorrect-roundings) - return static_cast(doubleNewIters + 0.5); - } - - ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined") void upscale(std::chrono::nanoseconds elapsed) { - if (elapsed * 10 < mTargetRuntimePerEpoch) { - // we are far below the target runtime. Multiply iterations by 10 (with overflow check) - if (mNumIters * 10 < mNumIters) { - // overflow :-( - showResult("iterations overflow. Maybe your code got optimized away?"); - mNumIters = 0; - return; - } - mNumIters *= 10; - } else { - mNumIters = calcBestNumIters(elapsed, mNumIters); - } - } - - void add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept { -# if defined(ANKERL_NANOBENCH_LOG_ENABLED) - auto oldIters = mNumIters; -# endif - - switch (mState) { - case State::warmup: - if (isCloseEnoughForMeasurements(elapsed)) { - // if elapsed is close enough, we can skip upscaling and go right to measurements - // still, we don't add the result to the measurements. - mState = State::measuring; - mNumIters = calcBestNumIters(elapsed, mNumIters); - } else { - // not close enough: switch to upscaling - mState = State::upscaling_runtime; - upscale(elapsed); - } - break; - - case State::upscaling_runtime: - if (isCloseEnoughForMeasurements(elapsed)) { - // if we are close enough, add measurement and switch to always measuring - mState = State::measuring; - mTotalElapsed += elapsed; - mTotalNumIters += mNumIters; - mResult.add(elapsed, mNumIters, pc); - mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters); - } else { - upscale(elapsed); - } - break; - - case State::measuring: - // just add measurements - no questions asked. Even when runtime is low. But we can't ignore - // that fluctuation, or else we would bias the result - mTotalElapsed += elapsed; - mTotalNumIters += mNumIters; - mResult.add(elapsed, mNumIters, pc); - if (0 != mBench.epochIterations()) { - mNumIters = mBench.epochIterations(); - } else { - mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters); - } - break; - - case State::endless: - mNumIters = (std::numeric_limits::max)(); - break; - } - - if (static_cast(mResult.size()) == mBench.epochs()) { - // we got all the results that we need, finish it - showResult(""); - mNumIters = 0; - } - - ANKERL_NANOBENCH_LOG(mBench.name() << ": " << detail::fmt::Number(20, 3, static_cast(elapsed.count())) << " elapsed, " - << detail::fmt::Number(20, 3, static_cast(mTargetRuntimePerEpoch.count())) - << " target. oldIters=" << oldIters << ", mNumIters=" << mNumIters - << ", mState=" << static_cast(mState)); - } - - void showResult(std::string const& errorMessage) const { - ANKERL_NANOBENCH_LOG(errorMessage); - - if (mBench.output() != nullptr) { - // prepare column data /////// - std::vector columns; - - auto rMedian = mResult.median(Result::Measure::elapsed); - - if (mBench.relative()) { - double d = 100.0; - if (!mBench.results().empty()) { - d = rMedian <= 0.0 ? 0.0 : mBench.results().front().median(Result::Measure::elapsed) / rMedian * 100.0; - } - columns.emplace_back(11, 1, "relative", "%", d); - } - - if (mBench.complexityN() > 0) { - columns.emplace_back(14, 0, "complexityN", "", mBench.complexityN()); - } - - columns.emplace_back(22, 2, mBench.timeUnitName() + "/" + mBench.unit(), "", - rMedian / (mBench.timeUnit().count() * mBench.batch())); - columns.emplace_back(22, 2, mBench.unit() + "/s", "", rMedian <= 0.0 ? 0.0 : mBench.batch() / rMedian); - - double rErrorMedian = mResult.medianAbsolutePercentError(Result::Measure::elapsed); - columns.emplace_back(10, 1, "err%", "%", rErrorMedian * 100.0); - - double rInsMedian = -1.0; - if (mBench.performanceCounters() && mResult.has(Result::Measure::instructions)) { - rInsMedian = mResult.median(Result::Measure::instructions); - columns.emplace_back(18, 2, "ins/" + mBench.unit(), "", rInsMedian / mBench.batch()); - } - - double rCycMedian = -1.0; - if (mBench.performanceCounters() && mResult.has(Result::Measure::cpucycles)) { - rCycMedian = mResult.median(Result::Measure::cpucycles); - columns.emplace_back(18, 2, "cyc/" + mBench.unit(), "", rCycMedian / mBench.batch()); - } - if (rInsMedian > 0.0 && rCycMedian > 0.0) { - columns.emplace_back(9, 3, "IPC", "", rCycMedian <= 0.0 ? 0.0 : rInsMedian / rCycMedian); - } - if (mBench.performanceCounters() && mResult.has(Result::Measure::branchinstructions)) { - double rBraMedian = mResult.median(Result::Measure::branchinstructions); - columns.emplace_back(17, 2, "bra/" + mBench.unit(), "", rBraMedian / mBench.batch()); - if (mResult.has(Result::Measure::branchmisses)) { - double p = 0.0; - if (rBraMedian >= 1e-9) { - p = 100.0 * mResult.median(Result::Measure::branchmisses) / rBraMedian; - } - columns.emplace_back(10, 1, "miss%", "%", p); - } - } - - columns.emplace_back(12, 2, "total", "", mResult.sumProduct(Result::Measure::iterations, Result::Measure::elapsed)); - - // write everything - auto& os = *mBench.output(); - - // combine all elements that are relevant for printing the header - uint64_t hash = 0; - hash = hash_combine(std::hash{}(mBench.unit()), hash); - hash = hash_combine(std::hash{}(mBench.title()), hash); - hash = hash_combine(std::hash{}(mBench.timeUnitName()), hash); - hash = hash_combine(std::hash{}(mBench.timeUnit().count()), hash); - hash = hash_combine(std::hash{}(mBench.relative()), hash); - hash = hash_combine(std::hash{}(mBench.performanceCounters()), hash); - - if (hash != singletonHeaderHash()) { - singletonHeaderHash() = hash; - - // no result yet, print header - os << std::endl; - for (auto const& col : columns) { - os << col.title(); - } - os << "| " << mBench.title() << std::endl; - - for (auto const& col : columns) { - os << col.separator(); - } - os << "|:" << std::string(mBench.title().size() + 1U, '-') << std::endl; - } - - if (!errorMessage.empty()) { - for (auto const& col : columns) { - os << col.invalid(); - } - os << "| :boom: " << fmt::MarkDownCode(mBench.name()) << " (" << errorMessage << ')' << std::endl; - } else { - for (auto const& col : columns) { - os << col.value(); - } - os << "| "; - auto showUnstable = isWarningsEnabled() && rErrorMedian >= 0.05; - if (showUnstable) { - os << ":wavy_dash: "; - } - os << fmt::MarkDownCode(mBench.name()); - if (showUnstable) { - auto avgIters = static_cast(mTotalNumIters) / static_cast(mBench.epochs()); - // NOLINTNEXTLINE(bugprone-incorrect-roundings) - auto suggestedIters = static_cast(avgIters * 10 + 0.5); - - os << " (Unstable with ~" << detail::fmt::Number(1, 1, avgIters) - << " iters. Increase `minEpochIterations` to e.g. " << suggestedIters << ")"; - } - os << std::endl; - } - } - } - - ANKERL_NANOBENCH(NODISCARD) bool isCloseEnoughForMeasurements(std::chrono::nanoseconds elapsed) const noexcept { - return elapsed * 3 >= mTargetRuntimePerEpoch * 2; - } - - uint64_t mNumIters = 1; - Bench const& mBench; - std::chrono::nanoseconds mTargetRuntimePerEpoch{}; - Result mResult; - Rng mRng{123}; - std::chrono::nanoseconds mTotalElapsed{}; - uint64_t mTotalNumIters = 0; - - State mState = State::upscaling_runtime; -}; -ANKERL_NANOBENCH(IGNORE_PADDED_POP) - -IterationLogic::IterationLogic(Bench const& bench) noexcept - : mPimpl(new Impl(bench)) {} - -IterationLogic::~IterationLogic() { - if (mPimpl) { - delete mPimpl; - } -} - -uint64_t IterationLogic::numIters() const noexcept { - ANKERL_NANOBENCH_LOG(mPimpl->mBench.name() << ": mNumIters=" << mPimpl->mNumIters); - return mPimpl->mNumIters; -} - -void IterationLogic::add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept { - mPimpl->add(elapsed, pc); -} - -void IterationLogic::moveResultTo(std::vector& results) noexcept { - results.emplace_back(std::move(mPimpl->mResult)); -} - -# if ANKERL_NANOBENCH(PERF_COUNTERS) - -ANKERL_NANOBENCH(IGNORE_PADDED_PUSH) -class LinuxPerformanceCounters { -public: - struct Target { - Target(uint64_t* targetValue_, bool correctMeasuringOverhead_, bool correctLoopOverhead_) - : targetValue(targetValue_) - , correctMeasuringOverhead(correctMeasuringOverhead_) - , correctLoopOverhead(correctLoopOverhead_) {} - - uint64_t* targetValue{}; - bool correctMeasuringOverhead{}; - bool correctLoopOverhead{}; - }; - - ~LinuxPerformanceCounters(); - - // quick operation - inline void start() {} - - inline void stop() {} - - bool monitor(perf_sw_ids swId, Target target); - bool monitor(perf_hw_id hwId, Target target); - - bool hasError() const noexcept { - return mHasError; - } - - // Just reading data is faster than enable & disabling. - // we subtract data ourselves. - inline void beginMeasure() { - if (mHasError) { - return; - } - - // NOLINTNEXTLINE(hicpp-signed-bitwise) - mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP); - if (mHasError) { - return; - } - - // NOLINTNEXTLINE(hicpp-signed-bitwise) - mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP); - } - - inline void endMeasure() { - if (mHasError) { - return; - } - - // NOLINTNEXTLINE(hicpp-signed-bitwise) - mHasError = (-1 == ioctl(mFd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP)); - if (mHasError) { - return; - } - - auto const numBytes = sizeof(uint64_t) * mCounters.size(); - auto ret = read(mFd, mCounters.data(), numBytes); - mHasError = ret != static_cast(numBytes); - } - - void updateResults(uint64_t numIters); - - // rounded integer division - template - static inline T divRounded(T a, T divisor) { - return (a + divisor / 2) / divisor; - } - - ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined") - static inline uint32_t mix(uint32_t x) noexcept { - x ^= x << 13; - x ^= x >> 17; - x ^= x << 5; - return x; - } - - template - ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined") - void calibrate(Op&& op) { - // clear current calibration data, - for (auto& v : mCalibratedOverhead) { - v = UINT64_C(0); - } - - // create new calibration data - auto newCalibration = mCalibratedOverhead; - for (auto& v : newCalibration) { - v = (std::numeric_limits::max)(); - } - for (size_t iter = 0; iter < 100; ++iter) { - beginMeasure(); - op(); - endMeasure(); - if (mHasError) { - return; - } - - for (size_t i = 0; i < newCalibration.size(); ++i) { - auto diff = mCounters[i]; - if (newCalibration[i] > diff) { - newCalibration[i] = diff; - } - } - } - - mCalibratedOverhead = std::move(newCalibration); - - { - // calibrate loop overhead. For branches & instructions this makes sense, not so much for everything else like cycles. - // marsaglia's xorshift: mov, sal/shr, xor. Times 3. - // This has the nice property that the compiler doesn't seem to be able to optimize multiple calls any further. - // see https://godbolt.org/z/49RVQ5 - uint64_t const numIters = 100000U + (std::random_device{}() & 3); - uint64_t n = numIters; - uint32_t x = 1234567; - - beginMeasure(); - while (n-- > 0) { - x = mix(x); - } - endMeasure(); - detail::doNotOptimizeAway(x); - auto measure1 = mCounters; - - n = numIters; - beginMeasure(); - while (n-- > 0) { - // we now run *twice* so we can easily calculate the overhead - x = mix(x); - x = mix(x); - } - endMeasure(); - detail::doNotOptimizeAway(x); - auto measure2 = mCounters; - - for (size_t i = 0; i < mCounters.size(); ++i) { - // factor 2 because we have two instructions per loop - auto m1 = measure1[i] > mCalibratedOverhead[i] ? measure1[i] - mCalibratedOverhead[i] : 0; - auto m2 = measure2[i] > mCalibratedOverhead[i] ? measure2[i] - mCalibratedOverhead[i] : 0; - auto overhead = m1 * 2 > m2 ? m1 * 2 - m2 : 0; - - mLoopOverhead[i] = divRounded(overhead, numIters); - } - } - } - -private: - bool monitor(uint32_t type, uint64_t eventid, Target target); - - std::map mIdToTarget{}; - - // start with minimum size of 3 for read_format - std::vector mCounters{3}; - std::vector mCalibratedOverhead{3}; - std::vector mLoopOverhead{3}; - - uint64_t mTimeEnabledNanos = 0; - uint64_t mTimeRunningNanos = 0; - int mFd = -1; - bool mHasError = false; -}; -ANKERL_NANOBENCH(IGNORE_PADDED_POP) - -LinuxPerformanceCounters::~LinuxPerformanceCounters() { - if (-1 != mFd) { - close(mFd); - } -} - -bool LinuxPerformanceCounters::monitor(perf_sw_ids swId, LinuxPerformanceCounters::Target target) { - return monitor(PERF_TYPE_SOFTWARE, swId, target); -} - -bool LinuxPerformanceCounters::monitor(perf_hw_id hwId, LinuxPerformanceCounters::Target target) { - return monitor(PERF_TYPE_HARDWARE, hwId, target); -} - -// overflow is ok, it's checked -ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined") -void LinuxPerformanceCounters::updateResults(uint64_t numIters) { - // clear old data - for (auto& id_value : mIdToTarget) { - *id_value.second.targetValue = UINT64_C(0); - } - - if (mHasError) { - return; - } - - mTimeEnabledNanos = mCounters[1] - mCalibratedOverhead[1]; - mTimeRunningNanos = mCounters[2] - mCalibratedOverhead[2]; - - for (uint64_t i = 0; i < mCounters[0]; ++i) { - auto idx = static_cast(3 + i * 2 + 0); - auto id = mCounters[idx + 1U]; - - auto it = mIdToTarget.find(id); - if (it != mIdToTarget.end()) { - - auto& tgt = it->second; - *tgt.targetValue = mCounters[idx]; - if (tgt.correctMeasuringOverhead) { - if (*tgt.targetValue >= mCalibratedOverhead[idx]) { - *tgt.targetValue -= mCalibratedOverhead[idx]; - } else { - *tgt.targetValue = 0U; - } - } - if (tgt.correctLoopOverhead) { - auto correctionVal = mLoopOverhead[idx] * numIters; - if (*tgt.targetValue >= correctionVal) { - *tgt.targetValue -= correctionVal; - } else { - *tgt.targetValue = 0U; - } - } - } - } -} - -bool LinuxPerformanceCounters::monitor(uint32_t type, uint64_t eventid, Target target) { - *target.targetValue = (std::numeric_limits::max)(); - if (mHasError) { - return false; - } - - auto pea = perf_event_attr(); - std::memset(&pea, 0, sizeof(perf_event_attr)); - pea.type = type; - pea.size = sizeof(perf_event_attr); - pea.config = eventid; - pea.disabled = 1; // start counter as disabled - pea.exclude_kernel = 1; - pea.exclude_hv = 1; - - // NOLINTNEXTLINE(hicpp-signed-bitwise) - pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID | PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING; - - const int pid = 0; // the current process - const int cpu = -1; // all CPUs -# if defined(PERF_FLAG_FD_CLOEXEC) // since Linux 3.14 - const unsigned long flags = PERF_FLAG_FD_CLOEXEC; -# else - const unsigned long flags = 0; -# endif - - auto fd = static_cast(syscall(__NR_perf_event_open, &pea, pid, cpu, mFd, flags)); - if (-1 == fd) { - return false; - } - if (-1 == mFd) { - // first call: set to fd, and use this from now on - mFd = fd; - } - uint64_t id = 0; - // NOLINTNEXTLINE(hicpp-signed-bitwise) - if (-1 == ioctl(fd, PERF_EVENT_IOC_ID, &id)) { - // couldn't get id - return false; - } - - // insert into map, rely on the fact that map's references are constant. - mIdToTarget.emplace(id, target); - - // prepare readformat with the correct size (after the insert) - auto size = 3 + 2 * mIdToTarget.size(); - mCounters.resize(size); - mCalibratedOverhead.resize(size); - mLoopOverhead.resize(size); - - return true; -} - -PerformanceCounters::PerformanceCounters() - : mPc(new LinuxPerformanceCounters()) - , mVal() - , mHas() { - - mHas.pageFaults = mPc->monitor(PERF_COUNT_SW_PAGE_FAULTS, LinuxPerformanceCounters::Target(&mVal.pageFaults, true, false)); - mHas.cpuCycles = mPc->monitor(PERF_COUNT_HW_REF_CPU_CYCLES, LinuxPerformanceCounters::Target(&mVal.cpuCycles, true, false)); - mHas.contextSwitches = - mPc->monitor(PERF_COUNT_SW_CONTEXT_SWITCHES, LinuxPerformanceCounters::Target(&mVal.contextSwitches, true, false)); - mHas.instructions = mPc->monitor(PERF_COUNT_HW_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.instructions, true, true)); - mHas.branchInstructions = - mPc->monitor(PERF_COUNT_HW_BRANCH_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.branchInstructions, true, false)); - mHas.branchMisses = mPc->monitor(PERF_COUNT_HW_BRANCH_MISSES, LinuxPerformanceCounters::Target(&mVal.branchMisses, true, false)); - // mHas.branchMisses = false; - - mPc->start(); - mPc->calibrate([] { - auto before = ankerl::nanobench::Clock::now(); - auto after = ankerl::nanobench::Clock::now(); - (void)before; - (void)after; - }); - - if (mPc->hasError()) { - // something failed, don't monitor anything. - mHas = PerfCountSet{}; - } -} - -PerformanceCounters::~PerformanceCounters() { - if (nullptr != mPc) { - delete mPc; - } -} - -void PerformanceCounters::beginMeasure() { - mPc->beginMeasure(); -} - -void PerformanceCounters::endMeasure() { - mPc->endMeasure(); -} - -void PerformanceCounters::updateResults(uint64_t numIters) { - mPc->updateResults(numIters); -} - -# else - -PerformanceCounters::PerformanceCounters() = default; -PerformanceCounters::~PerformanceCounters() = default; -void PerformanceCounters::beginMeasure() {} -void PerformanceCounters::endMeasure() {} -void PerformanceCounters::updateResults(uint64_t) {} - -# endif - -ANKERL_NANOBENCH(NODISCARD) PerfCountSet const& PerformanceCounters::val() const noexcept { - return mVal; -} -ANKERL_NANOBENCH(NODISCARD) PerfCountSet const& PerformanceCounters::has() const noexcept { - return mHas; -} - -// formatting utilities -namespace fmt { - -// adds thousands separator to numbers -NumSep::NumSep(char sep) - : mSep(sep) {} - -char NumSep::do_thousands_sep() const { - return mSep; -} - -std::string NumSep::do_grouping() const { - return "\003"; -} - -// RAII to save & restore a stream's state -StreamStateRestorer::StreamStateRestorer(std::ostream& s) - : mStream(s) - , mLocale(s.getloc()) - , mPrecision(s.precision()) - , mWidth(s.width()) - , mFill(s.fill()) - , mFmtFlags(s.flags()) {} - -StreamStateRestorer::~StreamStateRestorer() { - restore(); -} - -// sets back all stream info that we remembered at construction -void StreamStateRestorer::restore() { - mStream.imbue(mLocale); - mStream.precision(mPrecision); - mStream.width(mWidth); - mStream.fill(mFill); - mStream.flags(mFmtFlags); -} - -Number::Number(int width, int precision, int64_t value) - : mWidth(width) - , mPrecision(precision) - , mValue(static_cast(value)) {} - -Number::Number(int width, int precision, double value) - : mWidth(width) - , mPrecision(precision) - , mValue(value) {} - -std::ostream& Number::write(std::ostream& os) const { - StreamStateRestorer restorer(os); - os.imbue(std::locale(os.getloc(), new NumSep(','))); - os << std::setw(mWidth) << std::setprecision(mPrecision) << std::fixed << mValue; - return os; -} - -std::string Number::to_s() const { - std::stringstream ss; - write(ss); - return ss.str(); -} - -std::string to_s(uint64_t n) { - std::string str; - do { - str += static_cast('0' + static_cast(n % 10)); - n /= 10; - } while (n != 0); - std::reverse(str.begin(), str.end()); - return str; -} - -std::ostream& operator<<(std::ostream& os, Number const& n) { - return n.write(os); -} - -MarkDownColumn::MarkDownColumn(int w, int prec, std::string const& tit, std::string const& suff, double val) - : mWidth(w) - , mPrecision(prec) - , mTitle(tit) - , mSuffix(suff) - , mValue(val) {} - -std::string MarkDownColumn::title() const { - std::stringstream ss; - ss << '|' << std::setw(mWidth - 2) << std::right << mTitle << ' '; - return ss.str(); -} - -std::string MarkDownColumn::separator() const { - std::string sep(static_cast(mWidth), '-'); - sep.front() = '|'; - sep.back() = ':'; - return sep; -} - -std::string MarkDownColumn::invalid() const { - std::string sep(static_cast(mWidth), ' '); - sep.front() = '|'; - sep[sep.size() - 2] = '-'; - return sep; -} - -std::string MarkDownColumn::value() const { - std::stringstream ss; - auto width = mWidth - 2 - static_cast(mSuffix.size()); - ss << '|' << Number(width, mPrecision, mValue) << mSuffix << ' '; - return ss.str(); -} - -// Formats any text as markdown code, escaping backticks. -MarkDownCode::MarkDownCode(std::string const& what) { - mWhat.reserve(what.size() + 2); - mWhat.push_back('`'); - for (char c : what) { - mWhat.push_back(c); - if ('`' == c) { - mWhat.push_back('`'); - } - } - mWhat.push_back('`'); -} - -std::ostream& MarkDownCode::write(std::ostream& os) const { - return os << mWhat; -} - -std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode) { - return mdCode.write(os); -} -} // namespace fmt -} // namespace detail - -// provide implementation here so it's only generated once -Config::Config() = default; -Config::~Config() = default; -Config& Config::operator=(Config const&) = default; -Config& Config::operator=(Config&&) = default; -Config::Config(Config const&) = default; -Config::Config(Config&&) noexcept = default; - -// provide implementation here so it's only generated once -Result::~Result() = default; -Result& Result::operator=(Result const&) = default; -Result& Result::operator=(Result&&) = default; -Result::Result(Result const&) = default; -Result::Result(Result&&) noexcept = default; - -namespace detail { -template -inline constexpr typename std::underlying_type::type u(T val) noexcept { - return static_cast::type>(val); -} -} // namespace detail - -// Result returned after a benchmark has finished. Can be used as a baseline for relative(). -Result::Result(Config const& benchmarkConfig) - : mConfig(benchmarkConfig) - , mNameToMeasurements{detail::u(Result::Measure::_size)} {} - -void Result::add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const& pc) { - using detail::d; - using detail::u; - - double dIters = d(iters); - mNameToMeasurements[u(Result::Measure::iterations)].push_back(dIters); - - mNameToMeasurements[u(Result::Measure::elapsed)].push_back(d(totalElapsed) / dIters); - if (pc.has().pageFaults) { - mNameToMeasurements[u(Result::Measure::pagefaults)].push_back(d(pc.val().pageFaults) / dIters); - } - if (pc.has().cpuCycles) { - mNameToMeasurements[u(Result::Measure::cpucycles)].push_back(d(pc.val().cpuCycles) / dIters); - } - if (pc.has().contextSwitches) { - mNameToMeasurements[u(Result::Measure::contextswitches)].push_back(d(pc.val().contextSwitches) / dIters); - } - if (pc.has().instructions) { - mNameToMeasurements[u(Result::Measure::instructions)].push_back(d(pc.val().instructions) / dIters); - } - if (pc.has().branchInstructions) { - double branchInstructions = 0.0; - // correcting branches: remove branch introduced by the while (...) loop for each iteration. - if (pc.val().branchInstructions > iters + 1U) { - branchInstructions = d(pc.val().branchInstructions - (iters + 1U)); - } - mNameToMeasurements[u(Result::Measure::branchinstructions)].push_back(branchInstructions / dIters); - - if (pc.has().branchMisses) { - // correcting branch misses - double branchMisses = d(pc.val().branchMisses); - if (branchMisses > branchInstructions) { - // can't have branch misses when there were branches... - branchMisses = branchInstructions; - } - - // assuming at least one missed branch for the loop - branchMisses -= 1.0; - if (branchMisses < 1.0) { - branchMisses = 1.0; - } - mNameToMeasurements[u(Result::Measure::branchmisses)].push_back(branchMisses / dIters); - } - } -} - -Config const& Result::config() const noexcept { - return mConfig; -} - -inline double calcMedian(std::vector& data) { - if (data.empty()) { - return 0.0; - } - std::sort(data.begin(), data.end()); - - auto midIdx = data.size() / 2U; - if (1U == (data.size() & 1U)) { - return data[midIdx]; - } - return (data[midIdx - 1U] + data[midIdx]) / 2U; -} - -double Result::median(Measure m) const { - // create a copy so we can sort - auto data = mNameToMeasurements[detail::u(m)]; - return calcMedian(data); -} - -double Result::average(Measure m) const { - using detail::d; - auto const& data = mNameToMeasurements[detail::u(m)]; - if (data.empty()) { - return 0.0; - } - - // create a copy so we can sort - return sum(m) / d(data.size()); -} - -double Result::medianAbsolutePercentError(Measure m) const { - // create copy - auto data = mNameToMeasurements[detail::u(m)]; - - // calculates MdAPE which is the median of percentage error - // see https://www.spiderfinancial.com/support/documentation/numxl/reference-manual/forecasting-performance/mdape - auto med = calcMedian(data); - - // transform the data to absolute error - for (auto& x : data) { - x = (x - med) / x; - if (x < 0) { - x = -x; - } - } - return calcMedian(data); -} - -double Result::sum(Measure m) const noexcept { - auto const& data = mNameToMeasurements[detail::u(m)]; - return std::accumulate(data.begin(), data.end(), 0.0); -} - -double Result::sumProduct(Measure m1, Measure m2) const noexcept { - auto const& data1 = mNameToMeasurements[detail::u(m1)]; - auto const& data2 = mNameToMeasurements[detail::u(m2)]; - - if (data1.size() != data2.size()) { - return 0.0; - } - - double result = 0.0; - for (size_t i = 0, s = data1.size(); i != s; ++i) { - result += data1[i] * data2[i]; - } - return result; -} - -bool Result::has(Measure m) const noexcept { - return !mNameToMeasurements[detail::u(m)].empty(); -} - -double Result::get(size_t idx, Measure m) const { - auto const& data = mNameToMeasurements[detail::u(m)]; - return data.at(idx); -} - -bool Result::empty() const noexcept { - return 0U == size(); -} - -size_t Result::size() const noexcept { - auto const& data = mNameToMeasurements[detail::u(Measure::elapsed)]; - return data.size(); -} - -double Result::minimum(Measure m) const noexcept { - auto const& data = mNameToMeasurements[detail::u(m)]; - if (data.empty()) { - return 0.0; - } - - // here its save to assume that at least one element is there - return *std::min_element(data.begin(), data.end()); -} - -double Result::maximum(Measure m) const noexcept { - auto const& data = mNameToMeasurements[detail::u(m)]; - if (data.empty()) { - return 0.0; - } - - // here its save to assume that at least one element is there - return *std::max_element(data.begin(), data.end()); -} - -std::string const& Result::context(char const* variableName) const { - return mConfig.mContext.at(variableName); -} - -std::string const& Result::context(std::string const& variableName) const { - return mConfig.mContext.at(variableName); -} - -Result::Measure Result::fromString(std::string const& str) { - if (str == "elapsed") { - return Measure::elapsed; - } else if (str == "iterations") { - return Measure::iterations; - } else if (str == "pagefaults") { - return Measure::pagefaults; - } else if (str == "cpucycles") { - return Measure::cpucycles; - } else if (str == "contextswitches") { - return Measure::contextswitches; - } else if (str == "instructions") { - return Measure::instructions; - } else if (str == "branchinstructions") { - return Measure::branchinstructions; - } else if (str == "branchmisses") { - return Measure::branchmisses; - } else { - // not found, return _size - return Measure::_size; - } -} - -// Configuration of a microbenchmark. -Bench::Bench() { - mConfig.mOut = &std::cout; -} - -Bench::Bench(Bench&&) = default; -Bench& Bench::operator=(Bench&&) = default; -Bench::Bench(Bench const&) = default; -Bench& Bench::operator=(Bench const&) = default; -Bench::~Bench() noexcept = default; - -double Bench::batch() const noexcept { - return mConfig.mBatch; -} - -double Bench::complexityN() const noexcept { - return mConfig.mComplexityN; -} - -// Set a baseline to compare it to. 100% it is exactly as fast as the baseline, >100% means it is faster than the baseline, <100% -// means it is slower than the baseline. -Bench& Bench::relative(bool isRelativeEnabled) noexcept { - mConfig.mIsRelative = isRelativeEnabled; - return *this; -} -bool Bench::relative() const noexcept { - return mConfig.mIsRelative; -} - -Bench& Bench::performanceCounters(bool showPerformanceCounters) noexcept { - mConfig.mShowPerformanceCounters = showPerformanceCounters; - return *this; -} -bool Bench::performanceCounters() const noexcept { - return mConfig.mShowPerformanceCounters; -} - -// Operation unit. Defaults to "op", could be e.g. "byte" for string processing. -// If u differs from currently set unit, the stored results will be cleared. -// Use singular (byte, not bytes). -Bench& Bench::unit(char const* u) { - if (u != mConfig.mUnit) { - mResults.clear(); - } - mConfig.mUnit = u; - return *this; -} - -Bench& Bench::unit(std::string const& u) { - return unit(u.c_str()); -} - -std::string const& Bench::unit() const noexcept { - return mConfig.mUnit; -} - -Bench& Bench::timeUnit(std::chrono::duration const& tu, std::string const& tuName) { - mConfig.mTimeUnit = tu; - mConfig.mTimeUnitName = tuName; - return *this; -} - -std::string const& Bench::timeUnitName() const noexcept { - return mConfig.mTimeUnitName; -} - -std::chrono::duration const& Bench::timeUnit() const noexcept { - return mConfig.mTimeUnit; -} - -// If benchmarkTitle differs from currently set title, the stored results will be cleared. -Bench& Bench::title(const char* benchmarkTitle) { - if (benchmarkTitle != mConfig.mBenchmarkTitle) { - mResults.clear(); - } - mConfig.mBenchmarkTitle = benchmarkTitle; - return *this; -} -Bench& Bench::title(std::string const& benchmarkTitle) { - if (benchmarkTitle != mConfig.mBenchmarkTitle) { - mResults.clear(); - } - mConfig.mBenchmarkTitle = benchmarkTitle; - return *this; -} - -std::string const& Bench::title() const noexcept { - return mConfig.mBenchmarkTitle; -} - -Bench& Bench::name(const char* benchmarkName) { - mConfig.mBenchmarkName = benchmarkName; - return *this; -} - -Bench& Bench::name(std::string const& benchmarkName) { - mConfig.mBenchmarkName = benchmarkName; - return *this; -} - -std::string const& Bench::name() const noexcept { - return mConfig.mBenchmarkName; -} - -Bench& Bench::context(char const* variableName, char const* variableValue) { - mConfig.mContext[variableName] = variableValue; - return *this; -} - -Bench& Bench::context(std::string const& variableName, std::string const& variableValue) { - mConfig.mContext[variableName] = variableValue; - return *this; -} - -Bench& Bench::clearContext() { - mConfig.mContext.clear(); - return *this; -} - -// Number of epochs to evaluate. The reported result will be the median of evaluation of each epoch. -Bench& Bench::epochs(size_t numEpochs) noexcept { - mConfig.mNumEpochs = numEpochs; - return *this; -} -size_t Bench::epochs() const noexcept { - return mConfig.mNumEpochs; -} - -// Desired evaluation time is a multiple of clock resolution. Default is to be 1000 times above this measurement precision. -Bench& Bench::clockResolutionMultiple(size_t multiple) noexcept { - mConfig.mClockResolutionMultiple = multiple; - return *this; -} -size_t Bench::clockResolutionMultiple() const noexcept { - return mConfig.mClockResolutionMultiple; -} - -// Sets the maximum time each epoch should take. Default is 100ms. -Bench& Bench::maxEpochTime(std::chrono::nanoseconds t) noexcept { - mConfig.mMaxEpochTime = t; - return *this; -} -std::chrono::nanoseconds Bench::maxEpochTime() const noexcept { - return mConfig.mMaxEpochTime; -} - -// Sets the maximum time each epoch should take. Default is 100ms. -Bench& Bench::minEpochTime(std::chrono::nanoseconds t) noexcept { - mConfig.mMinEpochTime = t; - return *this; -} -std::chrono::nanoseconds Bench::minEpochTime() const noexcept { - return mConfig.mMinEpochTime; -} - -Bench& Bench::minEpochIterations(uint64_t numIters) noexcept { - mConfig.mMinEpochIterations = (numIters == 0) ? 1 : numIters; - return *this; -} -uint64_t Bench::minEpochIterations() const noexcept { - return mConfig.mMinEpochIterations; -} - -Bench& Bench::epochIterations(uint64_t numIters) noexcept { - mConfig.mEpochIterations = numIters; - return *this; -} -uint64_t Bench::epochIterations() const noexcept { - return mConfig.mEpochIterations; -} - -Bench& Bench::warmup(uint64_t numWarmupIters) noexcept { - mConfig.mWarmup = numWarmupIters; - return *this; -} -uint64_t Bench::warmup() const noexcept { - return mConfig.mWarmup; -} - -Bench& Bench::config(Config const& benchmarkConfig) { - mConfig = benchmarkConfig; - return *this; -} -Config const& Bench::config() const noexcept { - return mConfig; -} - -Bench& Bench::output(std::ostream* outstream) noexcept { - mConfig.mOut = outstream; - return *this; -} - -ANKERL_NANOBENCH(NODISCARD) std::ostream* Bench::output() const noexcept { - return mConfig.mOut; -} - -std::vector const& Bench::results() const noexcept { - return mResults; -} - -Bench& Bench::render(char const* templateContent, std::ostream& os) { - ::ankerl::nanobench::render(templateContent, *this, os); - return *this; -} - -Bench& Bench::render(std::string const& templateContent, std::ostream& os) { - ::ankerl::nanobench::render(templateContent, *this, os); - return *this; -} - -std::vector Bench::complexityBigO() const { - std::vector bigOs; - auto rangeMeasure = BigO::collectRangeMeasure(mResults); - bigOs.emplace_back("O(1)", rangeMeasure, [](double) { - return 1.0; - }); - bigOs.emplace_back("O(n)", rangeMeasure, [](double n) { - return n; - }); - bigOs.emplace_back("O(log n)", rangeMeasure, [](double n) { - return std::log2(n); - }); - bigOs.emplace_back("O(n log n)", rangeMeasure, [](double n) { - return n * std::log2(n); - }); - bigOs.emplace_back("O(n^2)", rangeMeasure, [](double n) { - return n * n; - }); - bigOs.emplace_back("O(n^3)", rangeMeasure, [](double n) { - return n * n * n; - }); - std::sort(bigOs.begin(), bigOs.end()); - return bigOs; -} - -Rng::Rng() - : mX(0) - , mY(0) { - std::random_device rd; - std::uniform_int_distribution dist; - do { - mX = dist(rd); - mY = dist(rd); - } while (mX == 0 && mY == 0); -} - -ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined") -uint64_t splitMix64(uint64_t& state) noexcept { - uint64_t z = (state += UINT64_C(0x9e3779b97f4a7c15)); - z = (z ^ (z >> 30U)) * UINT64_C(0xbf58476d1ce4e5b9); - z = (z ^ (z >> 27U)) * UINT64_C(0x94d049bb133111eb); - return z ^ (z >> 31U); -} - -// Seeded as described in romu paper (update april 2020) -Rng::Rng(uint64_t seed) noexcept - : mX(splitMix64(seed)) - , mY(splitMix64(seed)) { - for (size_t i = 0; i < 10; ++i) { - operator()(); - } -} - -// only internally used to copy the RNG. -Rng::Rng(uint64_t x, uint64_t y) noexcept - : mX(x) - , mY(y) {} - -Rng Rng::copy() const noexcept { - return Rng{mX, mY}; -} - -Rng::Rng(std::vector const& data) - : mX(0) - , mY(0) { - if (data.size() != 2) { - throw std::runtime_error("ankerl::nanobench::Rng::Rng: needed exactly 2 entries in data, but got " + - detail::fmt::to_s(data.size())); - } - mX = data[0]; - mY = data[1]; -} - -std::vector Rng::state() const { - std::vector data(2); - data[0] = mX; - data[1] = mY; - return data; -} - -BigO::RangeMeasure BigO::collectRangeMeasure(std::vector const& results) { - BigO::RangeMeasure rangeMeasure; - for (auto const& result : results) { - if (result.config().mComplexityN > 0.0) { - rangeMeasure.emplace_back(result.config().mComplexityN, result.median(Result::Measure::elapsed)); - } - } - return rangeMeasure; -} - -BigO::BigO(std::string const& bigOName, RangeMeasure const& rangeMeasure) - : mName(bigOName) { - - // estimate the constant factor - double sumRangeMeasure = 0.0; - double sumRangeRange = 0.0; - - for (size_t i = 0; i < rangeMeasure.size(); ++i) { - sumRangeMeasure += rangeMeasure[i].first * rangeMeasure[i].second; - sumRangeRange += rangeMeasure[i].first * rangeMeasure[i].first; - } - mConstant = sumRangeMeasure / sumRangeRange; - - // calculate root mean square - double err = 0.0; - double sumMeasure = 0.0; - for (size_t i = 0; i < rangeMeasure.size(); ++i) { - auto diff = mConstant * rangeMeasure[i].first - rangeMeasure[i].second; - err += diff * diff; - - sumMeasure += rangeMeasure[i].second; - } - - auto n = static_cast(rangeMeasure.size()); - auto mean = sumMeasure / n; - mNormalizedRootMeanSquare = std::sqrt(err / n) / mean; -} - -BigO::BigO(const char* bigOName, RangeMeasure const& rangeMeasure) - : BigO(std::string(bigOName), rangeMeasure) {} - -std::string const& BigO::name() const noexcept { - return mName; -} - -double BigO::constant() const noexcept { - return mConstant; -} - -double BigO::normalizedRootMeanSquare() const noexcept { - return mNormalizedRootMeanSquare; -} - -bool BigO::operator<(BigO const& other) const noexcept { - return std::tie(mNormalizedRootMeanSquare, mName) < std::tie(other.mNormalizedRootMeanSquare, other.mName); -} - -std::ostream& operator<<(std::ostream& os, BigO const& bigO) { - return os << bigO.constant() << " * " << bigO.name() << ", rms=" << bigO.normalizedRootMeanSquare(); -} - -std::ostream& operator<<(std::ostream& os, std::vector const& bigOs) { - detail::fmt::StreamStateRestorer restorer(os); - os << std::endl << "| coefficient | err% | complexity" << std::endl << "|--------------:|-------:|------------" << std::endl; - for (auto const& bigO : bigOs) { - os << "|" << std::setw(14) << std::setprecision(7) << std::scientific << bigO.constant() << " "; - os << "|" << detail::fmt::Number(6, 1, bigO.normalizedRootMeanSquare() * 100.0) << "% "; - os << "| " << bigO.name(); - os << std::endl; - } - return os; -} - -} // namespace nanobench -} // namespace ankerl - -#endif // ANKERL_NANOBENCH_IMPLEMENT -#endif // ANKERL_NANOBENCH_H_INCLUDED From 1f3cb37693b28554e008b48ffde9751038211238 Mon Sep 17 00:00:00 2001 From: Tristan Brindle Date: Thu, 25 Jan 2024 15:40:20 +0000 Subject: [PATCH 2/2] Rename benchmark targets ...for consistency with all of our example-* targets, and so they show up together in CLion --- benchmark/CMakeLists.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 13a7587d..a9974eb2 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -9,11 +9,11 @@ FetchContent_Declare( FetchContent_MakeAvailable(nanobench) -add_executable(internal-iteration-benchmark internal_iteration_benchmark.cpp) -target_link_libraries(internal-iteration-benchmark PUBLIC nanobench::nanobench flux) +add_executable(benchmark-internal-iteration internal_iteration_benchmark.cpp) +target_link_libraries(benchmark-internal-iteration PUBLIC nanobench::nanobench flux) -add_executable(sort-benchmark sort_benchmark.cpp) -target_link_libraries(sort-benchmark PUBLIC nanobench::nanobench flux) +add_executable(benchmark-sort sort_benchmark.cpp) +target_link_libraries(benchmark-sort PUBLIC nanobench::nanobench flux) -add_executable(multidimensional-memset-benchmark multidimensional_memset_benchmark.cpp multidimensional_memset_benchmark_kernels.cpp) -target_link_libraries(multidimensional-memset-benchmark PUBLIC nanobench::nanobench flux) +add_executable(benchmark-multidimensional-memset multidimensional_memset_benchmark.cpp multidimensional_memset_benchmark_kernels.cpp) +target_link_libraries(benchmark-multidimensional-memset PUBLIC nanobench::nanobench flux)