From fe33997b249792342e172d059c17780d06184d24 Mon Sep 17 00:00:00 2001
From: Tristan Brindle <t.c.brindle@gmail.com>
Date: Wed, 24 Jan 2024 18:20:09 +0000
Subject: [PATCH 1/2] Update nanobench

Use CMake's FetchContent to download from Github rather than bundling our own version of the header
---
 benchmark/CMakeLists.txt |   17 +-
 benchmark/nanobench.cpp  |    3 -
 benchmark/nanobench.h    | 3432 --------------------------------------
 3 files changed, 12 insertions(+), 3440 deletions(-)
 delete mode 100644 benchmark/nanobench.cpp
 delete mode 100644 benchmark/nanobench.h

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 647fea7c..13a7587d 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -1,12 +1,19 @@
 
-add_library(nanobench nanobench.cpp)
-target_include_directories(nanobench PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+include(FetchContent)
+
+FetchContent_Declare(
+    nanobench
+    GIT_REPOSITORY https://github.com/martinus/nanobench.git
+    GIT_TAG v4.3.11
+)
+
+FetchContent_MakeAvailable(nanobench)
 
 add_executable(internal-iteration-benchmark internal_iteration_benchmark.cpp)
-target_link_libraries(internal-iteration-benchmark PUBLIC nanobench flux)
+target_link_libraries(internal-iteration-benchmark PUBLIC nanobench::nanobench flux)
 
 add_executable(sort-benchmark sort_benchmark.cpp)
-target_link_libraries(sort-benchmark PUBLIC nanobench flux)
+target_link_libraries(sort-benchmark PUBLIC nanobench::nanobench flux)
 
 add_executable(multidimensional-memset-benchmark multidimensional_memset_benchmark.cpp multidimensional_memset_benchmark_kernels.cpp)
-target_link_libraries(multidimensional-memset-benchmark PUBLIC nanobench flux)
+target_link_libraries(multidimensional-memset-benchmark PUBLIC nanobench::nanobench flux)
diff --git a/benchmark/nanobench.cpp b/benchmark/nanobench.cpp
deleted file mode 100644
index ecf6bf42..00000000
--- a/benchmark/nanobench.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-
-#define ANKERL_NANOBENCH_IMPLEMENT
-#include <nanobench.h>
\ No newline at end of file
diff --git a/benchmark/nanobench.h b/benchmark/nanobench.h
deleted file mode 100644
index c03d9ad2..00000000
--- a/benchmark/nanobench.h
+++ /dev/null
@@ -1,3432 +0,0 @@
-//  __   _ _______ __   _  _____  ______  _______ __   _ _______ _     _
-//  | \  | |_____| | \  | |     | |_____] |______ | \  | |       |_____|
-//  |  \_| |     | |  \_| |_____| |_____] |______ |  \_| |_____  |     |
-//
-// Microbenchmark framework for C++11/14/17/20
-// https://github.com/martinus/nanobench
-//
-// Licensed under the MIT License <http://opensource.org/licenses/MIT>.
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2019-2022 Martin Leitner-Ankerl <martin.ankerl@gmail.com>
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-// SOFTWARE.
-
-#ifndef ANKERL_NANOBENCH_H_INCLUDED
-#define ANKERL_NANOBENCH_H_INCLUDED
-
-// see https://semver.org/
-#define ANKERL_NANOBENCH_VERSION_MAJOR 4 // incompatible API changes
-#define ANKERL_NANOBENCH_VERSION_MINOR 3 // backwards-compatible changes
-#define ANKERL_NANOBENCH_VERSION_PATCH 9 // backwards-compatible bug fixes
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-// public facing api - as minimal as possible
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include <chrono>        // high_resolution_clock
-#include <cstring>       // memcpy
-#include <iosfwd>        // for std::ostream* custom output target in Config
-#include <string>        // all names
-#include <unordered_map> // holds context information of results
-#include <vector>        // holds all results
-
-#define ANKERL_NANOBENCH(x) ANKERL_NANOBENCH_PRIVATE_##x()
-
-#define ANKERL_NANOBENCH_PRIVATE_CXX() __cplusplus
-#define ANKERL_NANOBENCH_PRIVATE_CXX98() 199711L
-#define ANKERL_NANOBENCH_PRIVATE_CXX11() 201103L
-#define ANKERL_NANOBENCH_PRIVATE_CXX14() 201402L
-#define ANKERL_NANOBENCH_PRIVATE_CXX17() 201703L
-
-#if ANKERL_NANOBENCH(CXX) >= ANKERL_NANOBENCH(CXX17)
-#    define ANKERL_NANOBENCH_PRIVATE_NODISCARD() [[nodiscard]]
-#else
-#    define ANKERL_NANOBENCH_PRIVATE_NODISCARD()
-#endif
-
-#if defined(__clang__)
-#    define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH() \
-        _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wpadded\"")
-#    define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP() _Pragma("clang diagnostic pop")
-#else
-#    define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH()
-#    define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP()
-#endif
-
-#if defined(__GNUC__)
-#    define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH() _Pragma("GCC diagnostic push") _Pragma("GCC diagnostic ignored \"-Weffc++\"")
-#    define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP() _Pragma("GCC diagnostic pop")
-#else
-#    define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH()
-#    define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP()
-#endif
-
-#if defined(ANKERL_NANOBENCH_LOG_ENABLED)
-#    include <iostream>
-#    define ANKERL_NANOBENCH_LOG(x)                                                 \
-        do {                                                                        \
-            std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << x << std::endl; \
-        } while (0)
-#else
-#    define ANKERL_NANOBENCH_LOG(x) \
-        do {                        \
-        } while (0)
-#endif
-
-#define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 0
-#if defined(__linux__) && !defined(ANKERL_NANOBENCH_DISABLE_PERF_COUNTERS)
-#    include <linux/version.h>
-#    if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 3, 0)
-// PERF_COUNT_HW_REF_CPU_CYCLES only available since kernel 3.3
-// PERF_FLAG_FD_CLOEXEC since kernel 3.14
-#        undef ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS
-#        define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 1
-#    endif
-#endif
-
-#if defined(__clang__)
-#    define ANKERL_NANOBENCH_NO_SANITIZE(...) __attribute__((no_sanitize(__VA_ARGS__)))
-#else
-#    define ANKERL_NANOBENCH_NO_SANITIZE(...)
-#endif
-
-#if defined(_MSC_VER)
-#    define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __declspec(noinline)
-#else
-#    define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __attribute__((noinline))
-#endif
-
-// workaround missing "is_trivially_copyable" in g++ < 5.0
-// See https://stackoverflow.com/a/31798726/48181
-#if defined(__GNUC__) && __GNUC__ < 5
-#    define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) __has_trivial_copy(__VA_ARGS__)
-#else
-#    define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) std::is_trivially_copyable<__VA_ARGS__>::value
-#endif
-
-// declarations ///////////////////////////////////////////////////////////////////////////////////
-
-namespace ankerl {
-namespace nanobench {
-
-using Clock = std::conditional<std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock,
-                               std::chrono::steady_clock>::type;
-class Bench;
-struct Config;
-class Result;
-class Rng;
-class BigO;
-
-/**
- * @brief Renders output from a mustache-like template and benchmark results.
- *
- * The templating facility here is heavily inspired by [mustache - logic-less templates](https://mustache.github.io/).
- * It adds a few more features that are necessary to get all of the captured data out of nanobench. Please read the
- * excellent [mustache manual](https://mustache.github.io/mustache.5.html) to see what this is all about.
- *
- * nanobench output has two nested layers, *result* and *measurement*.  Here is a hierarchy of the allowed tags:
- *
- * * `{{#result}}` Marks the begin of the result layer. Whatever comes after this will be instantiated as often as
- *   a benchmark result is available. Within it, you can use these tags:
- *
- *    * `{{title}}` See Bench::title.
- *
- *    * `{{name}}` Benchmark name, usually directly provided with Bench::run, but can also be set with Bench::name.
- *
- *    * `{{unit}}` Unit, e.g. `byte`. Defaults to `op`, see Bench::unit.
- *
- *    * `{{batch}}` Batch size, see Bench::batch.
- *
- *    * `{{complexityN}}` Value used for asymptotic complexity calculation. See Bench::complexityN.
- *
- *    * `{{epochs}}` Number of epochs, see Bench::epochs.
- *
- *    * `{{clockResolution}}` Accuracy of the clock, i.e. what's the smallest time possible to measure with the clock.
- *      For modern systems, this can be around 20 ns. This value is automatically determined by nanobench at the first
- *      benchmark that is run, and used as a static variable throughout the application's runtime.
- *
- *    * `{{clockResolutionMultiple}}` Configuration multiplier for `clockResolution`. See Bench::clockResolutionMultiple.
- *      This is the target runtime for each measurement (epoch). That means the more accurate your clock is, the faster
- *      will be the benchmark. Basing the measurement's runtime on the clock resolution is the main reason why nanobench is so fast.
- *
- *    * `{{maxEpochTime}}` Configuration for a maximum time each measurement (epoch) is allowed to take. Note that at least
- *      a single iteration will be performed, even when that takes longer than maxEpochTime. See Bench::maxEpochTime.
- *
- *    * `{{minEpochTime}}` Minimum epoch time, defaults to 1ms. See Bench::minEpochTime.
- *
- *    * `{{minEpochIterations}}` See Bench::minEpochIterations.
- *
- *    * `{{epochIterations}}` See Bench::epochIterations.
- *
- *    * `{{warmup}}` Number of iterations used before measuring starts. See Bench::warmup.
- *
- *    * `{{relative}}` True or false, depending on the setting you have used. See Bench::relative.
- *
- *    * `{{context(variableName)}}` See Bench::context.
- *
- *    Apart from these tags, it is also possible to use some mathematical operations on the measurement data. The operations
- *    are of the form `{{command(name)}}`.  Currently `name` can be one of `elapsed`, `iterations`. If performance counters
- *    are available (currently only on current Linux systems), you also have `pagefaults`, `cpucycles`,
- *    `contextswitches`, `instructions`, `branchinstructions`, and `branchmisses`. All the measures (except `iterations`) are
- *    provided for a single iteration (so `elapsed` is the time a single iteration took). The following tags are available:
- *
- *    * `{{median(<name>)}}` Calculate median of a measurement data set, e.g. `{{median(elapsed)}}`.
- *
- *    * `{{average(<name>)}}` Average (mean) calculation.
- *
- *    * `{{medianAbsolutePercentError(<name>)}}` Calculates MdAPE, the Median Absolute Percentage Error. The MdAPE is an excellent
- *      metric for the variation of measurements. It is more robust to outliers than the
- *      [Mean absolute percentage error (M-APE)](https://en.wikipedia.org/wiki/Mean_absolute_percentage_error).
- *      @f[
- *       \mathrm{MdAPE}(e) = \mathrm{med}\{| \frac{e_i - \mathrm{med}\{e\}}{e_i}| \}
- *      @f]
- *      E.g. for *elapsed*: First, @f$ \mathrm{med}\{e\} @f$ calculates the median by sorting and then taking the middle element
- *      of all *elapsed* measurements. This is used to calculate the absolute percentage
- *      error to this median for each measurement, as in  @f$ | \frac{e_i - \mathrm{med}\{e\}}{e_i}| @f$. All these results
- *      are sorted, and the middle value is chosen as the median absolute percent error.
- *
- *      This measurement is a bit hard to interpret, but it is very robust against outliers. E.g. a value of 5% means that half of the
- *      measurements deviate less than 5% from the median, and the other deviate more than 5% from the median.
- *
- *    * `{{sum(<name>)}}` Sum of all the measurements. E.g. `{{sum(iterations)}}` will give you the total number of iterations
-*        measured in this benchmark.
- *
- *    * `{{minimum(<name>)}}` Minimum of all measurements.
- *
- *    * `{{maximum(<name>)}}` Maximum of all measurements.
- *
- *    * `{{sumProduct(<first>, <second>)}}` Calculates the sum of the products of corresponding measures:
- *      @f[
- *          \mathrm{sumProduct}(a,b) = \sum_{i=1}^{n}a_i\cdot b_i
- *      @f]
- *      E.g. to calculate total runtime of the benchmark, you multiply iterations with elapsed time for each measurement, and
- *      sum these results up:
- *      `{{sumProduct(iterations, elapsed)}}`.
- *
- *    * `{{#measurement}}` To access individual measurement results, open the begin tag for measurements.
- *
- *       * `{{elapsed}}` Average elapsed wall clock time per iteration, in seconds.
- *
- *       * `{{iterations}}` Number of iterations in the measurement. The number of iterations will fluctuate due
- *         to some applied randomness, to enhance accuracy.
- *
- *       * `{{pagefaults}}` Average number of pagefaults per iteration.
- *
- *       * `{{cpucycles}}` Average number of CPU cycles processed per iteration.
- *
- *       * `{{contextswitches}}` Average number of context switches per iteration.
- *
- *       * `{{instructions}}` Average number of retired instructions per iteration.
- *
- *       * `{{branchinstructions}}` Average number of branches executed per iteration.
- *
- *       * `{{branchmisses}}` Average number of branches that were missed per iteration.
- *
- *    * `{{/measurement}}` Ends the measurement tag.
- *
- * * `{{/result}}` Marks the end of the result layer. This is the end marker for the template part that will be instantiated
- *   for each benchmark result.
- *
- *
- *  For the layer tags *result* and *measurement* you additionally can use these special markers:
- *
- *  * ``{{#-first}}`` - Begin marker of a template that will be instantiated *only for the first* entry in the layer. Use is only
- *    allowed between the begin and end marker of the layer. So between ``{{#result}}`` and ``{{/result}}``, or between
- *    ``{{#measurement}}`` and ``{{/measurement}}``. Finish the template with ``{{/-first}}``.
- *
- *  * ``{{^-first}}`` - Begin marker of a template that will be instantiated *for each except the first* entry in the layer. This,
- *    this is basically the inversion of ``{{#-first}}``. Use is only allowed between the begin and end marker of the layer.
- *    So between ``{{#result}}`` and ``{{/result}}``, or between ``{{#measurement}}`` and ``{{/measurement}}``.
- *
- *  * ``{{/-first}}`` - End marker for either ``{{#-first}}`` or ``{{^-first}}``.
- *
- *  * ``{{#-last}}`` - Begin marker of a template that will be instantiated *only for the last* entry in the layer. Use is only
- *    allowed between the begin and end marker of the layer. So between ``{{#result}}`` and ``{{/result}}``, or between
- *    ``{{#measurement}}`` and ``{{/measurement}}``. Finish the template with ``{{/-last}}``.
- *
- *  * ``{{^-last}}`` - Begin marker of a template that will be instantiated *for each except the last* entry in the layer. This,
- *    this is basically the inversion of ``{{#-last}}``. Use is only allowed between the begin and end marker of the layer.
- *    So between ``{{#result}}`` and ``{{/result}}``, or between ``{{#measurement}}`` and ``{{/measurement}}``.
- *
- *  * ``{{/-last}}`` - End marker for either ``{{#-last}}`` or ``{{^-last}}``.
- *
-   @verbatim embed:rst
-
-   For an overview of all the possible data you can get out of nanobench, please see the tutorial at :ref:`tutorial-template-json`.
-
-   The templates that ship with nanobench are:
-
-   * :cpp:func:`templates::csv() <ankerl::nanobench::templates::csv()>`
-   * :cpp:func:`templates::json() <ankerl::nanobench::templates::json()>`
-   * :cpp:func:`templates::htmlBoxplot() <ankerl::nanobench::templates::htmlBoxplot()>`
-   * :cpp:func:`templates::pyperf() <ankerl::nanobench::templates::pyperf()>`
-
-   @endverbatim
- *
- * @param mustacheTemplate The template.
- * @param bench Benchmark, containing all the results.
- * @param out Output for the generated output.
- */
-void render(char const* mustacheTemplate, Bench const& bench, std::ostream& out);
-void render(std::string const& mustacheTemplate, Bench const& bench, std::ostream& out);
-
-/**
- * Same as render(char const* mustacheTemplate, Bench const& bench, std::ostream& out), but for when
- * you only have results available.
- *
- * @param mustacheTemplate The template.
- * @param results All the results to be used for rendering.
- * @param out Output for the generated output.
- */
-void render(char const* mustacheTemplate, std::vector<Result> const& results, std::ostream& out);
-void render(std::string const& mustacheTemplate, std::vector<Result> const& results, std::ostream& out);
-
-// Contains mustache-like templates
-namespace templates {
-
-/*!
-  @brief CSV data for the benchmark results.
-
-  Generates a comma-separated values dataset. First line is the header, each following line is a summary of each benchmark run.
-
-  @verbatim embed:rst
-  See the tutorial at :ref:`tutorial-template-csv` for an example.
-  @endverbatim
- */
-char const* csv() noexcept;
-
-/*!
-  @brief HTML output that uses plotly to generate an interactive boxplot chart. See the tutorial for an example output.
-
-  The output uses only the elapsed wall clock time, and displays each epoch as a single dot.
-  @verbatim embed:rst
-  See the tutorial at :ref:`tutorial-template-html` for an example.
-  @endverbatim
-
-  @see also ankerl::nanobench::render()
- */
-char const* htmlBoxplot() noexcept;
-
-/*!
- @brief Output in pyperf compatible JSON format, which can be used for more analyzation.
- @verbatim embed:rst
- See the tutorial at :ref:`tutorial-template-pyperf` for an example how to further analyze the output.
- @endverbatim
- */
-char const* pyperf() noexcept;
-
-/*!
-  @brief Template to generate JSON data.
-
-  The generated JSON data contains *all* data that has been generated. All times are as double values, in seconds. The output can get
-  quite large.
-  @verbatim embed:rst
-  See the tutorial at :ref:`tutorial-template-json` for an example.
-  @endverbatim
- */
-char const* json() noexcept;
-
-} // namespace templates
-
-namespace detail {
-
-template <typename T>
-struct PerfCountSet;
-
-class IterationLogic;
-class PerformanceCounters;
-
-#if ANKERL_NANOBENCH(PERF_COUNTERS)
-class LinuxPerformanceCounters;
-#endif
-
-} // namespace detail
-} // namespace nanobench
-} // namespace ankerl
-
-// definitions ////////////////////////////////////////////////////////////////////////////////////
-
-namespace ankerl {
-namespace nanobench {
-namespace detail {
-
-template <typename T>
-struct PerfCountSet {
-    T pageFaults{};
-    T cpuCycles{};
-    T contextSwitches{};
-    T instructions{};
-    T branchInstructions{};
-    T branchMisses{};
-};
-
-} // namespace detail
-
-ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
-struct Config {
-    // actual benchmark config
-    std::string mBenchmarkTitle = "benchmark";
-    std::string mBenchmarkName = "noname";
-    std::string mUnit = "op";
-    double mBatch = 1.0;
-    double mComplexityN = -1.0;
-    size_t mNumEpochs = 11;
-    size_t mClockResolutionMultiple = static_cast<size_t>(1000);
-    std::chrono::nanoseconds mMaxEpochTime = std::chrono::milliseconds(100);
-    std::chrono::nanoseconds mMinEpochTime = std::chrono::milliseconds(1);
-    uint64_t mMinEpochIterations{1};
-    uint64_t mEpochIterations{0}; // If not 0, run *exactly* these number of iterations per epoch.
-    uint64_t mWarmup = 0;
-    std::ostream* mOut = nullptr;
-    std::chrono::duration<double> mTimeUnit = std::chrono::nanoseconds{1};
-    std::string mTimeUnitName = "ns";
-    bool mShowPerformanceCounters = true;
-    bool mIsRelative = false;
-    std::unordered_map<std::string, std::string> mContext{};
-
-    Config();
-    ~Config();
-    Config& operator=(Config const&);
-    Config& operator=(Config&&);
-    Config(Config const&);
-    Config(Config&&) noexcept;
-};
-ANKERL_NANOBENCH(IGNORE_PADDED_POP)
-
-// Result returned after a benchmark has finished. Can be used as a baseline for relative().
-ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
-class Result {
-public:
-    enum class Measure : size_t {
-        elapsed,
-        iterations,
-        pagefaults,
-        cpucycles,
-        contextswitches,
-        instructions,
-        branchinstructions,
-        branchmisses,
-        _size
-    };
-
-    explicit Result(Config const& benchmarkConfig);
-
-    ~Result();
-    Result& operator=(Result const&);
-    Result& operator=(Result&&);
-    Result(Result const&);
-    Result(Result&&) noexcept;
-
-    // adds new measurement results
-    // all values are scaled by iters (except iters...)
-    void add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const& pc);
-
-    ANKERL_NANOBENCH(NODISCARD) Config const& config() const noexcept;
-
-    ANKERL_NANOBENCH(NODISCARD) double median(Measure m) const;
-    ANKERL_NANOBENCH(NODISCARD) double medianAbsolutePercentError(Measure m) const;
-    ANKERL_NANOBENCH(NODISCARD) double average(Measure m) const;
-    ANKERL_NANOBENCH(NODISCARD) double sum(Measure m) const noexcept;
-    ANKERL_NANOBENCH(NODISCARD) double sumProduct(Measure m1, Measure m2) const noexcept;
-    ANKERL_NANOBENCH(NODISCARD) double minimum(Measure m) const noexcept;
-    ANKERL_NANOBENCH(NODISCARD) double maximum(Measure m) const noexcept;
-    ANKERL_NANOBENCH(NODISCARD) std::string const& context(char const*) const;
-    ANKERL_NANOBENCH(NODISCARD) std::string const& context(std::string const&) const;
-
-    ANKERL_NANOBENCH(NODISCARD) bool has(Measure m) const noexcept;
-    ANKERL_NANOBENCH(NODISCARD) double get(size_t idx, Measure m) const;
-    ANKERL_NANOBENCH(NODISCARD) bool empty() const noexcept;
-    ANKERL_NANOBENCH(NODISCARD) size_t size() const noexcept;
-
-    // Finds string, if not found, returns _size.
-    static Measure fromString(std::string const& str);
-
-private:
-    Config mConfig{};
-    std::vector<std::vector<double>> mNameToMeasurements{};
-};
-ANKERL_NANOBENCH(IGNORE_PADDED_POP)
-
-/**
- * An extremely fast random generator. Currently, this implements *RomuDuoJr*, developed by Mark Overton. Source:
- * http://www.romu-random.org/
- *
- * RomuDuoJr is extremely fast and provides reasonable good randomness. Not enough for large jobs, but definitely
- * good enough for a benchmarking framework.
- *
- *  * Estimated capacity: @f$ 2^{51} @f$ bytes
- *  * Register pressure: 4
- *  * State size: 128 bits
- *
- * This random generator is a drop-in replacement for the generators supplied by ``<random>``. It is not
- * cryptographically secure. It's intended purpose is to be very fast so that benchmarks that make use
- * of randomness are not distorted too much by the random generator.
- *
- * Rng also provides a few non-standard helpers, optimized for speed.
- */
-class Rng final {
-public:
-    /**
-     * @brief This RNG provides 64bit randomness.
-     */
-    using result_type = uint64_t;
-
-    static constexpr uint64_t(min)();
-    static constexpr uint64_t(max)();
-
-    /**
-     * As a safety precaution, we don't allow copying. Copying a PRNG would mean you would have two random generators that produce the
-     * same sequence, which is generally not what one wants. Instead create a new rng with the default constructor Rng(), which is
-     * automatically seeded from `std::random_device`. If you really need a copy, use `copy()`.
-     */
-    Rng(Rng const&) = delete;
-
-    /**
-     * Same as Rng(Rng const&), we don't allow assignment. If you need a new Rng create one with the default constructor Rng().
-     */
-    Rng& operator=(Rng const&) = delete;
-
-    // moving is ok
-    Rng(Rng&&) noexcept = default;
-    Rng& operator=(Rng&&) noexcept = default;
-    ~Rng() noexcept = default;
-
-    /**
-     * @brief Creates a new Random generator with random seed.
-     *
-     * Instead of a default seed (as the random generators from the STD), this properly seeds the random generator from
-     * `std::random_device`. It guarantees correct seeding. Note that seeding can be relatively slow, depending on the source of
-     * randomness used. So it is best to create a Rng once and use it for all your randomness purposes.
-     */
-    Rng();
-
-    /*!
-      Creates a new Rng that is seeded with a specific seed. Each Rng created from the same seed will produce the same randomness
-      sequence. This can be useful for deterministic behavior.
-
-      @verbatim embed:rst
-      .. note::
-
-         The random algorithm might change between nanobench releases. Whenever a faster and/or better random
-         generator becomes available, I will switch the implementation.
-      @endverbatim
-
-      As per the Romu paper, this seeds the Rng with splitMix64 algorithm and performs 10 initial rounds for further mixing up of the
-      internal state.
-
-      @param seed  The 64bit seed. All values are allowed, even 0.
-     */
-    explicit Rng(uint64_t seed) noexcept;
-    Rng(uint64_t x, uint64_t y) noexcept;
-    Rng(std::vector<uint64_t> const& data);
-
-    /**
-     * Creates a copy of the Rng, thus the copy provides exactly the same random sequence as the original.
-     */
-    ANKERL_NANOBENCH(NODISCARD) Rng copy() const noexcept;
-
-    /**
-     * @brief Produces a 64bit random value. This should be very fast, thus it is marked as inline. In my benchmark, this is ~46 times
-     * faster than `std::default_random_engine` for producing 64bit random values. It seems that the fastest std contender is
-     * `std::mt19937_64`. Still, this RNG is 2-3 times as fast.
-     *
-     * @return uint64_t The next 64 bit random value.
-     */
-    inline uint64_t operator()() noexcept;
-
-    // This is slightly biased. See
-
-    /**
-     * Generates a random number between 0 and range (excluding range).
-     *
-     * The algorithm only produces 32bit numbers, and is slightly biased. The effect is quite small unless your range is close to the
-     * maximum value of an integer. It is possible to correct the bias with rejection sampling (see
-     * [here](https://lemire.me/blog/2016/06/30/fast-random-shuffling/), but this is most likely irrelevant in practices for the
-     * purposes of this Rng.
-     *
-     * See Daniel Lemire's blog post [A fast alternative to the modulo
-     * reduction](https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/)
-     *
-     * @param range Upper exclusive range. E.g a value of 3 will generate random numbers 0, 1, 2.
-     * @return uint32_t Generated random values in range [0, range(.
-     */
-    inline uint32_t bounded(uint32_t range) noexcept;
-
-    // random double in range [0, 1(
-    // see http://prng.di.unimi.it/
-
-    /**
-     * Provides a random uniform double value between 0 and 1. This uses the method described in [Generating uniform doubles in the
-     * unit interval](http://prng.di.unimi.it/), and is extremely fast.
-     *
-     * @return double Uniformly distributed double value in range [0,1(, excluding 1.
-     */
-    inline double uniform01() noexcept;
-
-    /**
-     * Shuffles all entries in the given container. Although this has a slight bias due to the implementation of bounded(), this is
-     * preferable to `std::shuffle` because it is over 5 times faster. See Daniel Lemire's blog post [Fast random
-     * shuffling](https://lemire.me/blog/2016/06/30/fast-random-shuffling/).
-     *
-     * @param container The whole container will be shuffled.
-     */
-    template <typename Container>
-    void shuffle(Container& container) noexcept;
-
-    /**
-     * Extracts the full state of the generator, e.g. for serialization. For this RNG this is just 2 values, but to stay API compatible
-     * with future implementations that potentially use more state, we use a vector.
-     *
-     * @return Vector containing the full state:
-     */
-    std::vector<uint64_t> state() const;
-
-private:
-    static constexpr uint64_t rotl(uint64_t x, unsigned k) noexcept;
-
-    uint64_t mX;
-    uint64_t mY;
-};
-
-/**
- * @brief Main entry point to nanobench's benchmarking facility.
- *
- * It holds configuration and results from one or more benchmark runs. Usually it is used in a single line, where the object is
- * constructed, configured, and then a benchmark is run. E.g. like this:
- *
- *     ankerl::nanobench::Bench().unit("byte").batch(1000).run("random fluctuations", [&] {
- *         // here be the benchmark code
- *     });
- *
- * In that example Bench() constructs the benchmark, it is then configured with unit() and batch(), and after configuration a
- * benchmark is executed with run(). Once run() has finished, it prints the result to `std::cout`. It would also store the results
- * in the Bench instance, but in this case the object is immediately destroyed so it's not available any more.
- */
-ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
-class Bench {
-public:
-    /**
-     * @brief Creates a new benchmark for configuration and running of benchmarks.
-     */
-    Bench();
-
-    Bench(Bench&& other);
-    Bench& operator=(Bench&& other);
-    Bench(Bench const& other);
-    Bench& operator=(Bench const& other);
-    ~Bench() noexcept;
-
-    /*!
-      @brief Repeatedly calls `op()` based on the configuration, and performs measurements.
-
-      This call is marked with `noinline` to prevent the compiler to optimize beyond different benchmarks. This can have quite a big
-      effect on benchmark accuracy.
-
-      @verbatim embed:rst
-      .. note::
-
-        Each call to your lambda must have a side effect that the compiler can't possibly optimize it away. E.g. add a result to an
-        externally defined number (like `x` in the above example), and finally call `doNotOptimizeAway` on the variables the compiler
-        must not remove. You can also use :cpp:func:`ankerl::nanobench::doNotOptimizeAway` directly in the lambda, but be aware that
-        this has a small overhead.
-
-      @endverbatim
-
-      @tparam Op The code to benchmark.
-     */
-    template <typename Op>
-    ANKERL_NANOBENCH(NOINLINE)
-    Bench& run(char const* benchmarkName, Op&& op);
-
-    template <typename Op>
-    ANKERL_NANOBENCH(NOINLINE)
-    Bench& run(std::string const& benchmarkName, Op&& op);
-
-    /**
-     * @brief Same as run(char const* benchmarkName, Op op), but instead uses the previously set name.
-     * @tparam Op The code to benchmark.
-     */
-    template <typename Op>
-    ANKERL_NANOBENCH(NOINLINE)
-    Bench& run(Op&& op);
-
-    /**
-     * @brief Title of the benchmark, will be shown in the table header. Changing the title will start a new markdown table.
-     *
-     * @param benchmarkTitle The title of the benchmark.
-     */
-    Bench& title(char const* benchmarkTitle);
-    Bench& title(std::string const& benchmarkTitle);
-
-    /**
-     * @brief Gets the title of the benchmark
-     */
-    ANKERL_NANOBENCH(NODISCARD) std::string const& title() const noexcept;
-
-    /// Name of the benchmark, will be shown in the table row.
-    Bench& name(char const* benchmarkName);
-    Bench& name(std::string const& benchmarkName);
-    ANKERL_NANOBENCH(NODISCARD) std::string const& name() const noexcept;
-
-    /**
-     * @brief Set context information.
-     *
-     * The information can be accessed using custom render templates via `{{context(variableName)}}`.
-     * Trying to render a variable that hasn't been set before raises an exception.
-     * Not included in (default) markdown table.
-     *
-     * @see clearContext, render
-     *
-     * @param variableName The name of the context variable.
-     * @param variableValue The value of the context variable.
-     */
-    Bench& context(char const* variableName, char const* variableValue);
-    Bench& context(std::string const& variableName, std::string const& variableValue);
-
-    /**
-     * @brief Reset context information.
-     *
-     * This may improve efficiency when using many context entries,
-     * or improve robustness by removing spurious context entries.
-     *
-     * @see context
-     */
-    Bench& clearContext();
-
-    /**
-     * @brief Sets the batch size.
-     *
-     * E.g. number of processed byte, or some other metric for the size of the processed data in each iteration. If you benchmark
-     * hashing of a 1000 byte long string and want byte/sec as a result, you can specify 1000 as the batch size.
-     *
-     * @tparam T Any input type is internally cast to `double`.
-     * @param b batch size
-     */
-    template <typename T>
-    Bench& batch(T b) noexcept;
-    ANKERL_NANOBENCH(NODISCARD) double batch() const noexcept;
-
-    /**
-     * @brief Sets the operation unit.
-     *
-     * Defaults to "op". Could be e.g. "byte" for string processing. This is used for the table header, e.g. to show `ns/byte`. Use
-     * singular (*byte*, not *bytes*). A change clears the currently collected results.
-     *
-     * @param unit The unit name.
-     */
-    Bench& unit(char const* unit);
-    Bench& unit(std::string const& unit);
-    ANKERL_NANOBENCH(NODISCARD) std::string const& unit() const noexcept;
-
-    /**
-     * @brief Sets the time unit to be used for the default output.
-     *
-     * Nanobench defaults to using ns (nanoseconds) as output in the markdown. For some benchmarks this is too coarse, so it is
-     * possible to configure this. E.g. use `timeUnit(1ms, "ms")` to show `ms/op` instead of `ns/op`.
-     *
-     * @param tu Time unit to display the results in, default is 1ns.
-     * @param tuName Name for the time unit, default is "ns"
-     */
-    Bench& timeUnit(std::chrono::duration<double> const& tu, std::string const& tuName);
-    ANKERL_NANOBENCH(NODISCARD) std::string const& timeUnitName() const noexcept;
-    ANKERL_NANOBENCH(NODISCARD) std::chrono::duration<double> const& timeUnit() const noexcept;
-
-    /**
-     * @brief Set the output stream where the resulting markdown table will be printed to.
-     *
-     * The default is `&std::cout`. You can disable all output by setting `nullptr`.
-     *
-     * @param outstream Pointer to output stream, can be `nullptr`.
-     */
-    Bench& output(std::ostream* outstream) noexcept;
-    ANKERL_NANOBENCH(NODISCARD) std::ostream* output() const noexcept;
-
-    /**
-     * Modern processors have a very accurate clock, being able to measure as low as 20 nanoseconds. This is the main trick nanobech to
-     * be so fast: we find out how accurate the clock is, then run the benchmark only so often that the clock's accuracy is good enough
-     * for accurate measurements.
-     *
-     * The default is to run one epoch for 1000 times the clock resolution. So for 20ns resolution and 11 epochs, this gives a total
-     * runtime of
-     *
-     * @f[
-     * 20ns * 1000 * 11 \approx 0.2ms
-     * @f]
-     *
-     * To be precise, nanobench adds a 0-20% random noise to each evaluation. This is to prevent any aliasing effects, and further
-     * improves accuracy.
-     *
-     * Total runtime will be higher though: Some initial time is needed to find out the target number of iterations for each epoch, and
-     * there is some overhead involved to start & stop timers and calculate resulting statistics and writing the output.
-     *
-     * @param multiple Target number of times of clock resolution. Usually 1000 is a good compromise between runtime and accuracy.
-     */
-    Bench& clockResolutionMultiple(size_t multiple) noexcept;
-    ANKERL_NANOBENCH(NODISCARD) size_t clockResolutionMultiple() const noexcept;
-
-    /**
-     * @brief Controls number of epochs, the number of measurements to perform.
-     *
-     * The reported result will be the median of evaluation of each epoch. The higher you choose this, the more
-     * deterministic the result be and outliers will be more easily removed. Also the `err%` will be more accurate the higher this
-     * number is. Note that the `err%` will not necessarily decrease when number of epochs is increased. But it will be a more accurate
-     * representation of the benchmarked code's runtime stability.
-     *
-     * Choose the value wisely. In practice, 11 has been shown to be a reasonable choice between runtime performance and accuracy.
-     * This setting goes hand in hand with minEpochIterations() (or minEpochTime()). If you are more interested in *median* runtime,
-     * you might want to increase epochs(). If you are more interested in *mean* runtime, you might want to increase
-     * minEpochIterations() instead.
-     *
-     * @param numEpochs Number of epochs.
-     */
-    Bench& epochs(size_t numEpochs) noexcept;
-    ANKERL_NANOBENCH(NODISCARD) size_t epochs() const noexcept;
-
-    /**
-     * @brief Upper limit for the runtime of each epoch.
-     *
-     * As a safety precaution if the clock is not very accurate, we can set an upper limit for the maximum evaluation time per
-     * epoch. Default is 100ms. At least a single evaluation of the benchmark is performed.
-     *
-     * @see minEpochTime, minEpochIterations
-     *
-     * @param t Maximum target runtime for a single epoch.
-     */
-    Bench& maxEpochTime(std::chrono::nanoseconds t) noexcept;
-    ANKERL_NANOBENCH(NODISCARD) std::chrono::nanoseconds maxEpochTime() const noexcept;
-
-    /**
-     * @brief Minimum time each epoch should take.
-     *
-     * Default is zero, so we are fully relying on clockResolutionMultiple(). In most cases this is exactly what you want. If you see
-     * that the evaluation is unreliable with a high `err%`, you can increase either minEpochTime() or minEpochIterations().
-     *
-     * @see maxEpochTim), minEpochIterations
-     *
-     * @param t Minimum time each epoch should take.
-     */
-    Bench& minEpochTime(std::chrono::nanoseconds t) noexcept;
-    ANKERL_NANOBENCH(NODISCARD) std::chrono::nanoseconds minEpochTime() const noexcept;
-
-    /**
-     * @brief Sets the minimum number of iterations each epoch should take.
-     *
-     * Default is 1, and we rely on clockResolutionMultiple(). If the `err%` is high and you want a more smooth result, you might want
-     * to increase the minimum number of iterations, or increase the minEpochTime().
-     *
-     * @see minEpochTime, maxEpochTime, minEpochIterations
-     *
-     * @param numIters Minimum number of iterations per epoch.
-     */
-    Bench& minEpochIterations(uint64_t numIters) noexcept;
-    ANKERL_NANOBENCH(NODISCARD) uint64_t minEpochIterations() const noexcept;
-
-    /**
-     * Sets exactly the number of iterations for each epoch. Ignores all other epoch limits. This forces nanobench to use exactly
-     * the given number of iterations for each epoch, not more and not less. Default is 0 (disabled).
-     *
-     * @param numIters Exact number of iterations to use. Set to 0 to disable.
-     */
-    Bench& epochIterations(uint64_t numIters) noexcept;
-    ANKERL_NANOBENCH(NODISCARD) uint64_t epochIterations() const noexcept;
-
-    /**
-     * @brief Sets a number of iterations that are initially performed without any measurements.
-     *
-     * Some benchmarks need a few evaluations to warm up caches / database / whatever access. Normally this should not be needed, since
-     * we show the median result so initial outliers will be filtered away automatically. If the warmup effect is large though, you
-     * might want to set it. Default is 0.
-     *
-     * @param numWarmupIters Number of warmup iterations.
-     */
-    Bench& warmup(uint64_t numWarmupIters) noexcept;
-    ANKERL_NANOBENCH(NODISCARD) uint64_t warmup() const noexcept;
-
-    /**
-     * @brief Marks the next run as the baseline.
-     *
-     * Call `relative(true)` to mark the run as the baseline. Successive runs will be compared to this run. It is calculated by
-     *
-     * @f[
-     * 100\% * \frac{baseline}{runtime}
-     * @f]
-     *
-     *  * 100% means it is exactly as fast as the baseline
-     *  * >100% means it is faster than the baseline. E.g. 200% means the current run is twice as fast as the baseline.
-     *  * <100% means it is slower than the baseline. E.g. 50% means it is twice as slow as the baseline.
-     *
-     * See the tutorial section "Comparing Results" for example usage.
-     *
-     * @param isRelativeEnabled True to enable processing
-     */
-    Bench& relative(bool isRelativeEnabled) noexcept;
-    ANKERL_NANOBENCH(NODISCARD) bool relative() const noexcept;
-
-    /**
-     * @brief Enables/disables performance counters.
-     *
-     * On Linux nanobench has a powerful feature to use performance counters. This enables counting of retired instructions, count
-     * number of branches, missed branches, etc. On default this is enabled, but you can disable it if you don't need that feature.
-     *
-     * @param showPerformanceCounters True to enable, false to disable.
-     */
-    Bench& performanceCounters(bool showPerformanceCounters) noexcept;
-    ANKERL_NANOBENCH(NODISCARD) bool performanceCounters() const noexcept;
-
-    /**
-     * @brief Retrieves all benchmark results collected by the bench object so far.
-     *
-     * Each call to run() generates a Result that is stored within the Bench instance. This is mostly for advanced users who want to
-     * see all the nitty gritty details.
-     *
-     * @return All results collected so far.
-     */
-    ANKERL_NANOBENCH(NODISCARD) std::vector<Result> const& results() const noexcept;
-
-    /*!
-      @verbatim embed:rst
-
-      Convenience shortcut to :cpp:func:`ankerl::nanobench::doNotOptimizeAway`.
-
-      @endverbatim
-     */
-    template <typename Arg>
-    Bench& doNotOptimizeAway(Arg&& arg);
-
-    /*!
-      @verbatim embed:rst
-
-      Sets N for asymptotic complexity calculation, so it becomes possible to calculate `Big O
-      <https://en.wikipedia.org/wiki/Big_O_notation>`_ from multiple benchmark evaluations.
-
-      Use :cpp:func:`ankerl::nanobench::Bench::complexityBigO` when the evaluation has finished. See the tutorial
-      :ref:`asymptotic-complexity` for details.
-
-      @endverbatim
-
-      @tparam T Any type is cast to `double`.
-      @param b Length of N for the next benchmark run, so it is possible to calculate `bigO`.
-     */
-    template <typename T>
-    Bench& complexityN(T b) noexcept;
-    ANKERL_NANOBENCH(NODISCARD) double complexityN() const noexcept;
-
-    /*!
-      Calculates [Big O](https://en.wikipedia.org/wiki/Big_O_notation>) of the results with all preconfigured complexity functions.
-      Currently these complexity functions are fitted into the benchmark results:
-
-       @f$ \mathcal{O}(1) @f$,
-       @f$ \mathcal{O}(n) @f$,
-       @f$ \mathcal{O}(\log{}n) @f$,
-       @f$ \mathcal{O}(n\log{}n) @f$,
-       @f$ \mathcal{O}(n^2) @f$,
-       @f$ \mathcal{O}(n^3) @f$.
-
-      If we e.g. evaluate the complexity of `std::sort`, this is the result of `std::cout << bench.complexityBigO()`:
-
-      ```
-      |   coefficient |   err% | complexity
-      |--------------:|-------:|------------
-      |   5.08935e-09 |   2.6% | O(n log n)
-      |   6.10608e-08 |   8.0% | O(n)
-      |   1.29307e-11 |  47.2% | O(n^2)
-      |   2.48677e-15 |  69.6% | O(n^3)
-      |   9.88133e-06 | 132.3% | O(log n)
-      |   5.98793e-05 | 162.5% | O(1)
-      ```
-
-      So in this case @f$ \mathcal{O}(n\log{}n) @f$ provides the best approximation.
-
-      @verbatim embed:rst
-      See the tutorial :ref:`asymptotic-complexity` for details.
-      @endverbatim
-      @return Evaluation results, which can be printed or otherwise inspected.
-     */
-    std::vector<BigO> complexityBigO() const;
-
-    /**
-     * @brief Calculates bigO for a custom function.
-     *
-     * E.g. to calculate the mean squared error for @f$ \mathcal{O}(\log{}\log{}n) @f$, which is not part of the default set of
-     * complexityBigO(), you can do this:
-     *
-     * ```
-     * auto logLogN = bench.complexityBigO("O(log log n)", [](double n) {
-     *     return std::log2(std::log2(n));
-     * });
-     * ```
-     *
-     * The resulting mean squared error can be printed with `std::cout << logLogN`. E.g. it prints something like this:
-     *
-     * ```text
-     * 2.46985e-05 * O(log log n), rms=1.48121
-     * ```
-     *
-     * @tparam Op Type of mapping operation.
-     * @param name Name for the function, e.g. "O(log log n)"
-     * @param op Op's operator() maps a `double` with the desired complexity function, e.g. `log2(log2(n))`.
-     * @return BigO Error calculation, which is streamable to std::cout.
-     */
-    template <typename Op>
-    BigO complexityBigO(char const* name, Op op) const;
-
-    template <typename Op>
-    BigO complexityBigO(std::string const& name, Op op) const;
-
-    /*!
-      @verbatim embed:rst
-
-      Convenience shortcut to :cpp:func:`ankerl::nanobench::render`.
-
-      @endverbatim
-     */
-    Bench& render(char const* templateContent, std::ostream& os);
-    Bench& render(std::string const& templateContent, std::ostream& os);
-
-    Bench& config(Config const& benchmarkConfig);
-    ANKERL_NANOBENCH(NODISCARD) Config const& config() const noexcept;
-
-private:
-    Config mConfig{};
-    std::vector<Result> mResults{};
-};
-ANKERL_NANOBENCH(IGNORE_PADDED_POP)
-
-/**
- * @brief Makes sure none of the given arguments are optimized away by the compiler.
- *
- * @tparam Arg Type of the argument that shouldn't be optimized away.
- * @param arg The input that we mark as being used, even though we don't do anything with it.
- */
-template <typename Arg>
-void doNotOptimizeAway(Arg&& arg);
-
-namespace detail {
-
-#if defined(_MSC_VER)
-void doNotOptimizeAwaySink(void const*);
-
-template <typename T>
-void doNotOptimizeAway(T const& val);
-
-#else
-
-// These assembly magic is directly from what Google Benchmark is doing. I have previously used what facebook's folly was doing, but
-// this seemed to have compilation problems in some cases. Google Benchmark seemed to be the most well tested anyways.
-// see https://github.com/google/benchmark/blob/master/include/benchmark/benchmark.h#L307
-template <typename T>
-void doNotOptimizeAway(T const& val) {
-    // NOLINTNEXTLINE(hicpp-no-assembler)
-    asm volatile("" : : "r,m"(val) : "memory");
-}
-
-template <typename T>
-void doNotOptimizeAway(T& val) {
-#    if defined(__clang__)
-    // NOLINTNEXTLINE(hicpp-no-assembler)
-    asm volatile("" : "+r,m"(val) : : "memory");
-#    else
-    // NOLINTNEXTLINE(hicpp-no-assembler)
-    asm volatile("" : "+m,r"(val) : : "memory");
-#    endif
-}
-#endif
-
-// internally used, but visible because run() is templated.
-// Not movable/copy-able, so we simply use a pointer instead of unique_ptr. This saves us from
-// having to include <memory>, and the template instantiation overhead of unique_ptr which is unfortunately quite significant.
-ANKERL_NANOBENCH(IGNORE_EFFCPP_PUSH)
-class IterationLogic {
-public:
-    explicit IterationLogic(Bench const& config) noexcept;
-    ~IterationLogic();
-
-    ANKERL_NANOBENCH(NODISCARD) uint64_t numIters() const noexcept;
-    void add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept;
-    void moveResultTo(std::vector<Result>& results) noexcept;
-
-private:
-    struct Impl;
-    Impl* mPimpl;
-};
-ANKERL_NANOBENCH(IGNORE_EFFCPP_POP)
-
-ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
-class PerformanceCounters {
-public:
-    PerformanceCounters(PerformanceCounters const&) = delete;
-    PerformanceCounters& operator=(PerformanceCounters const&) = delete;
-
-    PerformanceCounters();
-    ~PerformanceCounters();
-
-    void beginMeasure();
-    void endMeasure();
-    void updateResults(uint64_t numIters);
-
-    ANKERL_NANOBENCH(NODISCARD) PerfCountSet<uint64_t> const& val() const noexcept;
-    ANKERL_NANOBENCH(NODISCARD) PerfCountSet<bool> const& has() const noexcept;
-
-private:
-#if ANKERL_NANOBENCH(PERF_COUNTERS)
-    LinuxPerformanceCounters* mPc = nullptr;
-#endif
-    PerfCountSet<uint64_t> mVal{};
-    PerfCountSet<bool> mHas{};
-};
-ANKERL_NANOBENCH(IGNORE_PADDED_POP)
-
-// Gets the singleton
-PerformanceCounters& performanceCounters();
-
-} // namespace detail
-
-class BigO {
-public:
-    using RangeMeasure = std::vector<std::pair<double, double>>;
-
-    template <typename Op>
-    static RangeMeasure mapRangeMeasure(RangeMeasure data, Op op) {
-        for (auto& rangeMeasure : data) {
-            rangeMeasure.first = op(rangeMeasure.first);
-        }
-        return data;
-    }
-
-    static RangeMeasure collectRangeMeasure(std::vector<Result> const& results);
-
-    template <typename Op>
-    BigO(char const* bigOName, RangeMeasure const& rangeMeasure, Op rangeToN)
-        : BigO(bigOName, mapRangeMeasure(rangeMeasure, rangeToN)) {}
-
-    template <typename Op>
-    BigO(std::string const& bigOName, RangeMeasure const& rangeMeasure, Op rangeToN)
-        : BigO(bigOName, mapRangeMeasure(rangeMeasure, rangeToN)) {}
-
-    BigO(char const* bigOName, RangeMeasure const& scaledRangeMeasure);
-    BigO(std::string const& bigOName, RangeMeasure const& scaledRangeMeasure);
-    ANKERL_NANOBENCH(NODISCARD) std::string const& name() const noexcept;
-    ANKERL_NANOBENCH(NODISCARD) double constant() const noexcept;
-    ANKERL_NANOBENCH(NODISCARD) double normalizedRootMeanSquare() const noexcept;
-    ANKERL_NANOBENCH(NODISCARD) bool operator<(BigO const& other) const noexcept;
-
-private:
-    std::string mName{};
-    double mConstant{};
-    double mNormalizedRootMeanSquare{};
-};
-std::ostream& operator<<(std::ostream& os, BigO const& bigO);
-std::ostream& operator<<(std::ostream& os, std::vector<ankerl::nanobench::BigO> const& bigOs);
-
-} // namespace nanobench
-} // namespace ankerl
-
-// implementation /////////////////////////////////////////////////////////////////////////////////
-
-namespace ankerl {
-namespace nanobench {
-
-constexpr uint64_t(Rng::min)() {
-    return 0;
-}
-
-constexpr uint64_t(Rng::max)() {
-    return (std::numeric_limits<uint64_t>::max)();
-}
-
-ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
-uint64_t Rng::operator()() noexcept {
-    auto x = mX;
-
-    mX = UINT64_C(15241094284759029579) * mY;
-    mY = rotl(mY - x, 27);
-
-    return x;
-}
-
-ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
-uint32_t Rng::bounded(uint32_t range) noexcept {
-    uint64_t r32 = static_cast<uint32_t>(operator()());
-    auto multiresult = r32 * range;
-    return static_cast<uint32_t>(multiresult >> 32U);
-}
-
-double Rng::uniform01() noexcept {
-    auto i = (UINT64_C(0x3ff) << 52U) | (operator()() >> 12U);
-    // can't use union in c++ here for type puning, it's undefined behavior.
-    // std::memcpy is optimized anyways.
-    double d;
-    std::memcpy(&d, &i, sizeof(double));
-    return d - 1.0;
-}
-
-template <typename Container>
-void Rng::shuffle(Container& container) noexcept {
-    auto i = container.size();
-    while (i > 1U) {
-        using std::swap;
-        auto n = operator()();
-        // using decltype(i) instead of size_t to be compatible to containers with 32bit index (see #80)
-        auto b1 = static_cast<decltype(i)>((static_cast<uint32_t>(n) * static_cast<uint64_t>(i)) >> 32U);
-        swap(container[--i], container[b1]);
-
-        auto b2 = static_cast<decltype(i)>(((n >> 32U) * static_cast<uint64_t>(i)) >> 32U);
-        swap(container[--i], container[b2]);
-    }
-}
-
-ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
-constexpr uint64_t Rng::rotl(uint64_t x, unsigned k) noexcept {
-    return (x << k) | (x >> (64U - k));
-}
-
-template <typename Op>
-ANKERL_NANOBENCH_NO_SANITIZE("integer")
-Bench& Bench::run(Op&& op) {
-    // It is important that this method is kept short so the compiler can do better optimizations/ inlining of op()
-    detail::IterationLogic iterationLogic(*this);
-    auto& pc = detail::performanceCounters();
-
-    while (auto n = iterationLogic.numIters()) {
-        pc.beginMeasure();
-        Clock::time_point before = Clock::now();
-        while (n-- > 0) {
-            op();
-        }
-        Clock::time_point after = Clock::now();
-        pc.endMeasure();
-        pc.updateResults(iterationLogic.numIters());
-        iterationLogic.add(after - before, pc);
-    }
-    iterationLogic.moveResultTo(mResults);
-    return *this;
-}
-
-// Performs all evaluations.
-template <typename Op>
-Bench& Bench::run(char const* benchmarkName, Op&& op) {
-    name(benchmarkName);
-    return run(std::forward<Op>(op));
-}
-
-template <typename Op>
-Bench& Bench::run(std::string const& benchmarkName, Op&& op) {
-    name(benchmarkName);
-    return run(std::forward<Op>(op));
-}
-
-template <typename Op>
-BigO Bench::complexityBigO(char const* benchmarkName, Op op) const {
-    return BigO(benchmarkName, BigO::collectRangeMeasure(mResults), op);
-}
-
-template <typename Op>
-BigO Bench::complexityBigO(std::string const& benchmarkName, Op op) const {
-    return BigO(benchmarkName, BigO::collectRangeMeasure(mResults), op);
-}
-
-// Set the batch size, e.g. number of processed bytes, or some other metric for the size of the processed data in each iteration.
-// Any argument is cast to double.
-template <typename T>
-Bench& Bench::batch(T b) noexcept {
-    mConfig.mBatch = static_cast<double>(b);
-    return *this;
-}
-
-// Sets the computation complexity of the next run. Any argument is cast to double.
-template <typename T>
-Bench& Bench::complexityN(T n) noexcept {
-    mConfig.mComplexityN = static_cast<double>(n);
-    return *this;
-}
-
-// Convenience: makes sure none of the given arguments are optimized away by the compiler.
-template <typename Arg>
-Bench& Bench::doNotOptimizeAway(Arg&& arg) {
-    detail::doNotOptimizeAway(std::forward<Arg>(arg));
-    return *this;
-}
-
-// Makes sure none of the given arguments are optimized away by the compiler.
-template <typename Arg>
-void doNotOptimizeAway(Arg&& arg) {
-    detail::doNotOptimizeAway(std::forward<Arg>(arg));
-}
-
-namespace detail {
-
-#if defined(_MSC_VER)
-template <typename T>
-void doNotOptimizeAway(T const& val) {
-    doNotOptimizeAwaySink(&val);
-}
-
-#endif
-
-} // namespace detail
-} // namespace nanobench
-} // namespace ankerl
-
-#if defined(ANKERL_NANOBENCH_IMPLEMENT)
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-// implementation part - only visible in .cpp
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-#    include <algorithm> // sort, reverse
-#    include <atomic>    // compare_exchange_strong in loop overhead
-#    include <cstdlib>   // getenv
-#    include <cstring>   // strstr, strncmp
-#    include <fstream>   // ifstream to parse proc files
-#    include <iomanip>   // setw, setprecision
-#    include <iostream>  // cout
-#    include <numeric>   // accumulate
-#    include <random>    // random_device
-#    include <sstream>   // to_s in Number
-#    include <stdexcept> // throw for rendering templates
-#    include <tuple>     // std::tie
-#    if defined(__linux__)
-#        include <unistd.h> //sysconf
-#    endif
-#    if ANKERL_NANOBENCH(PERF_COUNTERS)
-#        include <map> // map
-
-#        include <linux/perf_event.h>
-#        include <sys/ioctl.h>
-#        include <sys/syscall.h>
-#        include <unistd.h>
-#    endif
-
-// declarations ///////////////////////////////////////////////////////////////////////////////////
-
-namespace ankerl {
-namespace nanobench {
-
-// helper stuff that is only intended to be used internally
-namespace detail {
-
-struct TableInfo;
-
-// formatting utilities
-namespace fmt {
-
-class NumSep;
-class StreamStateRestorer;
-class Number;
-class MarkDownColumn;
-class MarkDownCode;
-
-} // namespace fmt
-} // namespace detail
-} // namespace nanobench
-} // namespace ankerl
-
-// definitions ////////////////////////////////////////////////////////////////////////////////////
-
-namespace ankerl {
-namespace nanobench {
-
-uint64_t splitMix64(uint64_t& state) noexcept;
-
-namespace detail {
-
-// helpers to get double values
-template <typename T>
-inline double d(T t) noexcept {
-    return static_cast<double>(t);
-}
-inline double d(Clock::duration duration) noexcept {
-    return std::chrono::duration_cast<std::chrono::duration<double>>(duration).count();
-}
-
-// Calculates clock resolution once, and remembers the result
-inline Clock::duration clockResolution() noexcept;
-
-} // namespace detail
-
-namespace templates {
-
-char const* csv() noexcept {
-    return R"DELIM("title";"name";"unit";"batch";"elapsed";"error %";"instructions";"branches";"branch misses";"total"
-{{#result}}"{{title}}";"{{name}}";"{{unit}}";{{batch}};{{median(elapsed)}};{{medianAbsolutePercentError(elapsed)}};{{median(instructions)}};{{median(branchinstructions)}};{{median(branchmisses)}};{{sumProduct(iterations, elapsed)}}
-{{/result}})DELIM";
-}
-
-char const* htmlBoxplot() noexcept {
-    return R"DELIM(<html>
-
-<head>
-    <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
-</head>
-
-<body>
-    <div id="myDiv"></div>
-    <script>
-        var data = [
-            {{#result}}{
-                name: '{{name}}',
-                y: [{{#measurement}}{{elapsed}}{{^-last}}, {{/last}}{{/measurement}}],
-            },
-            {{/result}}
-        ];
-        var title = '{{title}}';
-
-        data = data.map(a => Object.assign(a, { boxpoints: 'all', pointpos: 0, type: 'box' }));
-        var layout = { title: { text: title }, showlegend: false, yaxis: { title: 'time per unit', rangemode: 'tozero', autorange: true } }; Plotly.newPlot('myDiv', data, layout, {responsive: true});
-    </script>
-</body>
-
-</html>)DELIM";
-}
-
-char const* pyperf() noexcept {
-    return R"DELIM({
-    "benchmarks": [
-        {
-            "runs": [
-                {
-                    "values": [
-{{#measurement}}                        {{elapsed}}{{^-last}},
-{{/last}}{{/measurement}}
-                    ]
-                }
-            ]
-        }
-    ],
-    "metadata": {
-        "loops": {{sum(iterations)}},
-        "inner_loops": {{batch}},
-        "name": "{{title}}",
-        "unit": "second"
-    },
-    "version": "1.0"
-})DELIM";
-}
-
-char const* json() noexcept {
-    return R"DELIM({
-    "results": [
-{{#result}}        {
-            "title": "{{title}}",
-            "name": "{{name}}",
-            "unit": "{{unit}}",
-            "batch": {{batch}},
-            "complexityN": {{complexityN}},
-            "epochs": {{epochs}},
-            "clockResolution": {{clockResolution}},
-            "clockResolutionMultiple": {{clockResolutionMultiple}},
-            "maxEpochTime": {{maxEpochTime}},
-            "minEpochTime": {{minEpochTime}},
-            "minEpochIterations": {{minEpochIterations}},
-            "epochIterations": {{epochIterations}},
-            "warmup": {{warmup}},
-            "relative": {{relative}},
-            "median(elapsed)": {{median(elapsed)}},
-            "medianAbsolutePercentError(elapsed)": {{medianAbsolutePercentError(elapsed)}},
-            "median(instructions)": {{median(instructions)}},
-            "medianAbsolutePercentError(instructions)": {{medianAbsolutePercentError(instructions)}},
-            "median(cpucycles)": {{median(cpucycles)}},
-            "median(contextswitches)": {{median(contextswitches)}},
-            "median(pagefaults)": {{median(pagefaults)}},
-            "median(branchinstructions)": {{median(branchinstructions)}},
-            "median(branchmisses)": {{median(branchmisses)}},
-            "totalTime": {{sumProduct(iterations, elapsed)}},
-            "measurements": [
-{{#measurement}}                {
-                    "iterations": {{iterations}},
-                    "elapsed": {{elapsed}},
-                    "pagefaults": {{pagefaults}},
-                    "cpucycles": {{cpucycles}},
-                    "contextswitches": {{contextswitches}},
-                    "instructions": {{instructions}},
-                    "branchinstructions": {{branchinstructions}},
-                    "branchmisses": {{branchmisses}}
-                }{{^-last}},{{/-last}}
-{{/measurement}}            ]
-        }{{^-last}},{{/-last}}
-{{/result}}    ]
-})DELIM";
-}
-
-ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
-struct Node {
-    enum class Type { tag, content, section, inverted_section };
-
-    char const* begin;
-    char const* end;
-    std::vector<Node> children;
-    Type type;
-
-    template <size_t N>
-    // NOLINTNEXTLINE(hicpp-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
-    bool operator==(char const (&str)[N]) const noexcept {
-        return static_cast<size_t>(std::distance(begin, end) + 1) == N && 0 == strncmp(str, begin, N - 1);
-    }
-};
-ANKERL_NANOBENCH(IGNORE_PADDED_POP)
-
-static std::vector<Node> parseMustacheTemplate(char const** tpl) {
-    std::vector<Node> nodes;
-
-    while (true) {
-        auto begin = std::strstr(*tpl, "{{");
-        auto end = begin;
-        if (begin != nullptr) {
-            begin += 2;
-            end = std::strstr(begin, "}}");
-        }
-
-        if (begin == nullptr || end == nullptr) {
-            // nothing found, finish node
-            nodes.emplace_back(Node{*tpl, *tpl + std::strlen(*tpl), std::vector<Node>{}, Node::Type::content});
-            return nodes;
-        }
-
-        nodes.emplace_back(Node{*tpl, begin - 2, std::vector<Node>{}, Node::Type::content});
-
-        // we found a tag
-        *tpl = end + 2;
-        switch (*begin) {
-        case '/':
-            // finished! bail out
-            return nodes;
-
-        case '#':
-            nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::section});
-            break;
-
-        case '^':
-            nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::inverted_section});
-            break;
-
-        default:
-            nodes.emplace_back(Node{begin, end, std::vector<Node>{}, Node::Type::tag});
-            break;
-        }
-    }
-}
-
-static bool generateFirstLast(Node const& n, size_t idx, size_t size, std::ostream& out) {
-    ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
-    bool matchFirst = n == "-first";
-    bool matchLast = n == "-last";
-    if (!matchFirst && !matchLast) {
-        return false;
-    }
-
-    bool doWrite = false;
-    if (n.type == Node::Type::section) {
-        doWrite = (matchFirst && idx == 0) || (matchLast && idx == size - 1);
-    } else if (n.type == Node::Type::inverted_section) {
-        doWrite = (matchFirst && idx != 0) || (matchLast && idx != size - 1);
-    }
-
-    if (doWrite) {
-        for (auto const& child : n.children) {
-            if (child.type == Node::Type::content) {
-                out.write(child.begin, std::distance(child.begin, child.end));
-            }
-        }
-    }
-    return true;
-}
-
-static bool matchCmdArgs(std::string const& str, std::vector<std::string>& matchResult) {
-    matchResult.clear();
-    auto idxOpen = str.find('(');
-    auto idxClose = str.find(')', idxOpen);
-    if (idxClose == std::string::npos) {
-        return false;
-    }
-
-    matchResult.emplace_back(str.substr(0, idxOpen));
-
-    // split by comma
-    matchResult.emplace_back(std::string{});
-    for (size_t i = idxOpen + 1; i != idxClose; ++i) {
-        if (str[i] == ' ' || str[i] == '\t') {
-            // skip whitespace
-            continue;
-        }
-        if (str[i] == ',') {
-            // got a comma => new string
-            matchResult.emplace_back(std::string{});
-            continue;
-        }
-        // no whitespace no comma, append
-        matchResult.back() += str[i];
-    }
-    return true;
-}
-
-static bool generateConfigTag(Node const& n, Config const& config, std::ostream& out) {
-    using detail::d;
-
-    if (n == "title") {
-        out << config.mBenchmarkTitle;
-        return true;
-    } else if (n == "name") {
-        out << config.mBenchmarkName;
-        return true;
-    } else if (n == "unit") {
-        out << config.mUnit;
-        return true;
-    } else if (n == "batch") {
-        out << config.mBatch;
-        return true;
-    } else if (n == "complexityN") {
-        out << config.mComplexityN;
-        return true;
-    } else if (n == "epochs") {
-        out << config.mNumEpochs;
-        return true;
-    } else if (n == "clockResolution") {
-        out << d(detail::clockResolution());
-        return true;
-    } else if (n == "clockResolutionMultiple") {
-        out << config.mClockResolutionMultiple;
-        return true;
-    } else if (n == "maxEpochTime") {
-        out << d(config.mMaxEpochTime);
-        return true;
-    } else if (n == "minEpochTime") {
-        out << d(config.mMinEpochTime);
-        return true;
-    } else if (n == "minEpochIterations") {
-        out << config.mMinEpochIterations;
-        return true;
-    } else if (n == "epochIterations") {
-        out << config.mEpochIterations;
-        return true;
-    } else if (n == "warmup") {
-        out << config.mWarmup;
-        return true;
-    } else if (n == "relative") {
-        out << config.mIsRelative;
-        return true;
-    }
-    return false;
-}
-
-static std::ostream& generateResultTag(Node const& n, Result const& r, std::ostream& out) {
-    if (generateConfigTag(n, r.config(), out)) {
-        return out;
-    }
-    // match e.g. "median(elapsed)"
-    // g++ 4.8 doesn't implement std::regex :(
-    // static std::regex const regOpArg1("^([a-zA-Z]+)\\(([a-zA-Z]*)\\)$");
-    // std::cmatch matchResult;
-    // if (std::regex_match(n.begin, n.end, matchResult, regOpArg1)) {
-    std::vector<std::string> matchResult;
-    if (matchCmdArgs(std::string(n.begin, n.end), matchResult)) {
-        if (matchResult.size() == 2) {
-            if (matchResult[0] == "context") {
-                return out << r.context(matchResult[1]);
-            }
-
-            auto m = Result::fromString(matchResult[1]);
-            if (m == Result::Measure::_size) {
-                return out << 0.0;
-            }
-
-            if (matchResult[0] == "median") {
-                return out << r.median(m);
-            }
-            if (matchResult[0] == "average") {
-                return out << r.average(m);
-            }
-            if (matchResult[0] == "medianAbsolutePercentError") {
-                return out << r.medianAbsolutePercentError(m);
-            }
-            if (matchResult[0] == "sum") {
-                return out << r.sum(m);
-            }
-            if (matchResult[0] == "minimum") {
-                return out << r.minimum(m);
-            }
-            if (matchResult[0] == "maximum") {
-                return out << r.maximum(m);
-            }
-        } else if (matchResult.size() == 3) {
-            auto m1 = Result::fromString(matchResult[1]);
-            auto m2 = Result::fromString(matchResult[2]);
-            if (m1 == Result::Measure::_size || m2 == Result::Measure::_size) {
-                return out << 0.0;
-            }
-
-            if (matchResult[0] == "sumProduct") {
-                return out << r.sumProduct(m1, m2);
-            }
-        }
-    }
-
-    // match e.g. "sumProduct(elapsed, iterations)"
-    // static std::regex const regOpArg2("^([a-zA-Z]+)\\(([a-zA-Z]*)\\s*,\\s+([a-zA-Z]*)\\)$");
-
-    // nothing matches :(
-    throw std::runtime_error("command '" + std::string(n.begin, n.end) + "' not understood");
-}
-
-static void generateResultMeasurement(std::vector<Node> const& nodes, size_t idx, Result const& r, std::ostream& out) {
-    for (auto const& n : nodes) {
-        if (!generateFirstLast(n, idx, r.size(), out)) {
-            ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
-            switch (n.type) {
-            case Node::Type::content:
-                out.write(n.begin, std::distance(n.begin, n.end));
-                break;
-
-            case Node::Type::inverted_section:
-                throw std::runtime_error("got a inverted section inside measurement");
-
-            case Node::Type::section:
-                throw std::runtime_error("got a section inside measurement");
-
-            case Node::Type::tag: {
-                auto m = Result::fromString(std::string(n.begin, n.end));
-                if (m == Result::Measure::_size || !r.has(m)) {
-                    out << 0.0;
-                } else {
-                    out << r.get(idx, m);
-                }
-                break;
-            }
-            }
-        }
-    }
-}
-
-static void generateResult(std::vector<Node> const& nodes, size_t idx, std::vector<Result> const& results, std::ostream& out) {
-    auto const& r = results[idx];
-    for (auto const& n : nodes) {
-        if (!generateFirstLast(n, idx, results.size(), out)) {
-            ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
-            switch (n.type) {
-            case Node::Type::content:
-                out.write(n.begin, std::distance(n.begin, n.end));
-                break;
-
-            case Node::Type::inverted_section:
-                throw std::runtime_error("got a inverted section inside result");
-
-            case Node::Type::section:
-                if (n == "measurement") {
-                    for (size_t i = 0; i < r.size(); ++i) {
-                        generateResultMeasurement(n.children, i, r, out);
-                    }
-                } else {
-                    throw std::runtime_error("got a section inside result");
-                }
-                break;
-
-            case Node::Type::tag:
-                generateResultTag(n, r, out);
-                break;
-            }
-        }
-    }
-}
-
-} // namespace templates
-
-// helper stuff that only intended to be used internally
-namespace detail {
-
-char const* getEnv(char const* name);
-bool isEndlessRunning(std::string const& name);
-bool isWarningsEnabled();
-
-template <typename T>
-T parseFile(std::string const& filename);
-
-void gatherStabilityInformation(std::vector<std::string>& warnings, std::vector<std::string>& recommendations);
-void printStabilityInformationOnce(std::ostream* os);
-
-// remembers the last table settings used. When it changes, a new table header is automatically written for the new entry.
-uint64_t& singletonHeaderHash() noexcept;
-
-// determines resolution of the given clock. This is done by measuring multiple times and returning the minimum time difference.
-Clock::duration calcClockResolution(size_t numEvaluations) noexcept;
-
-// formatting utilities
-namespace fmt {
-
-// adds thousands separator to numbers
-ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
-class NumSep : public std::numpunct<char> {
-public:
-    explicit NumSep(char sep);
-    char do_thousands_sep() const override;
-    std::string do_grouping() const override;
-
-private:
-    char mSep;
-};
-ANKERL_NANOBENCH(IGNORE_PADDED_POP)
-
-// RAII to save & restore a stream's state
-ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
-class StreamStateRestorer {
-public:
-    explicit StreamStateRestorer(std::ostream& s);
-    ~StreamStateRestorer();
-
-    // sets back all stream info that we remembered at construction
-    void restore();
-
-    // don't allow copying / moving
-    StreamStateRestorer(StreamStateRestorer const&) = delete;
-    StreamStateRestorer& operator=(StreamStateRestorer const&) = delete;
-    StreamStateRestorer(StreamStateRestorer&&) = delete;
-    StreamStateRestorer& operator=(StreamStateRestorer&&) = delete;
-
-private:
-    std::ostream& mStream;
-    std::locale mLocale;
-    std::streamsize const mPrecision;
-    std::streamsize const mWidth;
-    std::ostream::char_type const mFill;
-    std::ostream::fmtflags const mFmtFlags;
-};
-ANKERL_NANOBENCH(IGNORE_PADDED_POP)
-
-// Number formatter
-class Number {
-public:
-    Number(int width, int precision, double value);
-    Number(int width, int precision, int64_t value);
-    std::string to_s() const;
-
-private:
-    friend std::ostream& operator<<(std::ostream& os, Number const& n);
-    std::ostream& write(std::ostream& os) const;
-
-    int mWidth;
-    int mPrecision;
-    double mValue;
-};
-
-// helper replacement for std::to_string of signed/unsigned numbers so we are locale independent
-std::string to_s(uint64_t s);
-
-std::ostream& operator<<(std::ostream& os, Number const& n);
-
-class MarkDownColumn {
-public:
-    MarkDownColumn(int w, int prec, std::string const& tit, std::string const& suff, double val);
-    std::string title() const;
-    std::string separator() const;
-    std::string invalid() const;
-    std::string value() const;
-
-private:
-    int mWidth;
-    int mPrecision;
-    std::string mTitle;
-    std::string mSuffix;
-    double mValue;
-};
-
-// Formats any text as markdown code, escaping backticks.
-class MarkDownCode {
-public:
-    explicit MarkDownCode(std::string const& what);
-
-private:
-    friend std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode);
-    std::ostream& write(std::ostream& os) const;
-
-    std::string mWhat{};
-};
-
-std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode);
-
-} // namespace fmt
-} // namespace detail
-} // namespace nanobench
-} // namespace ankerl
-
-// implementation /////////////////////////////////////////////////////////////////////////////////
-
-namespace ankerl {
-namespace nanobench {
-
-void render(char const* mustacheTemplate, std::vector<Result> const& results, std::ostream& out) {
-    detail::fmt::StreamStateRestorer restorer(out);
-
-    out.precision(std::numeric_limits<double>::digits10);
-    auto nodes = templates::parseMustacheTemplate(&mustacheTemplate);
-
-    for (auto const& n : nodes) {
-        ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
-        switch (n.type) {
-        case templates::Node::Type::content:
-            out.write(n.begin, std::distance(n.begin, n.end));
-            break;
-
-        case templates::Node::Type::inverted_section:
-            throw std::runtime_error("unknown list '" + std::string(n.begin, n.end) + "'");
-
-        case templates::Node::Type::section:
-            if (n == "result") {
-                const size_t nbResults = results.size();
-                for (size_t i = 0; i < nbResults; ++i) {
-                    generateResult(n.children, i, results, out);
-                }
-            } else if (n == "measurement") {
-                if (results.size() != 1) {
-                    throw std::runtime_error(
-                        "render: can only use section 'measurement' here if there is a single result, but there are " +
-                        detail::fmt::to_s(results.size()));
-                }
-                // when we only have a single result, we can immediately go into its measurement.
-                auto const& r = results.front();
-                for (size_t i = 0; i < r.size(); ++i) {
-                    generateResultMeasurement(n.children, i, r, out);
-                }
-            } else {
-                throw std::runtime_error("render: unknown section '" + std::string(n.begin, n.end) + "'");
-            }
-            break;
-
-        case templates::Node::Type::tag:
-            if (results.size() == 1) {
-                // result & config are both supported there
-                generateResultTag(n, results.front(), out);
-            } else {
-                // This just uses the last result's config.
-                if (!generateConfigTag(n, results.back().config(), out)) {
-                    throw std::runtime_error("unknown tag '" + std::string(n.begin, n.end) + "'");
-                }
-            }
-            break;
-        }
-    }
-}
-
-void render(std::string const& mustacheTemplate, std::vector<Result> const& results, std::ostream& out) {
-    render(mustacheTemplate.c_str(), results, out);
-}
-
-void render(char const* mustacheTemplate, const Bench& bench, std::ostream& out) {
-    render(mustacheTemplate, bench.results(), out);
-}
-
-void render(std::string const& mustacheTemplate, const Bench& bench, std::ostream& out) {
-    render(mustacheTemplate.c_str(), bench.results(), out);
-}
-
-namespace detail {
-
-PerformanceCounters& performanceCounters() {
-#    if defined(__clang__)
-#        pragma clang diagnostic push
-#        pragma clang diagnostic ignored "-Wexit-time-destructors"
-#    endif
-    static PerformanceCounters pc;
-#    if defined(__clang__)
-#        pragma clang diagnostic pop
-#    endif
-    return pc;
-}
-
-// Windows version of doNotOptimizeAway
-// see https://github.com/google/benchmark/blob/master/include/benchmark/benchmark.h#L307
-// see https://github.com/facebook/folly/blob/master/folly/Benchmark.h#L280
-// see https://docs.microsoft.com/en-us/cpp/preprocessor/optimize
-#    if defined(_MSC_VER)
-#        pragma optimize("", off)
-void doNotOptimizeAwaySink(void const*) {}
-#        pragma optimize("", on)
-#    endif
-
-template <typename T>
-T parseFile(std::string const& filename) {
-    std::ifstream fin(filename);
-    T num{};
-    fin >> num;
-    return num;
-}
-
-char const* getEnv(char const* name) {
-#    if defined(_MSC_VER)
-#        pragma warning(push)
-#        pragma warning(disable : 4996) // getenv': This function or variable may be unsafe.
-#    endif
-    return std::getenv(name);
-#    if defined(_MSC_VER)
-#        pragma warning(pop)
-#    endif
-}
-
-bool isEndlessRunning(std::string const& name) {
-    auto endless = getEnv("NANOBENCH_ENDLESS");
-    return nullptr != endless && endless == name;
-}
-
-// True when environment variable NANOBENCH_SUPPRESS_WARNINGS is either not set at all, or set to "0"
-bool isWarningsEnabled() {
-    auto suppression = getEnv("NANOBENCH_SUPPRESS_WARNINGS");
-    return nullptr == suppression || suppression == std::string("0");
-}
-
-void gatherStabilityInformation(std::vector<std::string>& warnings, std::vector<std::string>& recommendations) {
-    warnings.clear();
-    recommendations.clear();
-
-    bool recommendCheckFlags = false;
-
-#    if defined(DEBUG)
-    warnings.emplace_back("DEBUG defined");
-    recommendCheckFlags = true;
-#    endif
-
-    bool recommendPyPerf = false;
-#    if defined(__linux__)
-    auto nprocs = sysconf(_SC_NPROCESSORS_CONF);
-    if (nprocs <= 0) {
-        warnings.emplace_back("couldn't figure out number of processors - no governor, turbo check possible");
-    } else {
-
-        // check frequency scaling
-        for (long id = 0; id < nprocs; ++id) {
-            auto idStr = detail::fmt::to_s(static_cast<uint64_t>(id));
-            auto sysCpu = "/sys/devices/system/cpu/cpu" + idStr;
-            auto minFreq = parseFile<int64_t>(sysCpu + "/cpufreq/scaling_min_freq");
-            auto maxFreq = parseFile<int64_t>(sysCpu + "/cpufreq/scaling_max_freq");
-            if (minFreq != maxFreq) {
-                auto minMHz = static_cast<double>(minFreq) / 1000.0;
-                auto maxMHz = static_cast<double>(maxFreq) / 1000.0;
-                warnings.emplace_back("CPU frequency scaling enabled: CPU " + idStr + " between " +
-                                      detail::fmt::Number(1, 1, minMHz).to_s() + " and " + detail::fmt::Number(1, 1, maxMHz).to_s() +
-                                      " MHz");
-                recommendPyPerf = true;
-                break;
-            }
-        }
-
-        auto currentGovernor = parseFile<std::string>("/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor");
-        if ("performance" != currentGovernor) {
-            warnings.emplace_back("CPU governor is '" + currentGovernor + "' but should be 'performance'");
-            recommendPyPerf = true;
-        }
-
-        if (0 == parseFile<int>("/sys/devices/system/cpu/intel_pstate/no_turbo")) {
-            warnings.emplace_back("Turbo is enabled, CPU frequency will fluctuate");
-            recommendPyPerf = true;
-        }
-    }
-#    endif
-
-    if (recommendCheckFlags) {
-        recommendations.emplace_back("Make sure you compile for Release");
-    }
-    if (recommendPyPerf) {
-        recommendations.emplace_back("Use 'pyperf system tune' before benchmarking. See https://github.com/psf/pyperf");
-    }
-}
-
-void printStabilityInformationOnce(std::ostream* outStream) {
-    static bool shouldPrint = true;
-    if (shouldPrint && outStream && isWarningsEnabled()) {
-        auto& os = *outStream;
-        shouldPrint = false;
-        std::vector<std::string> warnings;
-        std::vector<std::string> recommendations;
-        gatherStabilityInformation(warnings, recommendations);
-        if (warnings.empty()) {
-            return;
-        }
-
-        os << "Warning, results might be unstable:" << std::endl;
-        for (auto const& w : warnings) {
-            os << "* " << w << std::endl;
-        }
-
-        os << std::endl << "Recommendations" << std::endl;
-        for (auto const& r : recommendations) {
-            os << "* " << r << std::endl;
-        }
-    }
-}
-
-// remembers the last table settings used. When it changes, a new table header is automatically written for the new entry.
-uint64_t& singletonHeaderHash() noexcept {
-    static uint64_t sHeaderHash{};
-    return sHeaderHash;
-}
-
-ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
-inline uint64_t hash_combine(uint64_t seed, uint64_t val) {
-    return seed ^ (val + UINT64_C(0x9e3779b9) + (seed << 6U) + (seed >> 2U));
-}
-
-// determines resolution of the given clock. This is done by measuring multiple times and returning the minimum time difference.
-Clock::duration calcClockResolution(size_t numEvaluations) noexcept {
-    auto bestDuration = Clock::duration::max();
-    Clock::time_point tBegin;
-    Clock::time_point tEnd;
-    for (size_t i = 0; i < numEvaluations; ++i) {
-        tBegin = Clock::now();
-        do {
-            tEnd = Clock::now();
-        } while (tBegin == tEnd);
-        bestDuration = (std::min)(bestDuration, tEnd - tBegin);
-    }
-    return bestDuration;
-}
-
-// Calculates clock resolution once, and remembers the result
-Clock::duration clockResolution() noexcept {
-    static Clock::duration sResolution = calcClockResolution(20);
-    return sResolution;
-}
-
-ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
-struct IterationLogic::Impl {
-    enum class State { warmup, upscaling_runtime, measuring, endless };
-
-    explicit Impl(Bench const& bench)
-        : mBench(bench)
-        , mResult(bench.config()) {
-        printStabilityInformationOnce(mBench.output());
-
-        // determine target runtime per epoch
-        mTargetRuntimePerEpoch = detail::clockResolution() * mBench.clockResolutionMultiple();
-        if (mTargetRuntimePerEpoch > mBench.maxEpochTime()) {
-            mTargetRuntimePerEpoch = mBench.maxEpochTime();
-        }
-        if (mTargetRuntimePerEpoch < mBench.minEpochTime()) {
-            mTargetRuntimePerEpoch = mBench.minEpochTime();
-        }
-
-        if (isEndlessRunning(mBench.name())) {
-            std::cerr << "NANOBENCH_ENDLESS set: running '" << mBench.name() << "' endlessly" << std::endl;
-            mNumIters = (std::numeric_limits<uint64_t>::max)();
-            mState = State::endless;
-        } else if (0 != mBench.warmup()) {
-            mNumIters = mBench.warmup();
-            mState = State::warmup;
-        } else if (0 != mBench.epochIterations()) {
-            // exact number of iterations
-            mNumIters = mBench.epochIterations();
-            mState = State::measuring;
-        } else {
-            mNumIters = mBench.minEpochIterations();
-            mState = State::upscaling_runtime;
-        }
-    }
-
-    // directly calculates new iters based on elapsed&iters, and adds a 10% noise. Makes sure we don't underflow.
-    ANKERL_NANOBENCH(NODISCARD) uint64_t calcBestNumIters(std::chrono::nanoseconds elapsed, uint64_t iters) noexcept {
-        auto doubleElapsed = d(elapsed);
-        auto doubleTargetRuntimePerEpoch = d(mTargetRuntimePerEpoch);
-        auto doubleNewIters = doubleTargetRuntimePerEpoch / doubleElapsed * d(iters);
-
-        auto doubleMinEpochIters = d(mBench.minEpochIterations());
-        if (doubleNewIters < doubleMinEpochIters) {
-            doubleNewIters = doubleMinEpochIters;
-        }
-        doubleNewIters *= 1.0 + 0.2 * mRng.uniform01();
-
-        // +0.5 for correct rounding when casting
-        // NOLINTNEXTLINE(bugprone-incorrect-roundings)
-        return static_cast<uint64_t>(doubleNewIters + 0.5);
-    }
-
-    ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined") void upscale(std::chrono::nanoseconds elapsed) {
-        if (elapsed * 10 < mTargetRuntimePerEpoch) {
-            // we are far below the target runtime. Multiply iterations by 10 (with overflow check)
-            if (mNumIters * 10 < mNumIters) {
-                // overflow :-(
-                showResult("iterations overflow. Maybe your code got optimized away?");
-                mNumIters = 0;
-                return;
-            }
-            mNumIters *= 10;
-        } else {
-            mNumIters = calcBestNumIters(elapsed, mNumIters);
-        }
-    }
-
-    void add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept {
-#    if defined(ANKERL_NANOBENCH_LOG_ENABLED)
-        auto oldIters = mNumIters;
-#    endif
-
-        switch (mState) {
-        case State::warmup:
-            if (isCloseEnoughForMeasurements(elapsed)) {
-                // if elapsed is close enough, we can skip upscaling and go right to measurements
-                // still, we don't add the result to the measurements.
-                mState = State::measuring;
-                mNumIters = calcBestNumIters(elapsed, mNumIters);
-            } else {
-                // not close enough: switch to upscaling
-                mState = State::upscaling_runtime;
-                upscale(elapsed);
-            }
-            break;
-
-        case State::upscaling_runtime:
-            if (isCloseEnoughForMeasurements(elapsed)) {
-                // if we are close enough, add measurement and switch to always measuring
-                mState = State::measuring;
-                mTotalElapsed += elapsed;
-                mTotalNumIters += mNumIters;
-                mResult.add(elapsed, mNumIters, pc);
-                mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters);
-            } else {
-                upscale(elapsed);
-            }
-            break;
-
-        case State::measuring:
-            // just add measurements - no questions asked. Even when runtime is low. But we can't ignore
-            // that fluctuation, or else we would bias the result
-            mTotalElapsed += elapsed;
-            mTotalNumIters += mNumIters;
-            mResult.add(elapsed, mNumIters, pc);
-            if (0 != mBench.epochIterations()) {
-                mNumIters = mBench.epochIterations();
-            } else {
-                mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters);
-            }
-            break;
-
-        case State::endless:
-            mNumIters = (std::numeric_limits<uint64_t>::max)();
-            break;
-        }
-
-        if (static_cast<uint64_t>(mResult.size()) == mBench.epochs()) {
-            // we got all the results that we need, finish it
-            showResult("");
-            mNumIters = 0;
-        }
-
-        ANKERL_NANOBENCH_LOG(mBench.name() << ": " << detail::fmt::Number(20, 3, static_cast<double>(elapsed.count())) << " elapsed, "
-                                           << detail::fmt::Number(20, 3, static_cast<double>(mTargetRuntimePerEpoch.count()))
-                                           << " target. oldIters=" << oldIters << ", mNumIters=" << mNumIters
-                                           << ", mState=" << static_cast<int>(mState));
-    }
-
-    void showResult(std::string const& errorMessage) const {
-        ANKERL_NANOBENCH_LOG(errorMessage);
-
-        if (mBench.output() != nullptr) {
-            // prepare column data ///////
-            std::vector<fmt::MarkDownColumn> columns;
-
-            auto rMedian = mResult.median(Result::Measure::elapsed);
-
-            if (mBench.relative()) {
-                double d = 100.0;
-                if (!mBench.results().empty()) {
-                    d = rMedian <= 0.0 ? 0.0 : mBench.results().front().median(Result::Measure::elapsed) / rMedian * 100.0;
-                }
-                columns.emplace_back(11, 1, "relative", "%", d);
-            }
-
-            if (mBench.complexityN() > 0) {
-                columns.emplace_back(14, 0, "complexityN", "", mBench.complexityN());
-            }
-
-            columns.emplace_back(22, 2, mBench.timeUnitName() + "/" + mBench.unit(), "",
-                                 rMedian / (mBench.timeUnit().count() * mBench.batch()));
-            columns.emplace_back(22, 2, mBench.unit() + "/s", "", rMedian <= 0.0 ? 0.0 : mBench.batch() / rMedian);
-
-            double rErrorMedian = mResult.medianAbsolutePercentError(Result::Measure::elapsed);
-            columns.emplace_back(10, 1, "err%", "%", rErrorMedian * 100.0);
-
-            double rInsMedian = -1.0;
-            if (mBench.performanceCounters() && mResult.has(Result::Measure::instructions)) {
-                rInsMedian = mResult.median(Result::Measure::instructions);
-                columns.emplace_back(18, 2, "ins/" + mBench.unit(), "", rInsMedian / mBench.batch());
-            }
-
-            double rCycMedian = -1.0;
-            if (mBench.performanceCounters() && mResult.has(Result::Measure::cpucycles)) {
-                rCycMedian = mResult.median(Result::Measure::cpucycles);
-                columns.emplace_back(18, 2, "cyc/" + mBench.unit(), "", rCycMedian / mBench.batch());
-            }
-            if (rInsMedian > 0.0 && rCycMedian > 0.0) {
-                columns.emplace_back(9, 3, "IPC", "", rCycMedian <= 0.0 ? 0.0 : rInsMedian / rCycMedian);
-            }
-            if (mBench.performanceCounters() && mResult.has(Result::Measure::branchinstructions)) {
-                double rBraMedian = mResult.median(Result::Measure::branchinstructions);
-                columns.emplace_back(17, 2, "bra/" + mBench.unit(), "", rBraMedian / mBench.batch());
-                if (mResult.has(Result::Measure::branchmisses)) {
-                    double p = 0.0;
-                    if (rBraMedian >= 1e-9) {
-                        p = 100.0 * mResult.median(Result::Measure::branchmisses) / rBraMedian;
-                    }
-                    columns.emplace_back(10, 1, "miss%", "%", p);
-                }
-            }
-
-            columns.emplace_back(12, 2, "total", "", mResult.sumProduct(Result::Measure::iterations, Result::Measure::elapsed));
-
-            // write everything
-            auto& os = *mBench.output();
-
-            // combine all elements that are relevant for printing the header
-            uint64_t hash = 0;
-            hash = hash_combine(std::hash<std::string>{}(mBench.unit()), hash);
-            hash = hash_combine(std::hash<std::string>{}(mBench.title()), hash);
-            hash = hash_combine(std::hash<std::string>{}(mBench.timeUnitName()), hash);
-            hash = hash_combine(std::hash<double>{}(mBench.timeUnit().count()), hash);
-            hash = hash_combine(std::hash<bool>{}(mBench.relative()), hash);
-            hash = hash_combine(std::hash<bool>{}(mBench.performanceCounters()), hash);
-
-            if (hash != singletonHeaderHash()) {
-                singletonHeaderHash() = hash;
-
-                // no result yet, print header
-                os << std::endl;
-                for (auto const& col : columns) {
-                    os << col.title();
-                }
-                os << "| " << mBench.title() << std::endl;
-
-                for (auto const& col : columns) {
-                    os << col.separator();
-                }
-                os << "|:" << std::string(mBench.title().size() + 1U, '-') << std::endl;
-            }
-
-            if (!errorMessage.empty()) {
-                for (auto const& col : columns) {
-                    os << col.invalid();
-                }
-                os << "| :boom: " << fmt::MarkDownCode(mBench.name()) << " (" << errorMessage << ')' << std::endl;
-            } else {
-                for (auto const& col : columns) {
-                    os << col.value();
-                }
-                os << "| ";
-                auto showUnstable = isWarningsEnabled() && rErrorMedian >= 0.05;
-                if (showUnstable) {
-                    os << ":wavy_dash: ";
-                }
-                os << fmt::MarkDownCode(mBench.name());
-                if (showUnstable) {
-                    auto avgIters = static_cast<double>(mTotalNumIters) / static_cast<double>(mBench.epochs());
-                    // NOLINTNEXTLINE(bugprone-incorrect-roundings)
-                    auto suggestedIters = static_cast<uint64_t>(avgIters * 10 + 0.5);
-
-                    os << " (Unstable with ~" << detail::fmt::Number(1, 1, avgIters)
-                       << " iters. Increase `minEpochIterations` to e.g. " << suggestedIters << ")";
-                }
-                os << std::endl;
-            }
-        }
-    }
-
-    ANKERL_NANOBENCH(NODISCARD) bool isCloseEnoughForMeasurements(std::chrono::nanoseconds elapsed) const noexcept {
-        return elapsed * 3 >= mTargetRuntimePerEpoch * 2;
-    }
-
-    uint64_t mNumIters = 1;
-    Bench const& mBench;
-    std::chrono::nanoseconds mTargetRuntimePerEpoch{};
-    Result mResult;
-    Rng mRng{123};
-    std::chrono::nanoseconds mTotalElapsed{};
-    uint64_t mTotalNumIters = 0;
-
-    State mState = State::upscaling_runtime;
-};
-ANKERL_NANOBENCH(IGNORE_PADDED_POP)
-
-IterationLogic::IterationLogic(Bench const& bench) noexcept
-    : mPimpl(new Impl(bench)) {}
-
-IterationLogic::~IterationLogic() {
-    if (mPimpl) {
-        delete mPimpl;
-    }
-}
-
-uint64_t IterationLogic::numIters() const noexcept {
-    ANKERL_NANOBENCH_LOG(mPimpl->mBench.name() << ": mNumIters=" << mPimpl->mNumIters);
-    return mPimpl->mNumIters;
-}
-
-void IterationLogic::add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept {
-    mPimpl->add(elapsed, pc);
-}
-
-void IterationLogic::moveResultTo(std::vector<Result>& results) noexcept {
-    results.emplace_back(std::move(mPimpl->mResult));
-}
-
-#    if ANKERL_NANOBENCH(PERF_COUNTERS)
-
-ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
-class LinuxPerformanceCounters {
-public:
-    struct Target {
-        Target(uint64_t* targetValue_, bool correctMeasuringOverhead_, bool correctLoopOverhead_)
-            : targetValue(targetValue_)
-            , correctMeasuringOverhead(correctMeasuringOverhead_)
-            , correctLoopOverhead(correctLoopOverhead_) {}
-
-        uint64_t* targetValue{};
-        bool correctMeasuringOverhead{};
-        bool correctLoopOverhead{};
-    };
-
-    ~LinuxPerformanceCounters();
-
-    // quick operation
-    inline void start() {}
-
-    inline void stop() {}
-
-    bool monitor(perf_sw_ids swId, Target target);
-    bool monitor(perf_hw_id hwId, Target target);
-
-    bool hasError() const noexcept {
-        return mHasError;
-    }
-
-    // Just reading data is faster than enable & disabling.
-    // we subtract data ourselves.
-    inline void beginMeasure() {
-        if (mHasError) {
-            return;
-        }
-
-        // NOLINTNEXTLINE(hicpp-signed-bitwise)
-        mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
-        if (mHasError) {
-            return;
-        }
-
-        // NOLINTNEXTLINE(hicpp-signed-bitwise)
-        mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
-    }
-
-    inline void endMeasure() {
-        if (mHasError) {
-            return;
-        }
-
-        // NOLINTNEXTLINE(hicpp-signed-bitwise)
-        mHasError = (-1 == ioctl(mFd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP));
-        if (mHasError) {
-            return;
-        }
-
-        auto const numBytes = sizeof(uint64_t) * mCounters.size();
-        auto ret = read(mFd, mCounters.data(), numBytes);
-        mHasError = ret != static_cast<ssize_t>(numBytes);
-    }
-
-    void updateResults(uint64_t numIters);
-
-    // rounded integer division
-    template <typename T>
-    static inline T divRounded(T a, T divisor) {
-        return (a + divisor / 2) / divisor;
-    }
-
-    ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
-    static inline uint32_t mix(uint32_t x) noexcept {
-        x ^= x << 13;
-        x ^= x >> 17;
-        x ^= x << 5;
-        return x;
-    }
-
-    template <typename Op>
-    ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
-    void calibrate(Op&& op) {
-        // clear current calibration data,
-        for (auto& v : mCalibratedOverhead) {
-            v = UINT64_C(0);
-        }
-
-        // create new calibration data
-        auto newCalibration = mCalibratedOverhead;
-        for (auto& v : newCalibration) {
-            v = (std::numeric_limits<uint64_t>::max)();
-        }
-        for (size_t iter = 0; iter < 100; ++iter) {
-            beginMeasure();
-            op();
-            endMeasure();
-            if (mHasError) {
-                return;
-            }
-
-            for (size_t i = 0; i < newCalibration.size(); ++i) {
-                auto diff = mCounters[i];
-                if (newCalibration[i] > diff) {
-                    newCalibration[i] = diff;
-                }
-            }
-        }
-
-        mCalibratedOverhead = std::move(newCalibration);
-
-        {
-            // calibrate loop overhead. For branches & instructions this makes sense, not so much for everything else like cycles.
-            // marsaglia's xorshift: mov, sal/shr, xor. Times 3.
-            // This has the nice property that the compiler doesn't seem to be able to optimize multiple calls any further.
-            // see https://godbolt.org/z/49RVQ5
-            uint64_t const numIters = 100000U + (std::random_device{}() & 3);
-            uint64_t n = numIters;
-            uint32_t x = 1234567;
-
-            beginMeasure();
-            while (n-- > 0) {
-                x = mix(x);
-            }
-            endMeasure();
-            detail::doNotOptimizeAway(x);
-            auto measure1 = mCounters;
-
-            n = numIters;
-            beginMeasure();
-            while (n-- > 0) {
-                // we now run *twice* so we can easily calculate the overhead
-                x = mix(x);
-                x = mix(x);
-            }
-            endMeasure();
-            detail::doNotOptimizeAway(x);
-            auto measure2 = mCounters;
-
-            for (size_t i = 0; i < mCounters.size(); ++i) {
-                // factor 2 because we have two instructions per loop
-                auto m1 = measure1[i] > mCalibratedOverhead[i] ? measure1[i] - mCalibratedOverhead[i] : 0;
-                auto m2 = measure2[i] > mCalibratedOverhead[i] ? measure2[i] - mCalibratedOverhead[i] : 0;
-                auto overhead = m1 * 2 > m2 ? m1 * 2 - m2 : 0;
-
-                mLoopOverhead[i] = divRounded(overhead, numIters);
-            }
-        }
-    }
-
-private:
-    bool monitor(uint32_t type, uint64_t eventid, Target target);
-
-    std::map<uint64_t, Target> mIdToTarget{};
-
-    // start with minimum size of 3 for read_format
-    std::vector<uint64_t> mCounters{3};
-    std::vector<uint64_t> mCalibratedOverhead{3};
-    std::vector<uint64_t> mLoopOverhead{3};
-
-    uint64_t mTimeEnabledNanos = 0;
-    uint64_t mTimeRunningNanos = 0;
-    int mFd = -1;
-    bool mHasError = false;
-};
-ANKERL_NANOBENCH(IGNORE_PADDED_POP)
-
-LinuxPerformanceCounters::~LinuxPerformanceCounters() {
-    if (-1 != mFd) {
-        close(mFd);
-    }
-}
-
-bool LinuxPerformanceCounters::monitor(perf_sw_ids swId, LinuxPerformanceCounters::Target target) {
-    return monitor(PERF_TYPE_SOFTWARE, swId, target);
-}
-
-bool LinuxPerformanceCounters::monitor(perf_hw_id hwId, LinuxPerformanceCounters::Target target) {
-    return monitor(PERF_TYPE_HARDWARE, hwId, target);
-}
-
-// overflow is ok, it's checked
-ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
-void LinuxPerformanceCounters::updateResults(uint64_t numIters) {
-    // clear old data
-    for (auto& id_value : mIdToTarget) {
-        *id_value.second.targetValue = UINT64_C(0);
-    }
-
-    if (mHasError) {
-        return;
-    }
-
-    mTimeEnabledNanos = mCounters[1] - mCalibratedOverhead[1];
-    mTimeRunningNanos = mCounters[2] - mCalibratedOverhead[2];
-
-    for (uint64_t i = 0; i < mCounters[0]; ++i) {
-        auto idx = static_cast<size_t>(3 + i * 2 + 0);
-        auto id = mCounters[idx + 1U];
-
-        auto it = mIdToTarget.find(id);
-        if (it != mIdToTarget.end()) {
-
-            auto& tgt = it->second;
-            *tgt.targetValue = mCounters[idx];
-            if (tgt.correctMeasuringOverhead) {
-                if (*tgt.targetValue >= mCalibratedOverhead[idx]) {
-                    *tgt.targetValue -= mCalibratedOverhead[idx];
-                } else {
-                    *tgt.targetValue = 0U;
-                }
-            }
-            if (tgt.correctLoopOverhead) {
-                auto correctionVal = mLoopOverhead[idx] * numIters;
-                if (*tgt.targetValue >= correctionVal) {
-                    *tgt.targetValue -= correctionVal;
-                } else {
-                    *tgt.targetValue = 0U;
-                }
-            }
-        }
-    }
-}
-
-bool LinuxPerformanceCounters::monitor(uint32_t type, uint64_t eventid, Target target) {
-    *target.targetValue = (std::numeric_limits<uint64_t>::max)();
-    if (mHasError) {
-        return false;
-    }
-
-    auto pea = perf_event_attr();
-    std::memset(&pea, 0, sizeof(perf_event_attr));
-    pea.type = type;
-    pea.size = sizeof(perf_event_attr);
-    pea.config = eventid;
-    pea.disabled = 1; // start counter as disabled
-    pea.exclude_kernel = 1;
-    pea.exclude_hv = 1;
-
-    // NOLINTNEXTLINE(hicpp-signed-bitwise)
-    pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID | PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
-
-    const int pid = 0;                    // the current process
-    const int cpu = -1;                   // all CPUs
-#        if defined(PERF_FLAG_FD_CLOEXEC) // since Linux 3.14
-    const unsigned long flags = PERF_FLAG_FD_CLOEXEC;
-#        else
-    const unsigned long flags = 0;
-#        endif
-
-    auto fd = static_cast<int>(syscall(__NR_perf_event_open, &pea, pid, cpu, mFd, flags));
-    if (-1 == fd) {
-        return false;
-    }
-    if (-1 == mFd) {
-        // first call: set to fd, and use this from now on
-        mFd = fd;
-    }
-    uint64_t id = 0;
-    // NOLINTNEXTLINE(hicpp-signed-bitwise)
-    if (-1 == ioctl(fd, PERF_EVENT_IOC_ID, &id)) {
-        // couldn't get id
-        return false;
-    }
-
-    // insert into map, rely on the fact that map's references are constant.
-    mIdToTarget.emplace(id, target);
-
-    // prepare readformat with the correct size (after the insert)
-    auto size = 3 + 2 * mIdToTarget.size();
-    mCounters.resize(size);
-    mCalibratedOverhead.resize(size);
-    mLoopOverhead.resize(size);
-
-    return true;
-}
-
-PerformanceCounters::PerformanceCounters()
-    : mPc(new LinuxPerformanceCounters())
-    , mVal()
-    , mHas() {
-
-    mHas.pageFaults = mPc->monitor(PERF_COUNT_SW_PAGE_FAULTS, LinuxPerformanceCounters::Target(&mVal.pageFaults, true, false));
-    mHas.cpuCycles = mPc->monitor(PERF_COUNT_HW_REF_CPU_CYCLES, LinuxPerformanceCounters::Target(&mVal.cpuCycles, true, false));
-    mHas.contextSwitches =
-        mPc->monitor(PERF_COUNT_SW_CONTEXT_SWITCHES, LinuxPerformanceCounters::Target(&mVal.contextSwitches, true, false));
-    mHas.instructions = mPc->monitor(PERF_COUNT_HW_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.instructions, true, true));
-    mHas.branchInstructions =
-        mPc->monitor(PERF_COUNT_HW_BRANCH_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.branchInstructions, true, false));
-    mHas.branchMisses = mPc->monitor(PERF_COUNT_HW_BRANCH_MISSES, LinuxPerformanceCounters::Target(&mVal.branchMisses, true, false));
-    // mHas.branchMisses = false;
-
-    mPc->start();
-    mPc->calibrate([] {
-        auto before = ankerl::nanobench::Clock::now();
-        auto after = ankerl::nanobench::Clock::now();
-        (void)before;
-        (void)after;
-    });
-
-    if (mPc->hasError()) {
-        // something failed, don't monitor anything.
-        mHas = PerfCountSet<bool>{};
-    }
-}
-
-PerformanceCounters::~PerformanceCounters() {
-    if (nullptr != mPc) {
-        delete mPc;
-    }
-}
-
-void PerformanceCounters::beginMeasure() {
-    mPc->beginMeasure();
-}
-
-void PerformanceCounters::endMeasure() {
-    mPc->endMeasure();
-}
-
-void PerformanceCounters::updateResults(uint64_t numIters) {
-    mPc->updateResults(numIters);
-}
-
-#    else
-
-PerformanceCounters::PerformanceCounters() = default;
-PerformanceCounters::~PerformanceCounters() = default;
-void PerformanceCounters::beginMeasure() {}
-void PerformanceCounters::endMeasure() {}
-void PerformanceCounters::updateResults(uint64_t) {}
-
-#    endif
-
-ANKERL_NANOBENCH(NODISCARD) PerfCountSet<uint64_t> const& PerformanceCounters::val() const noexcept {
-    return mVal;
-}
-ANKERL_NANOBENCH(NODISCARD) PerfCountSet<bool> const& PerformanceCounters::has() const noexcept {
-    return mHas;
-}
-
-// formatting utilities
-namespace fmt {
-
-// adds thousands separator to numbers
-NumSep::NumSep(char sep)
-    : mSep(sep) {}
-
-char NumSep::do_thousands_sep() const {
-    return mSep;
-}
-
-std::string NumSep::do_grouping() const {
-    return "\003";
-}
-
-// RAII to save & restore a stream's state
-StreamStateRestorer::StreamStateRestorer(std::ostream& s)
-    : mStream(s)
-    , mLocale(s.getloc())
-    , mPrecision(s.precision())
-    , mWidth(s.width())
-    , mFill(s.fill())
-    , mFmtFlags(s.flags()) {}
-
-StreamStateRestorer::~StreamStateRestorer() {
-    restore();
-}
-
-// sets back all stream info that we remembered at construction
-void StreamStateRestorer::restore() {
-    mStream.imbue(mLocale);
-    mStream.precision(mPrecision);
-    mStream.width(mWidth);
-    mStream.fill(mFill);
-    mStream.flags(mFmtFlags);
-}
-
-Number::Number(int width, int precision, int64_t value)
-    : mWidth(width)
-    , mPrecision(precision)
-    , mValue(static_cast<double>(value)) {}
-
-Number::Number(int width, int precision, double value)
-    : mWidth(width)
-    , mPrecision(precision)
-    , mValue(value) {}
-
-std::ostream& Number::write(std::ostream& os) const {
-    StreamStateRestorer restorer(os);
-    os.imbue(std::locale(os.getloc(), new NumSep(',')));
-    os << std::setw(mWidth) << std::setprecision(mPrecision) << std::fixed << mValue;
-    return os;
-}
-
-std::string Number::to_s() const {
-    std::stringstream ss;
-    write(ss);
-    return ss.str();
-}
-
-std::string to_s(uint64_t n) {
-    std::string str;
-    do {
-        str += static_cast<char>('0' + static_cast<char>(n % 10));
-        n /= 10;
-    } while (n != 0);
-    std::reverse(str.begin(), str.end());
-    return str;
-}
-
-std::ostream& operator<<(std::ostream& os, Number const& n) {
-    return n.write(os);
-}
-
-MarkDownColumn::MarkDownColumn(int w, int prec, std::string const& tit, std::string const& suff, double val)
-    : mWidth(w)
-    , mPrecision(prec)
-    , mTitle(tit)
-    , mSuffix(suff)
-    , mValue(val) {}
-
-std::string MarkDownColumn::title() const {
-    std::stringstream ss;
-    ss << '|' << std::setw(mWidth - 2) << std::right << mTitle << ' ';
-    return ss.str();
-}
-
-std::string MarkDownColumn::separator() const {
-    std::string sep(static_cast<size_t>(mWidth), '-');
-    sep.front() = '|';
-    sep.back() = ':';
-    return sep;
-}
-
-std::string MarkDownColumn::invalid() const {
-    std::string sep(static_cast<size_t>(mWidth), ' ');
-    sep.front() = '|';
-    sep[sep.size() - 2] = '-';
-    return sep;
-}
-
-std::string MarkDownColumn::value() const {
-    std::stringstream ss;
-    auto width = mWidth - 2 - static_cast<int>(mSuffix.size());
-    ss << '|' << Number(width, mPrecision, mValue) << mSuffix << ' ';
-    return ss.str();
-}
-
-// Formats any text as markdown code, escaping backticks.
-MarkDownCode::MarkDownCode(std::string const& what) {
-    mWhat.reserve(what.size() + 2);
-    mWhat.push_back('`');
-    for (char c : what) {
-        mWhat.push_back(c);
-        if ('`' == c) {
-            mWhat.push_back('`');
-        }
-    }
-    mWhat.push_back('`');
-}
-
-std::ostream& MarkDownCode::write(std::ostream& os) const {
-    return os << mWhat;
-}
-
-std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode) {
-    return mdCode.write(os);
-}
-} // namespace fmt
-} // namespace detail
-
-// provide implementation here so it's only generated once
-Config::Config() = default;
-Config::~Config() = default;
-Config& Config::operator=(Config const&) = default;
-Config& Config::operator=(Config&&) = default;
-Config::Config(Config const&) = default;
-Config::Config(Config&&) noexcept = default;
-
-// provide implementation here so it's only generated once
-Result::~Result() = default;
-Result& Result::operator=(Result const&) = default;
-Result& Result::operator=(Result&&) = default;
-Result::Result(Result const&) = default;
-Result::Result(Result&&) noexcept = default;
-
-namespace detail {
-template <typename T>
-inline constexpr typename std::underlying_type<T>::type u(T val) noexcept {
-    return static_cast<typename std::underlying_type<T>::type>(val);
-}
-} // namespace detail
-
-// Result returned after a benchmark has finished. Can be used as a baseline for relative().
-Result::Result(Config const& benchmarkConfig)
-    : mConfig(benchmarkConfig)
-    , mNameToMeasurements{detail::u(Result::Measure::_size)} {}
-
-void Result::add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const& pc) {
-    using detail::d;
-    using detail::u;
-
-    double dIters = d(iters);
-    mNameToMeasurements[u(Result::Measure::iterations)].push_back(dIters);
-
-    mNameToMeasurements[u(Result::Measure::elapsed)].push_back(d(totalElapsed) / dIters);
-    if (pc.has().pageFaults) {
-        mNameToMeasurements[u(Result::Measure::pagefaults)].push_back(d(pc.val().pageFaults) / dIters);
-    }
-    if (pc.has().cpuCycles) {
-        mNameToMeasurements[u(Result::Measure::cpucycles)].push_back(d(pc.val().cpuCycles) / dIters);
-    }
-    if (pc.has().contextSwitches) {
-        mNameToMeasurements[u(Result::Measure::contextswitches)].push_back(d(pc.val().contextSwitches) / dIters);
-    }
-    if (pc.has().instructions) {
-        mNameToMeasurements[u(Result::Measure::instructions)].push_back(d(pc.val().instructions) / dIters);
-    }
-    if (pc.has().branchInstructions) {
-        double branchInstructions = 0.0;
-        // correcting branches: remove branch introduced by the while (...) loop for each iteration.
-        if (pc.val().branchInstructions > iters + 1U) {
-            branchInstructions = d(pc.val().branchInstructions - (iters + 1U));
-        }
-        mNameToMeasurements[u(Result::Measure::branchinstructions)].push_back(branchInstructions / dIters);
-
-        if (pc.has().branchMisses) {
-            // correcting branch misses
-            double branchMisses = d(pc.val().branchMisses);
-            if (branchMisses > branchInstructions) {
-                // can't have branch misses when there were branches...
-                branchMisses = branchInstructions;
-            }
-
-            // assuming at least one missed branch for the loop
-            branchMisses -= 1.0;
-            if (branchMisses < 1.0) {
-                branchMisses = 1.0;
-            }
-            mNameToMeasurements[u(Result::Measure::branchmisses)].push_back(branchMisses / dIters);
-        }
-    }
-}
-
-Config const& Result::config() const noexcept {
-    return mConfig;
-}
-
-inline double calcMedian(std::vector<double>& data) {
-    if (data.empty()) {
-        return 0.0;
-    }
-    std::sort(data.begin(), data.end());
-
-    auto midIdx = data.size() / 2U;
-    if (1U == (data.size() & 1U)) {
-        return data[midIdx];
-    }
-    return (data[midIdx - 1U] + data[midIdx]) / 2U;
-}
-
-double Result::median(Measure m) const {
-    // create a copy so we can sort
-    auto data = mNameToMeasurements[detail::u(m)];
-    return calcMedian(data);
-}
-
-double Result::average(Measure m) const {
-    using detail::d;
-    auto const& data = mNameToMeasurements[detail::u(m)];
-    if (data.empty()) {
-        return 0.0;
-    }
-
-    // create a copy so we can sort
-    return sum(m) / d(data.size());
-}
-
-double Result::medianAbsolutePercentError(Measure m) const {
-    // create copy
-    auto data = mNameToMeasurements[detail::u(m)];
-
-    // calculates MdAPE which is the median of percentage error
-    // see https://www.spiderfinancial.com/support/documentation/numxl/reference-manual/forecasting-performance/mdape
-    auto med = calcMedian(data);
-
-    // transform the data to absolute error
-    for (auto& x : data) {
-        x = (x - med) / x;
-        if (x < 0) {
-            x = -x;
-        }
-    }
-    return calcMedian(data);
-}
-
-double Result::sum(Measure m) const noexcept {
-    auto const& data = mNameToMeasurements[detail::u(m)];
-    return std::accumulate(data.begin(), data.end(), 0.0);
-}
-
-double Result::sumProduct(Measure m1, Measure m2) const noexcept {
-    auto const& data1 = mNameToMeasurements[detail::u(m1)];
-    auto const& data2 = mNameToMeasurements[detail::u(m2)];
-
-    if (data1.size() != data2.size()) {
-        return 0.0;
-    }
-
-    double result = 0.0;
-    for (size_t i = 0, s = data1.size(); i != s; ++i) {
-        result += data1[i] * data2[i];
-    }
-    return result;
-}
-
-bool Result::has(Measure m) const noexcept {
-    return !mNameToMeasurements[detail::u(m)].empty();
-}
-
-double Result::get(size_t idx, Measure m) const {
-    auto const& data = mNameToMeasurements[detail::u(m)];
-    return data.at(idx);
-}
-
-bool Result::empty() const noexcept {
-    return 0U == size();
-}
-
-size_t Result::size() const noexcept {
-    auto const& data = mNameToMeasurements[detail::u(Measure::elapsed)];
-    return data.size();
-}
-
-double Result::minimum(Measure m) const noexcept {
-    auto const& data = mNameToMeasurements[detail::u(m)];
-    if (data.empty()) {
-        return 0.0;
-    }
-
-    // here its save to assume that at least one element is there
-    return *std::min_element(data.begin(), data.end());
-}
-
-double Result::maximum(Measure m) const noexcept {
-    auto const& data = mNameToMeasurements[detail::u(m)];
-    if (data.empty()) {
-        return 0.0;
-    }
-
-    // here its save to assume that at least one element is there
-    return *std::max_element(data.begin(), data.end());
-}
-
-std::string const& Result::context(char const* variableName) const {
-    return mConfig.mContext.at(variableName);
-}
-
-std::string const& Result::context(std::string const& variableName) const {
-    return mConfig.mContext.at(variableName);
-}
-
-Result::Measure Result::fromString(std::string const& str) {
-    if (str == "elapsed") {
-        return Measure::elapsed;
-    } else if (str == "iterations") {
-        return Measure::iterations;
-    } else if (str == "pagefaults") {
-        return Measure::pagefaults;
-    } else if (str == "cpucycles") {
-        return Measure::cpucycles;
-    } else if (str == "contextswitches") {
-        return Measure::contextswitches;
-    } else if (str == "instructions") {
-        return Measure::instructions;
-    } else if (str == "branchinstructions") {
-        return Measure::branchinstructions;
-    } else if (str == "branchmisses") {
-        return Measure::branchmisses;
-    } else {
-        // not found, return _size
-        return Measure::_size;
-    }
-}
-
-// Configuration of a microbenchmark.
-Bench::Bench() {
-    mConfig.mOut = &std::cout;
-}
-
-Bench::Bench(Bench&&) = default;
-Bench& Bench::operator=(Bench&&) = default;
-Bench::Bench(Bench const&) = default;
-Bench& Bench::operator=(Bench const&) = default;
-Bench::~Bench() noexcept = default;
-
-double Bench::batch() const noexcept {
-    return mConfig.mBatch;
-}
-
-double Bench::complexityN() const noexcept {
-    return mConfig.mComplexityN;
-}
-
-// Set a baseline to compare it to. 100% it is exactly as fast as the baseline, >100% means it is faster than the baseline, <100%
-// means it is slower than the baseline.
-Bench& Bench::relative(bool isRelativeEnabled) noexcept {
-    mConfig.mIsRelative = isRelativeEnabled;
-    return *this;
-}
-bool Bench::relative() const noexcept {
-    return mConfig.mIsRelative;
-}
-
-Bench& Bench::performanceCounters(bool showPerformanceCounters) noexcept {
-    mConfig.mShowPerformanceCounters = showPerformanceCounters;
-    return *this;
-}
-bool Bench::performanceCounters() const noexcept {
-    return mConfig.mShowPerformanceCounters;
-}
-
-// Operation unit. Defaults to "op", could be e.g. "byte" for string processing.
-// If u differs from currently set unit, the stored results will be cleared.
-// Use singular (byte, not bytes).
-Bench& Bench::unit(char const* u) {
-    if (u != mConfig.mUnit) {
-        mResults.clear();
-    }
-    mConfig.mUnit = u;
-    return *this;
-}
-
-Bench& Bench::unit(std::string const& u) {
-    return unit(u.c_str());
-}
-
-std::string const& Bench::unit() const noexcept {
-    return mConfig.mUnit;
-}
-
-Bench& Bench::timeUnit(std::chrono::duration<double> const& tu, std::string const& tuName) {
-    mConfig.mTimeUnit = tu;
-    mConfig.mTimeUnitName = tuName;
-    return *this;
-}
-
-std::string const& Bench::timeUnitName() const noexcept {
-    return mConfig.mTimeUnitName;
-}
-
-std::chrono::duration<double> const& Bench::timeUnit() const noexcept {
-    return mConfig.mTimeUnit;
-}
-
-// If benchmarkTitle differs from currently set title, the stored results will be cleared.
-Bench& Bench::title(const char* benchmarkTitle) {
-    if (benchmarkTitle != mConfig.mBenchmarkTitle) {
-        mResults.clear();
-    }
-    mConfig.mBenchmarkTitle = benchmarkTitle;
-    return *this;
-}
-Bench& Bench::title(std::string const& benchmarkTitle) {
-    if (benchmarkTitle != mConfig.mBenchmarkTitle) {
-        mResults.clear();
-    }
-    mConfig.mBenchmarkTitle = benchmarkTitle;
-    return *this;
-}
-
-std::string const& Bench::title() const noexcept {
-    return mConfig.mBenchmarkTitle;
-}
-
-Bench& Bench::name(const char* benchmarkName) {
-    mConfig.mBenchmarkName = benchmarkName;
-    return *this;
-}
-
-Bench& Bench::name(std::string const& benchmarkName) {
-    mConfig.mBenchmarkName = benchmarkName;
-    return *this;
-}
-
-std::string const& Bench::name() const noexcept {
-    return mConfig.mBenchmarkName;
-}
-
-Bench& Bench::context(char const* variableName, char const* variableValue) {
-    mConfig.mContext[variableName] = variableValue;
-    return *this;
-}
-
-Bench& Bench::context(std::string const& variableName, std::string const& variableValue) {
-    mConfig.mContext[variableName] = variableValue;
-    return *this;
-}
-
-Bench& Bench::clearContext() {
-    mConfig.mContext.clear();
-    return *this;
-}
-
-// Number of epochs to evaluate. The reported result will be the median of evaluation of each epoch.
-Bench& Bench::epochs(size_t numEpochs) noexcept {
-    mConfig.mNumEpochs = numEpochs;
-    return *this;
-}
-size_t Bench::epochs() const noexcept {
-    return mConfig.mNumEpochs;
-}
-
-// Desired evaluation time is a multiple of clock resolution. Default is to be 1000 times above this measurement precision.
-Bench& Bench::clockResolutionMultiple(size_t multiple) noexcept {
-    mConfig.mClockResolutionMultiple = multiple;
-    return *this;
-}
-size_t Bench::clockResolutionMultiple() const noexcept {
-    return mConfig.mClockResolutionMultiple;
-}
-
-// Sets the maximum time each epoch should take. Default is 100ms.
-Bench& Bench::maxEpochTime(std::chrono::nanoseconds t) noexcept {
-    mConfig.mMaxEpochTime = t;
-    return *this;
-}
-std::chrono::nanoseconds Bench::maxEpochTime() const noexcept {
-    return mConfig.mMaxEpochTime;
-}
-
-// Sets the maximum time each epoch should take. Default is 100ms.
-Bench& Bench::minEpochTime(std::chrono::nanoseconds t) noexcept {
-    mConfig.mMinEpochTime = t;
-    return *this;
-}
-std::chrono::nanoseconds Bench::minEpochTime() const noexcept {
-    return mConfig.mMinEpochTime;
-}
-
-Bench& Bench::minEpochIterations(uint64_t numIters) noexcept {
-    mConfig.mMinEpochIterations = (numIters == 0) ? 1 : numIters;
-    return *this;
-}
-uint64_t Bench::minEpochIterations() const noexcept {
-    return mConfig.mMinEpochIterations;
-}
-
-Bench& Bench::epochIterations(uint64_t numIters) noexcept {
-    mConfig.mEpochIterations = numIters;
-    return *this;
-}
-uint64_t Bench::epochIterations() const noexcept {
-    return mConfig.mEpochIterations;
-}
-
-Bench& Bench::warmup(uint64_t numWarmupIters) noexcept {
-    mConfig.mWarmup = numWarmupIters;
-    return *this;
-}
-uint64_t Bench::warmup() const noexcept {
-    return mConfig.mWarmup;
-}
-
-Bench& Bench::config(Config const& benchmarkConfig) {
-    mConfig = benchmarkConfig;
-    return *this;
-}
-Config const& Bench::config() const noexcept {
-    return mConfig;
-}
-
-Bench& Bench::output(std::ostream* outstream) noexcept {
-    mConfig.mOut = outstream;
-    return *this;
-}
-
-ANKERL_NANOBENCH(NODISCARD) std::ostream* Bench::output() const noexcept {
-    return mConfig.mOut;
-}
-
-std::vector<Result> const& Bench::results() const noexcept {
-    return mResults;
-}
-
-Bench& Bench::render(char const* templateContent, std::ostream& os) {
-    ::ankerl::nanobench::render(templateContent, *this, os);
-    return *this;
-}
-
-Bench& Bench::render(std::string const& templateContent, std::ostream& os) {
-    ::ankerl::nanobench::render(templateContent, *this, os);
-    return *this;
-}
-
-std::vector<BigO> Bench::complexityBigO() const {
-    std::vector<BigO> bigOs;
-    auto rangeMeasure = BigO::collectRangeMeasure(mResults);
-    bigOs.emplace_back("O(1)", rangeMeasure, [](double) {
-        return 1.0;
-    });
-    bigOs.emplace_back("O(n)", rangeMeasure, [](double n) {
-        return n;
-    });
-    bigOs.emplace_back("O(log n)", rangeMeasure, [](double n) {
-        return std::log2(n);
-    });
-    bigOs.emplace_back("O(n log n)", rangeMeasure, [](double n) {
-        return n * std::log2(n);
-    });
-    bigOs.emplace_back("O(n^2)", rangeMeasure, [](double n) {
-        return n * n;
-    });
-    bigOs.emplace_back("O(n^3)", rangeMeasure, [](double n) {
-        return n * n * n;
-    });
-    std::sort(bigOs.begin(), bigOs.end());
-    return bigOs;
-}
-
-Rng::Rng()
-    : mX(0)
-    , mY(0) {
-    std::random_device rd;
-    std::uniform_int_distribution<uint64_t> dist;
-    do {
-        mX = dist(rd);
-        mY = dist(rd);
-    } while (mX == 0 && mY == 0);
-}
-
-ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
-uint64_t splitMix64(uint64_t& state) noexcept {
-    uint64_t z = (state += UINT64_C(0x9e3779b97f4a7c15));
-    z = (z ^ (z >> 30U)) * UINT64_C(0xbf58476d1ce4e5b9);
-    z = (z ^ (z >> 27U)) * UINT64_C(0x94d049bb133111eb);
-    return z ^ (z >> 31U);
-}
-
-// Seeded as described in romu paper (update april 2020)
-Rng::Rng(uint64_t seed) noexcept
-    : mX(splitMix64(seed))
-    , mY(splitMix64(seed)) {
-    for (size_t i = 0; i < 10; ++i) {
-        operator()();
-    }
-}
-
-// only internally used to copy the RNG.
-Rng::Rng(uint64_t x, uint64_t y) noexcept
-    : mX(x)
-    , mY(y) {}
-
-Rng Rng::copy() const noexcept {
-    return Rng{mX, mY};
-}
-
-Rng::Rng(std::vector<uint64_t> const& data)
-    : mX(0)
-    , mY(0) {
-    if (data.size() != 2) {
-        throw std::runtime_error("ankerl::nanobench::Rng::Rng: needed exactly 2 entries in data, but got " +
-                                 detail::fmt::to_s(data.size()));
-    }
-    mX = data[0];
-    mY = data[1];
-}
-
-std::vector<uint64_t> Rng::state() const {
-    std::vector<uint64_t> data(2);
-    data[0] = mX;
-    data[1] = mY;
-    return data;
-}
-
-BigO::RangeMeasure BigO::collectRangeMeasure(std::vector<Result> const& results) {
-    BigO::RangeMeasure rangeMeasure;
-    for (auto const& result : results) {
-        if (result.config().mComplexityN > 0.0) {
-            rangeMeasure.emplace_back(result.config().mComplexityN, result.median(Result::Measure::elapsed));
-        }
-    }
-    return rangeMeasure;
-}
-
-BigO::BigO(std::string const& bigOName, RangeMeasure const& rangeMeasure)
-    : mName(bigOName) {
-
-    // estimate the constant factor
-    double sumRangeMeasure = 0.0;
-    double sumRangeRange = 0.0;
-
-    for (size_t i = 0; i < rangeMeasure.size(); ++i) {
-        sumRangeMeasure += rangeMeasure[i].first * rangeMeasure[i].second;
-        sumRangeRange += rangeMeasure[i].first * rangeMeasure[i].first;
-    }
-    mConstant = sumRangeMeasure / sumRangeRange;
-
-    // calculate root mean square
-    double err = 0.0;
-    double sumMeasure = 0.0;
-    for (size_t i = 0; i < rangeMeasure.size(); ++i) {
-        auto diff = mConstant * rangeMeasure[i].first - rangeMeasure[i].second;
-        err += diff * diff;
-
-        sumMeasure += rangeMeasure[i].second;
-    }
-
-    auto n = static_cast<double>(rangeMeasure.size());
-    auto mean = sumMeasure / n;
-    mNormalizedRootMeanSquare = std::sqrt(err / n) / mean;
-}
-
-BigO::BigO(const char* bigOName, RangeMeasure const& rangeMeasure)
-    : BigO(std::string(bigOName), rangeMeasure) {}
-
-std::string const& BigO::name() const noexcept {
-    return mName;
-}
-
-double BigO::constant() const noexcept {
-    return mConstant;
-}
-
-double BigO::normalizedRootMeanSquare() const noexcept {
-    return mNormalizedRootMeanSquare;
-}
-
-bool BigO::operator<(BigO const& other) const noexcept {
-    return std::tie(mNormalizedRootMeanSquare, mName) < std::tie(other.mNormalizedRootMeanSquare, other.mName);
-}
-
-std::ostream& operator<<(std::ostream& os, BigO const& bigO) {
-    return os << bigO.constant() << " * " << bigO.name() << ", rms=" << bigO.normalizedRootMeanSquare();
-}
-
-std::ostream& operator<<(std::ostream& os, std::vector<ankerl::nanobench::BigO> const& bigOs) {
-    detail::fmt::StreamStateRestorer restorer(os);
-    os << std::endl << "|   coefficient |   err% | complexity" << std::endl << "|--------------:|-------:|------------" << std::endl;
-    for (auto const& bigO : bigOs) {
-        os << "|" << std::setw(14) << std::setprecision(7) << std::scientific << bigO.constant() << " ";
-        os << "|" << detail::fmt::Number(6, 1, bigO.normalizedRootMeanSquare() * 100.0) << "% ";
-        os << "| " << bigO.name();
-        os << std::endl;
-    }
-    return os;
-}
-
-} // namespace nanobench
-} // namespace ankerl
-
-#endif // ANKERL_NANOBENCH_IMPLEMENT
-#endif // ANKERL_NANOBENCH_H_INCLUDED

From 1f3cb37693b28554e008b48ffde9751038211238 Mon Sep 17 00:00:00 2001
From: Tristan Brindle <t.c.brindle@gmail.com>
Date: Thu, 25 Jan 2024 15:40:20 +0000
Subject: [PATCH 2/2] Rename benchmark targets

...for consistency with all of our example-* targets, and so they show up together in CLion
---
 benchmark/CMakeLists.txt | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 13a7587d..a9974eb2 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -9,11 +9,11 @@ FetchContent_Declare(
 
 FetchContent_MakeAvailable(nanobench)
 
-add_executable(internal-iteration-benchmark internal_iteration_benchmark.cpp)
-target_link_libraries(internal-iteration-benchmark PUBLIC nanobench::nanobench flux)
+add_executable(benchmark-internal-iteration internal_iteration_benchmark.cpp)
+target_link_libraries(benchmark-internal-iteration PUBLIC nanobench::nanobench flux)
 
-add_executable(sort-benchmark sort_benchmark.cpp)
-target_link_libraries(sort-benchmark PUBLIC nanobench::nanobench flux)
+add_executable(benchmark-sort sort_benchmark.cpp)
+target_link_libraries(benchmark-sort PUBLIC nanobench::nanobench flux)
 
-add_executable(multidimensional-memset-benchmark multidimensional_memset_benchmark.cpp multidimensional_memset_benchmark_kernels.cpp)
-target_link_libraries(multidimensional-memset-benchmark PUBLIC nanobench::nanobench flux)
+add_executable(benchmark-multidimensional-memset multidimensional_memset_benchmark.cpp multidimensional_memset_benchmark_kernels.cpp)
+target_link_libraries(benchmark-multidimensional-memset PUBLIC nanobench::nanobench flux)