diff --git a/.gitignore b/.gitignore index c37575b..5ab9f94 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,6 @@ bin __pycache__ dist phlop.egg-info/ -scope_timer.txt + +*scope_timer.txt +tpp diff --git a/inc/phlop/timing/threaded_scope_timer.hpp b/inc/phlop/timing/threaded_scope_timer.hpp index b11fc78..c367327 100644 --- a/inc/phlop/timing/threaded_scope_timer.hpp +++ b/inc/phlop/timing/threaded_scope_timer.hpp @@ -56,6 +56,7 @@ struct ScopeTimerMan } _headers.clear(); thread_storage.clear(); + thread_reports.clear(); active = false; } @@ -132,10 +133,16 @@ struct ScopeTimerMan std::unique_lock lk(work_); thread_storage.emplace_back(std::move(pt.reports), std::move(pt.traces)); } + void move(std::shared_ptr& report) + { + std::unique_lock lk(work_); + thread_reports.emplace_back(std::move(report)); + } std::mutex work_; std::vector, std::vector>> thread_storage; + std::vector> thread_reports; // keep alive }; @@ -159,9 +166,11 @@ struct RunTimerReportSnapshot std::vector childs; }; + struct RunTimerReport { - std::string_view k, f; + std::string const k; // key + std::string const f; // function std::uint32_t l = 0; RunTimerReport(std::string_view const& _k, std::string_view const& _f, std::uint32_t const& _l) @@ -175,6 +184,7 @@ struct RunTimerReport ~RunTimerReport() {} + auto operator()(std::size_t i) { return snapshots[i].get(); } auto size() { return snapshots.size(); } @@ -182,6 +192,12 @@ struct RunTimerReport }; +struct ThreadLifeWatcher +{ + ~ThreadLifeWatcher() { ScopeTimerMan::INSTANCE().move(report); } + + std::shared_ptr report; +}; struct scope_timer @@ -264,7 +280,9 @@ struct BinaryTimerFile template void recurse_traces_for_keys(Trace const& c) { - std::string s{c->self->k}; + assert(c); + assert(c->self); + auto const& s = c->self->k; if (!key_ids.count(s)) { auto [it, b] = key_ids.emplace(s, key_ids.size()); @@ -359,11 +377,13 @@ namespace detail #endif #define PHLOP_SCOPE_TIMER(key) \ - static phlop::threaded::RunTimerReport PHLOP_STR_CAT(ridx_, __LINE__){key, __FILE__, \ - __LINE__}; \ + static thread_local auto PHLOP_STR_CAT(ridx_, __LINE__) \ + = std::make_shared(key, __FILE__, __LINE__); \ + static thread_local phlop::threaded::ThreadLifeWatcher PHLOP_STR_CAT(_watcher_, __LINE__){ \ + PHLOP_STR_CAT(ridx_, __LINE__)}; \ phlop::threaded::scope_timer PHLOP_STR_CAT(_scope_timer_, \ - __LINE__){PHLOP_STR_CAT(ridx_, __LINE__)}; \ - phlop::threaded::ScopeTimerMan::local().report_stack_ptr = &PHLOP_STR_CAT(ridx_, __LINE__); + __LINE__){*PHLOP_STR_CAT(ridx_, __LINE__)}; \ + phlop::threaded::ScopeTimerMan::local().report_stack_ptr = PHLOP_STR_CAT(ridx_, __LINE__).get(); #endif /*_PHLOP_TIMING_THREADED_SCOPE_TIMER_HPP_*/ diff --git a/mkn.pfm.yaml b/mkn.pfm.yaml new file mode 100644 index 0000000..abd7ee9 --- /dev/null +++ b/mkn.pfm.yaml @@ -0,0 +1,29 @@ +#! clean build test run -p scope_timer,threaded_scope_timer -Oa "-fPIC -std=c++20" -W 9 + +# run script first: ./sh/setup_pfm.sh + +name: phlop.pfm +parent: base + +profile: +- name: base + inc: inc + +- name: pfm + inc: tpp/pfm/include + +- name: pfm_lib + parent: pfm + mode: static + inc: tpp/pfm/lib, 0 + src: tpp/pfm/lib + +- name: pfm_events + self: pfm_lib + main: tpp/pfm/examples/check_events.c + out: check_events + +- name: pfm_info + self: pfm_lib + main: tpp/pfm/examples/showevtinfo.c + out: showevtinfo diff --git a/phlop/app/__main__.py b/phlop/app/__main__.py index c7cc76d..32d4f1b 100644 --- a/phlop/app/__main__.py +++ b/phlop/app/__main__.py @@ -9,6 +9,8 @@ phlop.app.cmake phlop.app.test_cases phlop.app.git + phlop.app.nvidia + phlop.app.pfm phlop.app.perf""" print(available_modules) diff --git a/phlop/app/nvidia/__init__.py b/phlop/app/nvidia/__init__.py new file mode 100644 index 0000000..b117618 --- /dev/null +++ b/phlop/app/nvidia/__init__.py @@ -0,0 +1,5 @@ +# +# +# +# +# diff --git a/phlop/app/nvidia/__main__.py b/phlop/app/nvidia/__main__.py new file mode 100644 index 0000000..bc16f88 --- /dev/null +++ b/phlop/app/nvidia/__main__.py @@ -0,0 +1,12 @@ +# +# +# +# +# + + +available_modules = """Available: + phlop.app.nvidia.csan + phlop.app.nvidia.ncu""" + +print(available_modules) diff --git a/phlop/app/nvidia/csan.py b/phlop/app/nvidia/csan.py new file mode 100644 index 0000000..d0ba3e5 --- /dev/null +++ b/phlop/app/nvidia/csan.py @@ -0,0 +1,63 @@ +# compute sanitizer frontend + +# https://docs.nvidia.com/compute-sanitizer/ComputeSanitizer/index.html + +## samples +# compute-sanitizer --tool memcheck [sanitizer_options] app_name [app_options] +# compute-sanitizer --tool racecheck [sanitizer_options] app_name [app_options] +# +# +# + + +from phlop.dict import ValDict +from phlop.proc import run + +metrics = [ + "all", + "l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum", # read + "l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum", # wrte +] + + +def build_command(cli_args): + cmd_parts = [ + "compute-sanitizer", + f"--tool {cli_args.tool}", + cli_args.extra if cli_args.extra else "", + " ".join(cli_args.remaining) if cli_args.remaining else "", + ] + return " ".join(filter(None, cmd_parts)) + + +def exec(cli_args): + return run(build_command(cli_args), check=True, cwd=cli_args.dir) + + +def cli_args_parser(description="compute-sanitizer tool"): + import argparse + + _help = ValDict( + dir="working directory", + quiet="Redirect output to /dev/null", + logging="0=off, 1=on non zero exit code, 2=always", + outfile="path for saved file if active", + tool="Sanitizer tool to use (memcheck, racecheck, initcheck, synccheck)", + extra="forward string to csan command", + ) + + parser = argparse.ArgumentParser( + description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument("remaining", nargs=argparse.REMAINDER) + parser.add_argument("-d", "--dir", default=".", help=_help.dir) + parser.add_argument("-o", "--outfile", default=None, help=_help.outfile) + parser.add_argument("-t", "--tool", default="memcheck", help=_help.tool) + parser.add_argument("--logging", type=int, default=1, help=_help.logging) + parser.add_argument("-e", "--extra", type=str, default="", help=_help.extra) + + return parser + + +def verify_cli_args(cli_args): + return cli_args diff --git a/phlop/app/nvidia/ncu.py b/phlop/app/nvidia/ncu.py new file mode 100644 index 0000000..482c57a --- /dev/null +++ b/phlop/app/nvidia/ncu.py @@ -0,0 +1,57 @@ +# Nsight Compute CLI + +# https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html + +## samples +# ncu --help +# ncu --metrics all +# ncu --metrics l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum +# ncu --target-processes all -o mpirun [mpi arguments] [app arguments] +# + + +from phlop.dict import ValDict +from phlop.proc import run + +metrics = [ + "all", + "l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum", # read + "l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum", # wrte +] + + +def build_command(cli_args): + return f"ncu {cli_args.remaining}" + + +def exec(cli_args): + return run(build_command(cli_args), check=True) + + +def cli_args_parser(description="ncu tool"): + import argparse + + _help = ValDict( + dir="working directory", + quiet="Redirect output to /dev/null", + logging="0=off, 1=on non zero exit code, 2=always", + outfile="path for saved file if active", + tool="", + extra="forward string to csan command", + ) + + parser = argparse.ArgumentParser( + description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument("remaining", nargs=argparse.REMAINDER) + parser.add_argument("-d", "--dir", default=".", help=_help.dir) + parser.add_argument("-o", "--outfile", default=None, help=_help.outfile) + parser.add_argument("-t", "--tool", default="stat", help=_help.tool) + parser.add_argument("--logging", type=int, default=1, help=_help.logging) + parser.add_argument("-e", "--extra", type=str, default="", help=_help.extra) + + return parser + + +def verify_cli_args(cli_args): + return cli_args diff --git a/phlop/app/perf.py b/phlop/app/perf.py index 0a03294..790a176 100644 --- a/phlop/app/perf.py +++ b/phlop/app/perf.py @@ -112,11 +112,10 @@ def cli_args_parser(description="Perf tool"): quiet="Redirect output to /dev/null", cores="Parallism core/thread count", infiles="infiles", - print_only="Print only, no execution", - regex="Filter out non-matching execution strings", logging="0=off, 1=on non zero exit code, 2=always", outfile="path for saved file if active", tool="stat/record/etc", + extra="forward string to perf command", ) parser = argparse.ArgumentParser( @@ -129,10 +128,11 @@ def cli_args_parser(description="Perf tool"): "-p", "--print_only", action="store_true", default=False, help=_help.print_only ) parser.add_argument("-i", "--infiles", default=None, help=_help.infiles) - parser.add_argument("-r", "--regex", default=None, help=_help.regex) parser.add_argument("-o", "--outfile", default=None, help=_help.outfile) parser.add_argument("-t", "--tool", default="stat", help=_help.tool) parser.add_argument("--logging", type=int, default=1, help=_help.logging) + parser.add_argument("-e", "--extra", type=str, default="", help=_help.extra) + return parser diff --git a/phlop/app/pfm/__init__.py b/phlop/app/pfm/__init__.py new file mode 100644 index 0000000..b117618 --- /dev/null +++ b/phlop/app/pfm/__init__.py @@ -0,0 +1,5 @@ +# +# +# +# +# diff --git a/phlop/app/pfm/__main__.py b/phlop/app/pfm/__main__.py new file mode 100644 index 0000000..f9d7228 --- /dev/null +++ b/phlop/app/pfm/__main__.py @@ -0,0 +1,12 @@ +# +# +# +# +# + + +available_modules = """Available: + phlop.app.pfm.check_events + phlop.app.pfm.showevtinfo""" + +print(available_modules) diff --git a/phlop/app/pfm/check_events.py b/phlop/app/pfm/check_events.py new file mode 100644 index 0000000..26fe30c --- /dev/null +++ b/phlop/app/pfm/check_events.py @@ -0,0 +1,48 @@ +# +# +# +# +# + + +import logging +from pathlib import Path + +from phlop.os import pushd +from phlop.proc import run +from phlop.string import decode_bytes + +FILE_DIR = Path(__file__).resolve().parent + +logger = logging.getLogger(__name__) +check_events_start = "Total events:" + + +def parse_check_events_output(lines): + return lines[-1].split(":")[1].strip().replace("0x", "r") + + +def run_check_events(code): + with pushd(FILE_DIR.parent.parent.parent): + return decode_bytes( + run(f"./tpp/pfm/examples/check_events {code}").stdout + ).splitlines() + + +def get_evt_perf_code(code): + return parse_check_events_output(run_check_events(code)) + + +if __name__ == "__main__": + from phlop.app.pfm.showevtinfo import get_evt_info + + key, code = "[MULT_FLOPS]", "" + for info in get_evt_info(): + if key in info.umask: + code = f"{info.name}:{info.umask[key].code}" + break + + assert code != "" + + # print("get_evt_perf_code", get_evt_perf_code(code)) + print(run(f"perf stat -e {get_evt_perf_code(code)} sleep 5")) diff --git a/phlop/app/pfm/showevtinfo.py b/phlop/app/pfm/showevtinfo.py new file mode 100644 index 0000000..4e33c15 --- /dev/null +++ b/phlop/app/pfm/showevtinfo.py @@ -0,0 +1,102 @@ +# +# +# +# +# + + +import logging +from dataclasses import asdict, dataclass, field +from pathlib import Path + +from phlop.os import pushd +from phlop.proc import run +from phlop.string import decode_bytes + +FILE_DIR = Path(__file__).resolve().parent + +logger = logging.getLogger(__name__) +EVTINFO_delimiter = "#-----------------------------" + + +@dataclass +class EVTUMask: + id: str + desc: str + code: str + + +@dataclass +class EVTInfo: + idx: str + pmu: str + name: str + umask: dict = field(default_factory=lambda: {}) + etc: dict = field(default_factory=lambda: {}) + + +@dataclass +class EVTInfos: + data: list = field(default_factory=lambda: []) + + def __iter__(self): + return self.data.__iter__() + + def umasks(self): + return EVTInfos(data=[d for d in self.data if d.umask]) + + def umasks_in(self, needle): + return EVTInfos( + data=[d for d in self.data if any(needle in k for k in d.umask)] + ) + + def append(self, ev: EVTInfo): + self.data.append(ev) + + +def _parse_evtinfo(bits_list): + assert len(bits_list) >= 7 + info = EVTInfo( + idx=bits_list[0][1].strip(), + pmu=bits_list[1][1].strip(), + name=bits_list[2][1].strip(), + ) + for bits in bits_list[7:]: + if bits[0].strip().startswith("Umask"): + info.umask[bits[3].strip()] = EVTUMask( + id=bits[3].strip(), desc=bits[5].strip(), code=bits[1].strip() + ) + return info + + +def parse_evtinfo_output(lines): + start_idx = 0 + for line in lines: + start_idx += 1 + if line.strip() == EVTINFO_delimiter: + break + + bits_list, results = [], EVTInfos() + for line in lines[start_idx:]: + if line == EVTINFO_delimiter: + results.append(_parse_evtinfo(bits_list)) + bits_list = [] + continue + bits_list.append(line.strip().split(":")) + + return results + + +def run_evtinfo(): + with pushd(FILE_DIR.parent.parent.parent): + return decode_bytes(run("./tpp/pfm/examples/showevtinfo").stdout).splitlines() + + +def get_evt_info(): + return parse_evtinfo_output(run_evtinfo()) + + +if __name__ == "__main__": + import json + + print(json.dumps(asdict(get_evt_info()), tabs=2)) diff --git a/phlop/app/stats_man.py b/phlop/app/stats_man.py index 8ed7edb..945439e 100644 --- a/phlop/app/stats_man.py +++ b/phlop/app/stats_man.py @@ -20,7 +20,6 @@ from phlop.dict import ValDict from phlop.proc import run_raw -logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) _default_interval = 2 diff --git a/phlop/run/mpirun_perf.py b/phlop/run/mpirun_perf.py index f4a7141..fb996d0 100644 --- a/phlop/run/mpirun_perf.py +++ b/phlop/run/mpirun_perf.py @@ -7,7 +7,6 @@ from phlop.app import perf as p -logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) MPI_RANK = os.environ.get("OMPI_COMM_WORLD_RANK") diff --git a/phlop/run/mpirun_stats_man.py b/phlop/run/mpirun_stats_man.py index acbe2af..712649c 100644 --- a/phlop/run/mpirun_stats_man.py +++ b/phlop/run/mpirun_stats_man.py @@ -7,7 +7,6 @@ from phlop.app import stats_man as sman -logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) MPI_RANK = os.environ.get("OMPI_COMM_WORLD_RANK") diff --git a/phlop/run/perf.py b/phlop/run/perf.py index 2529fc8..2196939 100644 --- a/phlop/run/perf.py +++ b/phlop/run/perf.py @@ -19,11 +19,11 @@ """ -def perf_stat_cmd(cli_args, path, line): +def perf_stat_cmd(cli_args, path, line, options): file = Path(line.split(" ")[-1]).stem outpath = logpath / path.stem outpath.mkdir(parents=True, exist_ok=True) - return p.stat_cmd(line, p.stat_events, outpath / f"{file}.json") + return p.stat_cmd(line, p.stat_events, outpath / f"{file}.json", options) def get_from_files(cli_args): @@ -50,7 +50,11 @@ def get_remaining(cli_args): test_batches = {} path = Path(cli_args.remaining[-1]).parent test_case = tc.determine_cores_for_test_case( - tc.TestCase(cmd=perf_stat_cmd(cli_args, path, " ".join(cli_args.remaining))) + tc.TestCase( + cmd=perf_stat_cmd( + cli_args, path, " ".join(cli_args.remaining), cli_args.extra + ) + ) ) test_batches[test_case.cores] = [test_case] return [tc.TestBatch(v, k) for k, v in test_batches.items()] diff --git a/phlop/testing/parallel_processor.py b/phlop/testing/parallel_processor.py index de51838..f2f0f91 100644 --- a/phlop/testing/parallel_processor.py +++ b/phlop/testing/parallel_processor.py @@ -16,8 +16,7 @@ logger = getLogger(__name__) -class TestCaseFailure(Exception): - ... +class TestCaseFailure(Exception): ... class LoggingMode(Enum): diff --git a/sh/clean.sh b/sh/clean.sh index c94f4b5..d47de06 100755 --- a/sh/clean.sh +++ b/sh/clean.sh @@ -1,10 +1,9 @@ #!/usr/bin/env bash -set -e -CWD="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" -cd "$CWD"/.. +set -ex +CWD="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" && cd "$CWD"/.. RM_RF=( - __pycache__ + __pycache__ .ruff_cache phlop.egg-info dist ) for RM in ${RM_RF[@]}; do diff --git a/sh/setup_pfm.sh b/sh/setup_pfm.sh new file mode 100755 index 0000000..2600b73 --- /dev/null +++ b/sh/setup_pfm.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +CWD="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" && cd "$CWD"/.. + +set -ex + +[ ! -d "tpp/pfm" ] && ( + git clone https://github.com/wcohen/libpfm4 tpp/pfm --depth 4 --shallow-submodules --recursive + cd tpp/pfm + make +) diff --git a/sh/test.sh b/sh/test.sh index 0db08df..84da0fb 100755 --- a/sh/test.sh +++ b/sh/test.sh @@ -1,7 +1,6 @@ #!/usr/bin/env bash -CWD="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" -cd "$CWD"/.. +CWD="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" && cd "$CWD"/.. set -ex @@ -19,4 +18,7 @@ py -O tests/timing/test_scope_timer.py test_scope_timer -f scope_timer.txt py -Om phlop.run.valgrind echo yes py -Om phlop.run.valgrind --tool=massif echo yes -py -Om phlop.run.perf echo yes || echo "perf failed, assumed CI" +py -Om phlop.run.perf -e="--all-user" echo yes || echo "perf failed, assumed CI" + +# install via ./sh/setup_pfm.sh +[ -d "tpp/pfm" ] && py -O tests/_phlop/app/pfm/test_pfm.py || echo "pfm missing, skipped" diff --git a/tests/_phlop/app/pfm/test_pfm.py b/tests/_phlop/app/pfm/test_pfm.py new file mode 100644 index 0000000..4d709eb --- /dev/null +++ b/tests/_phlop/app/pfm/test_pfm.py @@ -0,0 +1,27 @@ +# +# +# + + +from phlop.proc import run +from phlop.string import decode_bytes + +if __name__ == "__main__": + from phlop.app.pfm.check_events import get_evt_perf_code + from phlop.app.pfm.showevtinfo import get_evt_info + + code = "" + key0, key1 = "[MULT_FLOPS]", "[ADD_SUB_FLOPS]" + for info in get_evt_info(): + if key0 in info.umask: + for key, umask in info.umask.items(): + code += f"{info.name}:{umask.code} " + break + # if key1 in info.umask: + # code += f"{info.name}:{info.umask[key1].code} " + + code = code.strip() + assert code != "" + + events = " ".join([f"-e {get_evt_perf_code(ev)}" for ev in code.split(" ")]) + print(decode_bytes(run(f"perf stat {events} sleep 5").stderr))