diff --git a/.gitignore b/.gitignore
index c37575b..5ab9f94 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,4 +5,6 @@ bin
 __pycache__
 dist
 phlop.egg-info/
-scope_timer.txt
+
+*scope_timer.txt
+tpp
diff --git a/inc/phlop/timing/threaded_scope_timer.hpp b/inc/phlop/timing/threaded_scope_timer.hpp
index b11fc78..c367327 100644
--- a/inc/phlop/timing/threaded_scope_timer.hpp
+++ b/inc/phlop/timing/threaded_scope_timer.hpp
@@ -56,6 +56,7 @@ struct ScopeTimerMan
         }
         _headers.clear();
         thread_storage.clear();
+        thread_reports.clear();
         active = false;
     }
 
@@ -132,10 +133,16 @@ struct ScopeTimerMan
         std::unique_lock<std::mutex> lk(work_);
         thread_storage.emplace_back(std::move(pt.reports), std::move(pt.traces));
     }
+    void move(std::shared_ptr<RunTimerReport>& report)
+    {
+        std::unique_lock<std::mutex> lk(work_);
+        thread_reports.emplace_back(std::move(report));
+    }
 
     std::mutex work_;
     std::vector<std::pair<std::vector<RunTimerReport*>, std::vector<RunTimerReportSnapshot*>>>
         thread_storage;
+    std::vector<std::shared_ptr<RunTimerReport>> thread_reports; // keep alive
 };
 
 
@@ -159,9 +166,11 @@ struct RunTimerReportSnapshot
     std::vector<RunTimerReportSnapshot*> childs;
 };
 
+
 struct RunTimerReport
 {
-    std::string_view k, f;
+    std::string const k; // key
+    std::string const f; // function
     std::uint32_t l = 0;
 
     RunTimerReport(std::string_view const& _k, std::string_view const& _f, std::uint32_t const& _l)
@@ -175,6 +184,7 @@ struct RunTimerReport
 
     ~RunTimerReport() {}
 
+
     auto operator()(std::size_t i) { return snapshots[i].get(); }
     auto size() { return snapshots.size(); }
 
@@ -182,6 +192,12 @@ struct RunTimerReport
 };
 
 
+struct ThreadLifeWatcher
+{
+    ~ThreadLifeWatcher() { ScopeTimerMan::INSTANCE().move(report); }
+
+    std::shared_ptr<RunTimerReport> report;
+};
 
 
 struct scope_timer
@@ -264,7 +280,9 @@ struct BinaryTimerFile
     template<typename Trace>
     void recurse_traces_for_keys(Trace const& c)
     {
-        std::string s{c->self->k};
+        assert(c);
+        assert(c->self);
+        auto const& s = c->self->k;
         if (!key_ids.count(s))
         {
             auto [it, b] = key_ids.emplace(s, key_ids.size());
@@ -359,11 +377,13 @@ namespace detail
 #endif
 
 #define PHLOP_SCOPE_TIMER(key)                                                                     \
-    static phlop::threaded::RunTimerReport PHLOP_STR_CAT(ridx_, __LINE__){key, __FILE__,           \
-                                                                          __LINE__};               \
+    static thread_local auto PHLOP_STR_CAT(ridx_, __LINE__)                                        \
+        = std::make_shared<phlop::threaded::RunTimerReport>(key, __FILE__, __LINE__);              \
+    static thread_local phlop::threaded::ThreadLifeWatcher PHLOP_STR_CAT(_watcher_, __LINE__){     \
+        PHLOP_STR_CAT(ridx_, __LINE__)};                                                           \
     phlop::threaded::scope_timer PHLOP_STR_CAT(_scope_timer_,                                      \
-                                               __LINE__){PHLOP_STR_CAT(ridx_, __LINE__)};          \
-    phlop::threaded::ScopeTimerMan::local().report_stack_ptr = &PHLOP_STR_CAT(ridx_, __LINE__);
+                                               __LINE__){*PHLOP_STR_CAT(ridx_, __LINE__)};         \
+    phlop::threaded::ScopeTimerMan::local().report_stack_ptr = PHLOP_STR_CAT(ridx_, __LINE__).get();
 
 
 #endif /*_PHLOP_TIMING_THREADED_SCOPE_TIMER_HPP_*/
diff --git a/mkn.pfm.yaml b/mkn.pfm.yaml
new file mode 100644
index 0000000..abd7ee9
--- /dev/null
+++ b/mkn.pfm.yaml
@@ -0,0 +1,29 @@
+#! clean build test run -p scope_timer,threaded_scope_timer -Oa "-fPIC -std=c++20" -W 9
+
+# run script first: ./sh/setup_pfm.sh
+
+name: phlop.pfm
+parent: base
+
+profile:
+- name: base
+  inc: inc
+
+- name: pfm
+  inc: tpp/pfm/include
+
+- name: pfm_lib
+  parent: pfm
+  mode: static
+  inc: tpp/pfm/lib, 0
+  src: tpp/pfm/lib
+
+- name: pfm_events
+  self: pfm_lib
+  main: tpp/pfm/examples/check_events.c
+  out: check_events
+
+- name: pfm_info
+  self: pfm_lib
+  main: tpp/pfm/examples/showevtinfo.c
+  out: showevtinfo
diff --git a/phlop/app/__main__.py b/phlop/app/__main__.py
index c7cc76d..32d4f1b 100644
--- a/phlop/app/__main__.py
+++ b/phlop/app/__main__.py
@@ -9,6 +9,8 @@
     phlop.app.cmake
     phlop.app.test_cases
     phlop.app.git
+    phlop.app.nvidia
+    phlop.app.pfm
     phlop.app.perf"""
 
 print(available_modules)
diff --git a/phlop/app/nvidia/__init__.py b/phlop/app/nvidia/__init__.py
new file mode 100644
index 0000000..b117618
--- /dev/null
+++ b/phlop/app/nvidia/__init__.py
@@ -0,0 +1,5 @@
+#
+#
+#
+#
+#
diff --git a/phlop/app/nvidia/__main__.py b/phlop/app/nvidia/__main__.py
new file mode 100644
index 0000000..bc16f88
--- /dev/null
+++ b/phlop/app/nvidia/__main__.py
@@ -0,0 +1,12 @@
+#
+#
+#
+#
+#
+
+
+available_modules = """Available:
+    phlop.app.nvidia.csan
+    phlop.app.nvidia.ncu"""
+
+print(available_modules)
diff --git a/phlop/app/nvidia/csan.py b/phlop/app/nvidia/csan.py
new file mode 100644
index 0000000..d0ba3e5
--- /dev/null
+++ b/phlop/app/nvidia/csan.py
@@ -0,0 +1,63 @@
+# compute sanitizer frontend
+
+# https://docs.nvidia.com/compute-sanitizer/ComputeSanitizer/index.html
+
+## samples
+#  compute-sanitizer --tool memcheck [sanitizer_options] app_name [app_options]
+#  compute-sanitizer --tool racecheck [sanitizer_options] app_name [app_options]
+#
+#
+#
+
+
+from phlop.dict import ValDict
+from phlop.proc import run
+
+metrics = [
+    "all",
+    "l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum",  # read
+    "l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum",  # wrte
+]
+
+
+def build_command(cli_args):
+    cmd_parts = [
+        "compute-sanitizer",
+        f"--tool {cli_args.tool}",
+        cli_args.extra if cli_args.extra else "",
+        " ".join(cli_args.remaining) if cli_args.remaining else "",
+    ]
+    return " ".join(filter(None, cmd_parts))
+
+
+def exec(cli_args):
+    return run(build_command(cli_args), check=True, cwd=cli_args.dir)
+
+
+def cli_args_parser(description="compute-sanitizer tool"):
+    import argparse
+
+    _help = ValDict(
+        dir="working directory",
+        quiet="Redirect output to /dev/null",
+        logging="0=off, 1=on non zero exit code, 2=always",
+        outfile="path for saved file if active",
+        tool="Sanitizer tool to use (memcheck, racecheck, initcheck, synccheck)",
+        extra="forward string to csan command",
+    )
+
+    parser = argparse.ArgumentParser(
+        description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("remaining", nargs=argparse.REMAINDER)
+    parser.add_argument("-d", "--dir", default=".", help=_help.dir)
+    parser.add_argument("-o", "--outfile", default=None, help=_help.outfile)
+    parser.add_argument("-t", "--tool", default="memcheck", help=_help.tool)
+    parser.add_argument("--logging", type=int, default=1, help=_help.logging)
+    parser.add_argument("-e", "--extra", type=str, default="", help=_help.extra)
+
+    return parser
+
+
+def verify_cli_args(cli_args):
+    return cli_args
diff --git a/phlop/app/nvidia/ncu.py b/phlop/app/nvidia/ncu.py
new file mode 100644
index 0000000..482c57a
--- /dev/null
+++ b/phlop/app/nvidia/ncu.py
@@ -0,0 +1,57 @@
+# Nsight Compute CLI
+
+# https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html
+
+## samples
+#  ncu --help
+#  ncu --metrics all
+#  ncu --metrics l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum
+#  ncu --target-processes all -o <report-name> mpirun [mpi arguments] <app> [app arguments]
+#
+
+
+from phlop.dict import ValDict
+from phlop.proc import run
+
+metrics = [
+    "all",
+    "l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum",  # read
+    "l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum",  # wrte
+]
+
+
+def build_command(cli_args):
+    return f"ncu {cli_args.remaining}"
+
+
+def exec(cli_args):
+    return run(build_command(cli_args), check=True)
+
+
+def cli_args_parser(description="ncu tool"):
+    import argparse
+
+    _help = ValDict(
+        dir="working directory",
+        quiet="Redirect output to /dev/null",
+        logging="0=off, 1=on non zero exit code, 2=always",
+        outfile="path for saved file if active",
+        tool="",
+        extra="forward string to csan command",
+    )
+
+    parser = argparse.ArgumentParser(
+        description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("remaining", nargs=argparse.REMAINDER)
+    parser.add_argument("-d", "--dir", default=".", help=_help.dir)
+    parser.add_argument("-o", "--outfile", default=None, help=_help.outfile)
+    parser.add_argument("-t", "--tool", default="stat", help=_help.tool)
+    parser.add_argument("--logging", type=int, default=1, help=_help.logging)
+    parser.add_argument("-e", "--extra", type=str, default="", help=_help.extra)
+
+    return parser
+
+
+def verify_cli_args(cli_args):
+    return cli_args
diff --git a/phlop/app/perf.py b/phlop/app/perf.py
index 0a03294..790a176 100644
--- a/phlop/app/perf.py
+++ b/phlop/app/perf.py
@@ -112,11 +112,10 @@ def cli_args_parser(description="Perf tool"):
         quiet="Redirect output to /dev/null",
         cores="Parallism core/thread count",
         infiles="infiles",
-        print_only="Print only, no execution",
-        regex="Filter out non-matching execution strings",
         logging="0=off, 1=on non zero exit code, 2=always",
         outfile="path for saved file if active",
         tool="stat/record/etc",
+        extra="forward string to perf command",
     )
 
     parser = argparse.ArgumentParser(
@@ -129,10 +128,11 @@ def cli_args_parser(description="Perf tool"):
         "-p", "--print_only", action="store_true", default=False, help=_help.print_only
     )
     parser.add_argument("-i", "--infiles", default=None, help=_help.infiles)
-    parser.add_argument("-r", "--regex", default=None, help=_help.regex)
     parser.add_argument("-o", "--outfile", default=None, help=_help.outfile)
     parser.add_argument("-t", "--tool", default="stat", help=_help.tool)
     parser.add_argument("--logging", type=int, default=1, help=_help.logging)
+    parser.add_argument("-e", "--extra", type=str, default="", help=_help.extra)
+
     return parser
 
 
diff --git a/phlop/app/pfm/__init__.py b/phlop/app/pfm/__init__.py
new file mode 100644
index 0000000..b117618
--- /dev/null
+++ b/phlop/app/pfm/__init__.py
@@ -0,0 +1,5 @@
+#
+#
+#
+#
+#
diff --git a/phlop/app/pfm/__main__.py b/phlop/app/pfm/__main__.py
new file mode 100644
index 0000000..f9d7228
--- /dev/null
+++ b/phlop/app/pfm/__main__.py
@@ -0,0 +1,12 @@
+#
+#
+#
+#
+#
+
+
+available_modules = """Available:
+    phlop.app.pfm.check_events
+    phlop.app.pfm.showevtinfo"""
+
+print(available_modules)
diff --git a/phlop/app/pfm/check_events.py b/phlop/app/pfm/check_events.py
new file mode 100644
index 0000000..26fe30c
--- /dev/null
+++ b/phlop/app/pfm/check_events.py
@@ -0,0 +1,48 @@
+#
+#
+#
+#
+#
+
+
+import logging
+from pathlib import Path
+
+from phlop.os import pushd
+from phlop.proc import run
+from phlop.string import decode_bytes
+
+FILE_DIR = Path(__file__).resolve().parent
+
+logger = logging.getLogger(__name__)
+check_events_start = "Total events:"
+
+
+def parse_check_events_output(lines):
+    return lines[-1].split(":")[1].strip().replace("0x", "r")
+
+
+def run_check_events(code):
+    with pushd(FILE_DIR.parent.parent.parent):
+        return decode_bytes(
+            run(f"./tpp/pfm/examples/check_events {code}").stdout
+        ).splitlines()
+
+
+def get_evt_perf_code(code):
+    return parse_check_events_output(run_check_events(code))
+
+
+if __name__ == "__main__":
+    from phlop.app.pfm.showevtinfo import get_evt_info
+
+    key, code = "[MULT_FLOPS]", ""
+    for info in get_evt_info():
+        if key in info.umask:
+            code = f"{info.name}:{info.umask[key].code}"
+            break
+
+    assert code != ""
+
+    # print("get_evt_perf_code", get_evt_perf_code(code))
+    print(run(f"perf stat -e {get_evt_perf_code(code)} sleep 5"))
diff --git a/phlop/app/pfm/showevtinfo.py b/phlop/app/pfm/showevtinfo.py
new file mode 100644
index 0000000..4e33c15
--- /dev/null
+++ b/phlop/app/pfm/showevtinfo.py
@@ -0,0 +1,102 @@
+#
+#
+#
+#
+#
+
+
+import logging
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+
+from phlop.os import pushd
+from phlop.proc import run
+from phlop.string import decode_bytes
+
+FILE_DIR = Path(__file__).resolve().parent
+
+logger = logging.getLogger(__name__)
+EVTINFO_delimiter = "#-----------------------------"
+
+
+@dataclass
+class EVTUMask:
+    id: str
+    desc: str
+    code: str
+
+
+@dataclass
+class EVTInfo:
+    idx: str
+    pmu: str
+    name: str
+    umask: dict = field(default_factory=lambda: {})
+    etc: dict = field(default_factory=lambda: {})
+
+
+@dataclass
+class EVTInfos:
+    data: list = field(default_factory=lambda: [])
+
+    def __iter__(self):
+        return self.data.__iter__()
+
+    def umasks(self):
+        return EVTInfos(data=[d for d in self.data if d.umask])
+
+    def umasks_in(self, needle):
+        return EVTInfos(
+            data=[d for d in self.data if any(needle in k for k in d.umask)]
+        )
+
+    def append(self, ev: EVTInfo):
+        self.data.append(ev)
+
+
+def _parse_evtinfo(bits_list):
+    assert len(bits_list) >= 7
+    info = EVTInfo(
+        idx=bits_list[0][1].strip(),
+        pmu=bits_list[1][1].strip(),
+        name=bits_list[2][1].strip(),
+    )
+    for bits in bits_list[7:]:
+        if bits[0].strip().startswith("Umask"):
+            info.umask[bits[3].strip()] = EVTUMask(
+                id=bits[3].strip(), desc=bits[5].strip(), code=bits[1].strip()
+            )
+    return info
+
+
+def parse_evtinfo_output(lines):
+    start_idx = 0
+    for line in lines:
+        start_idx += 1
+        if line.strip() == EVTINFO_delimiter:
+            break
+
+    bits_list, results = [], EVTInfos()
+    for line in lines[start_idx:]:
+        if line == EVTINFO_delimiter:
+            results.append(_parse_evtinfo(bits_list))
+            bits_list = []
+            continue
+        bits_list.append(line.strip().split(":"))
+
+    return results
+
+
+def run_evtinfo():
+    with pushd(FILE_DIR.parent.parent.parent):
+        return decode_bytes(run("./tpp/pfm/examples/showevtinfo").stdout).splitlines()
+
+
+def get_evt_info():
+    return parse_evtinfo_output(run_evtinfo())
+
+
+if __name__ == "__main__":
+    import json
+
+    print(json.dumps(asdict(get_evt_info()), tabs=2))
diff --git a/phlop/app/stats_man.py b/phlop/app/stats_man.py
index 8ed7edb..945439e 100644
--- a/phlop/app/stats_man.py
+++ b/phlop/app/stats_man.py
@@ -20,7 +20,6 @@
 from phlop.dict import ValDict
 from phlop.proc import run_raw
 
-logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger(__name__)
 
 _default_interval = 2
diff --git a/phlop/run/mpirun_perf.py b/phlop/run/mpirun_perf.py
index f4a7141..fb996d0 100644
--- a/phlop/run/mpirun_perf.py
+++ b/phlop/run/mpirun_perf.py
@@ -7,7 +7,6 @@
 
 from phlop.app import perf as p
 
-logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger(__name__)
 
 MPI_RANK = os.environ.get("OMPI_COMM_WORLD_RANK")
diff --git a/phlop/run/mpirun_stats_man.py b/phlop/run/mpirun_stats_man.py
index acbe2af..712649c 100644
--- a/phlop/run/mpirun_stats_man.py
+++ b/phlop/run/mpirun_stats_man.py
@@ -7,7 +7,6 @@
 
 from phlop.app import stats_man as sman
 
-logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger(__name__)
 
 MPI_RANK = os.environ.get("OMPI_COMM_WORLD_RANK")
diff --git a/phlop/run/perf.py b/phlop/run/perf.py
index 2529fc8..2196939 100644
--- a/phlop/run/perf.py
+++ b/phlop/run/perf.py
@@ -19,11 +19,11 @@
 """
 
 
-def perf_stat_cmd(cli_args, path, line):
+def perf_stat_cmd(cli_args, path, line, options):
     file = Path(line.split(" ")[-1]).stem
     outpath = logpath / path.stem
     outpath.mkdir(parents=True, exist_ok=True)
-    return p.stat_cmd(line, p.stat_events, outpath / f"{file}.json")
+    return p.stat_cmd(line, p.stat_events, outpath / f"{file}.json", options)
 
 
 def get_from_files(cli_args):
@@ -50,7 +50,11 @@ def get_remaining(cli_args):
     test_batches = {}
     path = Path(cli_args.remaining[-1]).parent
     test_case = tc.determine_cores_for_test_case(
-        tc.TestCase(cmd=perf_stat_cmd(cli_args, path, " ".join(cli_args.remaining)))
+        tc.TestCase(
+            cmd=perf_stat_cmd(
+                cli_args, path, " ".join(cli_args.remaining), cli_args.extra
+            )
+        )
     )
     test_batches[test_case.cores] = [test_case]
     return [tc.TestBatch(v, k) for k, v in test_batches.items()]
diff --git a/phlop/testing/parallel_processor.py b/phlop/testing/parallel_processor.py
index de51838..f2f0f91 100644
--- a/phlop/testing/parallel_processor.py
+++ b/phlop/testing/parallel_processor.py
@@ -16,8 +16,7 @@
 logger = getLogger(__name__)
 
 
-class TestCaseFailure(Exception):
-    ...
+class TestCaseFailure(Exception): ...
 
 
 class LoggingMode(Enum):
diff --git a/sh/clean.sh b/sh/clean.sh
index c94f4b5..d47de06 100755
--- a/sh/clean.sh
+++ b/sh/clean.sh
@@ -1,10 +1,9 @@
 #!/usr/bin/env bash
-set -e
-CWD="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
-cd "$CWD"/..
+set -ex
+CWD="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" && cd "$CWD"/..
 
 RM_RF=(
-  __pycache__
+  __pycache__ .ruff_cache phlop.egg-info dist
 )
 
 for RM in ${RM_RF[@]}; do
diff --git a/sh/setup_pfm.sh b/sh/setup_pfm.sh
new file mode 100755
index 0000000..2600b73
--- /dev/null
+++ b/sh/setup_pfm.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+
+CWD="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" && cd "$CWD"/..
+
+set -ex
+
+[ ! -d "tpp/pfm" ] && (
+    git clone https://github.com/wcohen/libpfm4 tpp/pfm --depth 4 --shallow-submodules --recursive
+    cd tpp/pfm
+    make
+)
diff --git a/sh/test.sh b/sh/test.sh
index 0db08df..84da0fb 100755
--- a/sh/test.sh
+++ b/sh/test.sh
@@ -1,7 +1,6 @@
 #!/usr/bin/env bash
 
-CWD="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
-cd "$CWD"/..
+CWD="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" && cd "$CWD"/..
 
 set -ex
 
@@ -19,4 +18,7 @@ py -O tests/timing/test_scope_timer.py test_scope_timer -f scope_timer.txt
 py -Om phlop.run.valgrind echo yes
 py -Om phlop.run.valgrind --tool=massif echo yes
 
-py -Om phlop.run.perf echo yes || echo "perf failed, assumed CI"
+py -Om phlop.run.perf -e="--all-user" echo yes || echo "perf failed, assumed CI"
+
+# install via ./sh/setup_pfm.sh
+[ -d "tpp/pfm" ] && py -O tests/_phlop/app/pfm/test_pfm.py || echo "pfm missing, skipped"
diff --git a/tests/_phlop/app/pfm/test_pfm.py b/tests/_phlop/app/pfm/test_pfm.py
new file mode 100644
index 0000000..4d709eb
--- /dev/null
+++ b/tests/_phlop/app/pfm/test_pfm.py
@@ -0,0 +1,27 @@
+#
+#
+#
+
+
+from phlop.proc import run
+from phlop.string import decode_bytes
+
+if __name__ == "__main__":
+    from phlop.app.pfm.check_events import get_evt_perf_code
+    from phlop.app.pfm.showevtinfo import get_evt_info
+
+    code = ""
+    key0, key1 = "[MULT_FLOPS]", "[ADD_SUB_FLOPS]"
+    for info in get_evt_info():
+        if key0 in info.umask:
+            for key, umask in info.umask.items():
+                code += f"{info.name}:{umask.code} "
+            break
+        # if key1 in info.umask:
+        #     code += f"{info.name}:{info.umask[key1].code} "
+
+    code = code.strip()
+    assert code != ""
+
+    events = " ".join([f"-e {get_evt_perf_code(ev)}" for ev in code.split(" ")])
+    print(decode_bytes(run(f"perf stat {events} sleep 5").stderr))