// Overview / Features / Requirements / Showcase / API / FAQ / User Guide / License
Performance
is not a number!
Single
header
/module
performance
library that combines the power of:
c++23
,linux/perf
,llvm/mca
,gnuplot/sixel
, ...
Profiling, Tracing, Analyzing, Benchmarking
namespace description API info
hardware/software info compiler
,cpu
,memory
,sys
,proc
,bin
[core]
low-level utilities code
,compiler
,cpu
,memory
[prof]/time
timing (rdtsc/clock/chrono) tsc
,cpu
,thread
,real
,monotonic
,steady_clock
,high_resolution_clock
[prof]/stat
counting (linux/perf) instructions
,cycles
, ...,top_down
[prof]/record
sampling (linux/perf) instructions
,cycles
, ...,mem_loads
,mem_stores
,top_down
[prof]/trace
tracing (linux/intel_pt) instructions
,cycles
,tsc
[backend]/mc
disassembling (llvm) assembly
,address
,encoding
,size
,uops
,latency
,rthroughput
,may_load
,may_store
,has_side_effects
, ...,source
[backend]/mca
analyzing (llvm/mca) cycles
,instructions
,uops
,timeline
,resource_pressure
,bottleneck
bench
benchmarking baseline
,latency
,throughput
[io]
logging/plotting (gnuplot/sixel) log
,json
,report
,annotate
,plot
(hist
,box
,bar
,line
,ecdf
)[] -
inline namespace
Optimal (All Features)
- (
clang-19+
|gcc-13+
) /c++23+
llvm-19+
-apt-get install llvm-dev
linux-6.x+
perf-event-open
-apt-get install linux-tools-common
intel-12th+
withPEBS
,IPT
support
libipt
-apt-get install libipt-dev
terminal
withsixel
support
gnuplot
-apt-get install gnuplot
Auxiliary (Enhancements)
gh
-apt-get install gh
prof
-https://github.com/qlibs/prof
linux-perf
-apt get install linux-tools-common
intel-vtune
-apt get install intel-oneapi-vtune
amd-uprof
-https://www.amd.com/en/developer/uprof.html#downloads
gperftools
-apt get install google-perftools
llvm-xray
-apt-get install llvm
callgrind
-apt-get install valgrind
ut
-https://github.com/qlibs/ut
uefi
-https://github.com/qlibs/uefi
Usage
/** * Info/Core */ import perf; // #include <perf> int main() { auto&& spec = perf::info::spec{{ {"time", std::chrono::system_clock::now()}, {"perf.version", perf::info::version()}, {"sys.triple", perf::info::sys::triple()}, {"cxx.name", perf::info::compiler::name()}, {"cxx.version", perf::info::compiler::version()}, {"cpu.name", perf::info::cpu::name()}, {"cpu.code_name", perf::info::cpu::code_name()}, {"cpu.version", perf::info::cpu::version()}, {"cache.dL1", perf::info::memory::dcache()[perf::info::memory::level::L1])}, // ... }}; perf::log(spec); }/** * Profiling/Tracing/Analyzing */ import perf; // #include <perf> int main() { auto&& timer = perf::time::timer{perf::time::cpu, perf::time::real}; auto&& counter = perf::stat::counter{perf::stat::branch_misses, perf::stat::branches}; auto&& sampler = perf::record::sampler{perf::record::mem_loads}; auto&& tracer = perf::trace::tracer{perf::trace::instructions}; auto&& profiler = perf::profiler{sampler, tracer, counter, timer}; constexpr auto invoke = [](auto& profiler, auto&& fn, auto&&... ts) { profiler.start(); perf::compiler::prevent_elision(fn(ts...)); profiler.stop(); }; invoke(fizz_buzz, std::rand()); perf::log(profiler[]); perf::verify(profiler[perf::time::cpu] > 0ns); perf::analyzer analyzer{perf::mc::assembly, perf::mca::timeline}; analyzer << profiler[perf::trace::instructions]; perf::log(analyzer[]); perf::verify(analyzer[perf::mca::timeline][0u].cycle_dispatched); perf::verify(analyzer[perf::mc::assembly][0u].contains("add")); }/** * Benchmarking */ import perf; // #include <perf> auto fizz_buzz(int n) { if (n % 15 == 0) { return "FizzBuzz"; } else if (n % 3 == 0) { return "Fizz"; } else if (n % 5 == 0) { return "Buzz"; } else { return "Unknown"; } } int main() { perf::runner bench{perf::bench::latency{}}; bench(fizz_buzz, 15); bench(fizz_buzz, 3); bench(fizz_buzz, 5); bench(fizz_buzz, perf::data::unpredictable<int>{}); perf::report(bench[perf::time::steady_clock, perf::bench::operations]); perf::plot::bar(bench[perf::time::steady_clock]); perf::annotate<perf::vsplit>(bench[perf::mc::assembly]); }Build & Test
# module clang++ -std=c++23 -O3 -I. --precompile perf.cppm # -DNTEST disables compile-time tests clang++ -std=c++23 -O3 -fprebuilt-module-path=. perf.pcm <source_file> -lLLVM -lipt # header $CXX -std=c++23 -O3 -I. <source_file> -lLLVM -lipt # -DNTEST disables compile-time tests.github/scripts/tune.sh # See #FAQ for more
Export & Share
./a.out | .github/scripts/export.sh html | gh gist create --public
scripts/export.sh
-html
,markdown
,notebook
gh
-apt-get install gh
Configuration
/** * PERF version (read-only) # https://semver.org */ #define PERF (MAJOR, MINOR, PATCH) // ex. (1, 0, 0)/** * GNU # default: deduced based on `__GNUC__` * - 0 not compatible * - 1 compatible */ #define PERF_GNU 0/1 /** * Linux # default: deduced based on `__linux__` * - 0 not supported * - 1 supported */ #define PERF_LINUX 0/1 /** * LLVM # default: deduced based on `llvm-dev` headers * - 0 not supported * - 1 supported */ #define PERF_LLVM 0/1 /** * Intel Processor Trace # default: deduced based on `intel_pt` headers * - 0 not supported * - 1 supported */ #define PERF_INTEL 0/1 /** * I/O support # default: 1 * - 0 not compiled * - 1 supported (`log, json, report, annotate, plot`) */ #define PERF_IO 0/1 /** * tests # default: not-defined * - defined: disables all compile-time, run-time tests * - not-defined: compile-time tests executed, * run-time tests available by `perf::self::test()` API */ #define NTEST/** * gnuplot terminal # see `gnuplot -> set terminal` # default: 'sixel' * - 'sixel' # console image # https://www.arewesixelyet.com * - 'wxt' # popup window * - 'dumb size 150,25 ansi' # console with colors * - 'dumb size 80,25' # console */ ENV:PERF_IO_PLOT_TERM /** * style # default: dark * - light * - dark */ ENV:PERF_IO_PLOT_STYLEUtility/Info/Core
namespace perf::info { /** * static_assert(version().major == 1); * static_assert(version().minor == 0); * static_assert(version().patch == 0); */ inline constexpr auto version = [] -> sem_ver; } // namespace perf::infonamespace perf::info::compiler { /** * verify(name() == "clang"s); */ inline constexpr auto name = [] -> std::string_view; /** * static_assert(version().major == 20); * static_assert(version().minor == 0); * static_assert(version().patch == 0); */ inline constexpr auto version = [] -> sem_ver; } // namespace perf::info::compiler// perf::info::cpu::name assert(perf::info::cpu::name() == "12th Gen Intel(R) Core(TM) i7-12650"s); // perf::info::cpu::code_name assert(perf::info::cpu::code_name() == "alderlake"s); // perf::info::cpu::version assert(perf::info::cpu::version().family == 6); assert(perf::info::cpu::version().model == 154); assert(perf::info::cpu::version().stepping == 3); // perf::info::cpu::dispatch_width assert(perf::info::cpu::dispatch_width() == 6); // perf::info::cpu::features assert(perf::info::cpu::features() == std::vector{"avx", "avx2", "bmi", ...});// info::memory::icache assert(perf::info::memory::icache() == std::map{{level::L1, {.size = 448KiB, .line_size = 64, .assoc = 8}}} ); // info::memory::dcache assert(perf::info::memory::dcache() == std::map{ {level::L1, {.size = 416KiB, .line_size = 64, .assoc = 8}}, ...}, {level::L2, {.size = 9.5MiB, .line_size = 64, .assoc = 12}}, ...}, {level::L3, {.size = 24Mib, .line_size = 64, .assoc = 12}}, ...}, });// info::sys::name assert(perf::info::sys::name() == "linux"s); // info::sys::triple assert(perf::info::sys::triple() == "x86_64-pc-linux-gnu"s); // info::sys::page_size assert(perf::info::sys::page_size() == 4096b);// info::proc::name assert(perf::info::proc::self::name() == "/full/path/example.out"s); // info::proc::base_address assert(perf::info::proc::self::base_address());// info::bin::addr_to_fn_name static auto fn = [] {}; auto&& fn_name = perf::info::bin::addr_to_fn_name( perf::info::proc::self::name(), std::uint64_t(&fn) - perf::info::proc::self::base_address() ); assert(fn_name.has_value() and *fn_name == "fn"s); // info::bin::addr_to_name static auto var = 0; auto&& var_name = perf::info::bin::addr_to_name( perf::info::proc::self::name(), std::uint64_t(&var) - perf::info::proc::self::base_address() ); assert(var_name.has_value() and *var_name == "var"s); // info::bin::addr_to_line # requires debug symbols (-g) label:; auto&& source = perf::info::bin::addr_to_line( perf::info::proc::self::name(), std::uint64_t(&&label) - perf::info::proc::self::base_address() ); assert(source.has_value() and source->contains("label:;"));// code::align perf::code::align<std::align_val_t(64u)>(); for (...) { } // code::label perf::code::label<"begin">(); // begin: perf::code::label<"end">(); // end: assert(perf::code::labels["begin"] != perf::code::labels["end"]);// compiler::prevent_reorder # std::atomic_signal_fence perf::compiler::prevent_reorder(); // copmiler::prevent_elision int i{}; assert(perf::compiler::prevent_elision(i++)); // copiler::is_elided assert(perf::compiler::is_elided([] { })); assert(perf::compiler::is_elided([] { int i{}; i++; })); assert(not perf::compiler::is_elided([] { int i{}; perf::compiler::prevent_elision(i++); }));// cpu::pipeline::flush perf::cpu::pipeline::flush();// memory::align/memory::is_aligned auto aligned_addr = perf::memory::align<perf::memory::direction::up>( addr, std::align_val_t(64u) ); assert(perf::memory::is_aligned(aligned_addr, std::align_val_t(64u))); // memory::synchronize # std::atomic_thread_fence perf::memory::synchronize(); // memory::prefetch perf::memory::prefetch<perf::memory::operation::write, perf::memory::locality::high>(addr); // memory::lock # scoped{mlockall, munlockall} { perf::memory::lock _; } // memory::protect const std::array add{ // x86-64 0x89, 0xf8, // mov eax, edi 0x01, 0xf0, // add eax, esi 0xc3 // ret }; perf::memory::protect( std::span(add), perf::memory::protection::read | perf::memory::protection::write | perf::memory::protection::exec) ); assert(invoke(add, 1, 2) == 3); assert(invoke(add, 2, 3) == 5); // memory::pollute # pollutes memory by making allocations perf::memory::pollute(1024u); // memory::pre_fault # touches all pages used by data perf::memory::pre_fault(std::span(...)); // memory::flush # clears all cache lines used by data perf::memory::cache::flush(std::span(...));// sys::affinity perf::thread::affinity::set(perf::thread::self, 2u); assert(2u == perf::thread::affinity::get(perf;:thread::self));Profiling/Tracing/Analyzing
// time::timer perf::time::timer t{perf::time::steady_clock, perf::time::cpu}; t.start(); fn(); t.stop(); assert(t[perf::time::steady_clock] > 0ns); assert(t[perf::time::cpu] > 0ns); // `t[]` - returns std::tuple of all timers assert(std::get<0u>(t[]) > 0ns); // steady_clock assert(std::get<1u>(t[]) > 0ns); // time_cpuperf::time::steady_clock - monotonic-time perf::time::high_resolution_clock - highest available resolution clock perf::time::cpu - user-time + sys-time perf::time::thread - cpu-time for the current thread perf::time::real - wall-time perf::time::monotonic - guranateed to be always increasing perf::time::tsc - time-stamp-counter
// stat::counter // metrics/dsl // top_down// instruction per cycle (ipc) ipc = instructions / cycles; // cycles per instruction (cpi, inverse of ipc) cpi = cycles / instructions; // branch miss rate (branch misses per branch instruction) branch_miss_rate = branch_misses / branches; // cache miss rate (cache misses per cache reference) cache_miss_rate = cache_misses / cache_references; // llc miss rate llc_miss_rate = llc_misses / cache_references; // l1 data cache miss rate l1_dcache_miss_rate = l1_dcache_load_misses / l1_dcache_loads; // l1 instruction cache miss rate l1_icache_miss_rate = l1_icache_load_misses / l1_icache_loads; // dtlb miss rate dtlb_miss_rate = dtlb_load_misses / dtlb_loads; // itlb miss rate itlb_miss_rate = itlb_load_misses / itlb_loads; // stalled cycles rate (frontend) frontend_stall_rate = stalled_cycles_frontend / cycles; // stalled cycles rate (backend) backend_stall_rate = stalled_cycles_backend / cycles; // memory access rate memory_stall_ratio = stalled_cycles_backend / cycles; // overall stall rate total_stall_rate = (stalled_cycles_backend + stalled_cycles_frontend) / cycles; // cpu migrations per cycles cpu_migration_rate = cpu_migrations / cycles; // context switches per cycles context_switch_rate = context_switches / cycles; // page fault rate page_fault_rate = faults / cycles; // page fault rate (major faults per total faults) major_fault_rate = major_faults / cycles; // page fault rate (minor faults per total faults) minor_fault_rate = minor_faults / cycles;// record::sampler
// https://github.com/qlibs/prof // prof::callgrind // prof::...// trace::tracer auto&& instructions = perf::trace::trace( [&] { return fizz_buzz(i); } )[perf::trace::instructions]; perf::analyzer analyzer{perf::mc::assembly}; for (auto&& i : (analyzer << instructions)[perf::mc::assembly]) { perf::log("{}", i); }// perf::analyzer
Benchmarking
// perf::runner auto&& runner = [](auto&& fn, auto&&... ts) { perf::dataset ds{}; time::timer timer{perf::time::steady_clock}; timer.start(); for (auto i = 0u; i < 1'000; ++i) { compiler::prevent_elision(fn(ts...)); } timer.stop(); ds += timer; return ds; }; // `runner` deduces what to run based on the usage of `bench[...]` // unless otherwise explicilty specified perf::runner bench{perf::named("runner", runner}}; static auto fizz_buzz = [](int n) { if (n % 15 == 0) { return "FizzBuzz"; } else if (n % 3 == 0) { return "Fizz"; } else if (n % 5 == 0) { return "Buzz"; } else { return "Unknown"; } }; bench(fizz_buzz, 15); bench(fizz_buzz, 3); bench(fizz_buzz, 5); perf::report(bench[perf::time::steady_clock]);// perf::data bench(fizz_buzz, perf::data::sequence<int>{{3,5,15}}); bench(fizz_buzz, perf::data::uniform<int>{.min = 0, .max = 15}); // choice// perf::bench::latency auto add = [](int a, int b) { return a + b; }; auto sub = [](int a, int b) { return a - b; }; auto mult = [](int a, int b) { return a * b; }; auto div = [](int a, int b) { return a / b; }; perf::runner bench{perf::bench::latency{}}; bench(add, 0, 0); bench(sub, 0, 0); bench(mult, 0, 0); bench(div, 0, 0); using perf::metric::operator/; auto ops = perf::bench::operations; perf::report(bench[perf::time::tsc / ops, perf::stat::cycles / ops]);// perf::bench::throughput
latency = time / operations; throughput = operations / time; inverse_throughput = time / operations;// perf::bench::baseline // perf::bench::debug// perf::plot::hist // perf::plot::bar // perf::plot::box // perf::plot::line // perf::plot::ecdf // perf::plot::complexity
Setup Guide
How to setup
perf
docker?docker build -t perf .
docker run \ -it \ --privileged \ --network=host \ -e DISPLAY=${DISPLAY} \ -v ${PWD}:${PWD} \ -w ${PWD} \ perfHow to install
perf
depenencies?apt-get install linux-tools-common # linux-perf (perf::stat/perf::record) apt-get install llvm-dev # llvm (perf::mc/perf::mca) apt-get install libipt-dev # libipt (perf::trace) apt-get install gnuplot # (perf::plot)How to setup
linux performance counters
?.github/scripts/setup.sh --perf # --rdpmc --max-sample-rate 10000
sudo mount -o remount,mode=755 /sys/kernel/debug sudo mount -o remount,mode=755 /sys/kernel/debug/tracing sudo chown `whoami` /sys/kernel/debug/tracing/uprobe_events sudo chmod a+rw /sys/kernel/debug/tracing/uprobe_events echo 0 | sudo tee /proc/sys/kernel/kptr_restrict echo -1 | sudo tee /proc/sys/kernel/perf_event_paranoid echo 1000 | sudo tee /proc/sys/kernel/perf_event_max_sample_rateecho 2 | sudo tee /sys/devices/cpu_core/rdpmcHow to reduce
execution variability
?.github/scripts/tune.sh
pyperf
-pip3 install pyperf
sudo pyperf system tune sudo pyperf system show sudo pyperf system reset# Set Process CPU Affinity (apt install util-linux) taskset -c 0 ./a.out # Set Process Scheduling Priority (apt install coreutils) nice -n -20 taskset -c 0 ./a.out # -20..19 (most..less favorable to the process) # Disable CPU Frequency Scaling (apt install cpufrequtils) sudo cpupower frequency-set --governor performance # cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor # Disable Address Space Randomization echo 0 > /proc/sys/kernel/randomize_va_space # Disable Processor Boosting echo 0 | sudo tee /sys/devices/system/cpu/cpufreq/boost # Disable Turbo Mode echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo # Disable Hyperthreading/SMT echo off | sudo tee /sys/devices/system/cpu/smt/control # Restrict memory to a single socket numactl -m 0 -N 0 ./a.out # Enable Huge Pages sudo numactl --cpunodebind=1 --membind=1 hugeadm \ --obey-mempolicy --pool-pages-min=1G:64 sudo hugeadm --create-mounts# Enable Kernel Mode Task-Isolation (https://lwn.net/Articles/816298) # cat /sys/devices/system/cpu/isolated isolcpus=<cpu number>,...,<cpu number> # Disable P-states and C-states # cat /sys/devices/system/cpu/intel_pstate/status idle=pool intel_pstate=disable intel_idle.max_cstate=0 processor.max_cstate=1 # Disable NMI watchdog # cat /proc/sys/kernel/nmi_watchdog nmi_watchdog=0Usage Guide
How to compile
perf
withmodules
?clang++ -std=c++23 -O3 -I. --precompile perf.cppm clang++ -std=c++23 -O3 -fprebuilt-module-path=. perf.pcm <source_file> -lLLVM -liptimport perf;
How to change
assembly
syntax?perf::llvm llvm{ {.syntax = perf::arch::syntax::att} // default: intel };How to
analyze
for a different architecture?perf::llvm llvm{ .triple = "x86_64-pc-linux-gnu" // see `llvm-llc` for details };Which
terminal
can display images?Any terminal with sixel support - https://www.arewesixelyet.com
(Visual Studio Code
images support in terminal -Terminal -> Enable images option
)How to plot with
popup windows
?PERF_IO_PLOT_TERM='wxt' ./a.out
How to plot without
sixel
?PERF_IO_PLOT_TERM='dumb' ./a.out PERF_IO_PLOT_TERM='dumb size 80,25' ./a.out PERF_IO_PLOT_TERM='dumb size 150,25 ansi' ./a.outHow to change plot style?
PERF_IO_PLOT_STYLE='dark' ./perf # default PERF_IO_PLOT_STYLE='light' ./perfHow to save plot?
perf::plot::gnuplot plt{{.term = "png"}}; plt.send("set output 'output.png'"); perf::plot::bar(plt, ...);How to
export
results?./a.out 2>&1 | .github/scripts/export.sh markdown > results.md ./a.out 2>&1 | .github/scripts/export.sh notebook > results.ipynb ./a.out 2>&1 | .github/scripts/export.sh html > results.htmlHow to
share
results?
gh
-apt-get install gh
# https://jbt.github.io/markdown-editor gh gist create --public --web results.md
# https://jupyter.org gh gist create --public --web results.ipynb
# https://htmlpreview.github.io gh gist create --public --web results.html
How to write custom
profiler
?struct my_profiler { constexpr auto start(); constexpr auto stop(); [[nodiscard]] constexpr auto operator[](Ts...) const; };static_assert(perf::profiler_like<my_profiler>);
perf::runner bench{ [](auto&& fn, auto&&... ts) { my_profiler profiler{}; profiler.start(); perf::compiler::prevent_elision(fn(ts...)); profiler.stop(); } };How to integrate with
profiling
tools?
prof
supprots the following profilers
linux-perf
-apt get install linux-tools-common
intel-vtune
-apt get install intel-oneapi-vtune
amd-uprof
-https://www.amd.com/en/developer/uprof.html#downloads
gperftools
-apt get install google-perftools
llvm-xray
-apt-get install llvm
callgrind
-apt-get install valgrind
perf::runner bench{ [](auto&& fn, auto&&... ts) { prof::callgrind profiler{"callgrind"}; profiler.start(); perf::compiler::prevent_elision(fn(ts...)); profiler.stop(); } };bench(fn, ts...);
valgrind --tool=callgrind --instr-atstart=no ./a.out kcachegrind callgrind.*
How to integrate with
unit-testing
frameworks?import perf; import ut; // https://github.com/qlibs/ut int main() { perf::runner bench{perf::bench::latency{}}; perf::scoped _ { .on_exit = [&] { perf::report(bench[perf::time::cpu]); } }; "benchmark1"_test = [] { bench(fn1, ts1...); }; "benchmark1"_test = [] { bench(fn2, ts2...); }; }How
perf
tests are working?
compile-time
tests are executed uponinclude/import
(enabled by default)
run-time/sanity check
tests can be executed at run-timeint main() { perf::self::test({.verbose = true}); // run-time/sanity check tests }
-DNTEST
can be used to disable tests (not recommended)$CXX -DNTEST ... # tests will NOT be compiled
perf
tests execution model#ifndef NTEST "perf"_suite = [] { "run-time and compile-time"_test = [] constexpr { expect(3 == accumulate({1, 2, 3}, 0)); }; "run-time"_test = [] mutable { expect(std::rand() >= 0); }; "compile-time"_test = [] consteval { expect(sizeof(int) == sizeof(0)); }; }; #endifHow to integrate with
jupyter
?
jupyter
(apt install jupyter
) can be used for data analysis (python
)int main() { // ... perf perf::json("perf.json"); }# notebook.ipynb import pandas as pd df = pd.read_json("perf.json") print(df.head())jupyter notebook notebook.ipynbWhat is the difference between
latency
andthroughput
?
latency
is the time it takes for a single operation to complete (ns)
throughput
is the total number of operations or tasks completed in a given amount of time (op/s)What is
top-down microarchitecture analysis
method?https://www.intel.com/content/www/us/en/docs/vtune-profiler/cookbook/2023-0/top-down-microarchitecture-analysis-method.html
https://github.com/andikleen/pmu-tools/wiki/toplev-manualWhat are
performance
compilation flags?-O1 # optimizations (O1) [0] -O2 # optimizations (O1 + O2) [0] -O3 # optimizations (O1 + O2 + O3) [0] -march=native # architecture specific [1] -DNDEBUG # disables asserts, etc.-fno-omit-frame-pointer # keeps the frame pointer in a register
-ffast-math # [unsafe] faster but non-conforming math [2] -fcf-protection=none # [unsafe] stops emmitting `endbr64`[0] https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html
[1] https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html
[2] https://gcc.gnu.org/wiki/FloatingPointMathWhat are
performance
compiler attributes?
gnu::target
[[gnu::target("avx2")]] [[gnu::target("bmi")]]
gnu::optimize
[[gnu::optimize("O3")] [[gnu::optimize("ffast-math")]https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html
MIT/Apache2+LLVM
license namespace guard description MIT perf::*
- https://opensource.org/license/mit Apache2+LLVM perf::mca::*
PERF_LLVM == 1
https://llvm.org/LICENSE.txt