+
+ long long count = cpucycles();
+ long long persecond = cpucycles_persecond();
+ const char *implementation = cpucycles_implementation();
+ const char *version = cpucycles_version();
+
+Link with `-lcpucycles`. Old systems may also need `-lrt`.
+
+### DESCRIPTION
+
+`cpucycles()` returns an estimate for the number of CPU cycles that have
+occurred since an unspecified time in the past (perhaps system boot,
+perhaps program startup).
+
+Accessing true cycle counters can be difficult on some CPUs and
+operating systems. `cpucycles()` does its best to produce accurate
+results, but selects a low-precision counter if the only other option is
+failure.
+
+`cpucycles_persecond()` returns an estimate for the number of CPU cycles
+per second. This estimate comes from `/etc/cpucyclespersecond` if that
+file exists, otherwise from various OS mechanisms, otherwise from the
+`cpucyclespersecond` environment variable if that is set, otherwise
+2399987654.
+
+`cpucycles_implementation()` returns the name of the counter in use:
+e.g., `"amd64-pmc"`.
+
+`cpucycles_version()` returns the `libcpucycles` version number as a
+string: e.g., `"20230115"`. Results of `cpucycles_implementation()`
+should be interpreted relative to `cpucycles_version()`.
+
+`cpucycles` is actually a function pointer. The first call to
+`cpucycles()` or `cpucycles_persecond()` or `cpucycles_implementation()`
+selects one of the available counters and updates the `cpucycles`
+pointer accordingly. Subsequent calls to `cpucycles()` are thread-safe.
+
+### SEE ALSO
+
+**gettimeofday**(2), **clock_gettime**(2)
diff --git a/cpu-cycles/libcpucycles/doc/counters.md b/cpu-cycles/libcpucycles/doc/counters.md
new file mode 100644
index 0000000000..db87f89cfa
--- /dev/null
+++ b/cpu-cycles/libcpucycles/doc/counters.md
@@ -0,0 +1,447 @@
+Currently libcpucycles supports the following cycle counters. Some
+cycle counters are actually other forms of counters that libcpucycles
+scales to imitate a cycle counter. There is
+[separate documentation](selection.html)
+for how libcpucycles makes a choice of cycle counter. See also
+[security considerations](security.html) regarding enabling or disabling
+counters and regarding Turbo Boost.
+
+`amd64-pmc`: Requires a 64-bit Intel/AMD platform. Requires the Linux
+perf_event interface. Accesses a cycle counter through RDPMC. Requires
+`/proc/sys/kernel/perf_event_paranoid` to be at most 2 for user-level
+RDPMC access. This counter runs at the clock frequency of the CPU core.
+
+`amd64-tsc`, `amd64-tscasm`: Requires a 64-bit Intel/AMD platform.
+Requires RDTSC to be enabled, which it is by default. Uses RDTSC to
+access the CPU's time-stamp counter. On current CPUs, this is an
+off-core clock rather than a cycle counter, but it is typically a very
+fast off-core clock, making it adequate for seeing cycle counts if
+overclocking and underclocking are disabled. The difference between
+`tsc` and `tscasm` is that `tsc` uses the compiler's `__rdtsc()` while
+`tscasm` uses inline assembly.
+
+`arm32-cortex`: Requires a 32-bit ARMv7-A platform. Uses
+`mrc p15, 0, %0, c9, c13, 0` to read the cycle counter. Requires user
+access to the cycle counter, which is not enabled by default but can be
+enabled under Linux via
+[a kernel module](https://github.com/thoughtpolice/enable_arm_pmu).
+This counter is natively 32 bits, but libcpucycles watches how the
+counter and `gettimeofday` increase to compute a 64-bit extension of the
+counter.
+
+`arm64-pmc`: Requires a 64-bit ARMv8-A platform. Uses
+`mrs %0, PMCCNTR_EL0` to read the cycle counter. Requires user access
+to the cycle counter, which is not enabled by default but can be enabled
+under Linux via
+[a kernel module](https://github.com/rdolbeau/enable_arm_pmu).
+
+`arm64-vct`: Requires a 64-bit ARMv8-A platform. Uses
+`mrs %0, CNTVCT_EL0` to read a "virtual count" timer. This is an
+off-core clock, typically running at 24MHz. Results are scaled by
+libcpucycles.
+
+`mips64-cc`: Requires a 64-bit MIPS platform. (Maybe the same code would
+also work as `mips32-cc`, but this has not been tested yet.) Uses RDHWR
+to read the hardware cycle counter (hardware register 2 times a constant
+scale factor in hardware register 3). This counter is natively 32 bits,
+but libcpucycles watches how the counter and `gettimeofday` increase to
+compute a 64-bit extension of the counter.
+
+`ppc32-mftb`: Requires a 32-bit PowerPC platform. Uses `mftb` and
+`mftbu` to read the "time base". This is an off-core clock, typically
+running at 24MHz.
+
+`ppc64-mftb`: Requires a 64-bit PowerPC platform. Uses `mftb` and
+`mftbu` to read the "time base". This is an off-core clock, typically
+running at 24MHz.
+
+`riscv32-rdcycle`: Requires a 32-bit RISC-V platform. Uses `rdcycle`
+and `rdcycleh` to read a cycle counter.
+
+`riscv64-rdcycle`: Requires a 64-bit RISC-V platform. Uses `rdcycle`
+to read a cycle counter.
+
+`s390x-stckf`: Requires a 64-bit z/Architecture platform. Uses `stckf`
+to read the TOD clock, which is documented to run at 4096MHz. On the
+z15, this looks like a doubling of an off-core 2048MHz clock. Results
+are scaled by libcpucycles.
+
+`sparc64-rdtick`: Requires a 64-bit SPARC platform. Uses `rd %tick`
+to read a cycle counter.
+
+`x86-tsc`, `x86-tscasm`: Same as `amd64-tsc` and `amd64-tscasm`, but
+for 32-bit Intel/AMD platforms instead of 64-bit Intel/AMD platforms.
+
+`default-gettimeofday`: Reasonably portable. Resolution is limited to 1
+microsecond. Results are scaled by libcpucycles.
+
+`default-mach`: Requires an OS with `mach_absolute_time()`. Typically
+runs at 24MHz. Results are scaled by libcpucycles.
+
+`default-monotonic`: Requires `CLOCK_MONOTONIC`. Reasonably portable,
+although might fail on older systems where `default-gettimeofday` works.
+Resolution is limited to 1 nanosecond. Can be almost as good as a cycle
+counter, or orders of magnitude worse, depending on the OS and CPU.
+Results are scaled by libcpucycles.
+
+`default-perfevent`: Requires the Linux `perf_event` interface, and a
+CPU where `perf_event` supports `PERF_COUNT_HW_CPU_CYCLES`. Similar
+variations in quality to `default-monotonic`, without the 1-nanosecond
+limitation.
+
+`default-zero`: The horrifying last resort if nothing else works.
+
+## Examples
+
+These are examples of `cpucycles-info` output on various machines. The
+machines named `gcc*` are from the
+[GCC Compile Farm](https://gcc.gnu.org/wiki/CompileFarm).
+
+A `median` line saying, e.g., `47 +47+28+0+2-5+0+2-5...` means that the
+differences between adjacent cycle counts were 47+47, 47+28, 47+0, 47+2,
+47−5, 47+0, 47+2, 47−5, etc., with median difference 47. The first few
+differences are typically larger because of cache effects.
+
+`pi3aplus`,
+Broadcom BCM2837B0:
+```
+cpucycles version 20230105
+cpucycles tracesetup 0 arm64-pmc precision 9 scaling 1.000000 only32 0
+cpucycles tracesetup 1 arm64-vct precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 2 default-perfevent precision 189 scaling 1.000000 only32 0
+cpucycles tracesetup 3 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 4 default-monotonic precision 272 scaling 1.400000 only32 0
+cpucycles tracesetup 5 default-gettimeofday precision 1600 scaling 1400.000000 only32 0
+cpucycles tracesetup 6 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 1400000000
+cpucycles implementation arm64-pmc
+cpucycles median 10 +10+8+3+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0
+cpucycles observed persecond 1032000000...4224666667 with 1024 loops 4 microseconds
+cpucycles observed persecond 1286000000...1756000000 with 2048 loops 7 microseconds
+cpucycles observed persecond 1368266666...1598000000 with 4096 loops 14 microseconds
+cpucycles observed persecond 1366700000...1473428572 with 8192 loops 29 microseconds
+cpucycles observed persecond 1366100000...1417534483 with 16384 loops 59 microseconds
+cpucycles observed persecond 1332739837...1357132232 with 32768 loops 122 microseconds
+cpucycles observed persecond 1354483471...1366945834 with 65536 loops 241 microseconds
+cpucycles observed persecond 1385684989...1392195330 with 131072 loops 472 microseconds
+cpucycles observed persecond 1347223021...1350328528 with 262144 loops 972 microseconds
+cpucycles observed persecond 1375460125...1377069853 with 524288 loops 1905 microseconds
+cpucycles observed persecond 1376527697...1377335961 with 1048576 loops 3808 microseconds
+```
+
+`bblack`,
+TI Sitara XAM3359AZCZ100:
+```
+cpucycles version 20230105
+cpucycles tracesetup 0 arm32-cortex precision 8 scaling 1.000000 only32 1
+cpucycles tracesetup 1 default-perfevent precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 2 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 3 default-monotonic precision 1283 scaling 1.000000 only32 0
+cpucycles tracesetup 4 default-gettimeofday precision 1200 scaling 1000.000000 only32 0
+cpucycles tracesetup 5 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 1000000000
+cpucycles implementation arm32-cortex
+cpucycles median 1260 +1506+62+31+7+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+13+7+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0
+cpucycles observed persecond 622181818...2101888889 with 1024 loops 10 microseconds
+cpucycles observed persecond 806133333...1492615385 with 2048 loops 14 microseconds
+cpucycles observed persecond 879880000...1232565218 with 4096 loops 24 microseconds
+cpucycles observed persecond 939577777...1130581396 with 8192 loops 44 microseconds
+cpucycles observed persecond 956954022...1050047059 with 16384 loops 86 microseconds
+cpucycles observed persecond 982878542...1020685715 with 32768 loops 246 microseconds
+cpucycles observed persecond 988105105...1012217523 with 65536 loops 332 microseconds
+cpucycles observed persecond 993752077...1007159723 with 131072 loops 721 microseconds
+cpucycles observed persecond 995364296...1004009448 with 262144 loops 1377 microseconds
+cpucycles observed persecond 998216306...1001821536 with 524288 loops 2685 microseconds
+cpucycles observed persecond 998991848...1000914196 with 1048576 loops 5397 microseconds
+```
+
+`hiphop`,
+Intel Xeon E3-1220 v3:
+```
+cpucycles version 20230105
+cpucycles tracesetup 0 amd64-pmc precision 40 scaling 1.000000 only32 0
+cpucycles tracesetup 1 amd64-tsc precision 124 scaling 1.000000 only32 0
+cpucycles tracesetup 2 amd64-tscasm precision 124 scaling 1.000000 only32 0
+cpucycles tracesetup 3 default-perfevent precision 160 scaling 1.000000 only32 0
+cpucycles tracesetup 4 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 5 default-monotonic precision 272 scaling 3.100000 only32 0
+cpucycles tracesetup 6 default-gettimeofday precision 3300 scaling 3100.000000 only32 0
+cpucycles tracesetup 7 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 3100000000
+cpucycles implementation amd64-pmc
+cpucycles median 44 +38+23+23+23-4+0-4+0-4+0-4+0+10-4-2+1-4+1-4+1+17+1-4+1-4+1-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4
+cpucycles observed persecond 2066500000...4235000000 with 8192 loops 3 microseconds
+cpucycles observed persecond 2760833333...4200250000 with 16384 loops 5 microseconds
+cpucycles observed persecond 2743416666...3313100000 with 32768 loops 11 microseconds
+cpucycles observed persecond 2986227272...3295000000 with 65536 loops 21 microseconds
+cpucycles observed persecond 3052069767...3206073171 with 131072 loops 42 microseconds
+cpucycles observed persecond 3050395348...3125523810 with 262144 loops 85 microseconds
+cpucycles observed persecond 3085123529...3123059524 with 524288 loops 169 microseconds
+cpucycles observed persecond 3084561764...3103434912 with 1048576 loops 339 microseconds
+```
+
+`nucnuc`,
+Intel Pentium N3700:
+```
+cpucycles version 20230105
+cpucycles tracesetup 0 amd64-pmc precision 26 scaling 1.000000 only32 0
+cpucycles tracesetup 1 amd64-tsc precision 120 scaling 1.000000 only32 0
+cpucycles tracesetup 2 amd64-tscasm precision 120 scaling 1.000000 only32 0
+cpucycles tracesetup 3 default-perfevent precision 427 scaling 1.000000 only32 0
+cpucycles tracesetup 4 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 5 default-monotonic precision 320 scaling 1.600000 only32 0
+cpucycles tracesetup 6 default-gettimeofday precision 1800 scaling 1600.000000 only32 0
+cpucycles tracesetup 7 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 1600000000
+cpucycles implementation amd64-pmc
+cpucycles median 66 +12+12+14+14-1-1+0-1+0-1+0-1+0+1-1+0-1+0-1+0-2+0-1+0-1+0-1+0-2+0-1+0-1+0-1+0-2+0-1+0-1+1-1+0-2-1-1+0-1+0-1+0-2+0-1+2+0-1+0-1+0+0-1
+cpucycles observed persecond 1060500000...2325000000 with 2048 loops 3 microseconds
+cpucycles observed persecond 1387166666...2208250000 with 4096 loops 5 microseconds
+cpucycles observed persecond 1376083333...1705500000 with 8192 loops 11 microseconds
+cpucycles observed persecond 1495727272...1671800000 with 16384 loops 21 microseconds
+cpucycles observed persecond 1563428571...1655100000 with 32768 loops 41 microseconds
+cpucycles observed persecond 1580807228...1626234568 with 65536 loops 82 microseconds
+cpucycles observed persecond 1589539393...1612619632 with 131072 loops 164 microseconds
+cpucycles observed persecond 1598841463...1610230062 with 262144 loops 327 microseconds
+cpucycles observed persecond 1564336810...1569988042 with 524288 loops 670 microseconds
+cpucycles observed persecond 1599759725...1602608098 with 1048576 loops 1310 microseconds
+```
+
+`saber214`,
+AMD FX-8350:
+```
+cpucycles version 20230105
+cpucycles tracesetup 0 amd64-pmc precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 1 amd64-tsc precision 167 scaling 1.000000 only32 0
+cpucycles tracesetup 2 amd64-tscasm precision 168 scaling 1.000000 only32 0
+cpucycles tracesetup 3 default-perfevent precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 4 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 5 default-monotonic precision 376 scaling 4.013452 only32 0
+cpucycles tracesetup 6 default-gettimeofday precision 4213 scaling 4013.452000 only32 0
+cpucycles tracesetup 7 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 4013452000
+cpucycles implementation amd64-tsc
+cpucycles median 77 +87-2+21+7+4+1+0+2-2-7-4+0+1+4-2+3+1-2-2+5-6+2+2+2+2+1-1-1+0-4+0-1-1-1-2+3-1-1+2-2+0+0+2+0+0+2-2-2+1-1-2+2-5+2+0+2+0+1+0+3-2-1-1
+cpucycles observed persecond 2767500000...5759000000 with 4096 loops 3 microseconds
+cpucycles observed persecond 3426000000...4893800000 with 8192 loops 6 microseconds
+cpucycles observed persecond 3724076923...4446363637 with 16384 loops 12 microseconds
+cpucycles observed persecond 3977833333...4363318182 with 32768 loops 23 microseconds
+cpucycles observed persecond 3984854166...4168739131 with 65536 loops 47 microseconds
+cpucycles observed persecond 3981709923...4048193799 with 131072 loops 130 microseconds
+cpucycles observed persecond 3982716417...4026914573 with 262144 loops 200 microseconds
+cpucycles observed persecond 4001637602...4025136987 with 524288 loops 366 microseconds
+cpucycles observed persecond 4007411111...4018600248 with 1048576 loops 809 microseconds
+```
+
+`gcc14`,
+Intel Xeon E5-2620 v3,
+Debian testing (bookworm),
+Linux kernel 6.0.0-6-amd64:
+```
+cpucycles version 20230105
+cpucycles tracesetup 0 amd64-pmc precision 41 scaling 1.000000 only32 0
+cpucycles tracesetup 1 amd64-tsc precision 148 scaling 1.000000 only32 0
+cpucycles tracesetup 2 amd64-tscasm precision 148 scaling 1.000000 only32 0
+cpucycles tracesetup 3 default-perfevent precision 159 scaling 1.000000 only32 0
+cpucycles tracesetup 4 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 5 default-monotonic precision 289 scaling 3.200000 only32 0
+cpucycles tracesetup 6 default-gettimeofday precision 3400 scaling 3200.000000 only32 0
+cpucycles tracesetup 7 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 3200000000
+cpucycles implementation amd64-pmc
+cpucycles median 47 +47+28+0+2-5+0+2-5+16+2-5+0+2-5+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0
+cpucycles observed persecond 1653800000...2819333334 with 8192 loops 4 microseconds
+cpucycles observed persecond 1832111111...2389285715 with 16384 loops 8 microseconds
+cpucycles observed persecond 1936058823...2207200000 with 32768 loops 16 microseconds
+cpucycles observed persecond 2052843750...2196200000 with 65536 loops 31 microseconds
+cpucycles observed persecond 2050750000...2120048388 with 131072 loops 63 microseconds
+cpucycles observed persecond 2081896825...2117048388 with 262144 loops 125 microseconds
+cpucycles observed persecond 2089478087...2107044177 with 524288 loops 250 microseconds
+cpucycles observed persecond 2093343313...2102124249 with 1048576 loops 500 microseconds
+```
+
+`gcc23`,
+Cavium Octeon II V0.1,
+Debian 8.11,
+Linux kernel 4.1.4:
+```
+cpucycles version 20230105
+cpucycles tracesetup 0 mips64-cc precision 24 scaling 1.000000 only32 1
+cpucycles tracesetup 1 default-perfevent precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 2 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 3 default-monotonic precision 46702 scaling 2.399988 only32 0
+cpucycles tracesetup 4 default-gettimeofday precision 45799 scaling 2399.987654 only32 0
+cpucycles tracesetup 5 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 2399987654
+cpucycles implementation mips64-cc
+cpucycles median 2177 +828+17+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0
+cpucycles observed persecond 641900000...1845125000 with 1024 loops 9 microseconds
+cpucycles observed persecond 745357142...1352083334 with 2048 loops 13 microseconds
+cpucycles observed persecond 809826086...1162333334 with 4096 loops 22 microseconds
+cpucycles observed persecond 897717948...1104405406 with 8192 loops 38 microseconds
+cpucycles observed persecond 957467532...1059986667 with 16384 loops 76 microseconds
+cpucycles observed persecond 973102189...1029777778 with 32768 loops 136 microseconds
+cpucycles observed persecond 986518656...1015830828 with 65536 loops 267 microseconds
+cpucycles observed persecond 993452830...1008166667 with 131072 loops 529 microseconds
+cpucycles observed persecond 996036966...1003403609 with 262144 loops 1054 microseconds
+cpucycles observed persecond 984706378...1001682630 with 524288 loops 2131 microseconds
+cpucycles observed persecond 992585292...1001178580 with 1048576 loops 4296 microseconds
+```
+
+`gcc45`,
+AMD Athlon II X4 640,
+Debian 8.11,
+Linux kernel 3.16.0-11-686-pae:
+```
+cpucycles version 20230105
+cpucycles tracesetup 0 x86-tsc precision 199 scaling 1.000000 only32 0
+cpucycles tracesetup 1 x86-tscasm precision 199 scaling 1.000000 only32 0
+cpucycles tracesetup 2 default-perfevent precision 170 scaling 1.000000 only32 0
+cpucycles tracesetup 3 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 4 default-monotonic precision 941 scaling 3.000000 only32 0
+cpucycles tracesetup 5 default-gettimeofday precision 3200 scaling 3000.000000 only32 0
+cpucycles tracesetup 6 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 3000000000
+cpucycles implementation default-perfevent
+cpucycles median 72 +12+0+0+0+0+0+0+0+5+0+0+0+0+0+0+0+2+0+0+0+0+0+0+0+1+0+0+0+0+0+0+0+2+0+0+0+0+0+0+0+1+0+0+0+0+0+0+0+2+0+0+0+0+0+0+0+1+0+0+0+0+0+0
+cpucycles observed persecond 541500000...1812000000 with 1024 loops 3 microseconds
+cpucycles observed persecond 712333333...1212250000 with 2048 loops 5 microseconds
+cpucycles observed persecond 1193285714...1733600000 with 4096 loops 6 microseconds
+cpucycles observed persecond 1689176470...1804562500 with 8192 loops 33 microseconds
+cpucycles observed persecond 1713074626...1770600000 with 16384 loops 66 microseconds
+cpucycles observed persecond 1765107692...1795140625 with 32768 loops 129 microseconds
+cpucycles observed persecond 1785369649...1800603922 with 65536 loops 256 microseconds
+cpucycles observed persecond 1781377862...1796288462 with 131072 loops 261 microseconds
+cpucycles observed persecond 1772647398...1778247827 with 262144 loops 691 microseconds
+cpucycles observed persecond 1789670493...1794149598 with 524288 loops 870 microseconds
+cpucycles observed persecond 1860276211...1861561332 with 1048576 loops 3156 microseconds
+```
+
+`gcc92`,
+SiFive Freedom U740,
+Ubuntu 22.04,
+Linux kernel 5.15.0-1014-generic:
+```
+cpucycles version 20230105
+cpucycles tracesetup 0 riscv64-rdcycle precision 8 scaling 1.000000 only32 0
+cpucycles tracesetup 1 default-perfevent precision 3024 scaling 1.000000 only32 0
+cpucycles tracesetup 2 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 3 default-monotonic precision 2599 scaling 2.399988 only32 0
+cpucycles tracesetup 4 default-gettimeofday precision 2599 scaling 2399.987654 only32 0
+cpucycles tracesetup 5 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 2399987654
+cpucycles implementation riscv64-rdcycle
+cpucycles median 8 +33+27+1+1+1+1+0+0+0+22+0+0+0+0+0+0+0+628+0+0+0+7+0+0+0+145+0+0+0+0+0+0+0+22+0+0+0+0+0+0+0+158+0+0+0+0+0+0+0+22+0+0+0+0+0+0+0+22+0+0+0+0+0
+cpucycles observed persecond 530250000...1978000000 with 1024 loops 3 microseconds
+cpucycles observed persecond 831000000...1915666667 with 2048 loops 4 microseconds
+cpucycles observed persecond 1055750000...1689500000 with 4096 loops 7 microseconds
+cpucycles observed persecond 1045562500...1305428572 with 8192 loops 15 microseconds
+cpucycles observed persecond 1102700000...1236357143 with 16384 loops 29 microseconds
+cpucycles observed persecond 1176053571...1247444445 with 32768 loops 55 microseconds
+cpucycles observed persecond 1173321428...1209127273 with 65536 loops 111 microseconds
+cpucycles observed persecond 1187805429...1205210046 with 131072 loops 220 microseconds
+cpucycles observed persecond 1192415909...1201157535 with 262144 loops 439 microseconds
+cpucycles observed persecond 1194694760...1199247717 with 524288 loops 877 microseconds
+cpucycles observed persecond 1194656004...1197023034 with 1048576 loops 1781 microseconds
+```
+
+`gcc103`,
+Apple M1 (Icestorm-M1 + Firestorm-M1),
+Debian unstable (bookworm),
+Linux kernel 6.0.0-rc5-asahi-00001-gc62bd3fe430f:
+```
+cpucycles version 20230105
+cpucycles tracesetup 0 arm64-pmc precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 1 arm64-vct precision 186 scaling 86.000000 only32 0
+cpucycles tracesetup 2 default-perfevent precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 3 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 4 default-monotonic precision 285 scaling 2.064000 only32 0
+cpucycles tracesetup 5 default-gettimeofday precision 2264 scaling 2064.000000 only32 0
+cpucycles tracesetup 6 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 2064000000
+cpucycles implementation arm64-vct
+cpucycles median 0 +0+86+0+0+0+0+0+0+0+0+0+0+0+0+86+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+86+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+86+0+0+0+0+0+0+0+0
+cpucycles observed persecond 1784500000...3655000000 with 8192 loops 3 microseconds
+cpucycles observed persecond 1773750000...2393666667 with 16384 loops 7 microseconds
+cpucycles observed persecond 1897733333...2222769231 with 32768 loops 14 microseconds
+cpucycles observed persecond 1951310344...2114962963 with 65536 loops 28 microseconds
+cpucycles observed persecond 2024071428...2107000000 with 131072 loops 55 microseconds
+cpucycles observed persecond 2041531531...2082935780 with 262144 loops 110 microseconds
+cpucycles observed persecond 2051158371...2071461188 with 524288 loops 220 microseconds
+cpucycles observed persecond 2058539682...2068309795 with 1048576 loops 440 microseconds
+```
+
+`gcc112` (`gcc2-power8`),
+IBM POWER8E,
+CentOS 7.9 AltArch,
+Linux kernel 3.10.0-1127.13.1.el7.ppc64le:
+```
+cpucycles version 20230105
+cpucycles tracesetup 0 ppc64-mftb precision 251 scaling 7.207031 only32 0
+cpucycles tracesetup 1 default-perfevent precision 295 scaling 1.000000 only32 0
+cpucycles tracesetup 2 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 3 default-monotonic precision 536 scaling 3.690000 only32 0
+cpucycles tracesetup 4 default-gettimeofday precision 3890 scaling 3690.000000 only32 0
+cpucycles tracesetup 5 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 3690000000
+cpucycles implementation ppc64-mftb
+cpucycles median 195 +2969-8+14+0-8+7-8-7+7+6-7-1+0-1+0+7+7-15+7-1-7+6+0+0-8+0+6+0-8+7+0+7-8-8-7-1+7-8+7+0-8+0+14-8-7+6+0-8+7+7-15+0-1+0-1+14+0-15+14+0-1+7+0
+cpucycles observed persecond 2603750000...5510000000 with 2048 loops 3 microseconds
+cpucycles observed persecond 3430500000...6052250000 with 4096 loops 5 microseconds
+cpucycles observed persecond 3411333333...4457500000 with 8192 loops 11 microseconds
+cpucycles observed persecond 3548695652...4060333334 with 16384 loops 22 microseconds
+cpucycles observed persecond 3624977777...3876534884 with 32768 loops 44 microseconds
+cpucycles observed persecond 3621855555...3745363637 with 65536 loops 89 microseconds
+cpucycles observed persecond 3660157303...3722227273 with 131072 loops 177 microseconds
+cpucycles observed persecond 3680471751...3711622160 with 262144 loops 353 microseconds
+cpucycles observed persecond 3685321074...3700886525 with 524288 loops 706 microseconds
+cpucycles observed persecond 3687745930...3695537208 with 1048576 loops 1412 microseconds
+```
+
+`gcc202`,
+UltraSparc T5,
+Debian unstable (bookworm),
+Linux kernel 5.19.0-2-sparc64-smp:
+```
+cpucycles version 20230105
+cpucycles tracesetup 0 sparc64-rdtick precision 65 scaling 1.000000 only32 0
+cpucycles tracesetup 1 default-perfevent precision 386 scaling 1.000000 only32 0
+cpucycles tracesetup 2 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 3 default-monotonic precision 442 scaling 3.599910 only32 0
+cpucycles tracesetup 4 default-gettimeofday precision 3799 scaling 3599.910000 only32 0
+cpucycles tracesetup 5 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 3599910000
+cpucycles implementation sparc64-rdtick
+cpucycles median 73 +24+0+24+24+24+24+24+24+0+1+24+0+1+24+0+1+24+0+0+1+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+1+0+0+0+0+0+0+0+0+0+0+0+0+0
+cpucycles observed persecond 2751500000...4258250000 with 4096 loops 5 microseconds
+cpucycles observed persecond 3289200000...4206875000 with 8192 loops 9 microseconds
+cpucycles observed persecond 3454789473...3900823530 with 16384 loops 18 microseconds
+cpucycles observed persecond 3452026315...3659888889 with 32768 loops 37 microseconds
+cpucycles observed persecond 3543770270...3650916667 with 65536 loops 73 microseconds
+cpucycles observed persecond 3567299319...3620662069 with 131072 loops 146 microseconds
+cpucycles observed persecond 3591373287...3618220690 with 262144 loops 291 microseconds
+cpucycles observed persecond 3597353344...3610774527 with 524288 loops 582 microseconds
+cpucycles observed persecond 3595899403...3603058071 with 1048576 loops 1172 microseconds
+```
+
+IBM z15:
+```
+cpucycles version 20230106
+cpucycles tracesetup 0 s390x-stckf precision 250 scaling 1.269531 only32 0
+cpucycles tracesetup 1 default-perfevent precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 2 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 3 default-monotonic precision 272 scaling 5.200000 only32 0
+cpucycles tracesetup 4 default-gettimeofday precision 5400 scaling 5200.000000 only32 0
+cpucycles tracesetup 5 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 5200000000
+cpucycles implementation s390x-stckf
+cpucycles median 48 +87+8+0-2+0+0+38-2+0+1-3+1+28+0+3-3+1+0+28+0-2+3+0-2+36+0+0+0+1+0+28+0-2+0+3-2+35+1+0-2+0+3+28+0-2+0+0-2+3+25+3+0-2+0+1+35+1+0+0-2+0+28+0
+cpucycles observed persecond 4948941176...5627733334 with 8192 loops 16 microseconds
+cpucycles observed persecond 4104125000...5515666667 with 16384 loops 7 microseconds
+cpucycles observed persecond 5047076923...5987818182 with 32768 loops 12 microseconds
+cpucycles observed persecond 5044846153...5475708334 with 65536 loops 25 microseconds
+cpucycles observed persecond 5141313725...5357428572 with 131072 loops 50 microseconds
+cpucycles observed persecond 5150892156...5257250000 with 262144 loops 101 microseconds
+cpucycles observed persecond 5183421568...5236549505 with 524288 loops 203 microseconds
+cpucycles observed persecond 5190282555...5216582717 with 1048576 loops 406 microseconds
+```
diff --git a/cpu-cycles/libcpucycles/doc/download.md b/cpu-cycles/libcpucycles/doc/download.md
new file mode 100644
index 0000000000..6f72ddb220
--- /dev/null
+++ b/cpu-cycles/libcpucycles/doc/download.md
@@ -0,0 +1,30 @@
+To download and unpack the latest version of libcpucycles:
+
+ wget -m https://cpucycles.cr.yp.to/libcpucycles-latest-version.txt
+ version=$(cat cpucycles.cr.yp.to/libcpucycles-latest-version.txt)
+ wget -m https://cpucycles.cr.yp.to/libcpucycles-$version.tar.gz
+ tar -xzf cpucycles.cr.yp.to/libcpucycles-$version.tar.gz
+ cd libcpucycles-$version
+
+Then [install](install.html).
+
+### Archives and changelog (reverse chronological)
+
+[`libcpucycles-20230115.tar.gz`](libcpucycles-20230115.tar.gz) [browse](libcpucycles-20230115.html)
+
+Update actual `cpucycles_version` behavior to match documentation.
+
+[`libcpucycles-20230110.tar.gz`](libcpucycles-20230110.tar.gz) [browse](libcpucycles-20230110.html)
+
+`doc/api.md`: Document `cpucycles_version()`.
+
+Add `s390x-stckf` counter.
+
+`cpucycles/default-perfevent.c`: Read into `int64_t` instead of `long long`.
+Add comment explaining issues with `PERF_FORMAT_TOTAL_TIME_RUNNING`.
+
+`configure`: Improve `uname` handling.
+
+`doc/api.md`: Update description of default frequency.
+
+[`libcpucycles-20230105.tar.gz`](libcpucycles-20230105.tar.gz) [browse](libcpucycles-20230105.html)
diff --git a/cpu-cycles/libcpucycles/doc/html/api.html b/cpu-cycles/libcpucycles/doc/html/api.html
new file mode 100644
index 0000000000..1547c19647
--- /dev/null
+++ b/cpu-cycles/libcpucycles/doc/html/api.html
@@ -0,0 +1,91 @@
+
+
+
+
+
+
+API
+
+
+
+libcpucycles
+
+
+
+
NAME
+
cpucycles - count CPU cycles
+
SYNOPSIS
+
#include <cpucycles.h>
+
+long long count = cpucycles();
+long long persecond = cpucycles_persecond();
+const char *implementation = cpucycles_implementation();
+const char *version = cpucycles_version();
+
+
Link with -lcpucycles. Old systems may also need -lrt.
+
DESCRIPTION
+
cpucycles() returns an estimate for the number of CPU cycles that have
+occurred since an unspecified time in the past (perhaps system boot,
+perhaps program startup).
+
Accessing true cycle counters can be difficult on some CPUs and
+operating systems. cpucycles() does its best to produce accurate
+results, but selects a low-precision counter if the only other option is
+failure.
+
cpucycles_persecond() returns an estimate for the number of CPU cycles
+per second. This estimate comes from /etc/cpucyclespersecond if that
+file exists, otherwise from various OS mechanisms, otherwise from the
+cpucyclespersecond environment variable if that is set, otherwise
+2399987654.
+
cpucycles_implementation() returns the name of the counter in use:
+e.g., "amd64-pmc".
+
cpucycles_version() returns the libcpucycles version number as a
+string: e.g., "20230115". Results of cpucycles_implementation()
+should be interpreted relative to cpucycles_version().
+
cpucycles is actually a function pointer. The first call to
+cpucycles() or cpucycles_persecond() or cpucycles_implementation()
+selects one of the available counters and updates the cpucycles
+pointer accordingly. Subsequent calls to cpucycles() are thread-safe.
+
SEE ALSO
+
gettimeofday(2), clock_gettime(2)
Version:
+This is version 2023.01.15 of the "API" web page.
+
+
+
+
diff --git a/cpu-cycles/libcpucycles/doc/html/counters.html b/cpu-cycles/libcpucycles/doc/html/counters.html
new file mode 100644
index 0000000000..6ebc06222c
--- /dev/null
+++ b/cpu-cycles/libcpucycles/doc/html/counters.html
@@ -0,0 +1,456 @@
+
+
+
+
+
+
+Counters
+
+
+
+libcpucycles
+
+
+
+
Currently libcpucycles supports the following cycle counters. Some
+cycle counters are actually other forms of counters that libcpucycles
+scales to imitate a cycle counter. There is
+separate documentation
+for how libcpucycles makes a choice of cycle counter. See also
+security considerations regarding enabling or disabling
+counters and regarding Turbo Boost.
+
amd64-pmc: Requires a 64-bit Intel/AMD platform. Requires the Linux
+perf_event interface. Accesses a cycle counter through RDPMC. Requires
+/proc/sys/kernel/perf_event_paranoid to be at most 2 for user-level
+RDPMC access. This counter runs at the clock frequency of the CPU core.
+
amd64-tsc, amd64-tscasm: Requires a 64-bit Intel/AMD platform.
+Requires RDTSC to be enabled, which it is by default. Uses RDTSC to
+access the CPU's time-stamp counter. On current CPUs, this is an
+off-core clock rather than a cycle counter, but it is typically a very
+fast off-core clock, making it adequate for seeing cycle counts if
+overclocking and underclocking are disabled. The difference between
+tsc and tscasm is that tsc uses the compiler's __rdtsc() while
+tscasm uses inline assembly.
+
arm32-cortex: Requires a 32-bit ARMv7-A platform. Uses
+mrc p15, 0, %0, c9, c13, 0 to read the cycle counter. Requires user
+access to the cycle counter, which is not enabled by default but can be
+enabled under Linux via
+a kernel module.
+This counter is natively 32 bits, but libcpucycles watches how the
+counter and gettimeofday increase to compute a 64-bit extension of the
+counter.
+
arm64-pmc: Requires a 64-bit ARMv8-A platform. Uses
+mrs %0, PMCCNTR_EL0 to read the cycle counter. Requires user access
+to the cycle counter, which is not enabled by default but can be enabled
+under Linux via
+a kernel module.
+
arm64-vct: Requires a 64-bit ARMv8-A platform. Uses
+mrs %0, CNTVCT_EL0 to read a "virtual count" timer. This is an
+off-core clock, typically running at 24MHz. Results are scaled by
+libcpucycles.
+
mips64-cc: Requires a 64-bit MIPS platform. (Maybe the same code would
+also work as mips32-cc, but this has not been tested yet.) Uses RDHWR
+to read the hardware cycle counter (hardware register 2 times a constant
+scale factor in hardware register 3). This counter is natively 32 bits,
+but libcpucycles watches how the counter and gettimeofday increase to
+compute a 64-bit extension of the counter.
+
ppc32-mftb: Requires a 32-bit PowerPC platform. Uses mftb and
+mftbu to read the "time base". This is an off-core clock, typically
+running at 24MHz.
+
ppc64-mftb: Requires a 64-bit PowerPC platform. Uses mftb and
+mftbu to read the "time base". This is an off-core clock, typically
+running at 24MHz.
+
riscv32-rdcycle: Requires a 32-bit RISC-V platform. Uses rdcycle
+and rdcycleh to read a cycle counter.
+
riscv64-rdcycle: Requires a 64-bit RISC-V platform. Uses rdcycle
+to read a cycle counter.
+
s390x-stckf: Requires a 64-bit z/Architecture platform. Uses stckf
+to read the TOD clock, which is documented to run at 4096MHz. On the
+z15, this looks like a doubling of an off-core 2048MHz clock. Results
+are scaled by libcpucycles.
+
sparc64-rdtick: Requires a 64-bit SPARC platform. Uses rd %tick
+to read a cycle counter.
+
x86-tsc, x86-tscasm: Same as amd64-tsc and amd64-tscasm, but
+for 32-bit Intel/AMD platforms instead of 64-bit Intel/AMD platforms.
+
default-gettimeofday: Reasonably portable. Resolution is limited to 1
+microsecond. Results are scaled by libcpucycles.
+
default-mach: Requires an OS with mach_absolute_time(). Typically
+runs at 24MHz. Results are scaled by libcpucycles.
+
default-monotonic: Requires CLOCK_MONOTONIC. Reasonably portable,
+although might fail on older systems where default-gettimeofday works.
+Resolution is limited to 1 nanosecond. Can be almost as good as a cycle
+counter, or orders of magnitude worse, depending on the OS and CPU.
+Results are scaled by libcpucycles.
+
default-perfevent: Requires the Linux perf_event interface, and a
+CPU where perf_event supports PERF_COUNT_HW_CPU_CYCLES. Similar
+variations in quality to default-monotonic, without the 1-nanosecond
+limitation.
+
default-zero: The horrifying last resort if nothing else works.
+
Examples
+
These are examples of cpucycles-info output on various machines. The
+machines named gcc* are from the
+GCC Compile Farm.
+
A median line saying, e.g., 47 +47+28+0+2-5+0+2-5... means that the
+differences between adjacent cycle counts were 47+47, 47+28, 47+0, 47+2,
+47−5, 47+0, 47+2, 47−5, etc., with median difference 47. The first few
+differences are typically larger because of cache effects.
+
pi3aplus,
+Broadcom BCM2837B0:
+
cpucycles version 20230105
+cpucycles tracesetup 0 arm64-pmc precision 9 scaling 1.000000 only32 0
+cpucycles tracesetup 1 arm64-vct precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 2 default-perfevent precision 189 scaling 1.000000 only32 0
+cpucycles tracesetup 3 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 4 default-monotonic precision 272 scaling 1.400000 only32 0
+cpucycles tracesetup 5 default-gettimeofday precision 1600 scaling 1400.000000 only32 0
+cpucycles tracesetup 6 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 1400000000
+cpucycles implementation arm64-pmc
+cpucycles median 10 +10+8+3+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0
+cpucycles observed persecond 1032000000...4224666667 with 1024 loops 4 microseconds
+cpucycles observed persecond 1286000000...1756000000 with 2048 loops 7 microseconds
+cpucycles observed persecond 1368266666...1598000000 with 4096 loops 14 microseconds
+cpucycles observed persecond 1366700000...1473428572 with 8192 loops 29 microseconds
+cpucycles observed persecond 1366100000...1417534483 with 16384 loops 59 microseconds
+cpucycles observed persecond 1332739837...1357132232 with 32768 loops 122 microseconds
+cpucycles observed persecond 1354483471...1366945834 with 65536 loops 241 microseconds
+cpucycles observed persecond 1385684989...1392195330 with 131072 loops 472 microseconds
+cpucycles observed persecond 1347223021...1350328528 with 262144 loops 972 microseconds
+cpucycles observed persecond 1375460125...1377069853 with 524288 loops 1905 microseconds
+cpucycles observed persecond 1376527697...1377335961 with 1048576 loops 3808 microseconds
+
+
bblack,
+TI Sitara XAM3359AZCZ100:
+
cpucycles version 20230105
+cpucycles tracesetup 0 arm32-cortex precision 8 scaling 1.000000 only32 1
+cpucycles tracesetup 1 default-perfevent precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 2 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 3 default-monotonic precision 1283 scaling 1.000000 only32 0
+cpucycles tracesetup 4 default-gettimeofday precision 1200 scaling 1000.000000 only32 0
+cpucycles tracesetup 5 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 1000000000
+cpucycles implementation arm32-cortex
+cpucycles median 1260 +1506+62+31+7+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+13+7+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0
+cpucycles observed persecond 622181818...2101888889 with 1024 loops 10 microseconds
+cpucycles observed persecond 806133333...1492615385 with 2048 loops 14 microseconds
+cpucycles observed persecond 879880000...1232565218 with 4096 loops 24 microseconds
+cpucycles observed persecond 939577777...1130581396 with 8192 loops 44 microseconds
+cpucycles observed persecond 956954022...1050047059 with 16384 loops 86 microseconds
+cpucycles observed persecond 982878542...1020685715 with 32768 loops 246 microseconds
+cpucycles observed persecond 988105105...1012217523 with 65536 loops 332 microseconds
+cpucycles observed persecond 993752077...1007159723 with 131072 loops 721 microseconds
+cpucycles observed persecond 995364296...1004009448 with 262144 loops 1377 microseconds
+cpucycles observed persecond 998216306...1001821536 with 524288 loops 2685 microseconds
+cpucycles observed persecond 998991848...1000914196 with 1048576 loops 5397 microseconds
+
+
hiphop,
+Intel Xeon E3-1220 v3:
+
cpucycles version 20230105
+cpucycles tracesetup 0 amd64-pmc precision 40 scaling 1.000000 only32 0
+cpucycles tracesetup 1 amd64-tsc precision 124 scaling 1.000000 only32 0
+cpucycles tracesetup 2 amd64-tscasm precision 124 scaling 1.000000 only32 0
+cpucycles tracesetup 3 default-perfevent precision 160 scaling 1.000000 only32 0
+cpucycles tracesetup 4 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 5 default-monotonic precision 272 scaling 3.100000 only32 0
+cpucycles tracesetup 6 default-gettimeofday precision 3300 scaling 3100.000000 only32 0
+cpucycles tracesetup 7 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 3100000000
+cpucycles implementation amd64-pmc
+cpucycles median 44 +38+23+23+23-4+0-4+0-4+0-4+0+10-4-2+1-4+1-4+1+17+1-4+1-4+1-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4
+cpucycles observed persecond 2066500000...4235000000 with 8192 loops 3 microseconds
+cpucycles observed persecond 2760833333...4200250000 with 16384 loops 5 microseconds
+cpucycles observed persecond 2743416666...3313100000 with 32768 loops 11 microseconds
+cpucycles observed persecond 2986227272...3295000000 with 65536 loops 21 microseconds
+cpucycles observed persecond 3052069767...3206073171 with 131072 loops 42 microseconds
+cpucycles observed persecond 3050395348...3125523810 with 262144 loops 85 microseconds
+cpucycles observed persecond 3085123529...3123059524 with 524288 loops 169 microseconds
+cpucycles observed persecond 3084561764...3103434912 with 1048576 loops 339 microseconds
+
+
nucnuc,
+Intel Pentium N3700:
+
cpucycles version 20230105
+cpucycles tracesetup 0 amd64-pmc precision 26 scaling 1.000000 only32 0
+cpucycles tracesetup 1 amd64-tsc precision 120 scaling 1.000000 only32 0
+cpucycles tracesetup 2 amd64-tscasm precision 120 scaling 1.000000 only32 0
+cpucycles tracesetup 3 default-perfevent precision 427 scaling 1.000000 only32 0
+cpucycles tracesetup 4 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 5 default-monotonic precision 320 scaling 1.600000 only32 0
+cpucycles tracesetup 6 default-gettimeofday precision 1800 scaling 1600.000000 only32 0
+cpucycles tracesetup 7 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 1600000000
+cpucycles implementation amd64-pmc
+cpucycles median 66 +12+12+14+14-1-1+0-1+0-1+0-1+0+1-1+0-1+0-1+0-2+0-1+0-1+0-1+0-2+0-1+0-1+0-1+0-2+0-1+0-1+1-1+0-2-1-1+0-1+0-1+0-2+0-1+2+0-1+0-1+0+0-1
+cpucycles observed persecond 1060500000...2325000000 with 2048 loops 3 microseconds
+cpucycles observed persecond 1387166666...2208250000 with 4096 loops 5 microseconds
+cpucycles observed persecond 1376083333...1705500000 with 8192 loops 11 microseconds
+cpucycles observed persecond 1495727272...1671800000 with 16384 loops 21 microseconds
+cpucycles observed persecond 1563428571...1655100000 with 32768 loops 41 microseconds
+cpucycles observed persecond 1580807228...1626234568 with 65536 loops 82 microseconds
+cpucycles observed persecond 1589539393...1612619632 with 131072 loops 164 microseconds
+cpucycles observed persecond 1598841463...1610230062 with 262144 loops 327 microseconds
+cpucycles observed persecond 1564336810...1569988042 with 524288 loops 670 microseconds
+cpucycles observed persecond 1599759725...1602608098 with 1048576 loops 1310 microseconds
+
+
saber214,
+AMD FX-8350:
+
cpucycles version 20230105
+cpucycles tracesetup 0 amd64-pmc precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 1 amd64-tsc precision 167 scaling 1.000000 only32 0
+cpucycles tracesetup 2 amd64-tscasm precision 168 scaling 1.000000 only32 0
+cpucycles tracesetup 3 default-perfevent precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 4 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 5 default-monotonic precision 376 scaling 4.013452 only32 0
+cpucycles tracesetup 6 default-gettimeofday precision 4213 scaling 4013.452000 only32 0
+cpucycles tracesetup 7 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 4013452000
+cpucycles implementation amd64-tsc
+cpucycles median 77 +87-2+21+7+4+1+0+2-2-7-4+0+1+4-2+3+1-2-2+5-6+2+2+2+2+1-1-1+0-4+0-1-1-1-2+3-1-1+2-2+0+0+2+0+0+2-2-2+1-1-2+2-5+2+0+2+0+1+0+3-2-1-1
+cpucycles observed persecond 2767500000...5759000000 with 4096 loops 3 microseconds
+cpucycles observed persecond 3426000000...4893800000 with 8192 loops 6 microseconds
+cpucycles observed persecond 3724076923...4446363637 with 16384 loops 12 microseconds
+cpucycles observed persecond 3977833333...4363318182 with 32768 loops 23 microseconds
+cpucycles observed persecond 3984854166...4168739131 with 65536 loops 47 microseconds
+cpucycles observed persecond 3981709923...4048193799 with 131072 loops 130 microseconds
+cpucycles observed persecond 3982716417...4026914573 with 262144 loops 200 microseconds
+cpucycles observed persecond 4001637602...4025136987 with 524288 loops 366 microseconds
+cpucycles observed persecond 4007411111...4018600248 with 1048576 loops 809 microseconds
+
+
gcc14,
+Intel Xeon E5-2620 v3,
+Debian testing (bookworm),
+Linux kernel 6.0.0-6-amd64:
+
cpucycles version 20230105
+cpucycles tracesetup 0 amd64-pmc precision 41 scaling 1.000000 only32 0
+cpucycles tracesetup 1 amd64-tsc precision 148 scaling 1.000000 only32 0
+cpucycles tracesetup 2 amd64-tscasm precision 148 scaling 1.000000 only32 0
+cpucycles tracesetup 3 default-perfevent precision 159 scaling 1.000000 only32 0
+cpucycles tracesetup 4 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 5 default-monotonic precision 289 scaling 3.200000 only32 0
+cpucycles tracesetup 6 default-gettimeofday precision 3400 scaling 3200.000000 only32 0
+cpucycles tracesetup 7 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 3200000000
+cpucycles implementation amd64-pmc
+cpucycles median 47 +47+28+0+2-5+0+2-5+16+2-5+0+2-5+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0
+cpucycles observed persecond 1653800000...2819333334 with 8192 loops 4 microseconds
+cpucycles observed persecond 1832111111...2389285715 with 16384 loops 8 microseconds
+cpucycles observed persecond 1936058823...2207200000 with 32768 loops 16 microseconds
+cpucycles observed persecond 2052843750...2196200000 with 65536 loops 31 microseconds
+cpucycles observed persecond 2050750000...2120048388 with 131072 loops 63 microseconds
+cpucycles observed persecond 2081896825...2117048388 with 262144 loops 125 microseconds
+cpucycles observed persecond 2089478087...2107044177 with 524288 loops 250 microseconds
+cpucycles observed persecond 2093343313...2102124249 with 1048576 loops 500 microseconds
+
+
gcc23,
+Cavium Octeon II V0.1,
+Debian 8.11,
+Linux kernel 4.1.4:
+
cpucycles version 20230105
+cpucycles tracesetup 0 mips64-cc precision 24 scaling 1.000000 only32 1
+cpucycles tracesetup 1 default-perfevent precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 2 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 3 default-monotonic precision 46702 scaling 2.399988 only32 0
+cpucycles tracesetup 4 default-gettimeofday precision 45799 scaling 2399.987654 only32 0
+cpucycles tracesetup 5 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 2399987654
+cpucycles implementation mips64-cc
+cpucycles median 2177 +828+17+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0
+cpucycles observed persecond 641900000...1845125000 with 1024 loops 9 microseconds
+cpucycles observed persecond 745357142...1352083334 with 2048 loops 13 microseconds
+cpucycles observed persecond 809826086...1162333334 with 4096 loops 22 microseconds
+cpucycles observed persecond 897717948...1104405406 with 8192 loops 38 microseconds
+cpucycles observed persecond 957467532...1059986667 with 16384 loops 76 microseconds
+cpucycles observed persecond 973102189...1029777778 with 32768 loops 136 microseconds
+cpucycles observed persecond 986518656...1015830828 with 65536 loops 267 microseconds
+cpucycles observed persecond 993452830...1008166667 with 131072 loops 529 microseconds
+cpucycles observed persecond 996036966...1003403609 with 262144 loops 1054 microseconds
+cpucycles observed persecond 984706378...1001682630 with 524288 loops 2131 microseconds
+cpucycles observed persecond 992585292...1001178580 with 1048576 loops 4296 microseconds
+
+
gcc45,
+AMD Athlon II X4 640,
+Debian 8.11,
+Linux kernel 3.16.0-11-686-pae:
+
cpucycles version 20230105
+cpucycles tracesetup 0 x86-tsc precision 199 scaling 1.000000 only32 0
+cpucycles tracesetup 1 x86-tscasm precision 199 scaling 1.000000 only32 0
+cpucycles tracesetup 2 default-perfevent precision 170 scaling 1.000000 only32 0
+cpucycles tracesetup 3 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 4 default-monotonic precision 941 scaling 3.000000 only32 0
+cpucycles tracesetup 5 default-gettimeofday precision 3200 scaling 3000.000000 only32 0
+cpucycles tracesetup 6 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 3000000000
+cpucycles implementation default-perfevent
+cpucycles median 72 +12+0+0+0+0+0+0+0+5+0+0+0+0+0+0+0+2+0+0+0+0+0+0+0+1+0+0+0+0+0+0+0+2+0+0+0+0+0+0+0+1+0+0+0+0+0+0+0+2+0+0+0+0+0+0+0+1+0+0+0+0+0+0
+cpucycles observed persecond 541500000...1812000000 with 1024 loops 3 microseconds
+cpucycles observed persecond 712333333...1212250000 with 2048 loops 5 microseconds
+cpucycles observed persecond 1193285714...1733600000 with 4096 loops 6 microseconds
+cpucycles observed persecond 1689176470...1804562500 with 8192 loops 33 microseconds
+cpucycles observed persecond 1713074626...1770600000 with 16384 loops 66 microseconds
+cpucycles observed persecond 1765107692...1795140625 with 32768 loops 129 microseconds
+cpucycles observed persecond 1785369649...1800603922 with 65536 loops 256 microseconds
+cpucycles observed persecond 1781377862...1796288462 with 131072 loops 261 microseconds
+cpucycles observed persecond 1772647398...1778247827 with 262144 loops 691 microseconds
+cpucycles observed persecond 1789670493...1794149598 with 524288 loops 870 microseconds
+cpucycles observed persecond 1860276211...1861561332 with 1048576 loops 3156 microseconds
+
+
gcc92,
+SiFive Freedom U740,
+Ubuntu 22.04,
+Linux kernel 5.15.0-1014-generic:
+
cpucycles version 20230105
+cpucycles tracesetup 0 riscv64-rdcycle precision 8 scaling 1.000000 only32 0
+cpucycles tracesetup 1 default-perfevent precision 3024 scaling 1.000000 only32 0
+cpucycles tracesetup 2 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 3 default-monotonic precision 2599 scaling 2.399988 only32 0
+cpucycles tracesetup 4 default-gettimeofday precision 2599 scaling 2399.987654 only32 0
+cpucycles tracesetup 5 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 2399987654
+cpucycles implementation riscv64-rdcycle
+cpucycles median 8 +33+27+1+1+1+1+0+0+0+22+0+0+0+0+0+0+0+628+0+0+0+7+0+0+0+145+0+0+0+0+0+0+0+22+0+0+0+0+0+0+0+158+0+0+0+0+0+0+0+22+0+0+0+0+0+0+0+22+0+0+0+0+0
+cpucycles observed persecond 530250000...1978000000 with 1024 loops 3 microseconds
+cpucycles observed persecond 831000000...1915666667 with 2048 loops 4 microseconds
+cpucycles observed persecond 1055750000...1689500000 with 4096 loops 7 microseconds
+cpucycles observed persecond 1045562500...1305428572 with 8192 loops 15 microseconds
+cpucycles observed persecond 1102700000...1236357143 with 16384 loops 29 microseconds
+cpucycles observed persecond 1176053571...1247444445 with 32768 loops 55 microseconds
+cpucycles observed persecond 1173321428...1209127273 with 65536 loops 111 microseconds
+cpucycles observed persecond 1187805429...1205210046 with 131072 loops 220 microseconds
+cpucycles observed persecond 1192415909...1201157535 with 262144 loops 439 microseconds
+cpucycles observed persecond 1194694760...1199247717 with 524288 loops 877 microseconds
+cpucycles observed persecond 1194656004...1197023034 with 1048576 loops 1781 microseconds
+
+
gcc103,
+Apple M1 (Icestorm-M1 + Firestorm-M1),
+Debian unstable (bookworm),
+Linux kernel 6.0.0-rc5-asahi-00001-gc62bd3fe430f:
+
cpucycles version 20230105
+cpucycles tracesetup 0 arm64-pmc precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 1 arm64-vct precision 186 scaling 86.000000 only32 0
+cpucycles tracesetup 2 default-perfevent precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 3 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 4 default-monotonic precision 285 scaling 2.064000 only32 0
+cpucycles tracesetup 5 default-gettimeofday precision 2264 scaling 2064.000000 only32 0
+cpucycles tracesetup 6 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 2064000000
+cpucycles implementation arm64-vct
+cpucycles median 0 +0+86+0+0+0+0+0+0+0+0+0+0+0+0+86+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+86+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+86+0+0+0+0+0+0+0+0
+cpucycles observed persecond 1784500000...3655000000 with 8192 loops 3 microseconds
+cpucycles observed persecond 1773750000...2393666667 with 16384 loops 7 microseconds
+cpucycles observed persecond 1897733333...2222769231 with 32768 loops 14 microseconds
+cpucycles observed persecond 1951310344...2114962963 with 65536 loops 28 microseconds
+cpucycles observed persecond 2024071428...2107000000 with 131072 loops 55 microseconds
+cpucycles observed persecond 2041531531...2082935780 with 262144 loops 110 microseconds
+cpucycles observed persecond 2051158371...2071461188 with 524288 loops 220 microseconds
+cpucycles observed persecond 2058539682...2068309795 with 1048576 loops 440 microseconds
+
+
gcc112 (gcc2-power8),
+IBM POWER8E,
+CentOS 7.9 AltArch,
+Linux kernel 3.10.0-1127.13.1.el7.ppc64le:
+
cpucycles version 20230105
+cpucycles tracesetup 0 ppc64-mftb precision 251 scaling 7.207031 only32 0
+cpucycles tracesetup 1 default-perfevent precision 295 scaling 1.000000 only32 0
+cpucycles tracesetup 2 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 3 default-monotonic precision 536 scaling 3.690000 only32 0
+cpucycles tracesetup 4 default-gettimeofday precision 3890 scaling 3690.000000 only32 0
+cpucycles tracesetup 5 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 3690000000
+cpucycles implementation ppc64-mftb
+cpucycles median 195 +2969-8+14+0-8+7-8-7+7+6-7-1+0-1+0+7+7-15+7-1-7+6+0+0-8+0+6+0-8+7+0+7-8-8-7-1+7-8+7+0-8+0+14-8-7+6+0-8+7+7-15+0-1+0-1+14+0-15+14+0-1+7+0
+cpucycles observed persecond 2603750000...5510000000 with 2048 loops 3 microseconds
+cpucycles observed persecond 3430500000...6052250000 with 4096 loops 5 microseconds
+cpucycles observed persecond 3411333333...4457500000 with 8192 loops 11 microseconds
+cpucycles observed persecond 3548695652...4060333334 with 16384 loops 22 microseconds
+cpucycles observed persecond 3624977777...3876534884 with 32768 loops 44 microseconds
+cpucycles observed persecond 3621855555...3745363637 with 65536 loops 89 microseconds
+cpucycles observed persecond 3660157303...3722227273 with 131072 loops 177 microseconds
+cpucycles observed persecond 3680471751...3711622160 with 262144 loops 353 microseconds
+cpucycles observed persecond 3685321074...3700886525 with 524288 loops 706 microseconds
+cpucycles observed persecond 3687745930...3695537208 with 1048576 loops 1412 microseconds
+
+
gcc202,
+UltraSparc T5,
+Debian unstable (bookworm),
+Linux kernel 5.19.0-2-sparc64-smp:
+
cpucycles version 20230105
+cpucycles tracesetup 0 sparc64-rdtick precision 65 scaling 1.000000 only32 0
+cpucycles tracesetup 1 default-perfevent precision 386 scaling 1.000000 only32 0
+cpucycles tracesetup 2 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 3 default-monotonic precision 442 scaling 3.599910 only32 0
+cpucycles tracesetup 4 default-gettimeofday precision 3799 scaling 3599.910000 only32 0
+cpucycles tracesetup 5 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 3599910000
+cpucycles implementation sparc64-rdtick
+cpucycles median 73 +24+0+24+24+24+24+24+24+0+1+24+0+1+24+0+1+24+0+0+1+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+1+0+0+0+0+0+0+0+0+0+0+0+0+0
+cpucycles observed persecond 2751500000...4258250000 with 4096 loops 5 microseconds
+cpucycles observed persecond 3289200000...4206875000 with 8192 loops 9 microseconds
+cpucycles observed persecond 3454789473...3900823530 with 16384 loops 18 microseconds
+cpucycles observed persecond 3452026315...3659888889 with 32768 loops 37 microseconds
+cpucycles observed persecond 3543770270...3650916667 with 65536 loops 73 microseconds
+cpucycles observed persecond 3567299319...3620662069 with 131072 loops 146 microseconds
+cpucycles observed persecond 3591373287...3618220690 with 262144 loops 291 microseconds
+cpucycles observed persecond 3597353344...3610774527 with 524288 loops 582 microseconds
+cpucycles observed persecond 3595899403...3603058071 with 1048576 loops 1172 microseconds
+
+
IBM z15:
+
cpucycles version 20230106
+cpucycles tracesetup 0 s390x-stckf precision 250 scaling 1.269531 only32 0
+cpucycles tracesetup 1 default-perfevent precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 2 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 3 default-monotonic precision 272 scaling 5.200000 only32 0
+cpucycles tracesetup 4 default-gettimeofday precision 5400 scaling 5200.000000 only32 0
+cpucycles tracesetup 5 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 5200000000
+cpucycles implementation s390x-stckf
+cpucycles median 48 +87+8+0-2+0+0+38-2+0+1-3+1+28+0+3-3+1+0+28+0-2+3+0-2+36+0+0+0+1+0+28+0-2+0+3-2+35+1+0-2+0+3+28+0-2+0+0-2+3+25+3+0-2+0+1+35+1+0+0-2+0+28+0
+cpucycles observed persecond 4948941176...5627733334 with 8192 loops 16 microseconds
+cpucycles observed persecond 4104125000...5515666667 with 16384 loops 7 microseconds
+cpucycles observed persecond 5047076923...5987818182 with 32768 loops 12 microseconds
+cpucycles observed persecond 5044846153...5475708334 with 65536 loops 25 microseconds
+cpucycles observed persecond 5141313725...5357428572 with 131072 loops 50 microseconds
+cpucycles observed persecond 5150892156...5257250000 with 262144 loops 101 microseconds
+cpucycles observed persecond 5183421568...5236549505 with 524288 loops 203 microseconds
+cpucycles observed persecond 5190282555...5216582717 with 1048576 loops 406 microseconds
+
Version:
+This is version 2023.01.06 of the "Counters" web page.
+
+
+
+
diff --git a/cpu-cycles/libcpucycles/doc/html/download.html b/cpu-cycles/libcpucycles/doc/html/download.html
new file mode 100644
index 0000000000..9a4230bf1b
--- /dev/null
+++ b/cpu-cycles/libcpucycles/doc/html/download.html
@@ -0,0 +1,75 @@
+
+
+
+
+
+
+Download
+
+
+
+libcpucycles
+
+
+
+
To download and unpack the latest version of libcpucycles:
+
wget -m https://cpucycles.cr.yp.to/libcpucycles-latest-version.txt
+ version=$(cat cpucycles.cr.yp.to/libcpucycles-latest-version.txt)
+ wget -m https://cpucycles.cr.yp.to/libcpucycles-$version.tar.gz
+ tar -xzf cpucycles.cr.yp.to/libcpucycles-$version.tar.gz
+ cd libcpucycles-$version
+
+
Then install.
+
Archives and changelog (reverse chronological)
+
libcpucycles-20230115.tar.gz browse
+
Update actual cpucycles_version behavior to match documentation.
+
libcpucycles-20230110.tar.gz browse
+
doc/api.md: Document cpucycles_version().
+
Add s390x-stckf counter.
+
cpucycles/default-perfevent.c: Read into int64_t instead of long long.
+Add comment explaining issues with PERF_FORMAT_TOTAL_TIME_RUNNING.
+
configure: Improve uname handling.
+
doc/api.md: Update description of default frequency.
+
libcpucycles-20230105.tar.gz browse
Version:
+This is version 2023.01.15 of the "Download" web page.
+
+
+
+
diff --git a/cpu-cycles/libcpucycles/doc/html/index.html b/cpu-cycles/libcpucycles/doc/html/index.html
new file mode 100644
index 0000000000..c6ecb3e1d1
--- /dev/null
+++ b/cpu-cycles/libcpucycles/doc/html/index.html
@@ -0,0 +1,88 @@
+
+
+
+
+
+
+Intro
+
+
+
+libcpucycles
+
+
+
+
libcpucycles is a public-domain microlibrary for counting CPU cycles.
+Cycle counts are not as detailed as
+Falk diagrams
+but are the most precise timers available to typical software; they are
+central tools used in understanding and improving software performance.
+
The libcpucycles API is simple: include <cpucycles.h>, call
+cpucycles() to receive a long long whenever desired, and link with
+-lcpucycles.
+
Internally, libcpucycles understands machine-level
+cycle counters for amd64 (both PMC and TSC), arm32, arm64 (both PMC and
+VCT), mips64, ppc32, ppc64, riscv32, riscv64, s390x, sparc64, and x86.
+libcpucycles also understands four OS-level mechanisms, which give
+varying levels of accuracy: mach_absolute_time, perf_event,
+CLOCK_MONOTONIC, and, as a fallback, microsecond-resolution
+gettimeofday.
+
When the program first calls cpucycles(), libcpucycles automatically
+benchmarks the available mechanisms and selects the
+mechanism that does the best job. Subsequent cpucycles() calls are
+thread-safe and very fast. An accompanying cpucycles-info program
+prints a summary of cycle-counter accuracy.
+
For comparison, there is a simple-sounding __rdtsc() API provided by
+compilers, but this works only on Intel/AMD CPUs and is generally noisier
+than PMC. There is a __builtin_readcyclecounter() that works on more
+CPUs, but this works only with clang and has the same noise problems.
+Both of these mechanisms put the burden on the caller to figure out what
+can be done on other CPUs. Various packages include their own more
+portable abstraction layers for counting cycles (see, e.g., FFTW's
+cycle.h,
+used to automatically select from among multiple implementations
+provided by FFTW), but this creates per-package effort to keep up with
+the latest cycle counters. The goal of libcpucycles is to provide
+state-of-the-art cycle counting centrally for all packages to use.
Version:
+This is version 2023.01.06 of the "Intro" web page.
+
+
+
+
diff --git a/cpu-cycles/libcpucycles/doc/html/install.html b/cpu-cycles/libcpucycles/doc/html/install.html
new file mode 100644
index 0000000000..4d2899f41c
--- /dev/null
+++ b/cpu-cycles/libcpucycles/doc/html/install.html
@@ -0,0 +1,101 @@
+
+
+
+
+
+
+Install
+
+
+
+libcpucycles
+
+
+
+
Prerequisites: python3; gcc and/or clang. Currently tested only
+under Linux, but porting to other systems shouldn't be difficult.
+
For sysadmins, to install in /usr/local/{include,lib,bin}:
+
./configure && make -j8 install
+
+
For developers with an unprivileged account (typically with
+
export LD_LIBRARY_PATH="$HOME/lib"
+ export LIBRARY_PATH="$HOME/lib"
+ export CPATH="$HOME/include"
+ export PATH="$HOME/bin:$PATH"
+
+
in $HOME/.profile), to install in $HOME/{include,lib,bin}:
+
./configure --prefix=$HOME && make -j8 install
+
+
For distributors creating a package: Run
+
./configure --prefix=/usr && make -j8
+
+
and then follow your usual packaging procedures for the
+build/0/package files:
+
build/0/package/man/man3/cpucycles.3
+ build/0/package/include/cpucycles.h
+ build/0/package/lib/libcpucycles*
+ build/0/package/bin/cpucycles-info
+
+
There are some old systems where libcpucycles requires -lrt for
+clock_gettime; currently libcpucycles.so doesn't link to -lrt,
+so it's up to the caller to link to -lrt.
+
More options: You can run
+
./configure --host=amd64
+
+
to override ./configure's guess of the architecture that it should
+compile for. The architecture controls which cycle counters to try
+compiling: e.g., amd64 tries compiling cpucycles/amd64* and
+cpucycles/default*.
+
Inside the build directory, 0 is symlinked to amd64 for
+--host=amd64. Running make clean removes build/amd64. Re-running
+./configure automatically starts with make clean.
+
A subsequent ./configure --host=arm64 will create build/arm64 and
+symlink 0 -> arm64, without touching an existing build/amd64.
+However, cross-compilers aren't yet selected automatically.
+
Compilers tried are listed in compilers/default. Each compiler
+includes -fPIC to create a shared library, -fvisibility=hidden to
+hide non-public symbols in the library, and -fwrapv to switch to a
+slightly less dangerous version of C. The first compiler that seems to
+work is used to compile everything.
Version:
+This is version 2023.01.05 of the "Install" web page.
+
+
+
+
diff --git a/cpu-cycles/libcpucycles/doc/html/security.html b/cpu-cycles/libcpucycles/doc/html/security.html
new file mode 100644
index 0000000000..e978bcddb4
--- /dev/null
+++ b/cpu-cycles/libcpucycles/doc/html/security.html
@@ -0,0 +1,122 @@
+
+
+
+
+
+
+Security
+
+
+
+libcpucycles
+
+
+
+
Many security systems have been shown to be breakable by "timing
+attacks". These attacks extract secrets by analyzing timings of the
+legitimate user's operations on secret data. See the June 2022 survey
+page https://timing.attacks.cr.yp.to
+for an overview and further references.
+
Sometimes these attacks are used as motivation to disable the attacker's
+access to various timing mechanisms. For example, Firefox rounds its
+performance.now timer to 1-millisecond resolution
+"to mitigate potential security threats".
+
As another example, reducing /proc/sys/kernel/perf_event_paranoid
+under Linux to 2 (from 3 or higher), so that libcpucycles has access to
+the best available Intel/AMD cycle counter (RDPMC), also means making
+this cycle counter and other performance-monitoring counters available
+to any attacker-controlled software running on the computer. Perhaps
+this helps timing attacks, not to mention the possibility of opening up
+other vulnerabilities via the complicated perf_event interface.
+
As yet another example, ARM CPUs disable user access to the main CPU
+cycle counter by default. Installing a kernel module to enable user
+access to the cycle counter could help attacks.
+
Given the availability of simple mechanisms to disable RDPMC etc., it is
+easy to recommend using those mechanisms. To avoid creating unnecessary
+tension between those recommendations and the use of libcpucycles,
+applications that use libcpucycles should be structured so that
+high-resolution timers are used only on controlled development and
+benchmarking machines, not on general end-user machines.
+
This structure might seem incompatible with using cycle counts to
+automatically select the best of multiple options, as in FFTW. However,
+new infrastructure introduced in lib25519
+automatically selects options on end-user machines based on cycle counts
+that were collected on benchmarking machines.
+
The above text should not be understood as endorsing the idea that
+disabling timers is an effective defense against timing attacks.
+Certainly disabling high-resolution timers is not sufficient for
+security: there are many ways for attackers to amplify timing signals
+and to statistically filter out noise from low-resolution timers.
+Disabling every standard timing mechanism on the machine does not stop
+the attacker from accessing a remote timer or a counter maintained by
+the attacker's software. Perhaps disabling timers sometimes makes the
+difference between a feasible attack and an infeasible attack, but
+evaluating this is extremely difficult.
+
Meanwhile there is an auditable methodology available to stop timing
+attacks: constant-time programming, which systematically cuts off data
+flow from secrets to timings.
+
For example, secrets affect a CPU's power consumption, and Turbo Boost
+creates data flow from power consumption to timings, as illustrated by
+the Hertzbleed attack extracting secret
+keys from the SIKE cryptosystem (before SIKE was broken in other ways),
+and an independent attack
+extracting secret AES keys. Consequently, the constant-time methodology
+does not allow Turbo Boost.
+
This is why https://timing.attacks.cr.yp.to
+recommends turning off Turbo Boost "right now", and explains the
+mechanisms available to do this. One non-security reason that it was
+already normal (although not universal) for manufacturers to provide
+these mechanisms to end users is that Turbo Boost has a reputation for
+causing premature hardware failures. Turbo Boost also provides very
+little speed benefit for modern multithreaded vectorized applications.
+
Another reaction to timing attacks is to apply "masking" techniques.
+These techniques seem to make it more difficult for attackers to
+extract secrets from power consumption and other side channels. However,
+as https://timing.attacks.cr.yp.to
+explains, it is "practically impossible for an auditor to obtain any
+real assurance that these techniques are secure". See the December 2022
+paper
+"Breaking a fifth-order masked implementation of CRYSTALS-Kyber by copy-paste"
+for a newer example of a security failure in a masked implementation.
Version:
+This is version 2023.01.05 of the "Security" web page.
+
+
+
+
diff --git a/cpu-cycles/libcpucycles/doc/html/selection.html b/cpu-cycles/libcpucycles/doc/html/selection.html
new file mode 100644
index 0000000000..eab7c4ba9a
--- /dev/null
+++ b/cpu-cycles/libcpucycles/doc/html/selection.html
@@ -0,0 +1,158 @@
+
+
+
+
+
+
+Selection
+
+
+
+libcpucycles
+
+
+
+
Here is how libcpucycles decides which cycle counter to use. The
+underlying principles are as follows:
+
+-
+
Failure is not allowed. Using a low-resolution timer such as
+ gettimeofday() to estimate cycle counts is not desirable but is better
+ than providing no information.
+
+-
+
A counter that does well on some CPUs and OSes can do badly on others.
+ The counter selection in libcpucycles is based not just on rules set
+ at compile time but also on measurements of how well the counters
+ perform when the program first calls cpucycles().
+
+-
+
A critical application of cycle counting is collecting cycle counts
+ for multiple options to see which option is faster. It is the caller's
+ responsibility to compute medians of cycle counts for many runs of
+ whatever is being benchmarked: medians filter out occasional
+ cycle-count jumps caused by migration to another core (if the
+ benchmark is not pinned to a single core) or interrupts from other OS
+ activity. libcpucycles does not reject an otherwise attractive counter
+ merely because of occasional jumps.
+
+-
+
Cycle-counting overhead is not desirable, but does not directly affect
+ comparisons of multiple options measured using the same cycle counter,
+ so it is less important than consistent major errors such as treating
+ 2^32 + x cycles as x cycles. (Performance experts seeing a function
+ that takes billions of cycles usually focus on smaller subroutines,
+ but libcpucycles should not break larger measurements.) This is why
+ libcpucycles does not provide direct access to 32-bit cycle counters:
+ it provides wrappers that combine the counters with gettimeofday() to
+ produce 64 bits, even though this incurs some extra overhead.
+
+-
+
The noise introduced by typical off-core clocks, such as multiplying a
+ 24MHz clock by 86 to estimate cycles on a 2.064GHz CPU core, comes in
+ small part from low resolution but much more from changes in CPU
+ frequency: e.g., a 10000-cycle computation might be measured as 20000
+ cycles when the CPU enters a power-saving mode. When libcpucycles has
+ access to what is believed to be an on-core cycle counter, it uses
+ that even when its measurements show some noise. (Choosing an on-core
+ cycle counter does not magically eliminate the change in the relative
+ speed of the CPU and DRAM; the usual advice to warm up the CPU and set
+ constant frequencies if possible still applies.)
+
+
+
When cpucycles() is first called, libcpucycles tries running each
+cycle counter that has been compiled into the library. For example, for
+64-bit ARM CPUs, libcpucycles will try arm64-pmc, arm64-vct,
+default-gettimeofday, default-mach, default-monotonic, and
+default-perfevent, minus any of those that failed to compile.
+
Cycle counters that fail at run time with SIGILL (or SIGFPE or SIGBUS or
+SIGSEGV) are eliminated from the list. For example, arm64-pmc will
+fail with SIGILL if the kernel does not allow user access to
+PMCCNTR_EL0. Beware that libcpucycles does not catch SIGILL after its
+initial tests: if the kernel initially allows user access to
+PMCCNTR_EL0 but later turns it off then arm64-pmc will crash.
+
Independently of these counters, libcpucycles uses various OS mechanisms
+to obtain an estimate of the CPU frequency. This estimate is also
+available to the caller as cpucycles_persecond().
+
The methods that libcpucycles uses to ask the OS for an estimated CPU
+frequency fail on some OS-CPU combinations, in which case libcpucycles
+falls back to a cpucyclespersecond environment variable, or, if that
+variable does not exist, an estimate of 2399987654 cycles per second.
+(This estimate is in a realistic range of CPU speeds, and is close to
+multiples of 24MHz, 25MHz, and 19.2MHz, which are common crystal
+frequencies.) The sysadmin can create /etc/cpucyclespersecond to
+override all of the OS mechanisms.
+
For counters that do not ask for scaling, the estimated CPU frequency is
+shown in cpucycles-info as a double-check on the counter results. For
+counters that ask for scaling, libcpucycles uses the estimated CPU
+frequency to compute the scaling, so this is not a double-check. If a
+counter asks for scaling and the estimated CPU frequency does not seem
+close to a multiple of the counter frequency (possibly with a small
+power-of-2 denominator) then libcpucycles will throw the counter away,
+except in the case of fixed-resolution OS counters such as
+gettimeofday and CLOCK_MONOTONIC.
+
libcpucycles computes a precision estimate for each counter (times any
+applicable scaling) as follows. Call the counter 1000 times. Check that
+the counter has never decreased, and has increased at least once. (A
+counter where the decrease/increase checks fail is retried 10 times, so
+10000 calls overall, and removed if it fails all 10 times.) The
+precision estimate is then the smallest nonzero difference between
+adjacent counter results, plus a penalty explained below.
+
The penalty is 100 cycles for off-core counters (including RDTSC) and
+default-perfevent, and 200 cycles for fixed-resolution OS counters.
+For example, an on-core CPU cycle counter will be selected even if it
+actually has, e.g., a resolution of 8 cycles and 50 cycles of overhead.
+
Finally, libcpucycles selects the counter where the precision estimate
+is the smallest number of cycles. Note that an inaccurate estimate of
+CPU frequency can influence the choice between a scaled counter and an
+unscaled counter.
+
libcpucycles does not carry out its counter selection (typically tens
+of milliseconds, sometimes even more) as a static initializer; callers
+are presumed to not want to incur the cost of initialization unless and
+until they are actually using cpucycles(). A multithreaded caller thus
+has to place locks around any possibly-first call to cpucycles(), or
+create its own static initializer (an __attribute__((constructor))
+function) with an initial cpucycles() call so that all subsequent
+cpucycles() calls are thread-safe.
Version:
+This is version 2023.01.05 of the "Selection" web page.
+
+
+
+
diff --git a/cpu-cycles/libcpucycles/doc/install.md b/cpu-cycles/libcpucycles/doc/install.md
new file mode 100644
index 0000000000..9642ead64b
--- /dev/null
+++ b/cpu-cycles/libcpucycles/doc/install.md
@@ -0,0 +1,56 @@
+Prerequisites: `python3`; `gcc` and/or `clang`. Currently tested only
+under Linux, but porting to other systems shouldn't be difficult.
+
+For sysadmins, to install in `/usr/local/{include,lib,bin}`:
+
+ ./configure && make -j8 install
+
+For developers with an unprivileged account (typically with
+
+ export LD_LIBRARY_PATH="$HOME/lib"
+ export LIBRARY_PATH="$HOME/lib"
+ export CPATH="$HOME/include"
+ export PATH="$HOME/bin:$PATH"
+
+in `$HOME/.profile`), to install in `$HOME/{include,lib,bin}`:
+
+ ./configure --prefix=$HOME && make -j8 install
+
+For distributors creating a package: Run
+
+ ./configure --prefix=/usr && make -j8
+
+and then follow your usual packaging procedures for the
+`build/0/package` files:
+
+ build/0/package/man/man3/cpucycles.3
+ build/0/package/include/cpucycles.h
+ build/0/package/lib/libcpucycles*
+ build/0/package/bin/cpucycles-info
+
+There are some old systems where libcpucycles requires `-lrt` for
+`clock_gettime`; currently `libcpucycles.so` doesn't link to `-lrt`,
+so it's up to the caller to link to `-lrt`.
+
+More options: You can run
+
+ ./configure --host=amd64
+
+to override `./configure`'s guess of the architecture that it should
+compile for. The architecture controls which cycle counters to try
+compiling: e.g., `amd64` tries compiling `cpucycles/amd64*` and
+`cpucycles/default*`.
+
+Inside the `build` directory, `0` is symlinked to `amd64` for
+`--host=amd64`. Running `make clean` removes `build/amd64`. Re-running
+`./configure` automatically starts with `make clean`.
+
+A subsequent `./configure --host=arm64` will create `build/arm64` and
+symlink `0 -> arm64`, without touching an existing `build/amd64`.
+However, cross-compilers aren't yet selected automatically.
+
+Compilers tried are listed in `compilers/default`. Each compiler
+includes `-fPIC` to create a shared library, `-fvisibility=hidden` to
+hide non-public symbols in the library, and `-fwrapv` to switch to a
+slightly less dangerous version of C. The first compiler that seems to
+work is used to compile everything.
diff --git a/cpu-cycles/libcpucycles/doc/man/cpucycles.3 b/cpu-cycles/libcpucycles/doc/man/cpucycles.3
new file mode 100644
index 0000000000..bb7f9134fb
--- /dev/null
+++ b/cpu-cycles/libcpucycles/doc/man/cpucycles.3
@@ -0,0 +1,57 @@
+.\" Automatically generated by Pandoc 2.9.2.1
+.\"
+.TH "cpucycles" "3" "" "" ""
+.hy
+.SS NAME
+.PP
+cpucycles - count CPU cycles
+.SS SYNOPSIS
+.IP
+.nf
+\f[C]
+#include
+
+long long count = cpucycles();
+long long persecond = cpucycles_persecond();
+const char *implementation = cpucycles_implementation();
+const char *version = cpucycles_version();
+\f[R]
+.fi
+.PP
+Link with \f[C]-lcpucycles\f[R].
+Old systems may also need \f[C]-lrt\f[R].
+.SS DESCRIPTION
+.PP
+\f[C]cpucycles()\f[R] returns an estimate for the number of CPU cycles
+that have occurred since an unspecified time in the past (perhaps system
+boot, perhaps program startup).
+.PP
+Accessing true cycle counters can be difficult on some CPUs and
+operating systems.
+\f[C]cpucycles()\f[R] does its best to produce accurate results, but
+selects a low-precision counter if the only other option is failure.
+.PP
+\f[C]cpucycles_persecond()\f[R] returns an estimate for the number of
+CPU cycles per second.
+This estimate comes from \f[C]/etc/cpucyclespersecond\f[R] if that file
+exists, otherwise from various OS mechanisms, otherwise from the
+\f[C]cpucyclespersecond\f[R] environment variable if that is set,
+otherwise 2399987654.
+.PP
+\f[C]cpucycles_implementation()\f[R] returns the name of the counter in
+use: e.g., \f[C]\[dq]amd64-pmc\[dq]\f[R].
+.PP
+\f[C]cpucycles_version()\f[R] returns the \f[C]libcpucycles\f[R] version
+number as a string: e.g., \f[C]\[dq]20230115\[dq]\f[R].
+Results of \f[C]cpucycles_implementation()\f[R] should be interpreted
+relative to \f[C]cpucycles_version()\f[R].
+.PP
+\f[C]cpucycles\f[R] is actually a function pointer.
+The first call to \f[C]cpucycles()\f[R] or
+\f[C]cpucycles_persecond()\f[R] or \f[C]cpucycles_implementation()\f[R]
+selects one of the available counters and updates the
+\f[C]cpucycles\f[R] pointer accordingly.
+Subsequent calls to \f[C]cpucycles()\f[R] are thread-safe.
+.SS SEE ALSO
+.PP
+\f[B]gettimeofday\f[R](2), \f[B]clock_gettime\f[R](2)
diff --git a/cpu-cycles/libcpucycles/doc/readme.md b/cpu-cycles/libcpucycles/doc/readme.md
new file mode 100644
index 0000000000..98a42eea41
--- /dev/null
+++ b/cpu-cycles/libcpucycles/doc/readme.md
@@ -0,0 +1,36 @@
+libcpucycles is a public-domain microlibrary for counting CPU cycles.
+Cycle counts are not as detailed as
+[Falk diagrams](https://gamozolabs.github.io/metrology/2019/08/19/sushi_roll.html)
+but are the most precise timers available to typical software; they are
+central tools used in understanding and improving software performance.
+
+The libcpucycles [API](api.html) is simple: include ``, call
+`cpucycles()` to receive a `long long` whenever desired, and link with
+`-lcpucycles`.
+
+[Internally](counters.html), libcpucycles understands machine-level
+cycle counters for amd64 (both PMC and TSC), arm32, arm64 (both PMC and
+VCT), mips64, ppc32, ppc64, riscv32, riscv64, s390x, sparc64, and x86.
+libcpucycles also understands four OS-level mechanisms, which give
+varying levels of accuracy: `mach_absolute_time`, `perf_event`,
+`CLOCK_MONOTONIC`, and, as a fallback, microsecond-resolution
+`gettimeofday`.
+
+When the program first calls `cpucycles()`, libcpucycles automatically
+benchmarks the available mechanisms and [selects](selection.html) the
+mechanism that does the best job. Subsequent `cpucycles()` calls are
+thread-safe and very fast. An accompanying `cpucycles-info` program
+prints a summary of cycle-counter accuracy.
+
+For comparison, there is a simple-sounding `__rdtsc()` API provided by
+compilers, but this works only on Intel/AMD CPUs and is generally noisier
+than PMC. There is a `__builtin_readcyclecounter()` that works on more
+CPUs, but this works only with `clang` and has the same noise problems.
+Both of these mechanisms put the burden on the caller to figure out what
+can be done on other CPUs. Various packages include their own more
+portable abstraction layers for counting cycles (see, e.g., FFTW's
+[`cycle.h`](https://github.com/FFTW/fftw3/blob/master/kernel/cycle.h),
+used to automatically select from among multiple implementations
+provided by FFTW), but this creates per-package effort to keep up with
+the latest cycle counters. The goal of libcpucycles is to provide
+state-of-the-art cycle counting centrally for all packages to use.
diff --git a/cpu-cycles/libcpucycles/doc/security.md b/cpu-cycles/libcpucycles/doc/security.md
new file mode 100644
index 0000000000..554a20f0e1
--- /dev/null
+++ b/cpu-cycles/libcpucycles/doc/security.md
@@ -0,0 +1,76 @@
+Many security systems have been shown to be breakable by "timing
+attacks". These attacks extract secrets by analyzing timings of the
+legitimate user's operations on secret data. See the June 2022 survey
+page [https://timing.attacks.cr.yp.to](https://timing.attacks.cr.yp.to)
+for an overview and further references.
+
+Sometimes these attacks are used as motivation to disable the attacker's
+access to various timing mechanisms. For example, Firefox rounds its
+`performance.now` timer to 1-millisecond resolution
+["to mitigate potential security threats"](https://developer.mozilla.org/en-US/docs/Web/API/Performance/now).
+
+As another example, reducing `/proc/sys/kernel/perf_event_paranoid`
+under Linux to 2 (from 3 or higher), so that libcpucycles has access to
+the best available Intel/AMD cycle counter (RDPMC), also means making
+this cycle counter and other performance-monitoring counters available
+to any attacker-controlled software running on the computer. Perhaps
+this helps timing attacks, not to mention the possibility of opening up
+other vulnerabilities via the complicated `perf_event` interface.
+
+As yet another example, ARM CPUs disable user access to the main CPU
+cycle counter by default. Installing a kernel module to enable user
+access to the cycle counter could help attacks.
+
+Given the availability of simple mechanisms to disable RDPMC etc., it is
+easy to recommend using those mechanisms. To avoid creating unnecessary
+tension between those recommendations and the use of libcpucycles,
+applications that use libcpucycles should be structured so that
+high-resolution timers are used only on controlled development and
+benchmarking machines, not on general end-user machines.
+
+This structure might seem incompatible with using cycle counts to
+automatically select the best of multiple options, as in FFTW. However,
+new infrastructure introduced in [lib25519](https://lib25519.cr.yp.to)
+automatically selects options on end-user machines based on cycle counts
+that were _collected on benchmarking machines_.
+
+The above text should not be understood as endorsing the idea that
+disabling timers is an _effective_ defense against timing attacks.
+Certainly disabling high-resolution timers is not sufficient for
+security: there are many ways for attackers to amplify timing signals
+and to statistically filter out noise from low-resolution timers.
+Disabling _every_ standard timing mechanism on the machine does not stop
+the attacker from accessing a remote timer or a counter maintained by
+the attacker's software. Perhaps disabling timers sometimes makes the
+difference between a feasible attack and an infeasible attack, but
+evaluating this is extremely difficult.
+
+Meanwhile there is an auditable methodology available to stop timing
+attacks: constant-time programming, which systematically cuts off data
+flow from secrets to timings.
+
+For example, secrets affect a CPU's power consumption, and Turbo Boost
+creates data flow from power consumption to timings, as illustrated by
+the [Hertzbleed attack](https://www.hertzbleed.com) extracting secret
+keys from the SIKE cryptosystem (before SIKE was broken in other ways),
+and an [independent attack](https://arxiv.org/abs/2206.07012)
+extracting secret AES keys. Consequently, the constant-time methodology
+does not allow Turbo Boost.
+
+This is why [https://timing.attacks.cr.yp.to](https://timing.attacks.cr.yp.to)
+recommends turning off Turbo Boost "right now", and explains the
+mechanisms available to do this. One non-security reason that it was
+already normal (although not universal) for manufacturers to provide
+these mechanisms to end users is that Turbo Boost has a reputation for
+causing premature hardware failures. Turbo Boost also provides very
+little speed benefit for modern multithreaded vectorized applications.
+
+Another reaction to timing attacks is to apply "masking" techniques.
+These techniques _seem_ to make it more difficult for attackers to
+extract secrets from power consumption and other side channels. However,
+as [https://timing.attacks.cr.yp.to](https://timing.attacks.cr.yp.to)
+explains, it is "practically impossible for an auditor to obtain any
+real assurance that these techniques are secure". See the December 2022
+paper
+["Breaking a fifth-order masked implementation of CRYSTALS-Kyber by copy-paste"](https://eprint.iacr.org/2022/1713)
+for a newer example of a security failure in a masked implementation.
diff --git a/cpu-cycles/libcpucycles/doc/selection.md b/cpu-cycles/libcpucycles/doc/selection.md
new file mode 100644
index 0000000000..847f7820dc
--- /dev/null
+++ b/cpu-cycles/libcpucycles/doc/selection.md
@@ -0,0 +1,104 @@
+Here is how libcpucycles decides which cycle counter to use. The
+underlying principles are as follows:
+
+* Failure is not allowed. Using a low-resolution timer such as
+ `gettimeofday()` to estimate cycle counts is not desirable but is better
+ than providing no information.
+
+* A counter that does well on some CPUs and OSes can do badly on others.
+ The counter selection in libcpucycles is based not just on rules set
+ at compile time but also on measurements of how well the counters
+ perform when the program first calls `cpucycles()`.
+
+* A critical application of cycle counting is collecting cycle counts
+ for multiple options to see which option is faster. It is the caller's
+ responsibility to compute medians of cycle counts for many runs of
+ whatever is being benchmarked: medians filter out occasional
+ cycle-count jumps caused by migration to another core (if the
+ benchmark is not pinned to a single core) or interrupts from other OS
+ activity. libcpucycles does not reject an otherwise attractive counter
+ merely because of occasional jumps.
+
+* Cycle-counting overhead is not desirable, but does not directly affect
+ comparisons of multiple options measured using the same cycle counter,
+ so it is less important than consistent major errors such as treating
+ 2^32 + x cycles as x cycles. (Performance experts seeing a function
+ that takes billions of cycles usually focus on smaller subroutines,
+ but libcpucycles should not break larger measurements.) This is why
+ libcpucycles does not provide direct access to 32-bit cycle counters:
+ it provides wrappers that combine the counters with gettimeofday() to
+ produce 64 bits, even though this incurs some extra overhead.
+
+* The noise introduced by typical off-core clocks, such as multiplying a
+ 24MHz clock by 86 to estimate cycles on a 2.064GHz CPU core, comes in
+ small part from low resolution but much more from changes in CPU
+ frequency: e.g., a 10000-cycle computation might be measured as 20000
+ cycles when the CPU enters a power-saving mode. When libcpucycles has
+ access to what is believed to be an on-core cycle counter, it uses
+ that even when its measurements show some noise. (Choosing an on-core
+ cycle counter does not magically eliminate the change in the relative
+ speed of the CPU and DRAM; the usual advice to warm up the CPU and set
+ constant frequencies if possible still applies.)
+
+When `cpucycles()` is first called, libcpucycles tries running each
+cycle counter that has been compiled into the library. For example, for
+64-bit ARM CPUs, libcpucycles will try `arm64-pmc`, `arm64-vct`,
+`default-gettimeofday`, `default-mach`, `default-monotonic`, and
+`default-perfevent`, minus any of those that failed to compile.
+
+Cycle counters that fail at run time with SIGILL (or SIGFPE or SIGBUS or
+SIGSEGV) are eliminated from the list. For example, `arm64-pmc` will
+fail with SIGILL if the kernel does not allow user access to
+`PMCCNTR_EL0`. Beware that libcpucycles does not catch SIGILL after its
+initial tests: if the kernel initially allows user access to
+`PMCCNTR_EL0` but later turns it off then `arm64-pmc` will crash.
+
+Independently of these counters, libcpucycles uses various OS mechanisms
+to obtain an _estimate_ of the CPU frequency. This estimate is also
+available to the caller as `cpucycles_persecond()`.
+
+The methods that libcpucycles uses to ask the OS for an estimated CPU
+frequency fail on some OS-CPU combinations, in which case libcpucycles
+falls back to a `cpucyclespersecond` environment variable, or, if that
+variable does not exist, an estimate of 2399987654 cycles per second.
+(This estimate is in a realistic range of CPU speeds, and is close to
+multiples of 24MHz, 25MHz, and 19.2MHz, which are common crystal
+frequencies.) The sysadmin can create `/etc/cpucyclespersecond` to
+override all of the OS mechanisms.
+
+For counters that do not ask for scaling, the estimated CPU frequency is
+shown in `cpucycles-info` as a double-check on the counter results. For
+counters that ask for scaling, libcpucycles uses the estimated CPU
+frequency to compute the scaling, so this is not a double-check. If a
+counter asks for scaling and the estimated CPU frequency does not seem
+close to a multiple of the counter frequency (possibly with a small
+power-of-2 denominator) then libcpucycles will throw the counter away,
+except in the case of fixed-resolution OS counters such as
+`gettimeofday` and `CLOCK_MONOTONIC`.
+
+libcpucycles computes a precision estimate for each counter (times any
+applicable scaling) as follows. Call the counter 1000 times. Check that
+the counter has never decreased, and has increased at least once. (A
+counter where the decrease/increase checks fail is retried 10 times, so
+10000 calls overall, and removed if it fails all 10 times.) The
+precision estimate is then the smallest nonzero difference between
+adjacent counter results, plus a penalty explained below.
+
+The penalty is 100 cycles for off-core counters (including RDTSC) and
+`default-perfevent`, and 200 cycles for fixed-resolution OS counters.
+For example, an on-core CPU cycle counter will be selected even if it
+actually has, e.g., a resolution of 8 cycles and 50 cycles of overhead.
+
+Finally, libcpucycles selects the counter where the precision estimate
+is the smallest number of cycles. Note that an inaccurate estimate of
+CPU frequency can influence the choice between a scaled counter and an
+unscaled counter.
+
+libcpucycles does _not_ carry out its counter selection (typically tens
+of milliseconds, sometimes even more) as a static initializer; callers
+are presumed to not want to incur the cost of initialization unless and
+until they are actually using `cpucycles()`. A multithreaded caller thus
+has to place locks around any possibly-first call to `cpucycles()`, or
+create its own static initializer (an `__attribute__((constructor))`
+function) with an initial `cpucycles()` call so that all subsequent
+`cpucycles()` calls are thread-safe.
diff --git a/cpu-cycles/libcpucycles/scripts-build/install b/cpu-cycles/libcpucycles/scripts-build/install
new file mode 100755
index 0000000000..7ea5c77f67
--- /dev/null
+++ b/cpu-cycles/libcpucycles/scripts-build/install
@@ -0,0 +1,27 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import shutil
+import tempfile
+
+prefix = sys.argv[1]
+dirs = 'man/man3','lib','include','bin'
+install = {}
+
+os.umask(0o22)
+
+for target in dirs:
+ install[target] = '%s/%s'%(prefix,target)
+ os.makedirs(install[target],exist_ok=True)
+
+os.umask(0o77)
+
+for target in dirs:
+ with tempfile.TemporaryDirectory(dir=install[target]) as t:
+ for fn in sorted(os.listdir('package/'+target)):
+ try:
+ shutil.copy2('package/%s/%s' % (target,fn),'%s/%s' % (t,fn),follow_symlinks=False)
+ except TypeError: # XXX: old python3; should copy symlinks manually
+ shutil.copy2('package/%s/%s' % (target,fn),'%s/%s' % (t,fn))
+ os.rename('%s/%s' % (t,fn),'%s/%s' % (install[target],fn))
diff --git a/cpu-cycles/libcpucycles/scripts-build/staticlib b/cpu-cycles/libcpucycles/scripts-build/staticlib
new file mode 100755
index 0000000000..bb23658fd5
--- /dev/null
+++ b/cpu-cycles/libcpucycles/scripts-build/staticlib
@@ -0,0 +1,6 @@
+#!/bin/sh
+
+rm -f package/lib/libcpucycles.a
+ar cr package/lib/libcpucycles.a "$@"
+ranlib package/lib/libcpucycles.a || :
+chmod 644 package/lib/libcpucycles.a
diff --git a/cpu-cycles/libcpucycles/version b/cpu-cycles/libcpucycles/version
new file mode 100644
index 0000000000..dbdecdf7fc
--- /dev/null
+++ b/cpu-cycles/libcpucycles/version
@@ -0,0 +1 @@
+20230115
diff --git a/cpu-cycles/src/bindings.rs b/cpu-cycles/src/bindings.rs
new file mode 100644
index 0000000000..4065d3fd18
--- /dev/null
+++ b/cpu-cycles/src/bindings.rs
@@ -0,0 +1,9 @@
+#[link(name = "cpucycles", kind = "static")]
+extern "C" {
+ pub static mut cpucycles:
+ ::std::option::Option ::std::os::raw::c_longlong>;
+ pub fn cpucycles_implementation() -> *const ::std::os::raw::c_char;
+ pub fn cpucycles_version() -> *const ::std::os::raw::c_char;
+ pub fn cpucycles_persecond() -> ::std::os::raw::c_longlong;
+ pub fn cpucycles_tracesetup();
+}
diff --git a/cpu-cycles/src/lib.rs b/cpu-cycles/src/lib.rs
new file mode 100644
index 0000000000..660545e602
--- /dev/null
+++ b/cpu-cycles/src/lib.rs
@@ -0,0 +1,82 @@
+#![allow(non_upper_case_globals)]
+#![allow(non_camel_case_types)]
+#![allow(non_snake_case)]
+
+mod bindings;
+use bindings as c;
+
+use std::fmt;
+use std::{
+ error::Error,
+ ffi::{CStr, CString, IntoStringError},
+};
+
+#[derive(Debug)]
+pub struct CpuCyclesError {
+ message: String,
+}
+
+impl fmt::Display for CpuCyclesError {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ write!(f, "{}", self.message)
+ }
+}
+
+impl Error for CpuCyclesError {}
+
+pub fn cpucycles_tracesetup() {
+ unsafe { c::cpucycles_tracesetup() }
+}
+
+pub fn cpucycles() -> Result {
+ if let Some(count) = unsafe { c::cpucycles.map(|f| f()) } {
+ Ok(count)
+ } else {
+ Err(CpuCyclesError {
+ message: "Could not execute cpucycles!".to_string(),
+ })
+ }
+}
+
+pub fn cpucycles_persecond() -> Result {
+ Ok(unsafe { c::cpucycles_persecond() })
+}
+
+pub fn cpucycles_implementation() -> Result {
+ let implementation = unsafe { CString::from(CStr::from_ptr(c::cpucycles_implementation())) };
+ implementation.into_string()
+}
+
+pub fn cpucycles_version() -> Result {
+ let version = unsafe { CString::from(CStr::from_ptr(c::cpucycles_version())) };
+ version.into_string()
+}
+
+#[cfg(test)]
+mod test {
+ use crate::*;
+
+ #[test]
+ fn cpucycles_test() {
+ let count = cpucycles();
+ assert!(count.is_ok())
+ }
+
+ #[test]
+ fn cpucycles_persecond_test() {
+ let per_second = cpucycles_persecond();
+ assert!(per_second.is_ok());
+ }
+
+ #[test]
+ fn cpucycles_implementation_test() {
+ let implementation = cpucycles_implementation();
+ assert!(implementation.is_ok());
+ }
+
+ #[test]
+ fn cpucycles_version_test() {
+ let version = cpucycles_version();
+ assert!(version.is_ok());
+ }
+}
diff --git a/mixnode/Cargo.toml b/mixnode/Cargo.toml
index 1be77655b9..3a1a21b9af 100644
--- a/mixnode/Cargo.toml
+++ b/mixnode/Cargo.toml
@@ -29,32 +29,38 @@ log = { workspace = true }
pretty_env_logger = "0.4.0"
rand = "0.7.3"
rocket = { version = "0.5.0-rc.2", features = ["json"] }
-serde = { version="1.0", features = ["derive"] }
+serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
sysinfo = "0.27.7"
-tokio = { version="1.21.2", features = ["rt-multi-thread", "net", "signal"] }
-tokio-util = { version="0.7.3", features = ["codec"] }
+tokio = { version = "1.21.2", features = ["rt-multi-thread", "net", "signal"] }
+tokio-util = { version = "0.7.3", features = ["codec"] }
toml = "0.5.8"
url = { version = "2.2", features = ["serde"] }
atty = "0.2"
## internal
-nym-config = { path="../common/config" }
-nym-crypto = { path="../common/crypto" }
+nym-config = { path = "../common/config" }
+nym-crypto = { path = "../common/crypto" }
nym-contracts-common = { path = "../common/cosmwasm-smart-contracts/contracts-common" }
-mixnet-client = { path="../common/client-libs/mixnet-client" }
-mixnode-common = { path="../common/mixnode-common" }
-nym-nonexhaustive-delayqueue = { path="../common/nonexhaustive-delayqueue" }
-nym-sphinx = { path="../common/nymsphinx" }
+mixnet-client = { path = "../common/client-libs/mixnet-client" }
+mixnode-common = { path = "../common/mixnode-common" }
+nym-nonexhaustive-delayqueue = { path = "../common/nonexhaustive-delayqueue" }
+nym-sphinx = { path = "../common/nymsphinx" }
nym-pemstore = { path = "../common/pemstore", version = "0.2.0" }
nym-task = { path = "../common/task" }
nym-types = { path = "../common/types" }
-nym-topology = { path="../common/topology" }
-validator-client = { path="../common/client-libs/validator-client" }
-nym-bin-common = { path="../common/bin-common" }
+nym-topology = { path = "../common/topology" }
+validator-client = { path = "../common/client-libs/validator-client" }
+nym-bin-common = { path = "../common/bin-common" }
+cpu-cycles = { path = "../cpu-cycles", optional = true }
[dev-dependencies]
-tokio = { version="1.21.2", features = ["rt-multi-thread", "net", "signal", "test-util"] }
+tokio = { version = "1.21.2", features = [
+ "rt-multi-thread",
+ "net",
+ "signal",
+ "test-util",
+] }
nym-sphinx-types = { path = "../common/nymsphinx/types" }
nym-sphinx-params = { path = "../common/nymsphinx/params" }
diff --git a/mixnode/src/main.rs b/mixnode/src/main.rs
index 92b28cb6b8..609e63a0ad 100644
--- a/mixnode/src/main.rs
+++ b/mixnode/src/main.rs
@@ -60,6 +60,11 @@ impl Cli {
}
}
+#[cfg(feature = "cpu-cycles")]
+pub fn cpu_cycles() {
+ info!("{}", cpu_cycles::cpucycles())
+}
+
#[tokio::main]
async fn main() {
setup_logging();